def main(number):
url = 'http://www.bilibili.com/video/av' + str(number) + '/'
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
dcap["phantomjs.page.settings.loadImages"] = False
# phantomjs.exe???G:\Anaconda3\phantomjs\bin
driver = webdriver.PhantomJS(executable_path='G:\\Anaconda3\\phantomjs\\bin\\phantomjs.exe',
desired_capabilities=dcap)
try:
driver.get(url)
# time.sleep(random.uniform(1, 5))
content = driver.page_source # ??????
driver.close()
driver.quit()
soup = BeautifulSoup(content, 'lxml')
getInfo(soup)
except Exception:
pass
finally:
if driver:
driver.quit()
python类PHANTOMJS的实例源码
def getSoup(start, stop):
try:
for number in range(start, stop + 1):
url = 'http://www.bilibili.com/video/av'+str(number)+'/'
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
dcap["phantomjs.page.settings.loadImages"] = False
# phantomjs.exe???G:\Anaconda3\phantomjs\bin
driver = webdriver.PhantomJS(executable_path='G:\\Anaconda3\\phantomjs\\bin\\phantomjs.exe',
desired_capabilities=dcap)
driver.get(url)
# time.sleep(1) # ?????????????
content = driver.page_source # ??????
driver.close()
driver.quit()
soup = BeautifulSoup(content, 'lxml')
getInfo(soup)
except Exception:
pass
def get_webdriver(self):
service_args = []
if self.webdriver_config.proxy:
service_args.extend([
"--proxy=" + self.webdriver_config.proxy,
"--proxy-type=http",
"--ignore-ssl-errors=true"
])
dcapability = dict(DesiredCapabilities.PHANTOMJS)
if self.webdriver_config.header:
dcapability["phantomjs.page.settings.userAgent"] = self.webdriver_config.header['User-Agent']
dcapability["phantomjs.page.customHeaders.User-Agent"] = self.webdriver_config.header['User-Agent']
dcapability["takesScreenshot"] = True
driver = webdriver.PhantomJS(self.webdriver_config.phantomjs_path,
service_args=service_args,
desired_capabilities=dcapability)
driver.set_page_load_timeout(self.webdriver_config.timeout)
return driver
def process_request(self, request, spider):
if request.url[26] == 'c':
ua = random.choice(self.user_agent_list)
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = ua
dcap["phantomjs.page.settings.loadImages"] = False
driver = webdriver.PhantomJS(executable_path='E:\Webdriver\phantomjs-2.1.1-windows\\bin\phantomjs.exe',
desired_capabilities=dcap)
driver.get(request.url)
sleep_time = random.randint(15, 22)
time.sleep(sleep_time)
try:
detail = driver.find_element_by_xpath('//a[@ng-click="showDetail = btnOnClick(showDetail)"]')
detail.click()
except:
pass
body = driver.page_source
url = driver.current_url
driver.quit()
return HtmlResponse(url=url, body=body, request=request, encoding='utf-8')
def __init__(self, url):
self.url = url
self.book_name = "N/A"
self.book_volume = None
# Set the user agent to something generic.
dc = dict(DC.PHANTOMJS)
dc["phantomjs.page.settings.userAgent"] = USER_AGENT
self.d = webdriver.PhantomJS(desired_capabilities=dc,
service_args=["--ignore-ssl-errors=true", "--ssl-protocol=any", "--web-security=false", "--ssl-protocol=TLSv1"])
# Set cookies that makes it think we previously agreed to the ToS.
self.d.add_cookie({"name": "tachiyomi_auto_reader", "value": "Browser", "domain": ".ebookjapan.jp", "path": "/"})
self.d.add_cookie({"name": "tachiyomi_user_policy", "value": "on", "domain": ".ebookjapan.jp", "path": "/"})
self.d.set_window_size(1120, 550)
# Generic waiter.
self.wait = WebDriverWait(self.d, 60)
def get_pages(self):
'''
??Phantomjs??????????????????url
Get all pages' urls using selenium an phantomJS
return:
a list of tuple (page_num,page_url)
'''
r_slt=r'onchange="select_page\(\)">([\s\S]*?)</select>'
r_p=r'<option value="(.*?)".*?>?(\d*?)?<'
try:
dcap = dict(DesiredCapabilities.PHANTOMJS)
# ???????????????
dcap["phantomjs.page.settings.loadImages"] = False
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.get(self.chapter_url)
text=driver.page_source
st=re.findall(r_slt,text)[0]
self.pages = [(int(p[-1]),p[0]) for p in re.findall(r_p,st)]
except Exception:
traceback.print_exc()
self.pages = []
except KeyboardInterrupt:
raise KeyboardInterrupt
finally:
driver.quit()
print('Got {l} pages in chapter {ch}'.format(l=len(self.pages),ch=self.chapter_title))
return self.pages
def _get_PhantomJS(self):
try:
service_args = []
if self.proxy:
service_args.extend([
'--proxy={}:{}'.format(self.proxy.host, self.proxy.port),
'--proxy-type={}'.format(self.proxy.proto),
])
if self.proxy.username and self.proxy.password:
service_args.append(
'--proxy-auth={}:{}'.format(
self.proxy.username,
self.proxy.password
)
)
useragent = random_user_agent(
mobile=False
)
logger.info('useragent: {}'.format(useragent))
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = useragent
try:
self.webdriver = webdriver.PhantomJS(
executable_path=self.config['executable_path'],
service_args=service_args,
desired_capabilities=dcap
)
return True
except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err:
logger.error(err)
return False
except WebDriverException as e:
logger.error(e)
return False
def getSoup(start, stop):
try:
for number in range(start, stop+1):
url = 'http://space.bilibili.com/'+str(number)+'/#!/'
# url = 'http://space.bilibili.com/122879/#!/'
# "http://http://space.bilibili.com/122879/#!/"
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
dcap["phantomjs.page.settings.loadImages"] = False #??????????
# executable_path='D:\\Chrome\\phantomjs-2.1.1-windows\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe',
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.get(url)
content = driver.page_source # ??????
# print(content)
driver.close()
soup = BeautifulSoup(content, 'lxml')
username= getInfo(soup) # ?????
uid = number # number??uid
get_fans_uid = GetFansUid.GetFansUid(number)
fansuid, fansnumber = get_fans_uid.get_uids() # ????id?????
print(uid, username, fansnumber)
saveData(uid, username, fansnumber, fansuid)# ?????
except Exception:
print("get page error")
return getSoup(number + 1, stop+1)
# ????
def main(number):
url = 'http://space.bilibili.com/' + str(number) + '/#!/'
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
dcap["phantomjs.page.settings.loadImages"] = False # ??????????
driver = webdriver.PhantomJS(executable_path='G:\\Anaconda3\\phantomjs\\bin\\phantomjs.exe',
desired_capabilities=dcap)
try:
driver.get(url)
content = driver.page_source # ??????
driver.close()
driver.quit() # ??????????????
soup = BeautifulSoup(content, 'lxml')
username = getInfo(soup) # ?????
uid = number # number??uid
get_fans_uid = GetFansUid(number)
fansuid, fansnumber = get_fans_uid.get_uids() # ????id?????
saveData(uid, username, fansnumber, fansuid) # ?????
except Exception:
pass
finally:
if driver:
driver.quit()
def getSoup(start, stop):
try:
for number in range(start, stop+1):
url = 'http://space.bilibili.com/'+str(number)+'/#!/'
# "http://space.bilibili.com/1643718/#!/"
# "http://space.bilibili.com/902915/#!/"
# "http://space.bilibili.com/1/#!/"
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
dcap["phantomjs.page.settings.loadImages"] = False #??????????
# phantomjs.exe???G:\Anaconda3\phantomjs\bin
driver = webdriver.PhantomJS(executable_path='G:\\Anaconda3\\phantomjs\\bin\\phantomjs.exe',
desired_capabilities=dcap)
driver.get(url)
# time.sleep(1) # ?????????????
content = driver.page_source # ??????
# print(content)
driver.close()
driver.quit()
soup = BeautifulSoup(content, 'lxml')
getInfo(soup)
except Exception:
pass
# ????
def getSoup(start, stop):
try:
for number in range(start, stop+1):
url = 'http://space.bilibili.com/'+str(number)+'/#!/'
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
dcap["phantomjs.page.settings.loadImages"] = False #??????????
driver = webdriver.PhantomJS(executable_path='G:\\Anaconda3\\phantomjs\\bin\\phantomjs.exe',
desired_capabilities=dcap)
driver.get(url)
content = driver.page_source # ??????
driver.close()
driver.quit() #??????????????
soup = BeautifulSoup(content, 'lxml')
username= getInfo(soup) # ?????
uid = number # number??uid
get_fans_uid = GetFansUid(number)
fansuid, fansnumber = get_fans_uid.get_uids() # ????id?????
saveData(uid, username, fansnumber, fansuid) # ?????
except Exception:
print("get page error")
return getSoup(number+1, stop+1)
# ????
def getSoup(start, stop):
try:
for number in range(start, stop+1):
url = 'http://space.bilibili.com/'+str(number)+'/#!/'
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"
)
dcap["phantomjs.page.settings.loadImages"] = False #??????????
driver = webdriver.PhantomJS(executable_path='G:\\Anaconda3\\phantomjs\\bin\\phantomjs.exe',
desired_capabilities=dcap)
driver.get(url)
content = driver.page_source # ??????
driver.close()
driver.quit() #??????????????
soup = BeautifulSoup(content, 'lxml')
username= getInfo(soup) # ?????
uid = number # number??uid
get_gz_uid = GetFollowUid(number)
gzsuid, gznumber = get_gz_uid.get_uids() # ????id?????
saveData(uid, username, gznumber, gzsuid) # ?????
except Exception:
print("get page error")
return getSoup(number+1, stop+1)
# ????
def __init__(self, executable_path="phantomjs",
port=0, desired_capabilities=DesiredCapabilities.PHANTOMJS,
service_args=None, service_log_path=None):
"""
Creates a new instance of the PhantomJS / Ghostdriver.
Starts the service and then creates new instance of the driver.
:Args:
- executable_path - path to the executable. If the default is used it assumes the executable is in the $PATH
- port - port you would like the service to run, if left as 0, a free port will be found.
- desired_capabilities: Dictionary object with non-browser specific
capabilities only, such as "proxy" or "loggingPref".
- service_args : A List of command line arguments to pass to PhantomJS
- service_log_path: Path for phantomjs service to log to.
"""
self.service = Service(executable_path, port=port,
service_args=service_args, log_path=service_log_path)
self.service.start()
try:
RemoteWebDriver.__init__(self,
command_executor=self.service.service_url,
desired_capabilities=desired_capabilities)
except:
self.quit()
raise
self._is_remote = False
def setUpClass(cls):
super(TestEditHandler, cls).setUpClass()
caps = DesiredCapabilities.PHANTOMJS
# caps['loggingPrefs'] = { 'browser':'ALL' }
cls.driver = webdriver.PhantomJS(desired_capabilities=caps)
cls.driver.set_window_size(1920, 1080)
cls.driver.implicitly_wait(10)
def initialize_driver(self, driver=None):
if self.command_executor:
chrome_options = Options()
chrome_options.add_argument("--disable-notifications")
if self.proxy:
chrome_options.add_argument('--proxy-server=%s' % self.proxy)
self.driver = webdriver.Remote(
command_executor=self.command_executor,
desired_capabilities=chrome_options.to_capabilities()
)
else:
if self.which_driver == 'phantomjs':
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53 "
"(KHTML, like Gecko) Chrome/15.0.87"
)
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.set_window_size(1400, 1000)
self.driver = driver
elif self.which_driver == 'chrome':
chrome_options = Options()
chrome_options.add_argument("--disable-notifications")
if self.proxy:
chrome_options.add_argument('--proxy-server=%s' % self.proxy)
self.driver = webdriver.Chrome(chrome_options=chrome_options)
# otherwise use the driver passed in
else:
self.driver = driver
# set page load timeout
self.driver.set_page_load_timeout(time_to_wait=240)
def __init__(self, executable_path="phantomjs",
port=0, desired_capabilities=DesiredCapabilities.PHANTOMJS,
service_args=None, service_log_path=None):
"""
Creates a new instance of the PhantomJS / Ghostdriver.
Starts the service and then creates new instance of the driver.
:Args:
- executable_path - path to the executable. If the default is used it assumes the executable is in the $PATH
- port - port you would like the service to run, if left as 0, a free port will be found.
- desired_capabilities: Dictionary object with non-browser specific
capabilities only, such as "proxy" or "loggingPref".
- service_args : A List of command line arguments to pass to PhantomJS
- service_log_path: Path for phantomjs service to log to.
"""
self.service = Service(executable_path, port=port,
service_args=service_args, log_path=service_log_path)
self.service.start()
try:
RemoteWebDriver.__init__(self,
command_executor=self.service.service_url,
desired_capabilities=desired_capabilities)
except:
self.quit()
raise
self._is_remote = False
def open_driver(self):
self.quit_driver()
if not hasattr(self, 'driver') or not isinstance(self.driver,webdriver.phantomjs.webdriver.WebDriver):
# phantomjs driver
# http://engineering.shapesecurity.com/2015/01/detecting-phantomjs-based-visitors.html
# https://coderwall.com/p/9jgaeq/set-phantomjs-user-agent-string
# http://phantomjs.org/api/webpage/property/settings.html
# http://stackoverflow.com/questions/23390974/phantomjs-keeping-cache
dcap = dict(DesiredCapabilities.PHANTOMJS)
# dcap['browserName'] = 'Chrome'
dcap['phantomjs.page.settings.userAgent'] = ( self.user_agent )
dcap['phantomjs.page.settings.loadImages'] = ( 'false' )
dcap['phantomjs.page.settings.clearMemoryCaches'] = ( 'true' )
dcap['phantomjs.page.settings.resourceTimeout'] = ( max(2000,int(self.timeout * 1000)) )
dcap['acceptSslCerts'] = ( True )
dcap['applicationCacheEnabled'] = ( True )
dcap['handlesAlerts'] = ( False )
dcap['phantomjs.page.customHeaders'] = ( { 'Connection': 'keep-alive', 'Accept-Encoding': 'gzip, deflate, sdch' } )
phantomjs_service_args = ['--disk-cache=false','--ignore-ssl-errors=false','--ssl-protocol=TLSv1.2']
if self.proxy is not None:
phantomjs_service_args = ['--proxy={}'.format(self.proxy)] + phantomjs_service_args
if self.phantomjs_binary_path is None:
driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=phantomjs_service_args)
else:
driver = webdriver.PhantomJS(self.phantomjs_binary_path,desired_capabilities=dcap,service_args=phantomjs_service_args)
driver.set_window_size(1296,1018) # Tor browser size on Linux
driver.implicitly_wait(self.timeout)
driver.set_page_load_timeout(self.timeout)
driver.set_script_timeout(self.timeout)
self.driver = driver
def __init__(self, executable_path="phantomjs",
port=0, desired_capabilities=DesiredCapabilities.PHANTOMJS,
service_args=None, service_log_path=None):
"""
Creates a new instance of the PhantomJS / Ghostdriver.
Starts the service and then creates new instance of the driver.
:Args:
- executable_path - path to the executable. If the default is used it assumes the executable is in the $PATH
- port - port you would like the service to run, if left as 0, a free port will be found.
- desired_capabilities: Dictionary object with non-browser specific
capabilities only, such as "proxy" or "loggingPref".
- service_args : A List of command line arguments to pass to PhantomJS
- service_log_path: Path for phantomjs service to log to.
"""
self.service = Service(
executable_path,
port=port,
service_args=service_args,
log_path=service_log_path)
self.service.start()
try:
RemoteWebDriver.__init__(
self,
command_executor=self.service.service_url,
desired_capabilities=desired_capabilities)
except Exception:
self.quit()
raise
self._is_remote = False
def load_phantomjs(config):
"""Start PhantomJS webdriver with the given configuration.
Args:
config (dict): The configuration loaded previously in Cabu.
Returns:
webdriver (selenium.webdriver): An instance of phantomJS webdriver.
"""
dcap = dict(DesiredCapabilities.PHANTOMJS)
service_args = [
'--ignore-ssl-errors=true',
'--ssl-protocol=any',
'--web-security=false'
]
if os.environ.get('HTTPS_PROXY') or os.environ.get('HTTP_PROXY'):
proxy_address = os.environ.get('HTTPS_PROXY', os.environ.get('HTTP_PROXY'))
proxy_ip = re.search('http\:\/\/(.*)$', proxy_address).group(1)
service_args.append('--proxy=%s' % proxy_ip)
service_args.append('--proxy-type=http')
if 'HEADERS' in config and config['HEADERS']:
dcap = Headers(config).set_headers(dcap)
return webdriver.PhantomJS(
desired_capabilities=dcap,
service_args=service_args,
service_log_path=os.path.devnull
)
def test_phantomjs_headers_loading(self):
dcap = dict(DesiredCapabilities.PHANTOMJS)
headers = Headers(self.config).set_headers(dcap)
self.assertEquals(
headers['phantomjs.page.customHeaders.User-Agent'],
'Mozilla/6.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36'
' (KHTML, like Gecko) Chrome/48.0.2564.103 Safari/537.36'
)
def test_chrome_headers_loading(self):
self.app.config['DRIVER_NAME'] = 'Chrome'
dcap = dict(DesiredCapabilities.PHANTOMJS)
with self.assertRaises(Exception):
Headers(self.config).set_headers(dcap)
def create_selenium_driver(self):
# driver = webdriver.Chrome()
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"
)
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.set_window_size(1024, 768)
return driver
def get_browser():
dcap = dict(DesiredCapabilities.PHANTOMJS)
DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.Accept-Language'] = 'zh-CN,zh;q=0.8'
DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.Connection'] = 'keep-alive'
DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.Accept-Encoding'] = 'gzip, deflate, sdch'
DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.Cache-Control'] = 'max-age=0'
phantomjs_path = "G:\\programeSoftwares\\python2.7\\Scripts\\phantomjs.exe"
dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36")
#browser = webdriver.PhantomJS(desired_capabilities=dcap)
browser = webdriver.PhantomJS(desired_capabilities=dcap,executable_path=phantomjs_path)
#self.browser = webdriver.PhantomJS(desired_capabilities=dcap
browser.set_window_size(1920, 1080)
return browser
def __init__(self, executable_path="phantomjs",
port=0, desired_capabilities=DesiredCapabilities.PHANTOMJS,
service_args=None, service_log_path=None):
"""
Creates a new instance of the PhantomJS / Ghostdriver.
Starts the service and then creates new instance of the driver.
:Args:
- executable_path - path to the executable. If the default is used it assumes the executable is in the $PATH
- port - port you would like the service to run, if left as 0, a free port will be found.
- desired_capabilities: Dictionary object with non-browser specific
capabilities only, such as "proxy" or "loggingPref".
- service_args : A List of command line arguments to pass to PhantomJS
- service_log_path: Path for phantomjs service to log to.
"""
self.service = Service(executable_path, port=port,
service_args=service_args, log_path=service_log_path)
self.service.start()
try:
RemoteWebDriver.__init__(self,
command_executor=self.service.service_url,
desired_capabilities=desired_capabilities)
except:
self.quit()
raise
self._is_remote = False
def visit(self, url, xpath=None, timeout=60, retry=1, load_images=False, **kwargs):
if self.browser:
self.browser.quit()
desired_capabilities = dict()
desired_capabilities['phantomjs.page.settings.userAgent'] = self.ua if self.ua else 'Mozilla/5.0 (Windows NT 6.1; rv:42.0) Gecko/20100101 Firefox/42.0'
service_args = list()
if not load_images:
service_args += ['--load-images=false']
if self.proxy:
service_args += ['--proxy=%s' % self.proxy]
DesiredCapabilities.PHANTOMJS.update(desired_capabilities)
try:
browser = webdriver.PhantomJS(service_args=service_args if service_args else None,
desired_capabilities=DesiredCapabilities.PHANTOMJS)
except Exception as e:
print str(e)
return None
count = 0
while (retry + 1) > count:
count += 1
try:
browser.get(url)
break
except Exception as e:
print str(e)
if xpath:
browser.implicitly_wait(timeout)
try:
browser.find_element_by_xpath(xpath)
except Exception as e:
print str(e)
self.browser = browser
result = browser.page_source
return result if result != '<html><head></head><body></body></html>' else None
def get(self, url, xpath, timeout, retry, service_args, desired_capabilities):
browser = None
try:
result = dict()
if desired_capabilities:
DesiredCapabilities.PHANTOMJS.update(json.loads(desired_capabilities))
browser = webdriver.PhantomJS(service_args=json.loads(service_args) if service_args else None,
desired_capabilities=DesiredCapabilities.PHANTOMJS)
count = 0
while (retry + 1) > count:
count += 1
try:
browser.get(url)
break
except Exception as e:
print str(e)
if xpath:
browser.implicitly_wait(timeout)
try:
browser.find_element_by_xpath(xpath)
except Exception as e:
print str(e)
text = browser.page_source
if text == '<html><head></head><body></body></html>':
browser.quit()
return ''
result['cookies'] = browser.get_cookies()
result['text'] = text.encode('utf-8')
browser.quit()
return json.dumps(result)
except Exception as e:
if browser:
browser.quit()
print str(e)
return ''
def download_articles_ph(self, url):
'''
??phantomjs????
:param url: ????
:return:
'''
if url is None:
return None
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = (
UA
)
dcap["takesScreenshot"] = (False)
try:
driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--load-images=no'])
except Exception as e:
print(datetime.datetime.now())
print(url)
print(e)
else:
try:
driver.set_page_load_timeout(30)
driver.get(url)
time.sleep(1)
# driver.implicitly_wait(2)
html = driver.page_source
return html
except:
print(datetime.datetime.now())
print(url)
finally:
driver.quit()
def maintain_cookies_ph(self):
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = UA
cookie = []
# ??5?cookies
for i in range(5):
driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--load-images=no', ])
driver.get("http://weixin.sogou.com/")
# ??cookie??
cookie.append(driver.get_cookies())
# print(driver.get_cookies())
driver.quit()
return cookie
def host_worker(hostQueue, fileQueue, timeout, user_agent, verbose):
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap["phantomjs.page.settings.userAgent"] = user_agent
dcap["accept_untrusted_certs"] = True
driver = webdriver.PhantomJS(service_args=['--ignore-ssl-errors=true'], desired_capabilities=dcap) # or add to your PATH
driver.set_window_size(1024, 768) # optional
driver.set_page_load_timeout(timeout)
while(not hostQueue.empty()):
host = hostQueue.get()
if not host.startswith("http://") and not host.startswith("https://"):
host1 = "http://" + host
host2 = "https://" + host
filename1 = os.path.join("output", "images", str(uuid4()) + ".png")
filename2 = os.path.join("output", "images", str(uuid4()) + ".png")
if verbose:
print("Fetching %s" % host1)
if host_reachable(host1, timeout) and save_image(host1, filename1, driver):
fileQueue.put({host1: filename1})
else:
if verbose:
print("%s is unreachable or timed out" % host1)
if verbose:
print("Fetching %s" % host2)
if host_reachable(host2, timeout) and save_image(host2, filename2, driver):
fileQueue.put({host2: filename2})
else:
if verbose:
print("%s is unreachable or timed out" % host2)
else:
filename = os.path.join("output", "images", str(uuid4()) + ".png")
if verbose:
print("Fetching %s" % host)
if host_reachable(host, timeout) and save_image(host, filename, driver):
fileQueue.put({host: filename})
else:
if verbose:
print("%s is unreachable or timed out" % host)
def __init__(self, user_agent=None, cookies_file=None):
"""
Initialize the phantom JS selenium driver
:return:
"""
self.conf = config
self.user_agent = user_agent
self.cookies_file = cookies_file
# http://phantomjs.org/api/webpage/property/settings.html
dcap = dict(DesiredCapabilities.PHANTOMJS)
dcap['phantomjs.page.settings.loadImages'] = False
dcap['phantomjs.page.settings.webSecurityEnabled'] = False
dcap['phantomjs.page.settings.localToRemoteUrlAccessEnabled'] = True
if user_agent:
dcap['phantomjs.page.settings.userAgent'] = user_agent
self.driver = webdriver.PhantomJS(
desired_capabilities=dcap,
executable_path=self.conf['general']['phantomjs'],
)
self.load_cookies()
self.driver.implicitly_wait(30)
self.driver.set_window_size(1024, 768)