def get_htmls(self, urls, get_proxie_or_not=False):
"""
:type urls: list
:type get_proxie_or_not: bool
:param urls: ?????url??
:param get_proxie_or_not: ??????ip
:return: ???????html??
"""
list_html = []
for url in urls:
try:
if get_proxie_or_not:
p = Proxies()
p.get_ip_and_port()
self.session.proxies = {
"http": p.ip_and_port,
"https": p.ip_and_port
}
response = self.session.get(url, timeout=30)
if response.status_code == 200:
html = response.content
else:
# ??selenium+phantomjs???????
# ??phantomjs????????
desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
headers = self.headers
for key, value in headers.iteritems():
desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
driver = webdriver.PhantomJS(
desired_capabilities=desired_capabilities
)
driver.get(self.start_url)
html = driver.page_source
driver.quit()
assert html is not None
list_html.append(BeautifulSoup(html, 'lxml'))
except requests.ConnectTimeout:
print "url????"
if list_html:
return list_html
else:
print "??html???????????"
raise ValueError
评论列表
文章目录