def get_urls(self, get_proxie_or_not=False):
"""
:type get_proxie_or_not: bool
:param get_proxie_or_not: ??????ip
:return: ?????url???????url??
"""
list_url = []
try:
if get_proxie_or_not:
p = Proxies()
p.get_ip_and_port()
self.session.proxies = {
"http": p.ip_and_port,
"https": p.ip_and_port
}
response = self.session.get(self.start_url, timeout=30)
if response.status_code == 200:
html = response.content
else:
# ??selenium+phantomjs???????
# ??phantomjs????????
desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
headers = self.headers
for key, value in headers.iteritems():
desired_capabilities['phantomjs.page.customHeaders.{}'.format(key)] = value
driver = webdriver.PhantomJS(
desired_capabilities=desired_capabilities
)
driver.get(self.start_url)
html = driver.page_source
driver.quit()
soup = BeautifulSoup(html, 'lxml')
# ?????????????BeautifulSoup?????
urls = soup.find()
assert urls is not None
repeat_num = 0
for url in urls:
if url['href'] not in list_url:
list_url.append(url['href'])
else:
repeat_num += 1
print "??%d??????????" % repeat_num
except requests.ConnectTimeout:
print "url????????????????"
if list_url:
return list_url
else:
print "??url????????"
raise ValueError
评论列表
文章目录