def get_pages(self):
'''
??Phantomjs??????????????????url
Get all pages' urls using selenium an phantomJS
return:
a list of tuple (page_num,page_url)
'''
r_slt=r'onchange="select_page\(\)">([\s\S]*?)</select>'
r_p=r'<option value="(.*?)".*?>?(\d*?)?<'
try:
dcap = dict(DesiredCapabilities.PHANTOMJS)
# ???????????????
dcap["phantomjs.page.settings.loadImages"] = False
driver = webdriver.PhantomJS(desired_capabilities=dcap)
driver.get(self.chapter_url)
text=driver.page_source
st=re.findall(r_slt,text)[0]
self.pages = [(int(p[-1]),p[0]) for p in re.findall(r_p,st)]
except Exception:
traceback.print_exc()
self.pages = []
except KeyboardInterrupt:
raise KeyboardInterrupt
finally:
driver.quit()
print('Got {l} pages in chapter {ch}'.format(l=len(self.pages),ch=self.chapter_title))
return self.pages
评论列表
文章目录