def start_requests(self):
search_fields = [u'']
#search_fields = self.query
root_url = 'https://www.lagou.com'
urls = []
for field in search_fields:
driver = webdriver.Ie('C:\Program Files\Internet Explorer\IEDriverServer.exe') # open browser
driver.get(root_url) # open root url
time.sleep(1) # waiting for closing dialog
driver.find_element_by_id('search_input').send_keys('%s' % field)
driver.find_element_by_id('search_button').click()
time.sleep(2) # waiting for redirection
i = 0
flag = True#if there is no page column,then skip the search
try:
class_name = driver.find_element_by_xpath('//span[@action="next"]').get_attribute('class')
print class_name
except Exception:
flag = False
print "position is too sparse!"
##### to crawl 16 pages#####
while flag and class_name == "pager_next " and i < 16:#there is a space after pager_next~
time.sleep(1)
position_links = driver.find_elements_by_class_name('position_link')
for position_link in position_links:
job_url = position_link.get_attribute('href')
print('adding new seed url: %s' % job_url)
urls.append(job_url)
for url in urls:
yield Request(url=url, callback=self.parse_page)
i += 1
print "parsing page: ",i
driver.close()
driver.find_element_by_class_name('pager_next ').click()#go to the next page
class_name = driver.find_element_by_xpath('//span[@action="next"]').get_attribute('class')
lagouspider.py 文件源码
python
阅读 24
收藏 0
点赞 0
评论 0
评论列表
文章目录