lagouspider.py 文件源码

python
阅读 24 收藏 0 点赞 0 评论 0

项目:THUDataPiCrawler-old-version 作者: THUDataPI 项目源码 文件源码
def start_requests(self):
        search_fields = [u'']
        #search_fields = self.query
        root_url = 'https://www.lagou.com'
        urls = []
        for field in search_fields:
            driver = webdriver.Ie('C:\Program Files\Internet Explorer\IEDriverServer.exe')  # open browser
            driver.get(root_url)  # open root url
            time.sleep(1)  # waiting for closing dialog
            driver.find_element_by_id('search_input').send_keys('%s' % field)
            driver.find_element_by_id('search_button').click()
            time.sleep(2)  # waiting for redirection
            i = 0
            flag = True#if there is no page column,then skip the search 
            try:
                class_name = driver.find_element_by_xpath('//span[@action="next"]').get_attribute('class')
                print class_name
            except Exception:
                 flag = False
                 print "position is too sparse!"
            ##### to crawl 16 pages#####
            while  flag and class_name == "pager_next " and i < 16:#there is a space after pager_next~
                time.sleep(1) 
                position_links = driver.find_elements_by_class_name('position_link')
                for position_link in position_links:
                    job_url = position_link.get_attribute('href')
                    print('adding new seed url: %s' % job_url)
                    urls.append(job_url)
                for url in urls:
                    yield  Request(url=url, callback=self.parse_page)
                i += 1
                print "parsing page: ",i
                driver.close()
                driver.find_element_by_class_name('pager_next ').click()#go to the next page
                class_name = driver.find_element_by_xpath('//span[@action="next"]').get_attribute('class')
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号