crawldetails.py 文件源码

python
阅读 16 收藏 0 点赞 0 评论 0

项目:crawllagou 作者: ScarecrowFu 项目源码 文件源码
def parse_detail(self,response):
        item = CrawldetailsItem()
        sel = Selector(response)

        try:
            item["kd"] = response.meta['kd']
            item["title"] = self.get_text(sel,'//*[@id="job_detail"]/dt/h1/@title')
            item["company"] = sel.xpath('//*[@id="container"]/div[2]/dl/dt/a/div/h2/text()').extract()[0].strip()
            item["city"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[2]/text()').extract()[0]
            item["address"] = sel.xpath('//*[@id="container"]/div[2]/dl/dd/div[1]/text()').extract()[0]
            industry = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[1]/li[1]').extract()[0]
            item["industry"] = BeautifulSoup(industry).get_text().encode("utf-8").split(' ')[1].strip()
            scale = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[1]/li[2]').extract()[0]
            item["scale"] = BeautifulSoup(scale).get_text().encode("utf-8").split(' ')[1].strip()
            phase = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[2]/li').extract()[0]
            item["phase"] = BeautifulSoup(phase).get_text().encode("utf-8").split(' ')[1].strip()
            item["salary"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[1]/text()').extract()[0]
            item["experience"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[3]/text()').extract()[0]
            item["education"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[4]/text()').extract()[0]
            item["description"] = self.get_text(sel,'//*[@id="job_detail"]/dd[2]')
            item["url"] = response.url
            item["published"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[3]/text()').extract()[0][:-8]
            item["tag"] = self.get_text(sel, '//*[@id="job_detail"]/dd[1]/p[2]/text()')


        except Exception, e:
            print e
        yield item
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号