def parse(self, response):
hxs = Selector(response)
jobs = hxs.xpath('//div[contains(@class, "-job-item")]')
items = []
for job in jobs:
item = Job()
item["title"] = job.xpath('.//a[@class="job-link"]/text()').extract()[0]
item["company"] = job.xpath('.//div[@class="-name"]/text()').extract()[0].strip()
item["location"] = re.sub(r'\W+', '', job.xpath('.//div[@class="-location"]/text()').extract()[0].strip())
item["url"] = job.xpath('.//a[@class="job-link"]/@href').extract()[0]
item["date_posted"] = job.xpath('.//p[contains(@class, "-posted-date")]/text()').extract()[0].strip()
item["salary"] = job.xpath('.//span[@class="-salary"]/text()').extract_first(default='n/a').strip()
item["tags"] = job.css('.-tags p a.post-tag::text').extract()
item["crawl_timestamp"] = datetime.now().strftime("%H:%M:%S %Y-%m-%d")
item["job_board"] = "stackOverflow"
items.append(item)
return items
评论列表
文章目录