def parse_job_list_page(self, response):
self.get_connector().log(self.name, self.ACTION_CRAWL_LIST, response.url)
feed_parser = feedparser.parse(response.body)
for job_entry in feed_parser.entries:
job_url = job_entry.link
job_publication_date = datetime.fromtimestamp(mktime(job_entry.published_parsed))
job_publication_time = mktime(job_publication_date.timetuple())
last_job_publication_time = mktime(self._last_job_date.timetuple())
if job_publication_time <= last_job_publication_time:
self.get_connector().log(self.name,
self.ACTION_MARKER_FOUND,
"%s <= %s" % (job_publication_time, last_job_publication_time))
return
prepared_job = JobItem()
request = Request(job_url, self.parse_job_page)
request.meta['item'] = prepared_job
prepared_job['title'] = job_entry.title
prepared_job['description'] = job_entry.description
prepared_job['publication_datetime'] = job_publication_date
yield request
评论列表
文章目录