def process_item(self, item, spider):
if spider.name == 'RssCrawler':
# Search the CurrentVersion table for a version of the article
try:
self.cursor.execute(self.compare_versions, (item['url'],))
except (pymysql.err.OperationalError, pymysql.ProgrammingError, pymysql.InternalError,
pymysql.IntegrityError, TypeError) as error:
self.log.error("Something went wrong in rss query: %s", error)
# Save the result of the query. Must be done before the add,
# otherwise the result will be overwritten in the buffer
old_version = self.cursor.fetchone()
if old_version is not None:
# Compare the two download dates. index 3 of old_version
# corresponds to the download_date attribute in the DB
if (datetime.datetime.strptime(
item['download_date'], "%y-%m-%d %H:%M:%S") -
old_version[3]) \
< datetime.timedelta(hours=self.delta_time):
raise DropItem("Article in DB too recent. Not saving.")
return item
评论列表
文章目录