pipelines.py 文件源码

python
阅读 26 收藏 0 点赞 0 评论 0

项目:news-please 作者: fhamborg 项目源码 文件源码
def process_item(self, item, spider):
        if spider.name == 'RssCrawler':
            # Search the CurrentVersion table for a version of the article
            try:
                self.cursor.execute(self.compare_versions, (item['url'],))
            except (pymysql.err.OperationalError, pymysql.ProgrammingError, pymysql.InternalError,
                    pymysql.IntegrityError, TypeError) as error:
                self.log.error("Something went wrong in rss query: %s", error)

            # Save the result of the query. Must be done before the add,
            #   otherwise the result will be overwritten in the buffer
            old_version = self.cursor.fetchone()

            if old_version is not None:
                # Compare the two download dates. index 3 of old_version
                #   corresponds to the download_date attribute in the DB
                if (datetime.datetime.strptime(
                        item['download_date'], "%y-%m-%d %H:%M:%S") -
                        old_version[3]) \
                        < datetime.timedelta(hours=self.delta_time):
                    raise DropItem("Article in DB too recent. Not saving.")

        return item
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号