pipelines.py 文件源码

python
阅读 20 收藏 0 点赞 0 评论 0

项目:mailingListScraper 作者: gaalcaras 项目源码 文件源码
def process_item(self, item, spider):
        times = {
            'timestampSent': 'timeSent',
            'timestampReceived': 'timeReceived'
        }

        time_format = "%Y-%m-%d %H:%M:%S%z"

        # Define a default time zone according to the email server setting
        if spider.name == 'hypermail':
            def_tz = tz.tzoffset('EST', -18000)
        elif spider.name == 'marc':
            def_tz = tz.tzoffset('EDT', -14400)

        for key, val in times.items():
            if item[val] == "":
                item[val] = "NA"
                item[key] = "NA"
                continue

            try:
                parsed_time = dateParser(item[val])
            except ValueError:
                try:
                    # "... HH:MM:SS +0200"
                    pattern = r'(.* \d{2}:\d{2}:\d{2}(\s?[+,-]\d{4})?)'
                    simpler = re.search(pattern, item[val])
                    parsed_time = dateParser(simpler.group(1))
                except AttributeError:
                    message = '<' + item['url'] + '> '
                    message += 'ParseTimeFields could not parse ' + val + ', '
                    message += key + ' will be NA.'
                    LOGGER.warning(message)
                    item[key] = "NA"
                    continue

            if parsed_time.tzinfo is None:
                parsed_time = parsed_time.replace(tzinfo=def_tz)

            item[key] = parsed_time.strftime(time_format)

        return item
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号