def process_item(self, item, spider):
times = {
'timestampSent': 'timeSent',
'timestampReceived': 'timeReceived'
}
time_format = "%Y-%m-%d %H:%M:%S%z"
# Define a default time zone according to the email server setting
if spider.name == 'hypermail':
def_tz = tz.tzoffset('EST', -18000)
elif spider.name == 'marc':
def_tz = tz.tzoffset('EDT', -14400)
for key, val in times.items():
if item[val] == "":
item[val] = "NA"
item[key] = "NA"
continue
try:
parsed_time = dateParser(item[val])
except ValueError:
try:
# "... HH:MM:SS +0200"
pattern = r'(.* \d{2}:\d{2}:\d{2}(\s?[+,-]\d{4})?)'
simpler = re.search(pattern, item[val])
parsed_time = dateParser(simpler.group(1))
except AttributeError:
message = '<' + item['url'] + '> '
message += 'ParseTimeFields could not parse ' + val + ', '
message += key + ' will be NA.'
LOGGER.warning(message)
item[key] = "NA"
continue
if parsed_time.tzinfo is None:
parsed_time = parsed_time.replace(tzinfo=def_tz)
item[key] = parsed_time.strftime(time_format)
return item
评论列表
文章目录