def list_archive_timestamps(url, min_date, max_date, user_agent):
"""
List the available archive between min_date and max_date for the given URL
"""
logger.info('Listing the archives for the url {url}'.format(url=url))
# Construct the URL used to download the memento list
parameters = {'url': url,
'output': 'json',
'from': min_date.strftime(WEB_ARCHIVE_TIMESTAMP_FORMAT),
'to': max_date.strftime(WEB_ARCHIVE_TIMESTAMP_FORMAT)}
cdx_url = WEB_ARCHIVE_CDX_TEMPLATE.format(params=urlencode(parameters))
req = Request(cdx_url, None, {'User-Agent': user_agent})
with urlopen(req) as cdx:
memento_json = cdx.read().decode("utf-8")
timestamps = []
# Ignore the first line which contains column names
for url_key, timestamp, original, mime_type, status_code, digest, length in json.loads(memento_json)[1:]:
# Ignore archives with a status code != OK
if status_code == '200':
timestamps.append(datetime.strptime(timestamp, WEB_ARCHIVE_TIMESTAMP_FORMAT))
return timestamps
评论列表
文章目录