def mirror_modis_dates_html(base_url, mirror_dir, use_wget=False):
"""
Download all MODIS date listing pages to a local directory.
Usually, a MODIS listing for a date should not change (only new dates
should be added), so there should be no need to re-download.
"""
ndownloads = 0
dates_urls = collect_all_dates_pages(base_url)
utils.mkdir_p(mirror_dir)
for date, url in dates_urls:
fname = os.path.join(mirror_dir, date + '.html')
if not os.path.exists(fname):
print 'Downloading ', fname
if use_wget:
subprocess.check_call('/usr/bin/wget %s -O %s' % (url, fname),
shell=True)
else:
urllib.urlretrieve(url, fname)
ndownloads += 1
# The MODIS MOLT repository server doesn't return Content-Length
# so urllib cannot tell if it downloaded the whole html or was
# just disconnected, which could lead to incomplete HTML being
# downloaded. So we check if the downloaded file ends with </html>
with open(fname, 'r') as f:
# seek 10 bytes from the end
f.seek(-10, 2)
line = f.read(10)
if "</html>" not in line:
raise urllib.ContentTooShortError(
"Couldn't find </html> in downloaded file, probably a partial download", ""
)
# Just avoid firing requests as fast as possible
time.sleep(0.1)
return ndownloads > 0
评论列表
文章目录