def downloadUrls(self, urls):
url_data = {}
for u in urls:
url = self.base_url + u
request = urllib.request.Request(url)
# the .htaccess file checks for the header, and if it exists returns unprocessed data.
request.add_header('User-agent', 'our-web-crawler')
try:
response = urllib.request.urlopen(request)
data = response.read()
except urllib.request.HTTPError:
log (url)
raise
except urllib.request.URLError:
log (url)
raise
yield (u,data)
评论列表
文章目录