def saveContentOfURL(target_url):
# ?????????URL??????,???
if target_url in searched_url:
return
try:
# ??GET??target_url
article_response = urllib.request.urlopen(target_url)
raw_data = article_response.read()
# ???????gzip????????
if article_response.getheader("Content-Encoding") == "gzip":
raw_data = gzip.decompress(raw_data)
# gb2312??,???????????????,??????
article_data = raw_data.decode('gb2312', 'ignore')
# ?????<p></p>????clean????
forEachMatch(pattern_str='<p>(.*?)</p>', to_match_str=article_data,
func=lambda match: file_operator.writeFile(cleanArticle(match.group(1))))
except urllib.error.URLError:
print(target_url, 'is a wrong url')
except BaseException as message:
print(message)
# ?????????URL
searched_url.add(target_url)
# ??<p></p>????,???????
评论列表
文章目录