reptile.py 文件源码-python代码片段

reptile.py 文件源码

python

阅读 27 收藏 0 点赞 0 评论 0

def saveContentOfURL(target_url):
    # ?????????URL??????,???
    if target_url in searched_url:
        return
    try:
        # ??GET??target_url
        article_response = urllib.request.urlopen(target_url)
        raw_data = article_response.read()
        # ???????gzip????????
        if article_response.getheader("Content-Encoding") == "gzip":
            raw_data = gzip.decompress(raw_data)
        # gb2312??,???????????????,??????
        article_data = raw_data.decode('gb2312', 'ignore')
        # ?????<p></p>????clean????
        forEachMatch(pattern_str='<p>(.*?)</p>', to_match_str=article_data,
                     func=lambda match: file_operator.writeFile(cleanArticle(match.group(1))))
    except urllib.error.URLError:
        print(target_url, 'is a wrong url')
    except BaseException as message:
        print(message)
    # ?????????URL
    searched_url.add(target_url)


# ??<p></p>????,???????