NewsArticleClass.py 文件源码-python代码片段

NewsArticleClass.py 文件源码

python

阅读 24 收藏 0 点赞 0 评论 0

项目：Python-Scripts-Repo-on-Data-Science 作者: qalhata 项目源码文件源码

def getWashPostText(url, token):
    # this function takes the url of article and returns article
    # minus the crud - HTML, jabascript etc.
    try:
        page = urllib3.PoolManager.urlopen(url).read().decode('utf8')
    except:
        # here we say if unable to dload url, return title=None;article=None
        return (None, None)
    soup = BeautifulSoup(page)
    if soup is None:
        return (None, None)
    # we say here, the error checks are succesful, page was parsed
    text = ""
    if soup.find_all(token) is not None:
        # here we search the page for tokens which demarcate the article
        # usually '<article></article>'
        text = ''.join(map(lambda p: p.text, soup.find_all(token)))
        soup2 = BeautifulSoup(text)
        if soup2.find_all('p') is not None:
            text = ''.join(map(lambda p: p.text, soup2.find_all('p')))
    return text, soup.title.text