NewsArticleClass.py 文件源码-python代码片段

NewsArticleClass.py 文件源码

python

阅读 22 收藏 0 点赞 0 评论 0

项目：Python-Scripts-Repo-on-Data-Science 作者: qalhata 项目源码文件源码

def scrapeSource(url, magicFrag='2017',
                 scraperFunction=getNYTText, token='None'):
    urlBodies = {}
    requests = urllib3.PoolManager()
    response = requests.request('GET', url)
    soup = BeautifulSoup(response.data)
    # the above lines of code sets up the beautifulSoup page
    # now we find links
    # links are always of the form <a href='url'> link-text </a>
    for a in soup.findAll('a'):
        try:
            # the line above refers to indiv. scrapperFunction
            # for NYT & washPost
            if body and len(body) > 0:
                urlBodies[url] = body
                print(url)
        except:
            numErrors = 0
            numErrors += 1