def scrapeSource(url, magicFrag='2017',
scraperFunction=getNYTText, token='None'):
urlBodies = {}
requests = urllib3.PoolManager()
response = requests.request('GET', url)
soup = BeautifulSoup(response.data)
# the above lines of code sets up the beautifulSoup page
# now we find links
# links are always of the form <a href='url'> link-text </a>
for a in soup.findAll('a'):
try:
# the line above refers to indiv. scrapperFunction
# for NYT & washPost
if body and len(body) > 0:
urlBodies[url] = body
print(url)
except:
numErrors = 0
numErrors += 1
NewsArticleClass.py 文件源码
python
阅读 22
收藏 0
点赞 0
评论 0
评论列表
文章目录