def getWashPostText(url, token):
# this function takes the url of article and returns article
# minus the crud - HTML, jabascript etc.
try:
page = urllib3.PoolManager.urlopen(url).read().decode('utf8')
except:
# here we say if unable to dload url, return title=None;article=None
return (None, None)
soup = BeautifulSoup(page)
if soup is None:
return (None, None)
# we say here, the error checks are succesful, page was parsed
text = ""
if soup.find_all(token) is not None:
# here we search the page for tokens which demarcate the article
# usually '<article></article>'
text = ''.join(map(lambda p: p.text, soup.find_all(token)))
soup2 = BeautifulSoup(text)
if soup2.find_all('p') is not None:
text = ''.join(map(lambda p: p.text, soup2.find_all('p')))
return text, soup.title.text
NewsArticleClass.py 文件源码
python
阅读 24
收藏 0
点赞 0
评论 0
评论列表
文章目录