def get_only_text_washingtonpost_url(url):
# this func will take the URL as an argument and return only
# the raw text of the url.
# this function works specifically for the washPost articles
# because we know the structure of the pages
page = urllib.urlopen(url).read().decode('utf8')
# we download the URL
soup = BeautifulSoup(page)
# initialize a beautifulsoup object with the page we downloaded
text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
# the above gets everything bewteen a pair of HTML tags
# that look a certain way e.g. <article> stuff</article>
# the above format is specific to the washington post
soup2 = BeautifulSoup(text)
# find all the paragraph tage <p>
text = ' '.join(map(lambda p: p.text, soup2.find_all('p')))
return soup.title.text, text
#######################################################################
# TEST
######################################################################
NewsAutosummarize.py 文件源码
python
阅读 33
收藏 0
点赞 0
评论 0
评论列表
文章目录