def get_html_text(url):
response = requests.get(url)
origin_text = response.text
origin_text = re.sub(r'<script.*?>.*?</script>', '', origin_text, flags=re.I | re.M | re.DOTALL)
origin_text = re.sub(r'<style.*?>.*?</style>', '', origin_text, flags=re.I | re.M | re.DOTALL)
doc = html.fromstring(origin_text)
text = doc.xpath('//body//text()')
text = [i.strip() for i in text if i.strip()]
text = ' '.join(text)
seg = jieba.cut(text)
stopwords = read_stopwords('./utils/stopwords.txt') # callable read_stopwords()
seg = [i.strip() for i in seg if i.strip() and not i.strip().isdigit()
and i.strip() not in stopwords]
seg = ' '.join(seg)
return seg
评论列表
文章目录