def strip_tags(html, invalid_tags):
soup = BeautifulSoup(html,"html.parser")
coref_id_set=set()
set2text={}
for tag in soup.findAll(True):
if tag.name in invalid_tags:
s = ""
for c in tag.contents:
if not isinstance(c, NavigableString):
c = strip_tags(unicode(c), invalid_tags)
s += unicode(c)
tag.replaceWith(s)
for t in soup.find_all("coref"):
if t['set-id'] in coref_id_set :
pronoun_regex = re.compile('|'.join(pronouns))
# print t.get_text(),
if len(pronouns.intersection(nltk.word_tokenize(t.get_text().lower()))) > 0:
# print t.get_text(),
t.replaceWith(set2text[t['set-id']])
# print "REPLACED WITH :" , set2text[t['set-id']]
else:
coref_id_set.add(t['set-id'])
set2text[t['set-id']]=t.get_text()
# print soup
soup = re.sub("(\\t|\\r?\\n)+", " ",str(soup))
soup = re.sub("</s><s>","\n",soup)
soup = re.sub('<[^>]*>', '', soup)
return soup
评论列表
文章目录