coref.py 文件源码

python
阅读 32 收藏 0 点赞 0 评论 0

项目:QuestionAnswerNLP 作者: debjyoti385 项目源码 文件源码
def strip_tags(html, invalid_tags):
    soup = BeautifulSoup(html,"html.parser")
    coref_id_set=set()
    set2text={}
    for tag in soup.findAll(True):
        if tag.name in invalid_tags:
            s = ""

            for c in tag.contents:
                if not isinstance(c, NavigableString):
                    c = strip_tags(unicode(c), invalid_tags)
                s += unicode(c)

            tag.replaceWith(s)

    for t in soup.find_all("coref"):
        if t['set-id'] in coref_id_set :
            pronoun_regex = re.compile('|'.join(pronouns))
            # print t.get_text(),
            if len(pronouns.intersection(nltk.word_tokenize(t.get_text().lower()))) > 0:
                # print t.get_text(),
                t.replaceWith(set2text[t['set-id']])
                # print "REPLACED WITH :" , set2text[t['set-id']]
        else:
            coref_id_set.add(t['set-id'])
            set2text[t['set-id']]=t.get_text()


    # print soup
    soup =  re.sub("(\\t|\\r?\\n)+", " ",str(soup))
    soup = re.sub("</s><s>","\n",soup)
    soup = re.sub('<[^>]*>', '', soup)
    return soup
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号