find_entities.py 文件源码

python
阅读 45 收藏 0 点赞 0 评论 0

项目:Medical_NER 作者: murhafh 项目源码 文件源码
def print_symptoms_from_page(url = '', model = '', stanford_jar = ''):
    html_reader = HTMLReader(url)
    cleaned_text = html_reader.get_text_from_page()
    symptoms = set()

    st = NERTagger(model, stanford_jar, encoding='utf-8')
    sentences = nltk.sent_tokenize(cleaned_text)
    for sentence in sentences:
        tags = st.tag(nltk.word_tokenize(sentence))
        tag_index = 0
        while tag_index < len(tags):
            if tags[tag_index][1] == 'SYMP':
                symptom = []
                while tag_index < len(tags) and tags[tag_index][1] != 'O':
                    symptom.append(tags[tag_index][0])
                    tag_index += 1
                symptoms.add(' '.join(symptom))
            else:
                tag_index += 1
    print "Found %d symptoms:" % len(symptoms)
    for symptom in symptoms:
        print symptom
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号