def print_symptoms_from_page(url = '', model = '', stanford_jar = ''):
html_reader = HTMLReader(url)
cleaned_text = html_reader.get_text_from_page()
symptoms = set()
st = NERTagger(model, stanford_jar, encoding='utf-8')
sentences = nltk.sent_tokenize(cleaned_text)
for sentence in sentences:
tags = st.tag(nltk.word_tokenize(sentence))
tag_index = 0
while tag_index < len(tags):
if tags[tag_index][1] == 'SYMP':
symptom = []
while tag_index < len(tags) and tags[tag_index][1] != 'O':
symptom.append(tags[tag_index][0])
tag_index += 1
symptoms.add(' '.join(symptom))
else:
tag_index += 1
print "Found %d symptoms:" % len(symptoms)
for symptom in symptoms:
print symptom
评论列表
文章目录