def extract_one(email):
# use random-forest to find email category
category = rf_categorize(email)
if category != 'negatives_clean':
fields = features[category]
# extract named entities
tokenized_email = nltk.word_tokenize(email)
tagged_email = nltk.pos_tag(tokenized_email)
named_entity_email = nltk.ne_chunk(tagged_email)
entities = []
# concatenate multi-word entities
for branch in named_entity_email:
if isinstance(branch, nltk.tree.Tree):
entity = ''
for sub_entity in branch:
entity += (sub_entity[0] + ' ')
if [branch.label(), entity.strip()] not in entities:
entities.append([branch.label(), entity.strip()])
# use entities to fill in fields
matches = []
for field in fields:
field_matches = []
for entity in entities:
# compute semantic distance and threshold
dist = 0
description = describe(entity[1])
if description:
for word in description.split():
a = wn.synsets(field[1])
b = wn.synsets(word)
if a and b:
a = a[0]
b = b[0]
segment = a.path_similarity(b)
if segment:
dist += segment
if dist > 0.1:
field_matches.append([dist, entity[1]])
field_matches.sort(key=lambda x: x[0], reverse=True)
matches.append({field[1]: field_matches})
# return categorized email with field guess probablities
return [category, email, matches]
评论列表
文章目录