def extract_all(use_random_forest):
if use_random_forest:
emails = rf_model()
emails = [email for email in emails if email[0] != 'negatives_clean']
else:
db = utils.get_local_db()
for collection in db.collection_names():
if collection != 'negatives_clean':
for record in db.get_collection(collection).find():
emails.append([collection] + [record['Text']])
# find features for each email
email_data = []
for email_set in emails:
email = email_set[1]
fields = features[email_set[0]]
# extract named entities
tokenized_email = nltk.word_tokenize(email)
tagged_email = nltk.pos_tag(tokenized_email)
named_entity_email = nltk.ne_chunk(tagged_email)
entities = []
# concatenate multi-word entities
for branch in named_entity_email:
if isinstance(branch, nltk.tree.Tree):
entity = ''
for sub_entity in branch:
entity += (sub_entity[0] + ' ')
if [branch.label(), entity.strip()] not in entities:
entities.append([branch.label(), entity.strip()])
# use entities to fill in fields
matches = []
for field in fields:
field_matches = []
for entity in entities:
# compute semantic distance and threshold
dist = 0
description = describe(entity[1])
if description:
for word in description.split():
a = wn.synsets(field[1])
b = wn.synsets(word)
if a and b:
a = a[0]
b = b[0]
segment = a.path_similarity(b)
if segment:
dist += segment
if dist > 0.1:
field_matches.append([dist, entity[1]])
field_matches.sort(key=lambda x: x[0], reverse=True)
matches.append({field[1]: field_matches})
email_data.append([email_set[0], email, matches])
return email_data
评论列表
文章目录