extractor.py 文件源码-python代码片段

def extract_all(use_random_forest):
    if use_random_forest:
        emails = rf_model()
        emails = [email for email in emails if email[0] != 'negatives_clean']
    else:
        db = utils.get_local_db()
        for collection in db.collection_names():
            if collection != 'negatives_clean':
                for record in db.get_collection(collection).find():
                    emails.append([collection] + [record['Text']])

    # find features for each email
    email_data = []
    for email_set in emails:
        email = email_set[1]
        fields = features[email_set[0]]

        # extract named entities
        tokenized_email = nltk.word_tokenize(email)
        tagged_email =  nltk.pos_tag(tokenized_email)
        named_entity_email = nltk.ne_chunk(tagged_email)
        entities = []

        # concatenate multi-word entities
        for branch in named_entity_email:
            if isinstance(branch, nltk.tree.Tree):
                entity = ''
                for sub_entity in branch:
                    entity += (sub_entity[0] + ' ')
                if [branch.label(), entity.strip()] not in entities:
                    entities.append([branch.label(), entity.strip()])

        # use entities to fill in fields
        matches = []
        for field in fields:
            field_matches = []
            for entity in entities:
                # compute semantic distance and threshold
                dist = 0
                description = describe(entity[1])
                if description:
                    for word in description.split():
                        a = wn.synsets(field[1])
                        b = wn.synsets(word)
                        if a and b:
                            a = a[0]
                            b = b[0]
                            segment = a.path_similarity(b)
                            if segment:
                                dist += segment
                if dist > 0.1:
                    field_matches.append([dist, entity[1]])
            field_matches.sort(key=lambda x: x[0], reverse=True)
            matches.append({field[1]: field_matches})
        email_data.append([email_set[0], email, matches])
    return email_data