extractor.py 文件源码-python代码片段

def extract_one(email):
    # use random-forest to find email category
    category = rf_categorize(email)
    if category != 'negatives_clean':
        fields = features[category]

        # extract named entities
        tokenized_email = nltk.word_tokenize(email)
        tagged_email =  nltk.pos_tag(tokenized_email)
        named_entity_email = nltk.ne_chunk(tagged_email)
        entities = []

        # concatenate multi-word entities
        for branch in named_entity_email:
            if isinstance(branch, nltk.tree.Tree):
                entity = ''
                for sub_entity in branch:
                    entity += (sub_entity[0] + ' ')
                if [branch.label(), entity.strip()] not in entities:
                    entities.append([branch.label(), entity.strip()])

        # use entities to fill in fields
        matches = []
        for field in fields:
            field_matches = []
            for entity in entities:
                # compute semantic distance and threshold
                dist = 0
                description = describe(entity[1])
                if description:
                    for word in description.split():
                        a = wn.synsets(field[1])
                        b = wn.synsets(word)
                        if a and b:
                            a = a[0]
                            b = b[0]
                            segment = a.path_similarity(b)
                            if segment:
                                dist += segment
                if dist > 0.1:
                    field_matches.append([dist, entity[1]])
            field_matches.sort(key=lambda x: x[0], reverse=True)
            matches.append({field[1]: field_matches})

        # return categorized email with field guess probablities
        return [category, email, matches]