python类ne_chunk()的实例源码

main.py 文件源码 项目:That-s-Fake 作者: rajeevdesai 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def ne_tagging(text):
    chunked = ne_chunk(pos_tag(word_tokenize(text)))
    prev = None
    continuous_chunk = []
    current_chunk = []
    for i in chunked:
        if type(i) == Tree:
            current_chunk.append(" ".join([token for token, pos in i.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
            else:
                continue
    return continuous_chunk
_names.py 文件源码 项目:memex-dossier-open 作者: dossier 项目源码 文件源码 阅读 88 收藏 0 点赞 0 评论 0
def process(self, fc, context=None):
        text_source = self.config.get('text_source')
        if text_source and text_source in fc:
            text = fc[text_source]
        else:
            return fc
        names = defaultdict(StringCounter)
        for sent in nltk.sent_tokenize(text):
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                if hasattr(chunk, 'label'):
                    label = chunk.label()
                    name = ' '.join(c[0] for c in chunk.leaves())
                    if not isinstance(name, unicode):
                        name = unicode(name, 'utf-8')
                    name = cleanse(name)
                    #print chunk.node, name
                    names[label][name] += 1
        for entity_type, name_counts in names.items():
            fc[entity_type] = name_counts
        return fc
RussianTextPreprocessing.py 文件源码 项目:keras-textgen 作者: kenoma 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def get_continuous_chunks(self, text):
         chunked = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
         prev = None
         continuous_chunk = []
         current_chunk = []
         for i in chunked:
                 if type(i) == nltk.Tree:
                         current_chunk.append(" ".join([token for token, pos in i.leaves()]))
                 elif current_chunk:
                         named_entity = " ".join(current_chunk)
                         if named_entity not in continuous_chunk:
                                 continuous_chunk.append(named_entity)
                                 current_chunk = []
                 else:
                         continue
         return continuous_chunk
generate_neighbor_pos.py 文件源码 项目:kaggle-quora-solution-8th 作者: qqgeogor 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def get_pos_tag(qind):
    q = index_q[qind]
    wl = str(q).lower().split()
    pos_l = nltk.pos_tag(wl)
    q1_pos = []
    for pos in pos_l:
        q1_pos.append(pos[1])
    return q1_pos

# def get_ner_tag(qind):
#     q = index_q[qind]
#     wl = str(q).lower().split()
#     ner_l = nltk.ne_chunk(wl)
#     q1_ner = []
#     for pos in ner_l:
#         q1_ner.append(pos[1])
#     return q1_ner
qa.py 文件源码 项目:NLP_question_answering_system_project 作者: Roshrini 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def whereRules(sentenceOriginal):
    score = 0
    sentence = sentenceOriginal.lower()

    # for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentenceOriginal))):
    #         if type(chunk) is nltk.tree.Tree:
    #             if 'LOCATION' in chunk.label() or 'GPE' in chunk.label():
    #                 score += 10

    # RULE 2
    for word in LOCPREP:
        if word in sentence:
            score += 4

    # RULE 3
    for word in LOCATION:
        if word in sentence:
            score += 6

    return score

# WHEN RULES
newsname-match.py 文件源码 项目:newsname-match 作者: bahadasx 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def performNameExtraction(text):
    #Returns a list of what NLTK defines as persons after processing the text passed into it.
    try:
        entity_names = []
        for sent in nltk.sent_tokenize(text):
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                if hasattr(chunk, 'label') and chunk.label:
                        if chunk.label() == 'PERSON':
                        name_value = ' '.join(child[0] for child in chunk.leaves())
                        if name_value not in entity_names:
                            entity_names.append(name_value)
    except:
        print "Unexpected error:", sys.exc_info()[0]
    return entity_names
relextract.py 文件源码 项目:Price-Comparator 作者: Thejas-1 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
language_parser.py 文件源码 项目:cvscan 作者: skcript 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def fetch_name(resume_text):
  tokenized_sentences = nltk.sent_tokenize(resume_text)
  for sentence in tokenized_sentences:
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence), tagset='universal')):
      if hasattr(chunk, 'label'):# and chunk.label() == 'PERSON':
        chunk = chunk[0]
      (name, tag) = chunk
      if tag == 'NOUN':
        return name

  return "Applicant name couldn't be processed"
relextract.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
ner_similarity.py 文件源码 项目:QuestionAnswerNLP 作者: debjyoti385 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def extract_entities(text):
    result=dict()
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))):
        # chunk.draw()
        if(isinstance(chunk, nltk.tree.Tree)):
            for subtree in chunk.subtrees(filter=lambda t: (t.label() == 'PERSON' or t.label() == 'GPE' or t.label() == 'LOCATION')):
                for leave in subtree.leaves():
                    if leave[0].lower() not in irrelevant_loc_words:
                        result[leave[0].lower()]=subtree.label()
    # print result
    return result
relextract.py 文件源码 项目:neighborhood_mood_aws 作者: jarrellmark 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
relextract.py 文件源码 项目:hate-to-hugs 作者: sdoran35 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
language_processor.py 文件源码 项目:chitti 作者: bhuvi8 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def find_named_entities(sent):
    tree = nltk.ne_chunk(sent)
    for st in tree.subtrees():
        if st.label() != 'S':
            logger.debug(st)
relextract.py 文件源码 项目:FancyWord 作者: EastonLee 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
relextract.py 文件源码 项目:beepboop 作者: nicolehe 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
relextract.py 文件源码 项目:kind2anki 作者: prz3m 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
relextract.py 文件源码 项目:but_sentiment 作者: MixedEmotions 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def ne_chunked():
    print()
    print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
    print("=" * 45)
    ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
    rels = []
    for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
        sent = nltk.ne_chunk(sent)
        rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
        for rel in rels:
            print('{0:<5}{1}'.format(i, rtuple(rel)))
entities.py 文件源码 项目:feature_engineering 作者: webeng 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def extract(self, text, entity_description=False):
        # We need to clean the text in each method otherwise when we present it
        # to the user, it will have a different format
        text = self.remove_return_lines_and_quotes(text)
        sentences = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)]

        # This function is quite expensive
        sentences = [nltk.pos_tag(sent) for sent in sentences]

        entities_all = {} if entity_description else []

        #stop = stopwords.words('english')
        # more_stop_words = ['(' , ')', "'s" , ',', ':' , '<' , '>' , '.' , '-' , '&' ,'*','...' , 'therefore' , '.vs','hence']
        # stop = stopwords.words('english')
        # stop = stop + more_stop_words
        stop = ["a", "able", "about", "above", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "affected", "affecting", "affects", "after", "afterwards", "again", "against", "ah", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apparently", "approximately", "are", "aren", "arent", "arise", "around", "as", "aside", "ask", "asking", "at", "auth", "available", "away", "awfully", "b", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "between", "beyond", "biol", "both", "brief", "briefly", "but", "by", "c", "ca", "came", "can", "cannot", "can't", "cause", "causes", "certain", "certainly", "co", "com", "come", "comes", "contain", "containing", "contains", "could", "couldnt", "d", "date", "did", "didn't", "different", "do", "does", "doesn't", "doing", "done", "don't", "down", "downwards", "due", "during", "e", "each", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "especially", "et", "et-al", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "f", "far", "few", "ff", "fifth", "first", "five", "fix", "followed", "following", "follows", "for", "former", "formerly", "forth", "found", "four", "from", "further", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "gone", "got", "gotten", "h", "had", "happens", "hardly", "has", "hasn't", "have", "haven't", "having", "he", "hed", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself", "hes", "hi", "hid", "him", "himself", "his", "hither", "home", "how", "howbeit", "however", "hundred", "i", "id", "ie", "if", "i'll", "im", "immediate", "immediately", "importance", "important", "in", "inc", "indeed", "index", "information", "instead", "into", "invention", "inward", "is", "isn't", "it", "itd", "it'll", "its", "itself", "i've", "j", "just", "k", "keep keeps",
                "kept", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "line", "little", "'ll", "look", "looking", "looks", "ltd", "m", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "million", "miss", "ml", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "mug", "must", "my", "myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "ninety", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "now", "nowhere", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "omitted", "on", "once", "one", "ones", "only", "onto", "or", "ord", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "previously", "primarily", "probably", "promptly", "proud", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "run", "s", "said", "same", "saw", "say", "saying", "says", "sec", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sent", "seven", "several", "shall", "she", "shed", "she'll", "shes", "should", "shouldn't", "show", "showed", "shown", "showns", "shows", "significant", "significantly", "similar", "similarly", "since", "six", "slightly", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure"]

        for s in sentences:
            chunked = nltk.ne_chunk(s, binary=True)
            for n in chunked:
                if isinstance(n, nltk.tree.Tree):
                    if n.label() == 'NE':
                        entities_all = self.getEntity(n, stop, entities_all, entity_description)

        if entity_description:
            return entities_all
        else:
            return list(set(entities_all))
db_names.py 文件源码 项目:repeat-aft 作者: ripeta 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def extract_org(sent):
    pos = pos_tag(nltk.tokenize.word_tokenize(sent))
    sentt = nltk.ne_chunk(pos, binary=False)
    org = []
    for subtree in sentt.subtrees(filter=lambda t: t.label() == 'GPE' or t.label() == 'ORGANIZATION'):
        for leave in subtree.leaves():
            org.append(leave)
    return org
PhraseMaker.py 文件源码 项目:Scaffold 作者: christina-hammer 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def create_phrase(self, phrase_str): 

        tokenized_phrase = nltk.word_tokenize(phrase_str)
        tagged_phrase = nltk.pos_tag(tokenized_phrase)

        ne_chunk_tree = nltk.ne_chunk(tagged_phrase)
        #if (line_num in bluh):
            #print(str(line_num)+". "+str(ne_chunk_tree))

        merge_tokens = self._find_multi_token_nnp(ne_chunk_tree) 

        ne_chunk_list = self._merge_tokens_and_flatten(ne_chunk_tree, merge_tokens)        

        #if (line_num in bluh):
            #print(str(line_num)+". "+str(ne_chunk_list))        

        tokens = [] #list of tagged tuples
        for token in ne_chunk_list:
            if type(token) is nltk.tree.Tree:            
                tokens.append(self._tree_to_tuple(token))
            else:
                if (token[0] in self._keywords):                
                    token = (token[0], self._keywords[token[0]])
                tokens.append(token)

        #if (line_num in bluh):
            #print(str(line_num)+". "+str(tokens))  

        phrase = Phrase(tokens)    
        return phrase 

    #input: "ne_chunk_tree" - nltk tree of tuples and/or trees containing nltk tokens, "merge_tokens" - a list of int tuples
    #output: list of tuples/trees containing nltk tokens
    #purpose: merge tokens in ne_chunk_tree using index ranges listed in merge_tokens input arguement. flatten ne_chunk_tree from an nltk tree to a list
PhraseMaker.py 文件源码 项目:Scaffold 作者: christina-hammer 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def create_phrase(self, phrase_str): 

        tokenized_phrase = nltk.word_tokenize(phrase_str)
        tagged_phrase = nltk.pos_tag(tokenized_phrase)

        ne_chunk_tree = nltk.ne_chunk(tagged_phrase)
        #if (line_num in bluh):
            #print(str(line_num)+". "+str(ne_chunk_tree))

        merge_tokens = self._find_multi_token_nnp(ne_chunk_tree) 

        ne_chunk_list = self._merge_tokens_and_flatten(ne_chunk_tree, merge_tokens)        

        #if (line_num in bluh):
            #print(str(line_num)+". "+str(ne_chunk_list))        

        tokens = [] #list of tagged tuples
        for token in ne_chunk_list:
            if type(token) is nltk.tree.Tree:            
                tokens.append(self._tree_to_tuple(token))
            else:
                if (token[0] in self._keywords):                
                    token = (token[0], self._keywords[token[0]])
                tokens.append(token)

        #if (line_num in bluh):
            #print(str(line_num)+". "+str(tokens))  

        phrase = Phrase(tokens)    
        return phrase 

    #input: "ne_chunk_tree" - nltk tree of tuples and/or trees containing nltk tokens, "merge_tokens" - a list of int tuples
    #output: list of tuples/trees containing nltk tokens
    #purpose: merge tokens in ne_chunk_tree using index ranges listed in merge_tokens input arguement. flatten ne_chunk_tree from an nltk tree to a list
language_parser.py 文件源码 项目:cvscan 作者: skcript 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def fetch_all_organizations(resume_text):
  organizations = set()
  tokenized_sentences = nltk.sent_tokenize(resume_text)

  # Custom grammar with NLTK
  # NP - Noun Phrase
  # NN - Noun
  # NNP - Proper Noun
  # V - Verb
  # JJ - Adjective

  # In a sentence that contains NN NNNP V NN NN JJ NN.
  # The noun-phrases fetched are:
  # NP: NN NNP
  # NP: NN NN
  # NP: NN

  # Ex, "Application Developer at Delta Force"
  # => ["Application Developer", "Delta Force"]

  grammar = r"""NP: {<NN|NNP>+}"""
  parser = nltk.RegexpParser(grammar)

  avoid_organizations = utilities.get_avoid_organizations()

  for sentence in tokenized_sentences:

    # tags all parts of speech in the tokenized sentences
    tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence))

    # then chunks with customize grammar
    # np_chunks are instances of class nltk.tree.Tree
    np_chunks = parser.parse(tagged_words)
    noun_phrases = []

    for np_chunk in np_chunks:
      if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP':
        # if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree
        noun_phrase = ""
        for (org, tag) in np_chunk.leaves():
          noun_phrase += org + ' '

        noun_phrases.append(noun_phrase.rstrip())

    # Using name entity chunker to get all the organizations
    chunks = nltk.ne_chunk(tagged_words)
    for chunk in chunks:
      if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'ORGANIZATION':
        (organization, tag) = chunk[0]

        # if organization is in the noun_phrase, it means that there is a high chance of noun_phrase containing the employer name
        # eg, Delta Force is added to organizations even if only Delta is recognized as an organization but Delta Force is a noun-phrase
        for noun_phrase in noun_phrases:
          if organization in noun_phrase and organization not in avoid_organizations:
            organizations.add(noun_phrase.capitalize())

  return organizations
extractor.py 文件源码 项目:MLAB_Intuit 作者: rykard95 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def extract_all(use_random_forest):
    if use_random_forest:
        emails = rf_model()
        emails = [email for email in emails if email[0] != 'negatives_clean']
    else:
        db = utils.get_local_db()
        for collection in db.collection_names():
            if collection != 'negatives_clean':
                for record in db.get_collection(collection).find():
                    emails.append([collection] + [record['Text']])

    # find features for each email
    email_data = []
    for email_set in emails:
        email = email_set[1]
        fields = features[email_set[0]]

        # extract named entities
        tokenized_email = nltk.word_tokenize(email)
        tagged_email =  nltk.pos_tag(tokenized_email)
        named_entity_email = nltk.ne_chunk(tagged_email)
        entities = []

        # concatenate multi-word entities
        for branch in named_entity_email:
            if isinstance(branch, nltk.tree.Tree):
                entity = ''
                for sub_entity in branch:
                    entity += (sub_entity[0] + ' ')
                if [branch.label(), entity.strip()] not in entities:
                    entities.append([branch.label(), entity.strip()])

        # use entities to fill in fields
        matches = []
        for field in fields:
            field_matches = []
            for entity in entities:
                # compute semantic distance and threshold
                dist = 0
                description = describe(entity[1])
                if description:
                    for word in description.split():
                        a = wn.synsets(field[1])
                        b = wn.synsets(word)
                        if a and b:
                            a = a[0]
                            b = b[0]
                            segment = a.path_similarity(b)
                            if segment:
                                dist += segment
                if dist > 0.1:
                    field_matches.append([dist, entity[1]])
            field_matches.sort(key=lambda x: x[0], reverse=True)
            matches.append({field[1]: field_matches})
        email_data.append([email_set[0], email, matches])
    return email_data
extractor.py 文件源码 项目:MLAB_Intuit 作者: rykard95 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def extract_one(email):
    # use random-forest to find email category
    category = rf_categorize(email)
    if category != 'negatives_clean':
        fields = features[category]

        # extract named entities
        tokenized_email = nltk.word_tokenize(email)
        tagged_email =  nltk.pos_tag(tokenized_email)
        named_entity_email = nltk.ne_chunk(tagged_email)
        entities = []

        # concatenate multi-word entities
        for branch in named_entity_email:
            if isinstance(branch, nltk.tree.Tree):
                entity = ''
                for sub_entity in branch:
                    entity += (sub_entity[0] + ' ')
                if [branch.label(), entity.strip()] not in entities:
                    entities.append([branch.label(), entity.strip()])

        # use entities to fill in fields
        matches = []
        for field in fields:
            field_matches = []
            for entity in entities:
                # compute semantic distance and threshold
                dist = 0
                description = describe(entity[1])
                if description:
                    for word in description.split():
                        a = wn.synsets(field[1])
                        b = wn.synsets(word)
                        if a and b:
                            a = a[0]
                            b = b[0]
                            segment = a.path_similarity(b)
                            if segment:
                                dist += segment
                if dist > 0.1:
                    field_matches.append([dist, entity[1]])
            field_matches.sort(key=lambda x: x[0], reverse=True)
            matches.append({field[1]: field_matches})

        # return categorized email with field guess probablities
        return [category, email, matches]
state_data_sources.py 文件源码 项目:repeat-aft 作者: ripeta 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def extract_org(sent):
    pos = pos_tag(nltk.tokenize.word_tokenize(sent))
    sentt = nltk.ne_chunk(pos, binary=False)
    org = []
    for subtree in sentt.subtrees(filter=lambda t: t.label() == 'GPE' or t.label() == 'ORGANIZATION'):
        for leave in subtree.leaves():
            org.append(leave)
    return org
qa.py 文件源码 项目:NLP_question_answering_system_project 作者: Roshrini 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def whoRules(question, sentenceOriginal):
    score = 0
    hasNameQuestion = False
    hasNameSentence = False
    hasnameSentence = False
    hasHumanSentence = False
    sentence = sentenceOriginal.lower()

    # for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentenceOriginal))):
    #         if type(chunk) is nltk.tree.Tree:
    #             if 'PERSON' in chunk.label() or 'ORGANIZATION' in chunk.label():
    #                 score += 10

    for item in question:
        if item in NAME:
            hasNameQuestion = True
            #break

        if item in HUMAN and item in sentence:
            score += 10

    for item in sentence:
        if item in NAME:
            hasNameSentence = True
        if 'name' in item:
            hasnameSentence = True
        if item in HUMAN:
            hasHumanSentence = True

    # RULE 2
    if not hasNameQuestion and hasNameSentence:
        score += 6

    # RULE 3
    if not hasNameQuestion and hasnameSentence:
        score += 4

    # RULE 4
    if hasNameSentence or hasHumanSentence:
        score += 4

    return score


# WHAT RULES


问题


面经


文章

微信
公众号

扫码关注公众号