def ne_tagging(text):
chunked = ne_chunk(pos_tag(word_tokenize(text)))
prev = None
continuous_chunk = []
current_chunk = []
for i in chunked:
if type(i) == Tree:
current_chunk.append(" ".join([token for token, pos in i.leaves()]))
elif current_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
current_chunk = []
else:
continue
return continuous_chunk
python类ne_chunk()的实例源码
def process(self, fc, context=None):
text_source = self.config.get('text_source')
if text_source and text_source in fc:
text = fc[text_source]
else:
return fc
names = defaultdict(StringCounter)
for sent in nltk.sent_tokenize(text):
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
if hasattr(chunk, 'label'):
label = chunk.label()
name = ' '.join(c[0] for c in chunk.leaves())
if not isinstance(name, unicode):
name = unicode(name, 'utf-8')
name = cleanse(name)
#print chunk.node, name
names[label][name] += 1
for entity_type, name_counts in names.items():
fc[entity_type] = name_counts
return fc
def get_continuous_chunks(self, text):
chunked = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))
prev = None
continuous_chunk = []
current_chunk = []
for i in chunked:
if type(i) == nltk.Tree:
current_chunk.append(" ".join([token for token, pos in i.leaves()]))
elif current_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
current_chunk = []
else:
continue
return continuous_chunk
generate_neighbor_pos.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def get_pos_tag(qind):
q = index_q[qind]
wl = str(q).lower().split()
pos_l = nltk.pos_tag(wl)
q1_pos = []
for pos in pos_l:
q1_pos.append(pos[1])
return q1_pos
# def get_ner_tag(qind):
# q = index_q[qind]
# wl = str(q).lower().split()
# ner_l = nltk.ne_chunk(wl)
# q1_ner = []
# for pos in ner_l:
# q1_ner.append(pos[1])
# return q1_ner
def whereRules(sentenceOriginal):
score = 0
sentence = sentenceOriginal.lower()
# for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentenceOriginal))):
# if type(chunk) is nltk.tree.Tree:
# if 'LOCATION' in chunk.label() or 'GPE' in chunk.label():
# score += 10
# RULE 2
for word in LOCPREP:
if word in sentence:
score += 4
# RULE 3
for word in LOCATION:
if word in sentence:
score += 6
return score
# WHEN RULES
def performNameExtraction(text):
#Returns a list of what NLTK defines as persons after processing the text passed into it.
try:
entity_names = []
for sent in nltk.sent_tokenize(text):
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
if hasattr(chunk, 'label') and chunk.label:
if chunk.label() == 'PERSON':
name_value = ' '.join(child[0] for child in chunk.leaves())
if name_value not in entity_names:
entity_names.append(name_value)
except:
print "Unexpected error:", sys.exc_info()[0]
return entity_names
def ne_chunked():
print()
print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
print("=" * 45)
ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
rels = []
for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
sent = nltk.ne_chunk(sent)
rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
for rel in rels:
print('{0:<5}{1}'.format(i, rtuple(rel)))
def fetch_name(resume_text):
tokenized_sentences = nltk.sent_tokenize(resume_text)
for sentence in tokenized_sentences:
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence), tagset='universal')):
if hasattr(chunk, 'label'):# and chunk.label() == 'PERSON':
chunk = chunk[0]
(name, tag) = chunk
if tag == 'NOUN':
return name
return "Applicant name couldn't be processed"
relextract.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def ne_chunked():
print()
print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
print("=" * 45)
ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
rels = []
for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
sent = nltk.ne_chunk(sent)
rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
for rel in rels:
print('{0:<5}{1}'.format(i, rtuple(rel)))
def extract_entities(text):
result=dict()
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))):
# chunk.draw()
if(isinstance(chunk, nltk.tree.Tree)):
for subtree in chunk.subtrees(filter=lambda t: (t.label() == 'PERSON' or t.label() == 'GPE' or t.label() == 'LOCATION')):
for leave in subtree.leaves():
if leave[0].lower() not in irrelevant_loc_words:
result[leave[0].lower()]=subtree.label()
# print result
return result
def ne_chunked():
print()
print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
print("=" * 45)
ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
rels = []
for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
sent = nltk.ne_chunk(sent)
rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
for rel in rels:
print('{0:<5}{1}'.format(i, rtuple(rel)))
def ne_chunked():
print()
print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
print("=" * 45)
ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
rels = []
for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
sent = nltk.ne_chunk(sent)
rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
for rel in rels:
print('{0:<5}{1}'.format(i, rtuple(rel)))
def find_named_entities(sent):
tree = nltk.ne_chunk(sent)
for st in tree.subtrees():
if st.label() != 'S':
logger.debug(st)
def ne_chunked():
print()
print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
print("=" * 45)
ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
rels = []
for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
sent = nltk.ne_chunk(sent)
rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
for rel in rels:
print('{0:<5}{1}'.format(i, rtuple(rel)))
def ne_chunked():
print()
print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
print("=" * 45)
ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
rels = []
for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
sent = nltk.ne_chunk(sent)
rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
for rel in rels:
print('{0:<5}{1}'.format(i, rtuple(rel)))
def ne_chunked():
print()
print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
print("=" * 45)
ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
rels = []
for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
sent = nltk.ne_chunk(sent)
rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
for rel in rels:
print('{0:<5}{1}'.format(i, rtuple(rel)))
def ne_chunked():
print()
print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker")
print("=" * 45)
ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*')
rels = []
for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]):
sent = nltk.ne_chunk(sent)
rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7)
for rel in rels:
print('{0:<5}{1}'.format(i, rtuple(rel)))
def extract(self, text, entity_description=False):
# We need to clean the text in each method otherwise when we present it
# to the user, it will have a different format
text = self.remove_return_lines_and_quotes(text)
sentences = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)]
# This function is quite expensive
sentences = [nltk.pos_tag(sent) for sent in sentences]
entities_all = {} if entity_description else []
#stop = stopwords.words('english')
# more_stop_words = ['(' , ')', "'s" , ',', ':' , '<' , '>' , '.' , '-' , '&' ,'*','...' , 'therefore' , '.vs','hence']
# stop = stopwords.words('english')
# stop = stop + more_stop_words
stop = ["a", "able", "about", "above", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "added", "adj", "affected", "affecting", "affects", "after", "afterwards", "again", "against", "ah", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "apparently", "approximately", "are", "aren", "arent", "arise", "around", "as", "aside", "ask", "asking", "at", "auth", "available", "away", "awfully", "b", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "between", "beyond", "biol", "both", "brief", "briefly", "but", "by", "c", "ca", "came", "can", "cannot", "can't", "cause", "causes", "certain", "certainly", "co", "com", "come", "comes", "contain", "containing", "contains", "could", "couldnt", "d", "date", "did", "didn't", "different", "do", "does", "doesn't", "doing", "done", "don't", "down", "downwards", "due", "during", "e", "each", "ed", "edu", "effect", "eg", "eight", "eighty", "either", "else", "elsewhere", "end", "ending", "enough", "especially", "et", "et-al", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "except", "f", "far", "few", "ff", "fifth", "first", "five", "fix", "followed", "following", "follows", "for", "former", "formerly", "forth", "found", "four", "from", "further", "furthermore", "g", "gave", "get", "gets", "getting", "give", "given", "gives", "giving", "go", "goes", "gone", "got", "gotten", "h", "had", "happens", "hardly", "has", "hasn't", "have", "haven't", "having", "he", "hed", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself", "hes", "hi", "hid", "him", "himself", "his", "hither", "home", "how", "howbeit", "however", "hundred", "i", "id", "ie", "if", "i'll", "im", "immediate", "immediately", "importance", "important", "in", "inc", "indeed", "index", "information", "instead", "into", "invention", "inward", "is", "isn't", "it", "itd", "it'll", "its", "itself", "i've", "j", "just", "k", "keep keeps",
"kept", "kg", "km", "know", "known", "knows", "l", "largely", "last", "lately", "later", "latter", "latterly", "least", "less", "lest", "let", "lets", "like", "liked", "likely", "line", "little", "'ll", "look", "looking", "looks", "ltd", "m", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "million", "miss", "ml", "more", "moreover", "most", "mostly", "mr", "mrs", "much", "mug", "must", "my", "myself", "n", "na", "name", "namely", "nay", "nd", "near", "nearly", "necessarily", "necessary", "need", "needs", "neither", "never", "nevertheless", "new", "next", "nine", "ninety", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "now", "nowhere", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "omitted", "on", "once", "one", "ones", "only", "onto", "or", "ord", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "page", "pages", "part", "particular", "particularly", "past", "per", "perhaps", "placed", "please", "plus", "poorly", "possible", "possibly", "potentially", "pp", "predominantly", "present", "previously", "primarily", "probably", "promptly", "proud", "provides", "put", "q", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "respectively", "resulted", "resulting", "results", "right", "run", "s", "said", "same", "saw", "say", "saying", "says", "sec", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sent", "seven", "several", "shall", "she", "shed", "she'll", "shes", "should", "shouldn't", "show", "showed", "shown", "showns", "shows", "significant", "significantly", "similar", "similarly", "since", "six", "slightly", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure"]
for s in sentences:
chunked = nltk.ne_chunk(s, binary=True)
for n in chunked:
if isinstance(n, nltk.tree.Tree):
if n.label() == 'NE':
entities_all = self.getEntity(n, stop, entities_all, entity_description)
if entity_description:
return entities_all
else:
return list(set(entities_all))
def extract_org(sent):
pos = pos_tag(nltk.tokenize.word_tokenize(sent))
sentt = nltk.ne_chunk(pos, binary=False)
org = []
for subtree in sentt.subtrees(filter=lambda t: t.label() == 'GPE' or t.label() == 'ORGANIZATION'):
for leave in subtree.leaves():
org.append(leave)
return org
def create_phrase(self, phrase_str):
tokenized_phrase = nltk.word_tokenize(phrase_str)
tagged_phrase = nltk.pos_tag(tokenized_phrase)
ne_chunk_tree = nltk.ne_chunk(tagged_phrase)
#if (line_num in bluh):
#print(str(line_num)+". "+str(ne_chunk_tree))
merge_tokens = self._find_multi_token_nnp(ne_chunk_tree)
ne_chunk_list = self._merge_tokens_and_flatten(ne_chunk_tree, merge_tokens)
#if (line_num in bluh):
#print(str(line_num)+". "+str(ne_chunk_list))
tokens = [] #list of tagged tuples
for token in ne_chunk_list:
if type(token) is nltk.tree.Tree:
tokens.append(self._tree_to_tuple(token))
else:
if (token[0] in self._keywords):
token = (token[0], self._keywords[token[0]])
tokens.append(token)
#if (line_num in bluh):
#print(str(line_num)+". "+str(tokens))
phrase = Phrase(tokens)
return phrase
#input: "ne_chunk_tree" - nltk tree of tuples and/or trees containing nltk tokens, "merge_tokens" - a list of int tuples
#output: list of tuples/trees containing nltk tokens
#purpose: merge tokens in ne_chunk_tree using index ranges listed in merge_tokens input arguement. flatten ne_chunk_tree from an nltk tree to a list
def create_phrase(self, phrase_str):
tokenized_phrase = nltk.word_tokenize(phrase_str)
tagged_phrase = nltk.pos_tag(tokenized_phrase)
ne_chunk_tree = nltk.ne_chunk(tagged_phrase)
#if (line_num in bluh):
#print(str(line_num)+". "+str(ne_chunk_tree))
merge_tokens = self._find_multi_token_nnp(ne_chunk_tree)
ne_chunk_list = self._merge_tokens_and_flatten(ne_chunk_tree, merge_tokens)
#if (line_num in bluh):
#print(str(line_num)+". "+str(ne_chunk_list))
tokens = [] #list of tagged tuples
for token in ne_chunk_list:
if type(token) is nltk.tree.Tree:
tokens.append(self._tree_to_tuple(token))
else:
if (token[0] in self._keywords):
token = (token[0], self._keywords[token[0]])
tokens.append(token)
#if (line_num in bluh):
#print(str(line_num)+". "+str(tokens))
phrase = Phrase(tokens)
return phrase
#input: "ne_chunk_tree" - nltk tree of tuples and/or trees containing nltk tokens, "merge_tokens" - a list of int tuples
#output: list of tuples/trees containing nltk tokens
#purpose: merge tokens in ne_chunk_tree using index ranges listed in merge_tokens input arguement. flatten ne_chunk_tree from an nltk tree to a list
def fetch_all_organizations(resume_text):
organizations = set()
tokenized_sentences = nltk.sent_tokenize(resume_text)
# Custom grammar with NLTK
# NP - Noun Phrase
# NN - Noun
# NNP - Proper Noun
# V - Verb
# JJ - Adjective
# In a sentence that contains NN NNNP V NN NN JJ NN.
# The noun-phrases fetched are:
# NP: NN NNP
# NP: NN NN
# NP: NN
# Ex, "Application Developer at Delta Force"
# => ["Application Developer", "Delta Force"]
grammar = r"""NP: {<NN|NNP>+}"""
parser = nltk.RegexpParser(grammar)
avoid_organizations = utilities.get_avoid_organizations()
for sentence in tokenized_sentences:
# tags all parts of speech in the tokenized sentences
tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence))
# then chunks with customize grammar
# np_chunks are instances of class nltk.tree.Tree
np_chunks = parser.parse(tagged_words)
noun_phrases = []
for np_chunk in np_chunks:
if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP':
# if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree
noun_phrase = ""
for (org, tag) in np_chunk.leaves():
noun_phrase += org + ' '
noun_phrases.append(noun_phrase.rstrip())
# Using name entity chunker to get all the organizations
chunks = nltk.ne_chunk(tagged_words)
for chunk in chunks:
if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'ORGANIZATION':
(organization, tag) = chunk[0]
# if organization is in the noun_phrase, it means that there is a high chance of noun_phrase containing the employer name
# eg, Delta Force is added to organizations even if only Delta is recognized as an organization but Delta Force is a noun-phrase
for noun_phrase in noun_phrases:
if organization in noun_phrase and organization not in avoid_organizations:
organizations.add(noun_phrase.capitalize())
return organizations
def extract_all(use_random_forest):
if use_random_forest:
emails = rf_model()
emails = [email for email in emails if email[0] != 'negatives_clean']
else:
db = utils.get_local_db()
for collection in db.collection_names():
if collection != 'negatives_clean':
for record in db.get_collection(collection).find():
emails.append([collection] + [record['Text']])
# find features for each email
email_data = []
for email_set in emails:
email = email_set[1]
fields = features[email_set[0]]
# extract named entities
tokenized_email = nltk.word_tokenize(email)
tagged_email = nltk.pos_tag(tokenized_email)
named_entity_email = nltk.ne_chunk(tagged_email)
entities = []
# concatenate multi-word entities
for branch in named_entity_email:
if isinstance(branch, nltk.tree.Tree):
entity = ''
for sub_entity in branch:
entity += (sub_entity[0] + ' ')
if [branch.label(), entity.strip()] not in entities:
entities.append([branch.label(), entity.strip()])
# use entities to fill in fields
matches = []
for field in fields:
field_matches = []
for entity in entities:
# compute semantic distance and threshold
dist = 0
description = describe(entity[1])
if description:
for word in description.split():
a = wn.synsets(field[1])
b = wn.synsets(word)
if a and b:
a = a[0]
b = b[0]
segment = a.path_similarity(b)
if segment:
dist += segment
if dist > 0.1:
field_matches.append([dist, entity[1]])
field_matches.sort(key=lambda x: x[0], reverse=True)
matches.append({field[1]: field_matches})
email_data.append([email_set[0], email, matches])
return email_data
def extract_one(email):
# use random-forest to find email category
category = rf_categorize(email)
if category != 'negatives_clean':
fields = features[category]
# extract named entities
tokenized_email = nltk.word_tokenize(email)
tagged_email = nltk.pos_tag(tokenized_email)
named_entity_email = nltk.ne_chunk(tagged_email)
entities = []
# concatenate multi-word entities
for branch in named_entity_email:
if isinstance(branch, nltk.tree.Tree):
entity = ''
for sub_entity in branch:
entity += (sub_entity[0] + ' ')
if [branch.label(), entity.strip()] not in entities:
entities.append([branch.label(), entity.strip()])
# use entities to fill in fields
matches = []
for field in fields:
field_matches = []
for entity in entities:
# compute semantic distance and threshold
dist = 0
description = describe(entity[1])
if description:
for word in description.split():
a = wn.synsets(field[1])
b = wn.synsets(word)
if a and b:
a = a[0]
b = b[0]
segment = a.path_similarity(b)
if segment:
dist += segment
if dist > 0.1:
field_matches.append([dist, entity[1]])
field_matches.sort(key=lambda x: x[0], reverse=True)
matches.append({field[1]: field_matches})
# return categorized email with field guess probablities
return [category, email, matches]
def extract_org(sent):
pos = pos_tag(nltk.tokenize.word_tokenize(sent))
sentt = nltk.ne_chunk(pos, binary=False)
org = []
for subtree in sentt.subtrees(filter=lambda t: t.label() == 'GPE' or t.label() == 'ORGANIZATION'):
for leave in subtree.leaves():
org.append(leave)
return org
def whoRules(question, sentenceOriginal):
score = 0
hasNameQuestion = False
hasNameSentence = False
hasnameSentence = False
hasHumanSentence = False
sentence = sentenceOriginal.lower()
# for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentenceOriginal))):
# if type(chunk) is nltk.tree.Tree:
# if 'PERSON' in chunk.label() or 'ORGANIZATION' in chunk.label():
# score += 10
for item in question:
if item in NAME:
hasNameQuestion = True
#break
if item in HUMAN and item in sentence:
score += 10
for item in sentence:
if item in NAME:
hasNameSentence = True
if 'name' in item:
hasnameSentence = True
if item in HUMAN:
hasHumanSentence = True
# RULE 2
if not hasNameQuestion and hasNameSentence:
score += 6
# RULE 3
if not hasNameQuestion and hasnameSentence:
score += 4
# RULE 4
if hasNameSentence or hasHumanSentence:
score += 4
return score
# WHAT RULES