def fetch_all_organizations(resume_text):
organizations = set()
tokenized_sentences = nltk.sent_tokenize(resume_text)
# Custom grammar with NLTK
# NP - Noun Phrase
# NN - Noun
# NNP - Proper Noun
# V - Verb
# JJ - Adjective
# In a sentence that contains NN NNNP V NN NN JJ NN.
# The noun-phrases fetched are:
# NP: NN NNP
# NP: NN NN
# NP: NN
# Ex, "Application Developer at Delta Force"
# => ["Application Developer", "Delta Force"]
grammar = r"""NP: {<NN|NNP>+}"""
parser = nltk.RegexpParser(grammar)
avoid_organizations = utilities.get_avoid_organizations()
for sentence in tokenized_sentences:
# tags all parts of speech in the tokenized sentences
tagged_words = nltk.pos_tag(nltk.word_tokenize(sentence))
# then chunks with customize grammar
# np_chunks are instances of class nltk.tree.Tree
np_chunks = parser.parse(tagged_words)
noun_phrases = []
for np_chunk in np_chunks:
if isinstance(np_chunk, nltk.tree.Tree) and np_chunk.label() == 'NP':
# if np_chunk is of grammer 'NP' then create a space seperated string of all leaves under the 'NP' tree
noun_phrase = ""
for (org, tag) in np_chunk.leaves():
noun_phrase += org + ' '
noun_phrases.append(noun_phrase.rstrip())
# Using name entity chunker to get all the organizations
chunks = nltk.ne_chunk(tagged_words)
for chunk in chunks:
if isinstance(chunk, nltk.tree.Tree) and chunk.label() == 'ORGANIZATION':
(organization, tag) = chunk[0]
# if organization is in the noun_phrase, it means that there is a high chance of noun_phrase containing the employer name
# eg, Delta Force is added to organizations even if only Delta is recognized as an organization but Delta Force is a noun-phrase
for noun_phrase in noun_phrases:
if organization in noun_phrase and organization not in avoid_organizations:
organizations.add(noun_phrase.capitalize())
return organizations
评论列表
文章目录