def noun_phrases(text, included_unnormalized=False):
'''applies normalization to the terms found by noun_phrases_as_tokens
and joins on '_'.
:rtype: list of phrase strings with spaces replaced by ``_``.
'''
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()
def normalize(word):
'''Normalises words to lowercase and stems and lemmatizes it.'''
word = word.lower()
try:
word = stemmer.stem_word(word)
word = lemmatizer.lemmatize(word)
except:
pass
return word
normalizations = defaultdict(list)
for terms in noun_phrases_as_tokens(text):
key = u'_'.join(map(normalize, terms))
normalizations[key].append(u' '.join(terms))
if included_unnormalized:
return normalizations.keys(), normalizations
else:
return normalizations.keys()
评论列表
文章目录