def LemTokens(tokens):
lemmer = WordNetLemmatizer()
return [lemmer.lemmatize(token) for token in tokens]
python类WordNetLemmatizer()的实例源码
def add_from_lemma_definitions(self, vocab, try_lower=False):
"""Add lemma definitions for non-lemmas.
This code covers the following scenario: supposed a dictionary is crawled,
but only for word lemmas.
"""
lemmatizer = nltk.WordNetLemmatizer()
added = 0
for word in vocab.words:
word_list = [word, word.lower()] if try_lower else [word]
for word_to_lemma in word_list:
try:
for part_of_speech in ['a', 's', 'r', 'n', 'v']:
lemma = lemmatizer.lemmatize(word_to_lemma, part_of_speech)
lemma_defs = self._data.get(lemma)
if lemma != word and lemma_defs:
# This can be quite slow. But this code will not be used
# very often.
for def_ in lemma_defs:
if not def_ in self._data[word]:
added += 1
self._data[word].append(def_)
except:
logger.error("lemmatizer crashed on {}".format(word))
logger.error(traceback.format_exc())
logger.info("Added {} new defs in add_from_lemma_definitions".format(added))
self.save()
def crawl_lemmas(self, vocab):
"""Add Wordnet lemmas as definitions."""
lemmatizer = nltk.WordNetLemmatizer()
for word in vocab.words:
definitions = []
try:
for part_of_speech in ['a', 's', 'r', 'n', 'v']:
lemma = lemmatizer.lemmatize(word, part_of_speech)
if lemma != word and not [lemma] in definitions:
definitions.append([lemma])
except:
logger.error("lemmatizer crashed on {}".format(word))
if definitions:
self._data[word] = definitions
self.save()
def __init__(self):
self._wordnet = nltk.WordNetLemmatizer()
self._cache = {}
def __init__(self):
self.install_nltk_corpora('stopwords', 'wordnet', 'punkt')
self.lemmatizer = nltk.WordNetLemmatizer()
self.lemmatizer.lemmatize('') # Force nltk lazy corpus loader to do something.
self.tokenizer = self.make_tokenizer()
self.stopwords = nltk.corpus.stopwords.words('english')
self.sent_tokenizer = None
def lemmatize_individual_text(tokens):
'''
Given a list of tokens, return a list of lemmatized strings.
'''
lemmatizer = nltk.WordNetLemmatizer()
return map(lemmatizer.lemmatize, tokens)
normalization.py 文件源码
项目:natural-language-preprocessings
作者: Hironsan
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def lemmatize_term(term, pos=None):
if pos is None:
synsets = wordnet.synsets(term)
if not synsets:
return term
pos = synsets[0].pos()
if pos == wordnet.ADJ_SAT:
pos = wordnet.ADJ
return nltk.WordNetLemmatizer().lemmatize(term, pos=pos)
def __init__(self, lower: bool = True, stemmer="port"):
self.lower = lower
self.stemmer = stemmer
if stemmer == "port":
self._stemmer = PorterStemmer()
self._stem = self._stemmer.stem
elif stemmer == "wordnet":
self._stemmer = WordNetLemmatizer()
self._stem = self._stemmer.lemmatize
else:
raise ValueError(stemmer)
# stemming is slow, so we cache words as we go
self.normalize_cache = {}
def __init__(self, require_unique_match, lemmatizer="word_net",
empty_question_features=False, stop_words=None):
self.lemmatizer = lemmatizer
self.stop_words = stop_words
self.empty_question_features = empty_question_features
if lemmatizer == "word_net":
self._lemmatizer = WordNetLemmatizer()
else:
raise ValueError()
self._cache = {}
self.require_unique_match = require_unique_match
def __init__(self, stopwords=None, punct=None,
lower=True, strip=True):
self.lower = lower
self.strip = strip
#self.stopwords = stopwords or set(sw.words('english'))
self.punct = punct or set(string.punctuation)
self.lemmatizer = WordNetLemmatizer()
def __init__(self, exclude_stopwords=False, lemmatize=True):
try:
import nltk
_NLTK_DISABLED = False
except:
_NLTK_DISABLED = True
self.vocas = [] # id to word
self.token2id = dict() # word to id
self.docfreq = [] # id to document frequency
self.exclude_stopwords = exclude_stopwords
self.stopwords_list = []
if exclude_stopwords:
# Too much strict
#with open (os.path.join(os.path.dirname(__file__), 'stopwords.txt'), "r") as _f:
# stopwords_list = _f.read().replace('\n', '').split()
if not _NLTK_DISABLED:
stopwords_list += nltk.corpus.stopwords.words('english')
self.stopwords_list = set(stopwords_list)
if lemmatize:
if not _NLTK_DISABLED:
self.wlemm = nltk.WordNetLemmatizer()
else:
print ('Warning: no lemmatizer !')
def __init__(self, stopwords=None):
self.stopwords = set(stopwords or nltk.corpus.stopwords.words('english'))
self.lemmatizer = nltk.WordNetLemmatizer()
def test_lemmatize_with_pos():
text = "The restaurants nearby are better than the shops further away"
words = nltk.word_tokenize(text)
lemmatizer = nltk.WordNetLemmatizer()
print utility.lemmatize_with_pos(lemmatizer,words)
def noun_phrases(text, included_unnormalized=False):
'''applies normalization to the terms found by noun_phrases_as_tokens
and joins on '_'.
:rtype: list of phrase strings with spaces replaced by ``_``.
'''
lemmatizer = nltk.WordNetLemmatizer()
stemmer = nltk.stem.porter.PorterStemmer()
def normalize(word):
'''Normalises words to lowercase and stems and lemmatizes it.'''
word = word.lower()
try:
word = stemmer.stem_word(word)
word = lemmatizer.lemmatize(word)
except:
pass
return word
normalizations = defaultdict(list)
for terms in noun_phrases_as_tokens(text):
key = u'_'.join(map(normalize, terms))
normalizations[key].append(u' '.join(terms))
if included_unnormalized:
return normalizations.keys(), normalizations
else:
return normalizations.keys()
def __init__(self):
NltkNormalizer.install_nltk_corpora('averaged_perceptron_tagger')
self.normalizer = NltkNormalizer()
self.lem = nltk.WordNetLemmatizer()
self.tagger = nltk.PerceptronTagger()
self.translation_dict = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB}
def __init__(self):
self.install_nltk_corpora('stopwords', 'wordnet', 'punkt')
self.lemmatizer = nltk.WordNetLemmatizer()
self.lemmatizer.lemmatize('') # Force nltk lazy corpus loader to do something.
self.tokenizer = self.make_tokenizer()
self.stopwords = nltk.corpus.stopwords.words('english')
self.sent_tokenizer = None
def __init__(self):
self.lemmatizer = WordNetLemmatizer()