python类WordNetLemmatizer()的实例源码

lemmatiser.py 文件源码 项目:LDA-REST 作者: valentinarho 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def LemTokens(tokens):
    lemmer = WordNetLemmatizer()
    return [lemmer.lemmatize(token) for token in tokens]
retrieval.py 文件源码 项目:dict_based_learning 作者: tombosc 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def add_from_lemma_definitions(self, vocab, try_lower=False):
        """Add lemma definitions for non-lemmas.

        This code covers the following scenario: supposed a dictionary is crawled,
        but only for word lemmas.

        """
        lemmatizer = nltk.WordNetLemmatizer()
        added = 0
        for word in vocab.words:
            word_list = [word, word.lower()] if try_lower else [word]

            for word_to_lemma in word_list:
                try:
                    for part_of_speech in ['a', 's', 'r', 'n', 'v']:
                        lemma = lemmatizer.lemmatize(word_to_lemma, part_of_speech)
                        lemma_defs = self._data.get(lemma)
                        if lemma != word and lemma_defs:
                            # This can be quite slow. But this code will not be used
                            # very often.
                            for def_ in lemma_defs:
                                if not def_ in self._data[word]:
                                    added += 1
                                    self._data[word].append(def_)
                except:
                    logger.error("lemmatizer crashed on {}".format(word))
                    logger.error(traceback.format_exc())
        logger.info("Added {} new defs in add_from_lemma_definitions".format(added))
        self.save()
retrieval.py 文件源码 项目:dict_based_learning 作者: tombosc 项目源码 文件源码 阅读 52 收藏 0 点赞 0 评论 0
def crawl_lemmas(self, vocab):
        """Add Wordnet lemmas as definitions."""
        lemmatizer = nltk.WordNetLemmatizer()
        for word in vocab.words:
            definitions = []
            try:
                for part_of_speech in ['a', 's', 'r', 'n', 'v']:
                    lemma = lemmatizer.lemmatize(word, part_of_speech)
                    if lemma != word and not [lemma] in definitions:
                        definitions.append([lemma])
            except:
                logger.error("lemmatizer crashed on {}".format(word))
            if definitions:
                self._data[word] = definitions
        self.save()
normalize.py 文件源码 项目:minke 作者: DistrictDataLabs 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def __init__(self):
        self._wordnet = nltk.WordNetLemmatizer()
        self._cache   = {}
nltk_normalization.py 文件源码 项目:vec4ir 作者: lgalke 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def __init__(self):
        self.install_nltk_corpora('stopwords', 'wordnet', 'punkt')
        self.lemmatizer = nltk.WordNetLemmatizer()
        self.lemmatizer.lemmatize('')  # Force nltk lazy corpus loader to do something.
        self.tokenizer = self.make_tokenizer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.sent_tokenizer = None
text_cleanup.py 文件源码 项目:Hacker_News_Article_Topics 作者: reeddunkle 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def lemmatize_individual_text(tokens):
    '''
    Given a list of tokens, return a list of lemmatized strings.
    '''

    lemmatizer = nltk.WordNetLemmatizer()

    return map(lemmatizer.lemmatize, tokens)
normalization.py 文件源码 项目:natural-language-preprocessings 作者: Hironsan 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def lemmatize_term(term, pos=None):
    if pos is None:
        synsets = wordnet.synsets(term)
        if not synsets:
            return term
        pos = synsets[0].pos()
        if pos == wordnet.ADJ_SAT:
            pos = wordnet.ADJ
    return nltk.WordNetLemmatizer().lemmatize(term, pos=pos)
text_utils.py 文件源码 项目:document-qa 作者: allenai 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def __init__(self, lower: bool = True, stemmer="port"):
        self.lower = lower
        self.stemmer = stemmer
        if stemmer == "port":
            self._stemmer = PorterStemmer()
            self._stem = self._stemmer.stem
        elif stemmer == "wordnet":
            self._stemmer = WordNetLemmatizer()
            self._stem = self._stemmer.lemmatize
        else:
            raise ValueError(stemmer)
        # stemming is slow, so we cache words as we go
        self.normalize_cache = {}
text_features.py 文件源码 项目:document-qa 作者: allenai 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def __init__(self, require_unique_match, lemmatizer="word_net",
                 empty_question_features=False, stop_words=None):
        self.lemmatizer = lemmatizer
        self.stop_words = stop_words
        self.empty_question_features = empty_question_features
        if lemmatizer == "word_net":
            self._lemmatizer = WordNetLemmatizer()
        else:
            raise ValueError()
        self._cache = {}
        self.require_unique_match = require_unique_match
NLTKPreprocessor.py 文件源码 项目:ai-chatbot-framework 作者: alfredfrancis 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower = lower
        self.strip = strip
        #self.stopwords  = stopwords or set(sw.words('english'))
        self.punct = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()
vocabulary.py 文件源码 项目:pymake 作者: dtrckd 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def __init__(self, exclude_stopwords=False, lemmatize=True):

        try:
            import nltk
            _NLTK_DISABLED = False
        except:
            _NLTK_DISABLED = True

        self.vocas = []        # id to word
        self.token2id = dict() # word to id
        self.docfreq = []      # id to document frequency
        self.exclude_stopwords = exclude_stopwords

        self.stopwords_list = []
        if exclude_stopwords:
            # Too much strict
            #with open (os.path.join(os.path.dirname(__file__), 'stopwords.txt'), "r") as _f:
            #    stopwords_list = _f.read().replace('\n', '').split()
            if not _NLTK_DISABLED:
                stopwords_list += nltk.corpus.stopwords.words('english')
            self.stopwords_list = set(stopwords_list)

        if lemmatize:
            if not _NLTK_DISABLED:
                self.wlemm = nltk.WordNetLemmatizer()
            else:
                print ('Warning: no lemmatizer !')
learn.py 文件源码 项目:partisan-discourse 作者: DistrictDataLabs 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def __init__(self, stopwords=None):
        self.stopwords  = set(stopwords or nltk.corpus.stopwords.words('english'))
        self.lemmatizer = nltk.WordNetLemmatizer()
tests.py 文件源码 项目:OpinionMining728 作者: stasi009 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def test_lemmatize_with_pos():
    text = "The restaurants nearby are better than the shops further away"
    words = nltk.word_tokenize(text)
    lemmatizer = nltk.WordNetLemmatizer()
    print utility.lemmatize_with_pos(lemmatizer,words)
sip.py 文件源码 项目:memex-dossier-open 作者: dossier 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def noun_phrases(text, included_unnormalized=False):
    '''applies normalization to the terms found by noun_phrases_as_tokens
    and joins on '_'.

    :rtype: list of phrase strings with spaces replaced by ``_``.

    '''
    lemmatizer = nltk.WordNetLemmatizer()
    stemmer = nltk.stem.porter.PorterStemmer()

    def normalize(word):
        '''Normalises words to lowercase and stems and lemmatizes it.'''
        word = word.lower()
        try:
            word = stemmer.stem_word(word)
            word = lemmatizer.lemmatize(word)
        except:
            pass
        return word

    normalizations = defaultdict(list)
    for terms in noun_phrases_as_tokens(text):
        key = u'_'.join(map(normalize, terms))
        normalizations[key].append(u' '.join(terms))

    if included_unnormalized:
        return normalizations.keys(), normalizations
    else:
        return normalizations.keys()
synset_analysis.py 文件源码 项目:Quadflor 作者: quadflor 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def __init__(self):
        NltkNormalizer.install_nltk_corpora('averaged_perceptron_tagger')
        self.normalizer = NltkNormalizer()
        self.lem = nltk.WordNetLemmatizer()
        self.tagger = nltk.PerceptronTagger()
        self.translation_dict = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB}
nltk_normalization.py 文件源码 项目:Quadflor 作者: quadflor 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def __init__(self):
        self.install_nltk_corpora('stopwords', 'wordnet', 'punkt')
        self.lemmatizer = nltk.WordNetLemmatizer()
        self.lemmatizer.lemmatize('')  # Force nltk lazy corpus loader to do something.
        self.tokenizer = self.make_tokenizer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.sent_tokenizer = None
processor.py 文件源码 项目:nltk-api 作者: szyku 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def __init__(self):
        self.lemmatizer = WordNetLemmatizer()


问题


面经


文章

微信
公众号

扫码关注公众号