python类WordNetLemmatizer()的实例源码-面圈网

readdata.py 文件源码项目：Natural-Language-Processing-Python-and-NLTK 作者: PacktPublishing 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def preprocessing(text):
    text = text.decode("utf8")
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]

    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]

    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]

    # lower capitalization
    tokens = [word.lower() for word in tokens]

    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text= ' '.join(tokens)

    return preprocessed_text

session.py 文件源码项目：scanner 作者: cheng6076 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def test_ranker(options):
    lemmatizer = WordNetLemmatizer()
    words, answers, candidate_lfs = load_lf_test(options.data_dir)
    r = ranker.LogLinear(options.word_dim, options.embedding_file, options.stopwords_file)
    assert(os.path.exists(options.ranker_model_dir))
    r.load_model(options.ranker_model_dir)

    result_file = os.path.join(options.result_dir, 'test')
    rf = open(result_file, 'w')
    print ('testing...')
    for word, answer, lf in iter_lf_test(words, answers, candidate_lfs):
        lemma = [lemmatizer.lemmatize(w) for w in word]
        selected = r.test(word, lemma, lf)
        write_file(rf, selected[0], answer, selected[1])
    rf.close()

    print (getResults(result_file))

generate_ngram_pos_link.py 文件源码项目：kaggle-quora-solution-8th 作者: qqgeogor 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def getPOSLinks(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    text = nltk.word_tokenize(text)
    pos = nltk.pos_tag(text)
    links = []
    link = []
    active = False
    for w in pos:
        part = w[1]
        word = w[0]
        if(not active and (part[:2] == "DT" or part == "WP" or part == "VB" or part == "IN")):
            active = True
        if(active):
            link.append(wordnet_lemmatizer.lemmatize(word))
        #extract main body
        if(active and (part == "PRP" or part[:2] == "NN" or part == "." )):
            active = False
            links.append(" ".join(link))
            link = []
    return links

__init__.py 文件源码项目：earthy 作者: alvations 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def wordnet_lemmatize(word, pos='n'):
    global _nltk_wordnet_lemmatizer
    try:
        _nltk_wordnet_lemmatizer
    except NameError:
        _nltk_wordnet_lemmatizer = WordNetLemmatizer()
    return _nltk_wordnet_lemmatizer.lemmatize(word, penn2morphy(pos))

recipe_cleanup.py 文件源码项目：Flavor-Network 作者: lingcheng99 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def split_ingr(x):
    wnl=WordNetLemmatizer()
    cleanlist=[]
    lst = x.strip('[]').split(',')
    cleanlist=[' '.join(wnl.lemmatize(word.lower()) for word in word_tokenize(re.sub('[^a-zA-Z]',' ',item))) for item in lst]
    return cleanlist

#remove low-information words from ingredients, could use more

lemmatizer.py 文件源码项目：chatbot_ner 作者: hellohaptik 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def __wordnet_lemmatizer(self):
        """Initializes WordNetLemmatizer

        Returns:
            Initializes WordNetLemmatizer
        """
        self.lemmatizer = WordNetLemmatizer()
        # Call lemmatize to avoid lazy load
        _ = self.lemmatizer.lemmatize('start')

add_error_features.py 文件源码项目：smt-for-gec 作者: cnap 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def __init__(self):
        self.WN_TAGS = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'}
        self.wnl = WordNetLemmatizer()
        self.dictionary = enchant.Dict('en')
        self.inflengine = inflect.engine()

analyze_parallel_sents.py 文件源码项目：smt-for-gec 作者: cnap 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def __init__(self):
        self.WN_TAGS = {'J': 'a', 'N': 'n', 'R': 'r', 'V': 'v'}
        self.wnl = WordNetLemmatizer()
        self.dictionary = enchant.Dict('en')
        self.lookup_table = {}

LDAModel_English.py 文件源码项目：LDA_RecEngine 作者: easonchan1213 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def __tokenizeWholeCorpora(self,pathToCorpora):
        print 'Start tokenzing the corpora: %s' % (pathToCorpora)
        punct = re.compile('[%s]' % re.escape(string.punctuation))
        wnl = WordNetLemmatizer()
        doc_count=0
        train_set = []
        doc_mapping = {}
        link_mapping = {}

        for f in glob(pathToCorpora+'/*'):
                filereader = open(f, 'r')
                article = filereader.readlines();filereader.close()
                text = ''
                try:
                    link = article[0]
                    title = article[1]
                    text = article[2].lower()
                except IndexError:
                    continue

                # Skip document length < min_length
                if len(text) < self.min_length:
                    continue
                text = punct.sub("",text)  # Remove all punctuations
                tokens = nltk.word_tokenize(text)  # Tokenize the whole text
                # Lemmatize every word and add to tokens list if the word is not in stopword
                train_set.append([wnl.lemmatize(word) for word in tokens if word not in self.stopword]) 
                # Build doc-mapping
                doc_mapping[doc_count] = title
                link_mapping[doc_count] = link
                doc_count = doc_count+1
                if doc_count % 10000 == 0:
                    print 'Have processed %i documents' % (doc_count)

        print 'Finished tokenzing the copora: %s' % (pathToCorpora)
        return doc_count,train_set,doc_mapping,link_mapping

util.py 文件源码项目：topic-ensemble 作者: derekgreene 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def preprocess( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True, lemmatize = False ):
    """
    Preprocess a list containing text documents stored as strings.
    """
    token_pattern = re.compile(r"\b\w\w+\b", re.U)

    if lemmatize:
        from nltk.stem import WordNetLemmatizer
        wnl = WordNetLemmatizer()

    def normalize( x ):
        x = x.lower()
        if lemmatize:
            return wnl.lemmatize(x)
        return x

    def custom_tokenizer( s ):
        return [normalize(x) for x in token_pattern.findall(s) if (len(x) >= min_term_length and x[0].isalpha() ) ]

    # Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
    if apply_norm:
        norm_function = "l2"
    else:
        norm_function = None
    tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) 
    X = tfidf.fit_transform(docs)
    terms = []
    # store the vocabulary map
    v = tfidf.vocabulary_
    for i in range(len(v)):
        terms.append("")
    for term in v.keys():
        terms[ v[term] ] = term
    return (X,terms)

nltk_lemmatizer.py 文件源码项目：adaware-nlp 作者: mhw32 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def __init__(self):
        self.model = WordNetLemmatizer()

moods_dictionary_creation.py 文件源码项目：LyricsMoodClassifier 作者: valeriaalampi 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def get_lemma(word):
    l = WordNetLemmatizer()
    return l.lemmatize(word)

lyrics_tokenization.py 文件源码项目：LyricsMoodClassifier 作者: valeriaalampi 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def simple_lemmatizing(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return lemmatized_tokens

pdtb2.py 文件源码项目：pdtb2 作者: cgpotts 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def __lemmatize(self, lemma):
        """
        Internal method used for applying the nltk.stem.WordNetStemmer() to the (word, pos) pair lemma.
        """
        string, tag = lemma
        if tag in ('a', 'n', 'r', 'v'):        
            wnl = WordNetLemmatizer()
            string = wnl.lemmatize(string, tag)
        return (string, tag)

    ######################################################################    
    # POSITIONING.

topicModel.py 文件源码项目：TextSummarization 作者: g-deoliveira 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def tokenizer(document):
    """
    input: a string
    output: a list of strings
    converts a string into tokens and performs the following steps:
    1. elimaintes non alphabetical characters
    2. converts to lower case
    3. lemmatizes using the nltk.stem.WordNetLemmatizer
    4. splits into tokens
    """
    text = re.sub('[^a-zA-Z]', ' ', document)
    tokens = text.lower().split()
    tokens = [lemmatizer(tkn) for tkn in tokens]
    return tokens

swda.py 文件源码项目：swda 作者: cgpotts 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def __wn_lemmatize(self, lemma):
        """
        Lemmatize lemma using wordnet.stemWordNetLemmatizer(). Always
        returns a (string, pos) pair.  Lemmatizes even when the tag
        isn't helpful, by ignoring it for stemming.
        """
        string, tag = lemma
        wnl = WordNetLemmatizer()
        if tag in ('a', 'n', 'r', 'v'):
            string = wnl.lemmatize(string, tag)
        else:
            string = wnl.lemmatize(string)
        return (string, tag)

remove_stopwords_nltk.py 文件源码项目：review-classification 作者: vishnupriyam 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def clean_review(review,stopwords):
    result = ""
    lemmatizer = WordNetLemmatizer()
    for word in review:
        #converts the word to its lemma form
        word = lemmatizer.lemmatize(word)
        #adds the word to the resultant review only if its not a stopword
        if word not in stopwords:
            #removes all non-alphabet characters
            word = re.sub('[^A-Za-z ]','',word)
            if(len(word) != 0):
                result += word+" "
    return result

utils.py 文件源码项目：tRECS 作者: TeeOhh 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def lemmatizer(text):
    # '''Description: This function takes in the string of descriptions and return string with all words lemmatized
    #   Parameters: String of descriptions
    #   Output: String with all words lemmatized (ex. "meeting" to "meeting" if noun and "meet" if verb)'''
    lemmatizer = WordNetLemmatizer()
    lis = unicode(str(text), 'utf-8').split(" ")
    lemm_words = [lemmatizer.lemmatize(word) for word in lis]
    return " ".join(lemm_words)

negation_detection.py 文件源码项目：negation-detection 作者: gkotsis 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def _lemma_(token):

    if isinstance(token, str):
        return _stem_(token)
    if isinstance(token, unicode):
        return _stem_(token)
    from nltk.corpus import wordnet

    def get_wordnet_pos(treebank_tag):

        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return ''

    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    p = get_wordnet_pos(token.pos()[0][1])
    if p!=wordnet.VERB:
        return _stem_(token[0])
    rs = wordnet_lemmatizer.lemmatize(token[0], pos=p)
    return rs

session.py 文件源码项目：scanner 作者: cheng6076 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def train_ranker(options):
    lemmatizer = WordNetLemmatizer()
    words, answers, good_lfs, bad_lfs = load_lf_train(options.data_dir)
    r = ranker.LogLinear(options.word_dim, options.embedding_file, options.stopwords_file)
    trainer = optimizers[options.optimizer](r.model)
    sents = 0
    total_loss = 0.0
    train_size = len(words)
    i = 0

    for epoch in range(options.epochs):
        for word, answer, good_lf, bad_lf in iter_lf_train(words, answers, good_lfs, bad_lfs):
            if len(good_lf) == 0:
                continue
            lemma = [lemmatizer.lemmatize(w) for w in word]
            loss = r.train(word, lemma, good_lf, bad_lf)
            sents += 1
            if loss is not None:
                total_loss += loss.scalar_value()
                loss.backward()
                trainer.update()
            e = float(i) / train_size
            if i % options.print_every == 0:
                print('epoch {}: loss per sentence: {}'.format(e, total_loss / sents))
                sents = 0    
                total_loss = 0.0

            i += 1

        print ('saving model...')
        save_as = '%s/epoch%03d.ranker' % (options.result_dir, epoch)
        r.save_model(save_as)

subject_oriented.py 文件源码项目：goal 作者: victorskl 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def find_match_word(hash_content, wordlist):
    split_words = []
    while len(hash_content) !=0:
        #return the index of the matched word
        word, index = check_match(hash_content,wordlist)
        split_words.append(word)
        #remove the matched words from the original tokens
        hash_content = hash_content[len(hash_content)*(-1):index]
    return split_words

#use WordNetLemmatizer to lemmatize the word

movie_sentiment.py 文件源码项目：StoryArcs 作者: dfmcaleer 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def text_clean(filename):
    '''
    Input: File path of script.
    Output: List of all words in script lowercased, lemmatized, without punctuation.
    '''
    wnl = WordNetLemmatizer()
    word_list = [word.decode("utf8", errors='ignore') for line in open(filename, 'r') for word in line.split()]
    lemma_list = [wnl.lemmatize(word.lower()) for word in word_list]
    return lemma_list

preprocess_text.py 文件源码项目：TextTopicNet 作者: lluisgomez 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def preprocess(raw):
    # Initialize Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Decode Wiki Markup entities and remove markup
    text = filter_wiki(raw)
    text = re.sub(filter_more, '', text)

    # clean and tokenize document string
    text = text.lower()
    tokens = tokenizer.tokenize(text)

    # remove stop words from tokens
    tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    tokens = [lemma.lemmatize(i) for i in tokens]

    # remove non alphabetic characters
    tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]

    # remove unigrams and bigrams
    tokens = [i for i in tokens if len(i)>2]

    return tokens

preprocess_text.py 文件源码项目：TextTopicNet 作者: lluisgomez 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def preprocess_imageclef(raw):
    # Initialize Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Decode Wiki Markup entities and remove markup
    text = filter_wiki(raw)
    text = re.sub(filter_more, '', text)

    # clean and tokenize document string
    text = text.lower()
    tokens = tokenizer.tokenize(text)

    # remove stop words from tokens
    tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    tokens = [lemma.lemmatize(i) for i in tokens]

    # remove non alphabetic characters
    tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]

    # remove unigrams and bigrams
    tokens = [i for i in tokens if len(i)>2]

    return (tokens, text)

preprocess_text.py 文件源码项目：TextTopicNet 作者: lluisgomez 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def preprocess_wikidata(raw):
 # Initialize Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Decode Wiki Markup entities and remove markup
    text = filter_wiki(raw)
    text = re.sub(filter_more, '', text)

    # clean and tokenize document string
    text = text.lower().split('../img/')[0]
    tokens = tokenizer.tokenize(text)

    # remove stop words from tokens
    tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    tokens = [lemma.lemmatize(i) for i in tokens]

    # remove non alphabetic characters
    tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]

    # remove unigrams and bigrams
    tokens = [i for i in tokens if len(i)>2]

    return (tokens, text)

utils.py 文件源码项目：TextTopicNet 作者: lluisgomez 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def preprocess(raw):
    # Initialize Tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Initialize Lemmatizer
    lemma = WordNetLemmatizer()

    # create English stop words list
    en_stop = get_stop_words('en')

    # Decode Wiki Markup entities and remove markup
    text = filter_wiki(raw)
    text = re.sub(filter_more, '', text)

    # clean and tokenize document string
    text = text.lower()
    tokens = tokenizer.tokenize(text)

    # remove stop words from tokens
    tokens = [i for i in tokens if not i in en_stop]

    # stem tokens
    tokens = [lemma.lemmatize(i) for i in tokens]

    # remove non alphabetic characters
    tokens = [re.sub(r'[^a-z]', '', i) for i in tokens]

    # remove unigrams and bigrams
    tokens = [i for i in tokens if len(i)>2]

    return tokens

Chapter 05_KNN n Naive Bayes.py 文件源码项目：Statistics-for-Machine-Learning 作者: PacktPublishing 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def preprocessing(text):
    text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())

    tokens = [word for sent in nltk.sent_tokenize(text2) for word in
              nltk.word_tokenize(sent)]

    tokens = [word.lower() for word in tokens]

    stopwds = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwds]

    tokens = [word for word in tokens if len(word)>=3]

    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    tagged_corpus = pos_tag(tokens)    

    Noun_tags = ['NN','NNP','NNPS','NNS']
    Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']

    lemmatizer = WordNetLemmatizer()

    def prat_lemmatize(token,tag):
        if tag in Noun_tags:
            return lemmatizer.lemmatize(token,'n')
        elif tag in Verb_tags:
            return lemmatizer.lemmatize(token,'v')
        else:
            return lemmatizer.lemmatize(token,'n')

    pre_proc_text =  " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])             

    return pre_proc_text

graph_of_words.py 文件源码项目：TextAsGraphClassification 作者: NightmareNyx 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def clean_terms(terms, stopwords=None, lemmatize=None, stem=None, only_N_J=None):
    if stopwords is not None:
        terms = [t for t in terms if t not in stopwords]
    if only_N_J is not None:  # include only nouns and verbs
        tagged = nltk.pos_tag(terms)
        terms = [t for t, pos in tagged if pos in tags]
    if lemmatize is not None:
        lem = WordNetLemmatizer()
        terms = [lem.lemmatize(t) for t in terms]
    if stem is not None:
        stem = PorterStemmer()
        terms = [stem.stem(t) for t in terms]
    return terms

analyze.py 文件源码项目：oie-benchmark 作者: gabrielStanovsky 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def __init__(self):
        """
        Intialize memebers:
        question_dist - generalized-question distribution of the assigned extraction
                        location.
        """
        self.question_dist = defaultdict(lambda : defaultdict(lambda : 0))
        self.lmtzr = WordNetLemmatizer()

cleaner.py 文件源码项目：pdf-server 作者: nathanielove 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join(lemmatizer.lemmatize(word) for word in text.split())