python类PorterStemmer()的实例源码

snowball.py 文件源码 项目:kind2anki 作者: prz3m 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
snowball.py 文件源码 项目:but_sentiment 作者: MixedEmotions 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, ignore_stopwords=False):
        _LanguageSpecificStemmer.__init__(self, ignore_stopwords)
        porter.PorterStemmer.__init__(self)
Vectorizer.py 文件源码 项目:TextClassification 作者: AlgorTroy 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def bag_of_words(list_of_strings, remove_puncs=True, remove_digits=True, remove_alnums=True):

    porter = PorterStemmer()
    lmtz = WordNetLemmatizer()

    # empty bag of words
    bag_of_words = []

    # Iterate for string
    for string in tqdm(list_of_strings):
        string_tokens = custom_tokenizer(string, remove_puncs=remove_puncs, get_unique=True)

        bag_of_words.extend(string_tokens)

    if remove_alnums:
        bag_of_words = [bag for bag in bag_of_words if bag.isalpha()]
    elif remove_digits:
        bag_of_words = [bag for bag in bag_of_words if (not isNumber(bag))]

    bag_of_words.sort()

    # Stem and Lemmatize the data
    bag_of_words_stemmed = []

    for word in bag_of_words:
        try:
            bag_of_words_stemmed.append(porter.stem(lmtz.lemmatize(word)))
        except:
            bag_of_words_stemmed.append(word)

    bag_of_words = list(bag_of_words_stemmed)

    # Remove stop words
    stop = set(stopwords.words('english'))
    print('Removing Stop words...')
    bag_of_words = [bag.strip().lower() for bag in bag_of_words if (bag.strip().lower() not in stop)]

    bow_counter = Counter(bag_of_words)
    bow_counter = OrderedDict(sorted(bow_counter.items()))

    return bow_counter
lda_model_calculator.py 文件源码 项目:moviegeek 作者: practical-recommender-systems 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def build_lda_model(self, data, docs, n_topics=5):

        texts = []
        tokenizer = RegexpTokenizer(r'\w+')
        for d in data:
            raw = d.lower()

            tokens = tokenizer.tokenize(raw)

            stopped_tokens = self.remove_stopwords(tokens)

            stemmed_tokens = stopped_tokens
            #stemmer = PorterStemmer()
            #stemmed_tokens = [stemmer.stem(token) for token in stopped_tokens]

            texts.append(stemmed_tokens)

        dictionary = corpora.Dictionary(texts)

        corpus = [dictionary.doc2bow(text) for text in texts]

        lda_model = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,
                                                 num_topics=n_topics)

        index = similarities.MatrixSimilarity(corpus)

        self.save_lda_model(lda_model, corpus, dictionary, index)
        self.save_similarities(index, docs)

        return dictionary, texts, lda_model
keywords.py 文件源码 项目:feature_engineering 作者: webeng 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def extract_bigrams(self, text):

        text = self.remove_return_lines_and_quotes(text)
        bigrams = []

        st = PorterStemmer()
        stop = stopwords.words('english')

        more_stop_words = [
            '(', ')', "'s", ',', ':', '<', '>', '.', '-', '&', '*', '...']
        stop = stopwords.words('english')
        stop = stop + more_stop_words

        tokens = st.stem(text)
        tokens = nltk.word_tokenize(tokens.lower())
        tokens = [i for i in tokens if i not in stop]
        tokens = [word for word in tokens if len(word) > 2]

        bigram_measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(tokens)
        finder.apply_freq_filter(2)
        top_bigrams = finder.nbest(bigram_measures.pmi, 1000)

        for bg in top_bigrams:
            bg = " ".join(bg)
            tag = nltk.pos_tag([bg])[0]

            if tag[1] not in ['VBG', 'RB', 'VB', 'VBD', 'VBN', 'VBP', 'VBZ', 'PRP', 'IN', 'DT', 'CC', 'PRP$']:
                bigrams.append(tag[0])

        return bigrams
build_wiki_classifier.py 文件源码 项目:wikipedia_classifier 作者: LouisFoucard 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def stem_tokens(tokens, stemmer = PorterStemmer()):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed
text_preprocessing.py 文件源码 项目:itunes 作者: kaminem64 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def k_tokenizer(text):
    text = text.encode('ascii',errors='ignore').replace('-', '')
    """ We should use a better way to remove non-english words """

    tokenizer = TweetTokenizer(preserve_case=False)
    tokens = tokenizer.tokenize(text)

    # stopset = set(stopwords.words('english'))
    # tokens = [word for word in tokens if not word in stopset]

    """ Synonyms using wordnet """

    mwe_tokenizer = MWETokenizer([('ios', '9'),])
    mwe_tokens = mwe_tokenizer.tokenize(tokens)

    """ We might want to tokenize by sentence and then tag each sentence and aggregate the results """

    """ train -> train_NN train_V"""
    tagged = nltk.pos_tag(mwe_tokens)

    def get_wordnet_pos(treebank_tag):

        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN # we preserve the original form of any unknown word

    wordnet_lemmatizer = WordNetLemmatizer()
    final_doc=[]
    for token, tag in tagged:
        word = tag + '_' + wordnet_lemmatizer.lemmatize(token, get_wordnet_pos(tag))
        final_doc.append(word)

    # porter = PorterStemmer()
    # final_doc=[]
    # for token in mwe_tokens:
    #     final_doc.append(porter.stem(token))

    return final_doc
DataHandler.py 文件源码 项目:TextClassification 作者: AlgorTroy 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def get_encoded_vector(list_of_words, new_string):

    porter = PorterStemmer()
    lmtz = WordNetLemmatizer()

    if 'START_SEQ' not in list_of_words:
        list_of_words.append('START_SEQ')

    if 'UNKNOWN_WORDS' not in list_of_words:
        list_of_words.append('UNKNOWN_WORDS')

    if 'END_SEQ' not in list_of_words:
        list_of_words.append('END_SEQ')

    tokens = text_to_word_sequence(new_string, lower=True, split=" ")

    # Stem and Lemmatize the data
    token_stemmed = []

    for token in tokens:
        try:
            token_stemmed.append(porter.stem(lmtz.lemmatize(token)))
        except:
            token_stemmed.append(token)

    tokens = list(token_stemmed)

    out = []

    all_unknown_words = True

    for token in tokens:
        if token in list_of_words:
            all_unknown_words = False
            out.append(list_of_words.index(token))
        else:
            out.append(list_of_words.index('UNKNOWN_WORDS'))
    if all_unknown_words:
        print('Sentence not recognised:', new_string)

    out = [list_of_words.index('START_SEQ')] + out + [list_of_words.index('END_SEQ')]
    return out


问题


面经


文章

微信
公众号

扫码关注公众号