python类LancasterStemmer()的实例源码

knock72.py 文件源码 项目:100knock2017 作者: tmu-nlp 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def preprocessor_data(data, ids, test=0):
    stopwords_set = set(stopwords.words('english'))
    stemmer = stem.LancasterStemmer()

    data_in_preprocessed = []
    labels = []

    for line in data:
        words_preprocessed = []
        line.lower()
        label, words = line.split()[0], line.split()[1:]
        labels.append(int(label))

        for word in words:
            if word in stopwords_set:
                continue
            lemmatized = stemmer.stem(word)
            if test == 0:
                ids[lemmatized]
            words_preprocessed.append(lemmatized)
        data_in_preprocessed.append(words_preprocessed)

    return data_in_preprocessed, labels
knock72.py 文件源码 项目:100knock2016 作者: tmu-nlp 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def getFeature(word_list):
    stemmer = stem.LancasterStemmer()
    # stemmer2 = stem.PorterStemmer()
    feature = defaultdict(lambda: 0)
    for word in word_list:
        if not isStopWords(word):
            word_stem = stemmer.stem(word)
            feature[word_stem] += 1
    return dict(feature)
sentiment.py 文件源码 项目:fake_news 作者: bmassman 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def stem_text(text):
    from nltk.stem import LancasterStemmer
    ls = LancasterStemmer()
    tokens = tokenize_text(text)
    filtered_tokens = [ls.stem(token) for token in tokens]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text
knock72.py 文件源码 项目:100knock2017 作者: tmu-nlp 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def preprocessor_words(words):
    stopwords_set = set(stopwords.words('english'))
    stemmer = stem.LancasterStemmer()

    words_preprocessed = []
    for word in words:
        if word in stopwords_set:
            continue
        lemmatized = stemmer.stem(word)
        words_preprocessed.append(lemmatized)

    return words_preprocessed


问题


面经


文章

微信
公众号

扫码关注公众号