tfidf_feature.py 文件源码-python代码片段

tfidf_feature.py 文件源码

python

阅读 113 收藏 0 点赞 0 评论 0

项目：TextClassification 作者: mosu027 项目源码文件源码

def tfidf_feature(xtrain, xtest, stopwords_path):
    """
    tf-idf feature
    """
    xtrain = [" ".join(word) for word in xtrain]
    xtest = [" ".join(word) for word in xtest]
    stopwords = codecs.open(stopwords_path, 'r', encoding='utf-8').readlines()
    stopwords = [word.strip("\n") for word in stopwords]
    vectorizer_train = CountVectorizer(analyzer='word', stop_words=stopwords,min_df=5)
    count_train = vectorizer_train.fit_transform(xtrain)
    vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_)
    count_test = vectorizer_test.fit_transform(xtest)

    transformer = TfidfTransformer()
    tfidf_train = transformer.fit(count_train).transform(count_train)
    tfidf_test = transformer.fit(count_test).transform(count_test)

    return tfidf_train.toarray(),tfidf_test.toarray()