python类TfidfTransformer()的实例源码

pipelines.py 文件源码 项目:magic 作者: pan-webis-de 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def word_bigrams():
    preprocessor = TextCleaner(lowercase=True,
                               filter_urls=True,
                               filter_mentions=True,
                               filter_hashtags=True,
                               alphabetic=True,
                               strip_accents=True,
                               filter_rt=True)
    pipeline = Pipeline([('vect', CountVectorizer(preprocessor=preprocessor,
                                                  ngram_range=(2, 2))),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('scale', Normalizer())])
    return ('word_bigrams', pipeline)
pipelines.py 文件源码 项目:magic 作者: pan-webis-de 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def char_ngrams():
    vectorizer = CountVectorizer(min_df=1,
                                 preprocessor=TextCleaner(filter_urls=True,
                                                          filter_mentions=True,
                                                          filter_hashtags=True,
                                                          lowercase=False),
                                 analyzer='char_wb',
                                 ngram_range=(4, 4))
    pipeline = Pipeline([('vect', vectorizer),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('scale', Normalizer())])
    return ('char_ngrams', pipeline)
Tfidf_count.py 文件源码 项目:Graduation-design 作者: Baichenjia 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def TFIDF_result():
    str_handel_list = read_handel_list()   # ??30?????????????????????str
    str_test = read_test_list()  # ?????????????????str
    # ??TF-IDF???
    corpus = str_handel_list[:]  # TF-IDF????
    corpus.append(str_test)    # ????????????
    print "TF-IDF corpus building success..."
    ######################### ??scikit-learn?? TF-IDF????
    # ??????????????????????a[i][j] ??j??i???????
    vectorizer = CountVectorizer()
    # ??????????tf-idf??
    transformer = TfidfTransformer()
    # ???fit_transform???tf-idf????fit_transform??????????
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    # ????????????
    word = vectorizer.get_feature_names()
    # ?tf-idf?????????a[i][j]??j??i?????tf-idf??
    weight = tfidf.toarray()
    print "TF-IDF score is calcuated success..."
    # ???30???????????TF-IDF??
    results = []
    for j in range(len(word)):
        if word[j] == '??' or word[j] == '??' or len(word[j]) == 1:  # ??????????1??
            continue
        results.append((word[j], weight[30][j]))  # ??????????
    sorted_results = sorted(results, key=lambda result: result[1], reverse=True)   # ??????
    # ?TF-IDF???100????
    fp_tfidf_result = open("f://emotion/mysite/Label_extract/result_tfidf.txt", 'w+')
    tfidf_results = []
    for i in range(100):   # ???????100??????????????
        tfidf_results.append((sorted_results[i][0], sorted_results[i][1]))
        fp_tfidf_result.write(sorted_results[i][0] + ' ' + str(round(sorted_results[i][1], 10)))
        fp_tfidf_result.write('\n')
    fp_tfidf_result.close()
    return tfidf_results
TermDocMatrix.py 文件源码 项目:scattertext 作者: JasonKessler 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def _fit_tfidf_model(self, category, clf):
        y = self._get_mask_from_category(category)
        y_continuous = self._get_continuous_version_boolean_y(y)
        X = TfidfTransformer().fit_transform(self._X)
        clf.fit(X, y_continuous)
topic_model.py 文件源码 项目:Trendster 作者: rawanhassunah 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def fit_tfidf(count_vector):
    '''
    Fits a term frequency matrix on a count vector.
    '''
    tfidf_vector = TfidfTransformer(use_idf=False).fit(count_vector)
    return tfidf_vector
training_classifier.py 文件源码 项目:Trendster 作者: rawanhassunah 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def fit_tfidf(count_vector):
    '''
    Transforms a count vector into a tf vector.
    TF: count vector normalized on legnth of docs.
    '''
    tfidf = TfidfTransformer(use_idf=False)
    tfidf_vector = tfidf.fit(count_vector)
    return tfidf_vector
classifier.py 文件源码 项目:Trendster 作者: rawanhassunah 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def fit_tfidf(count_vector):
    tfidf = TfidfTransformer(use_idf=False)
    tfidf_vector = tfidf.fit(count_vector)
    return tfidf_vector
categorize.py 文件源码 项目:IAAT 作者: rfrugte 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def train_sgdc(training_list):
    footnotes=[]
    cate=[]
    for i in training_list:
        footnotes.append(i[0])
        cate.append(i[1])  
    text_clf=Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3,n_iter=5, random_state=42)),])
    _ = text_clf.fit(footnotes,cate)
    return text_clf
baselines.py 文件源码 项目:context_predictive_words 作者: Cogitans 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def parseToBOW():
    vectorizer = CountVectorizer(min_df=1)
    texts = pickle.load(open(OUTFILE, 'rb'))[0]
    tdm = vectorizer.fit_transform(texts)
    transformer = TfidfTransformer()
    tdidf = transformer.fit_transform(tdm)
    f = open(DATASET_PATH + "BOW.p", "wb")
    pickle.dump(tdm, f)
    f.close()
    f = open(DATASET_PATH + "BOW_TDIDF.p", "wb")
    pickle.dump(tdidf, f)
    f.close()
feature_extractors.py 文件源码 项目:text-analytics-with-python 作者: dipanjanS 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def tfidf_transformer(bow_matrix):

    transformer = TfidfTransformer(norm='l2',
                                   smooth_idf=True,
                                   use_idf=True)
    tfidf_matrix = transformer.fit_transform(bow_matrix)
    return transformer, tfidf_matrix
utils.py 文件源码 项目:event-cui-transfer 作者: mit-ddig 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def transformTFIDF(X_train_all, X_test_all):
    """Transform bag-of-events using TF-IDF.

    Arguments
    ---------
    X_train_all: pandas DataFrame
    X_test_all: pandas DataFrame

    Returns
    -------
    X_train_t: CSR matrix
    X_test_t: CSR matrix
    """

    tfidf_t = TfidfTransformer(norm='l2',
                               use_idf=True,
                               sublinear_tf=True,
                               smooth_idf=True)
    X_train = scipy.sparse.csr_matrix(X_train_all)
    X_test = scipy.sparse.csr_matrix(X_test_all)
    # Fit TFIDF using training data.
    tfidf_t.fit(X_train)
    # Transform both training and test data.
    X_train_t = tfidf_t.transform(X_train)
    X_test_t = tfidf_t.transform(X_test)
    return X_train_t, X_test_t
classifier.py 文件源码 项目:django_text_classifier 作者: django-text-classifier 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def get_pipeline(name):
    x = TrainingSet.objects.filter(classifier=name).values_list('body',
                                                                flat=True)
    y = TrainingSet.objects.filter(classifier=name).values_list('target',
                                                                flat=True)
    pipeline = Pipeline([
         ('vector', CountVectorizer()),
         ('transform', TfidfTransformer()),
         ('bayes', MultinomialNB())
    ])

    pipeline.fit(x, y)

    return pipeline
sentiment_rf.py 文件源码 项目:OpinionMining728 作者: stasi009 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def test_one_rf():
    Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl")
    print "training data loaded"
    print_label_frequency(ytrain_raw)

    ############# create the pipeline
    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)),
        ('tfidf', TfidfTransformer()),
        ('rf', RandomForestClassifier(n_estimators=500,
                                      max_depth=200,
                                      min_samples_split=10,
                                      oob_score=True,
                                      n_jobs=-1,verbose=1,class_weight='balanced')),
    ])

    ############# train
    pipeline.fit(Xtrain_raw,ytrain_raw)

    ############# check result
    rf = pipeline.steps[-1][1]
    rf.oob_score_

    ############# training error
    ytrain_predict = pipeline.predict(Xtrain_raw)
    print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict)
    print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict)

    ############# testing error
    Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl")
    ytest_predict = pipeline.predict(Xtest_raw)
    accuracy_score(y_true=ytest_raw,y_pred=ytest_predict)
    print classification_report(y_true=ytest_raw,y_pred=ytest_predict)
example_2.py 文件源码 项目:nlp 作者: lhyxcxy 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def kmeans(class_num):
    """
    kmeans ??
    :param class_num: ????
    :return:class_list[[??1???2],[??1???2]]
    """
    class_list=list();
    sentences_words,sentences=loadFile()
    vectorizer = CountVectorizer()  # ??????????????????????a[i][j] ??j??i???????
    transformer = TfidfTransformer()  # ??????????tf-idf??
    # ???fit_transform???tf-idf????fit_transform??????????
    #?????words_list  ???["? ? ?? ???","?? ??"] ?????????????list
    tfidf = transformer.fit_transform(vectorizer.fit_transform(sentences_words))

    #weight ???shape=[????????] ???????
    weight = tfidf.toarray()  # ?tf-idf?????????a[i][j]??j??i?????tf-idf??
    clf = KMeans(n_clusters=class_num)
    s = clf.fit(weight)
    for i in range(class_num):
        class_list.append(list())
    print clf.labels_
    for i in range(len(clf.labels_)):#clf.labels_ ??????????[1,3,2,5,0,3,5,4,1] ???????????
        class_label=clf.labels_[i]
        class_list[class_label].append(sentences[i])
        #print "#######?"+str(clf.labels_[i])+"?"+words_list[i]
    return class_list;
idf_trainer.py 文件源码 项目:LLString 作者: mitll 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def __init__(self,min_df=2,norm="l2"):
        """ Constructor """
        self.cv = CountVectorizer(min_df=min_df)
        self.tfidf = TfidfTransformer(norm)

        self.LOG_IDF = None
        self.CORPUS_VOCAB = None
        self.OOV_IDF_VAL = 0 #min idf value to assign for out-of-vocabulary terms

        self.IDF_MODEL = dict()
softtfidf.py 文件源码 项目:LLString 作者: mitll 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def compute_query_idf(self,corpus):
        """ Compute IDF from s and t in case you have no externally computed IDF to use """
        cv = CountVectorizer(min_df = 0.0)
        cv.fit_transform(corpus)
        self.logger.debug(cv.vocabulary_)
        freq_term_matrix = cv.transform(corpus)
        tfidf = TfidfTransformer(norm="l2")
        tfidf.fit(freq_term_matrix)
        log_idf = tfidf.idf_
        self.LOG_IDF = log_idf
        self.CORPUS_VOCAB = cv.vocabulary_
evaluate_jac.py 文件源码 项目:vae_sparse 作者: rahulk90 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def getTF(dataset):
    tfidf = TfidfTransformer(norm=None)
    tfidf.fit(dataset['train'])
    return tfidf.idf_
evaluate_init_final.py 文件源码 项目:vae_sparse 作者: rahulk90 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def getTF(dataset):
    tfidf = TfidfTransformer(norm=None)
    tfidf.fit(dataset['train'])
    return tfidf.idf_
window_sklearn.py 文件源码 项目:MorphoBabushka 作者: nvanva 项目源码 文件源码 阅读 120 收藏 0 点赞 0 评论 0
def tfidf_pipeline(df, ngram_range, lowercase, binary, min_df=2, max_df=1.0, caps_features=False, pos_features=False, clf=LinearSVC()):
    return Pipeline([
        ('mapper', mapper(df, ngram_range, lowercase, binary, min_df, max_df, caps_features, pos_features)),
        ('scaler', TfidfTransformer()),
        ('clf', clf),
    ])
loadFile.py 文件源码 项目:DRM 作者: JohnZhengHub 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def file2mat(filename):
    transformer = TfidfTransformer()
    vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1))
    data = load(filename)
    reviews = [each_data['review'] for each_data in data]
    bag_of_word = vectorizer.fit_transform(reviews)
    tfidf = transformer.fit_transform(bag_of_word)

    aspect_label = collect_aspect_label(data)
    rating_label = collect_rating_label(data)
    return tfidf, aspect_label, rating_label

# ??wordVec ????? ?????????


问题


面经


文章

微信
公众号

扫码关注公众号