python类TfidfTransformer()的实例源码

papyrus_summary_extraction_tool.py 文件源码 项目:Papyrus--simple-but-effective-text-summarization-tool 作者: RebeccaMerrett 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def function_2(text):
    paragraphs = text.split('\n\n')
    count_vect = CountVectorizer()
    bow_matrix = count_vect.fit_transform(paragraphs)
    normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)
    similarity_graph = normalized_matrix * normalized_matrix.T #term frequency/inverse doc frequency applied
    similarity_graph.toarray()
    nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
    scores = nx.pagerank(nx_graph) #TextRank applied
    ranked = sorted(((scores[i],s) for i,s in enumerate(paragraphs)), reverse=True) #Sorts all paragraphs from highest to lowest scores
    ten_percent = int(round(10.00/100.00 * len(ranked)))
    ten_percent_high_scores = ranked[0:ten_percent]
    summary = [x[1] for x in ten_percent_high_scores] #Takes top 10%, so the paragraphs with the highest scores (does not disturb the rank order)
    return "\n\n".join(summary)

#Text taken from the user's uploaded PDF or URL, cleaned and formatted.
WGGraph.py 文件源码 项目:AbTextSumm 作者: StevenLOL 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def removeSimilarSentences(generatedSentences, originalSentences,  stopwords,threshold=0.80,):
    docs=[]
    for sent, sim in generatedSentences:
        docs.append(sent)
    docs.extend(originalSentences)

    bow_matrix = StemmedTfidfVectorizer(stop_words=stopwords).fit_transform(docs)
    normalized = TfidfTransformer().fit_transform(bow_matrix)
    #simMatrix = (normalized[0:] * normalized[0:].T).A
    simindices=[]
    #print 'Num original, ', len(originalSentences)
    for i in xrange(len(generatedSentences)):
        simGeneratedScores = linear_kernel(normalized[i], normalized[len(generatedSentences):]).flatten()
        if(max(simGeneratedScores) >= threshold):
            simindices.append(i)

    #print simindices
    finalGen=[sentence for k,sentence in enumerate(generatedSentences) if k not in simindices]
    #print len(generatedSentences), len(finalGen)
    return finalGen
twenty_news_group.py 文件源码 项目:DataScience-And-MachineLearning-Handbook-For-Coders 作者: wxyyxc1992 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def extract_feature(self):
        """
        ???????????
        """

        # ?????????-???
        self.train_dtm = self.count_vect.fit_transform(self.data['train'].data)

        # ????? TF ??

        tf_transformer = TfidfTransformer(use_idf=False)

        self.train_tf = tf_transformer.transform(self.train_dtm)

        # ????? TF-IDF ??

        tfidf_transformer = TfidfTransformer().fit(self.train_dtm)

        self.train_tfidf = tf_transformer.transform(self.train_dtm)
test.py 文件源码 项目:Emotion-Identification 作者: saopayne 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def feature(terms):
    dataMatrix = np.genfromtxt(finaltest, delimiter='|', dtype=None, skip_header=True)
    n = dataMatrix.size
    l = len(terms)
    occurence = np.zeros((n, l), dtype=np.int)
    d = 0
    for row in dataMatrix:
        temp = row[0].lower().decode('UTF-8').split(' ')
        for i in range(l):
            if terms[i] in temp:
                occurence[d][i] += 1
        d += 1
    transformer = TfidfTransformer()
    tfdif = transformer.fit_transform(occurence)
    occurence = tfdif.toarray()
    np.savetxt('occurencetest.csv',occurence,delimiter=',')

    return occurence, dataMatrix
pipelines.py 文件源码 项目:magic 作者: pan-webis-de 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def word_unigrams():
    preprocessor = TextCleaner(lowercase=True,
                               filter_urls=True,
                               filter_mentions=True,
                               filter_hashtags=True,
                               alphabetic=True,
                               strip_accents=True,
                               filter_rt=True)
    vectorizer = CountVectorizer(min_df=2,
                                 stop_words=get_stopwords(),
                                 preprocessor=preprocessor,
                                 ngram_range=(1, 1))
    pipeline = Pipeline([('vect', vectorizer),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('scale', Normalizer())])
    return ('word_unigrams', pipeline)
result.py 文件源码 项目:Graduation-design 作者: Baichenjia 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def Training_model():
    #????????????
    f = open("f://emotion/mysite/weibo_emotion/emotion_file/data_count.txt")   # ???????????
    f.readline()   # ????
    data = np.loadtxt(f)
    #?????????
    f1 = open("f://emotion/mysite/weibo_emotion/emotion_file/data_jixing.txt")
    leibie = np.loadtxt(f1)
    f.close()
    f1.close()

    #TF-IDF??
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(data)
    data1 = tfidf.toarray()

    #SVM?????
    clf = svm.SVC()   # class
    clf.fit(data1, leibie)    # training the svc model
    return clf
test_termDocMatrixFactory.py 文件源码 项目:scattertext 作者: JasonKessler 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def test_main(self):
        categories, documents = get_docs_categories()
        clean_function = lambda text: '' if text.startswith('[') else text
        entity_types = set(['GPE'])
        term_doc_mat = (
            TermDocMatrixFactory(
                category_text_iter=zip(categories, documents),
                clean_function=clean_function,
                nlp=_testing_nlp,
                feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types)
            ).build()
        )
        clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0)
        fdc = FeatsFromDoc(term_doc_mat._term_idx_store,
                           clean_function=clean_function,
                           feats_from_spacy_doc=FeatsFromSpacyDoc(
                               entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
        tfidf = TfidfTransformer(norm='l1')
        X = tfidf.fit_transform(term_doc_mat._X)
        clf.fit(X, term_doc_mat._y)
        X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
        pred = clf.predict(tfidf.transform(X_to_predict))
        dec = clf.decision_function(X_to_predict)
TermDocMatrix.py 文件源码 项目:scattertext 作者: JasonKessler 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def get_logistic_regression_coefs_l2(self, category,
                                         clf=RidgeClassifierCV()):
        ''' Computes l2-penalized logistic regression score.
        Parameters
        ----------
        category : str
            category name to score

        category : str
            category name to score
        Returns
        -------
            (coefficient array, accuracy, majority class baseline accuracy)
        '''
        from sklearn.cross_validation import cross_val_predict
        y = self._get_mask_from_category(category)
        X = TfidfTransformer().fit_transform(self._X)
        clf.fit(X, y)
        y_hat = cross_val_predict(clf, X, y)
        acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat)
        return clf.coef_[0], acc, baseline
TermDocMatrix.py 文件源码 项目:scattertext 作者: JasonKessler 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def get_logistic_regression_coefs_l1(self, category,
                                         clf=LassoCV(alphas=[0.1, 0.001],
                                                     max_iter=10000,
                                                     n_jobs=-1)):
        ''' Computes l1-penalized logistic regression score.
        Parameters
        ----------
        category : str
            category name to score

        Returns
        -------
            (coefficient array, accuracy, majority class baseline accuracy)
        '''
        from sklearn.cross_validation import cross_val_predict
        y = self._get_mask_from_category(category)
        y_continuous = self._get_continuous_version_boolean_y(y)
        # X = TfidfTransformer().fit_transform(self._X)
        X = self._X

        clf.fit(X, y_continuous)
        y_hat = (cross_val_predict(clf, X, y_continuous) > 0)
        acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat)
        clf.fit(X, y_continuous)
        return clf.coef_, acc, baseline
example_1.py 文件源码 项目:nlp 作者: lhyxcxy 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def getTFIDF():
    """

    :return:
    """
    corpus,textList=getFenCiWords();
    vectorizer=CountVectorizer()#??????????????????????a[i][j] ??j??i???????
    transformer=TfidfTransformer()#??????????tf-idf??
    tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#???fit_transform???tf-idf????fit_transform??????????
    word=vectorizer.get_feature_names()#????????????
    weight = tfidf.toarray()  # ?tf-idf?????????a[i][j]??j??i?????tf-idf??
    print "?" + str(len(weight)) + "???" + ",?" + str(len(word)) + "??"
    return weight, textList
    # for i in range(len(weight)):#???????tf-idf????????for??????????for?????????????
    #   print u"-------?????",i,u"??????tf-idf??------"
    # for j in range(len(word)):
    # print word[j],weight[i][j]
scikitre.py 文件源码 项目:IBRel 作者: lasigeBioTM 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
        super(ScikitRE, self).__init__()
        self.modelname = relationtype + "_" + modelname
        self.relationtype = relationtype
        self.pairtype = relationtype
        self.corpus = corpus
        self.pairs = []
        self.features = []
        self.labels = []
        self.pred = []
        self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
        self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
        self.generate_data(corpus, modelname, relationtype)
        self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
                                  #('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
                                  #('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
                                  #('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
                                  #('clf', SGDClassifier())
                                  #('clf', svm.NuSVC(nu=0.01 ))
                                   #('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
                                  ('clf', MultinomialNB(alpha=0.01, fit_prior=False))
                                  #('clf', DummyClassifier(strategy="constant", constant=True))
                                 ])
tfidf_feature.py 文件源码 项目:TextClassification 作者: mosu027 项目源码 文件源码 阅读 100 收藏 0 点赞 0 评论 0
def tfidf_feature(xtrain, xtest, stopwords_path):
    """
    tf-idf feature
    """
    xtrain = [" ".join(word) for word in xtrain]
    xtest = [" ".join(word) for word in xtest]
    stopwords = codecs.open(stopwords_path, 'r', encoding='utf-8').readlines()
    stopwords = [word.strip("\n") for word in stopwords]
    vectorizer_train = CountVectorizer(analyzer='word', stop_words=stopwords,min_df=5)
    count_train = vectorizer_train.fit_transform(xtrain)
    vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_)
    count_test = vectorizer_test.fit_transform(xtest)

    transformer = TfidfTransformer()
    tfidf_train = transformer.fit(count_train).transform(count_train)
    tfidf_test = transformer.fit(count_test).transform(count_test)

    return tfidf_train.toarray(),tfidf_test.toarray()
category.py 文件源码 项目:feature_engineering 作者: webeng 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def getModels(self):
        with open(self.data_path + '/categories.pkl', 'rb') as f:
            categories = cPickle.load(f)

        with open(self.data_path + '/category_map.pkl', 'rb') as f:
            category_map = cPickle.load(f)

        with open(self.data_path + '/article_classifier_model.pkl', 'rb') as f:
            clf = cPickle.load(f)

        count_vect = CountVectorizer()
        with open(self.data_path + '/count_vect.pkl', 'rb') as f:
            count_vect = cPickle.load(f)

        tfidf_transformer = TfidfTransformer()
        with open(self.data_path + '/tfidf_transformer.pkl', 'rb') as f:
            tfidf_transformer = cPickle.load(f)

        with open(self.data_path + '/tree.pkl', 'rb') as f:
            tree = cPickle.load(f)

        return categories, category_map, clf, count_vect, tfidf_transformer, tree
centroid_w2v.py 文件源码 项目:text-summarizer 作者: gaetangate 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def get_topic_idf(self, sentences):
        vectorizer = CountVectorizer()
        sent_word_matrix = vectorizer.fit_transform(sentences)

        transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False)
        tfidf = transformer.fit_transform(sent_word_matrix)
        tfidf = tfidf.toarray()

        centroid_vector = tfidf.sum(0)
        centroid_vector = np.divide(centroid_vector, centroid_vector.max())
        # print(centroid_vector.max())

        feature_names = vectorizer.get_feature_names()
        word_list = []
        for i in range(centroid_vector.shape[0]):
            if centroid_vector[i] > self.topic_threshold:
                # print(feature_names[i], centroid_vector[i])
                word_list.append(feature_names[i])

        return word_list
test_text.py 文件源码 项目:Parallel-SGD 作者: angadgill 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def test_tf_idf_smoothing():
    X = [[1, 1, 1],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=True, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert_true((tfidf >= 0).all())

    # check normalization
    assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])

    # this is robust to features with only zeros
    X = [[1, 1, 0],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=True, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert_true((tfidf >= 0).all())
Vectorizer.py 文件源码 项目:kindred 作者: jakelever 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def _vectorize(self,corpus,fit):
        assert isinstance(corpus,kindred.Corpus)

        matrices = []
        for feature in self.chosenFeatures:
            assert feature in self.featureInfo.keys()
            featureFunction = self.featureInfo[feature]['func']
            never_tfidf = self.featureInfo[feature]['never_tfidf']
            data = featureFunction(corpus)
            notEmpty = any( len(d)>0 for d in data )
            if fit:
                if notEmpty:
                    self.dictVectorizers[feature] = DictVectorizer()
                    if self.tfidf and not never_tfidf:
                        self.tfidfTransformers[feature] = TfidfTransformer()
                        intermediate = self.dictVectorizers[feature].fit_transform(data)
                        matrices.append(self.tfidfTransformers[feature].fit_transform(intermediate))
                    else:
                        matrices.append(self.dictVectorizers[feature].fit_transform(data))
            else:
                if feature in self.dictVectorizers:
                    if self.tfidf and not never_tfidf:
                        intermediate = self.dictVectorizers[feature].transform(data)
                        matrices.append(self.tfidfTransformers[feature].transform(intermediate))
                    else:
                        matrices.append(self.dictVectorizers[feature].transform(data))

        mergedMatrix = hstack(matrices)
        return mergedMatrix
text_classifier.py 文件源码 项目:textar 作者: datosgobar 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def __init__(self, texts, ids, vocabulary=None, encoding='utf-8'):
        """Definido en la declaracion de la clase.

        Attributes:
            texts (list of str): Textos a clasificar.
            ids (list of str): Identificadores únicos para cada texto (debe
                tener la misma longitud que `texts`).
            vocabulary (list): Opcional. Vocabulario a tener en cuenta para la
                vectorización de los textos. Default: usa todas las palabras
                presentes en los textos, salvo los ES_stopwords.txt.
            encoding (str): Codificación de los textos en `texts` y en `ids`.
        """
        this_dir, this_filename = os.path.split(__file__)
        es_stopwords = pd.read_csv(os.path.join(this_dir, 'ES_stopwords.txt'),
                                   header=None, encoding='utf-8')
        es_stopwords = list(np.squeeze(es_stopwords.values))
        self._check_id_length(ids)
        self.vectorizer = CountVectorizer(
            input='content', encoding=encoding, decode_error='strict',
            strip_accents='ascii', lowercase=True, preprocessor=None,
            tokenizer=None, stop_words=es_stopwords, ngram_range=(1, 1),
            analyzer='word', max_df=0.8, min_df=1, max_features=None,
            vocabulary=vocabulary, binary=False)

        self.transformer = TfidfTransformer()
        self.ids = None  # Matiene una lista ordenada de ids de textos.
        self.term_mat = None  # Matriz que cuenta los terminos en un texto.
        self.tfidf_mat = None  # Matriz de relevancia de los terminos.
        self.reload_texts(texts, ids)
sklearn_ex1.py 文件源码 项目:base_function 作者: Rockyzsu 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def case1():
    from sklearn import datasets
    news = datasets.fetch_20newsgroups(subset='all')
    # print len(news.data)
    # print len(news.target)

    # print '*'*10
    # print news.data[0]
    # print '*'*10
    # print news.target[0]
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
    vec = CountVectorizer()
    x = vec.fit_transform(news.data)
    # print x.shape
    # print x[:2]
    print x[:10,:10].toarray()
    TFIDF = TfidfTransformer()
    x_tfidf = TFIDF.fit_transform(x)
    print x_tfidf[:10,:10].toarray()


    from sklearn.cross_validation import train_test_split
    Xtrain, Xtest, ytrain,ytest =train_test_split(x,news.target,test_size = 0.3,random_state=233)

    tf_Xtrain, tf_Xtest, tf_ytrain,tf_ytest =train_test_split(x_tfidf,news.target,test_size = 0.3,random_state=233)


    from sklearn.naive_bayes import MultinomialNB
    mnb =MultinomialNB()
    tf_mnb = MultinomialNB()

    mmb.fit(Xtrain,ytrain)
    tf_mnb.fit(tf_Xtrain,tf_ytrain)
classifier_svm.py 文件源码 项目:text-classification 作者: cahya-wirawan 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def fit(self, dataset, filename):
        self.logger.debug("fit")
        self.clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                             ])
        self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
        joblib.dump(self.clf, filename + ".pkl", compress=9)
classifier_bayesian.py 文件源码 项目:text-classification 作者: cahya-wirawan 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def fit(self, dataset, filename):
        self.logger.debug("fit")
        self.clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', MultinomialNB())
                             ])
        self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
        joblib.dump(self.clf, filename + ".pkl", compress=9)
features.py 文件源码 项目:AlphaPy 作者: ScottFreeLLC 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def cvectorize(f, c, n):
    r"""Use the Count Vectorizer and TF-IDF Transformer.

    Parameters
    ----------
    f : pandas.DataFrame
        Dataframe containing the column ``c``.
    c : str
        Name of the text column in the dataframe ``f``.
    n : int
        The number of n-grams.

    Returns
    -------
    new_features : sparse matrix
        The transformed features.

    References
    ----------
    To use count vectorization and TF-IDF, you can find more
    information here [TFE]_.

    .. [TFE] http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

    """
    fc = f[c]
    fc.fillna(BSEP, inplace=True)
    cvect = CountVectorizer(ngram_range=[1, n], analyzer='char')
    cfeat = cvect.fit_transform(fc)
    tfidf_transformer = TfidfTransformer()
    new_features = tfidf_transformer.fit_transform(cfeat).toarray()
    return new_features


#
# Function apply_treatment
#
base.py 文件源码 项目:vec4ir 作者: lgalke 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def __init__(self, norm='l2', use_idf=True, smooth_idf=True,
                 sublinear_tf=False, **kwargs):
        self.tfidf = TfidfTransformer(norm=norm, use_idf=use_idf,
                                      smooth_idf=smooth_idf,
                                      sublinear_tf=sublinear_tf)

        # override defaults since we need the counts here
        self.verbose = kwargs.get('verbose', 0)

        binary = kwargs.pop('binary', False)
        dtype = kwargs.pop('dtype', np.int64)

        # pass remaining args to countvectorizer
        self._init_params(name="TFIDF", binary=binary, dtype=dtype, **kwargs)
twenty_news_group.py 文件源码 项目:DataScience-And-MachineLearning-Handbook-For-Coders 作者: wxyyxc1992 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def predict(self, docs):
        """
        ???????????
        """

        X_new_counts = self.count_vect.transform(docs)

        tfidf_transformer = TfidfTransformer().fit(X_new_counts)

        X_new_tfidf = tfidf_transformer.transform(X_new_counts)

        return self.clf.predict(X_new_tfidf)
reuters.py 文件源码 项目:MachineLearningProject 作者: ymynem 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def normalize(counts):
    transformer = TfidfTransformer(smooth_idf=1)
    return transformer.fit_transform(counts).toarray()
semeval_regression.py 文件源码 项目:semeval2016-task4 作者: aesuli 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def main():
    sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('-i', '--input', help='Input file', required=True)
    parser.add_argument('-t', '--test', help='Test file', required=True)
    parser.add_argument('-o', '--output', help='Output filename prefix', required=True)
    parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0)
    parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000)
    args = parser.parse_args()

    data = read_semeval_regression(args.input, encoding='windows-1252')

    analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])

    pipeline = Pipeline([
        ('vect', CountVectorizer(analyzer=analyzer)),
        ('tfidf', TfidfTransformer()),
        ('sel', SelectKBest(chi2, k=args.k)),
        ('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)),
    ])

    test = read_test_data(args.test, encoding='windows-1252')

    regressor = pipeline.fit(data[0], data[1])

    y = regressor.predict(test[2])

    with open('%sc%f-k%i-C.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as outfile:
        for id_, topic, rate in zip(test[0], test[1], y):
            print(id_, topic, rate, sep='\t', file=outfile)
bayes.py 文件源码 项目:opentc 作者: cahya-wirawan 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def fit(self, dataset, filename):
        self.logger.debug("fit")
        self.clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', MultinomialNB())
                             ])
        self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
        joblib.dump(self.clf, filename + ".pkl", compress=9)
svm.py 文件源码 项目:opentc 作者: cahya-wirawan 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def fit(self, dataset, filename):
        self.logger.debug("fit")
        self.clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                             ])
        self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
        joblib.dump(self.clf, filename + ".pkl", compress=9)
train.py 文件源码 项目:Emotion-Identification 作者: saopayne 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def feature():
    global termcount
    dataMatrix = np.genfromtxt(finaltrial, delimiter='|', dtype=None, skip_header=True)
    terms = []
    n = dataMatrix.size
    for row in dataMatrix:
        row[0] = row[0].lower().decode('UTF-8')
        temp = row[0].decode('UTF-8').replace(' ', '+')
        temp = (get.urlopen("http://localhost:5095/parser?sentence=" + temp).read()).decode('UTF-8')
        terms.extend([x.split('/')[0] for x in temp.split(' ') if
                      x.split('/')[1] == 'JJ' or x.split('/')[1].startswith('VB')])
        tfidf(temp)
    s = sum(list(termcount.values()))
    termcount = {x: (y * 100 / s) for x, y in zip(termcount.keys(), termcount.values())}
    # terms.extend([x for x in termcount.keys()])
    terms = list(set(terms))
    stop = open('stop.csv', 'r').read().splitlines()
    terms = [x for x in terms if x not in stop]
    l = len(terms)
    occurence = np.zeros((n, l), dtype=np.int)
    d = 0
    for row in dataMatrix:
        temp = row[0].decode('UTF-8').split(' ')
        for i in range(l):
            if terms[i] in temp:
                occurence[d][i] += 1
        d += 1
    transformer = TfidfTransformer()
    tfdif = transformer.fit_transform(occurence)
    occurence = tfdif.toarray()


    np.savetxt('occurence.csv',occurence,delimiter=',')
    return occurence, dataMatrix, terms
pipelines.py 文件源码 项目:magic 作者: pan-webis-de 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def avg_spelling_error(lang=None):
    pipeline = Pipeline([('feature', SpellingError(language=lang)),
                         ('tfidf', TfidfTransformer(sublinear_tf=False)),
                         ('scale', Normalizer())])
    return ('avg_spelling_error', pipeline)
pipelines.py 文件源码 项目:magic 作者: pan-webis-de 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def punctuation_features():
    pipeline = Pipeline([('feature', PunctuationFeatures()),
                         ('tfidf', TfidfTransformer(sublinear_tf=False)),
                         ('scale', Normalizer())])
    return ('punctuation_features', pipeline)


问题


面经


文章

微信
公众号

扫码关注公众号