python类TruncatedSVD()的实例源码

ClasteringCalculator.py 文件源码 项目:TextStageProcessor 作者: mhyhre 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def make_aa_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'affinity_propagation/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        aa_clusterizator = AffinityPropagation(damping=self.aa_damping,
                                               max_iter=self.aa_max_iter,
                                               convergence_iter=self.aa_no_change_stop)

        predict_result = aa_clusterizator.fit_predict(X)
        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)
ClasteringCalculator.py 文件源码 项目:TextStageProcessor 作者: mhyhre 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def make_birch_clustering(self, short_filenames, input_texts):

        output_dir = self.output_dir + 'birch/'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        if self.need_tf_idf:
            self.signals.PrintInfo.emit("?????? TF-IDF...")
            idf_filename = output_dir + 'tf_idf.csv'
            msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
            self.signals.PrintInfo.emit(msg)

        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(input_texts)

        svd = TruncatedSVD(2)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)
        X = lsa.fit_transform(X)

        birch = Birch(threshold=self.birch_threshold,
                      branching_factor=self.birch_branching_factor,
                      n_clusters=self.birch_clusters_count)

        predict_result = birch.fit_predict(X)
        self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')

        clasters_output = ''
        for claster_index in range(max(predict_result) + 1):
            clasters_output += ('??????? ' + str(claster_index) + ':\n')
            for predict, document in zip(predict_result, short_filenames):
                if predict == claster_index:
                    clasters_output += ('  ' + str(document) + '\n')
            clasters_output += '\n'
        self.signals.PrintInfo.emit(clasters_output)
        self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
        writeStringToFile(clasters_output, output_dir + 'clusters.txt')

        self.draw_clusters_plot(X, predict_result, short_filenames)
__init__.py 文件源码 项目:mlprojects-py 作者: srinathperera 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def doPCA(X, output_columns_count):
    #DO PCA on the data and use it to transform
    svd = TruncatedSVD(output_columns_count)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)

    X = lsa.fit_transform(X)
    return X
dimensionality_reduction.py 文件源码 项目:eezzy 作者: 3Blades 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def SVD_results(data, n_comps=None):
    svd = SVD(n_components=n_comps)
    model = svd.fit(data)
    out_data = {'model' : model, 'reconstruction error': svd.reconstruction_err_ }
    return 'SVD', out_data
tfidf2.py 文件源码 项目:subjectClassify_py 作者: haohhxx 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def SVD_Vec(matData, dimension):
    svd = TruncatedSVD(n_components=dimension)
    newData = svd.fit_transform(matData)
    return newData
tfidf.py 文件源码 项目:subjectClassify_py 作者: haohhxx 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def SVD_Vec(matData, dimension):
    svd = TruncatedSVD(n_components=dimension)
    newData = svd.fit_transform(matData)
    return newData
tfidf_u.py 文件源码 项目:subjectClassify_py 作者: haohhxx 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def SVD_Vec(matData, dimension):
    svd = TruncatedSVD(n_components=dimension)
    newData = svd.fit_transform(matData)
    return newData
classification.py 文件源码 项目:DocumentClassification 作者: bahmanh 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def featuresByLSA(features,ncomponents=100):
    svd = TruncatedSVD(n_components=ncomponents)
    normalizer =  Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    dtm_lsa = lsa.fit_transform(features)
    return dtm_lsa
learn.py 文件源码 项目:partisan-discourse 作者: DistrictDataLabs 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def construct_pipeline(classifier):
    """
    This function creates a feature extraction pipeline that accepts data
    from a CorpusLoader and appends the classification model to the end of
    the pipeline, returning a newly constructed Pipeline object that is
    ready to be fit and trained!
    """

    return Pipeline([
        # Create a Feature Union of Text Stats and Bag of Words
        ('union', FeatureUnion(
            transformer_list = [

                # Pipeline for pulling document structure features
                ('stats', Pipeline([
                    ('stats', TextStats()),
                    ('vect', DictVectorizer()),
                ])),

                # Pipeline for creating a bag of words TF-IDF vector
                ('bow', Pipeline([
                    ('tokens', TextNormalizer()),
                    ('tfidf',  TfidfVectorizer(
                        tokenizer=identity, preprocessor=None, lowercase=False
                    )),
                    ('best', TruncatedSVD(n_components=1000)),
                ])),

            ],

            # weight components in feature union
            transformer_weights = {
                'stats': 0.15,
                'bow': 0.85,
            },
        )),

        # Append the estimator to the end of the pipeline
        ('classifier', classifier),
    ])
feature_vector_space.py 文件源码 项目:Kaggle_HomeDepot 作者: ChenglongChen 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def transform(self):
        tfidf = self._init_word_ngram_tfidf(self.ngram)
        X = tfidf.fit_transform(self.obs_corpus)
        svd = TruncatedSVD(n_components = self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)
feature_vector_space.py 文件源码 项目:Kaggle_HomeDepot 作者: ChenglongChen 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def transform(self):
        tfidf = self._init_char_ngram_tfidf(self.ngram)
        X = tfidf.fit_transform(self.obs_corpus)
        svd = TruncatedSVD(n_components=self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)


# ------------------------ Cooccurrence LSA -------------------------------
# 1st in CrowdFlower
feature_vector_space.py 文件源码 项目:Kaggle_HomeDepot 作者: ChenglongChen 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def transform(self):
        ## tfidf
        tfidf = self._init_word_ngram_tfidf(ngram=self.ngram)
        X_obs = tfidf.fit_transform(self.obs_corpus)
        X_target = tfidf.fit_transform(self.target_corpus)
        X_tfidf = scipy.sparse.hstack([X_obs, X_target]).tocsr()
        ## svd
        svd = TruncatedSVD(n_components=self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        X_svd = svd.fit_transform(X_tfidf)
        return X_svd


# -------------------------------- TSNE ------------------------------------------
# 2nd in CrowdFlower (preprocessing_mikhail.py)
generate_svd_20_feature.py 文件源码 项目:kaggle-quora-solution-8th 作者: qqgeogor 项目源码 文件源码 阅读 43 收藏 0 点赞 0 评论 0
def svd(train,test,dims=20,it=15,file_name='tf_idf',path='data/'):
    svd=TruncatedSVD(n_iter=it,random_state=1123,n_components=dims)
    svd.fit(train)
    pd.to_pickle(svd.transform(train),path+'train_svd_'+str(dims)+'_'+file_name+'.pkl')
    pd.to_pickle(svd.transform(test),path+'test_svd_'+str(dims)+'_'+file_name+'.pkl')
    return 'Success'


# In[3]:
generate_svd_100_feature.py 文件源码 项目:kaggle-quora-solution-8th 作者: qqgeogor 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def svd(train,test,dims=100,it=15,file_name='tf_idf',path='data/'):
    svd=TruncatedSVD(n_iter=it,random_state=1123,n_components=dims)
    svd.fit(train)
    pd.to_pickle(svd.transform(train),path+'train_svd_'+str(dims)+'_'+file_name+'.pkl')
    pd.to_pickle(svd.transform(test),path+'test_svd_'+str(dims)+'_'+file_name+'.pkl')
    return 'Success'


# In[12]:
build_datasets.py 文件源码 项目:bnp 作者: mpearmain 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def buildKB16(n_comp = 200, seed_value = 123):
    ## data
    # read the training/test data  
    print('Importing Data')
    xtrain = pd.read_csv('../input/xtrain_kb6099.csv')
    xtest = pd.read_csv('../input/xtest_kb6099.csv')

    # separate 
    id_train = xtrain.ID; xtrain.drop('ID', axis = 1, inplace = True)
    ytrain = xtrain.target; xtrain.drop('target', axis = 1, inplace = True)
    id_test = xtest.ID; xtest.drop('ID', axis = 1, inplace = True)

    # fit SVD
    svd = TruncatedSVD(n_components = n_comp,n_iter=5, random_state= seed_value)
    svd.fit(xtrain)
    xtrain = svd.transform(xtrain)
    xtest = svd.transform(xtest)
    xtrain = pd.DataFrame(xtrain)
    xtest = pd.DataFrame(xtest)

    ## store the results
    # add indices etc
    xtrain = pd.DataFrame(xtrain)
    xtrain['ID'] = id_train
    xtrain['target'] = ytrain
#
    xtest = pd.DataFrame(xtest)
    xtest['ID'] = id_test
#
#
#    # save the files
    xtrain.to_csv('../input/xtrain_kb16c'+str(n_comp)+'.csv', index = False, header = True)
    xtest.to_csv('../input/xtest_kb16c'+str(n_comp)+'.csv', index = False, header = True)

    return
AE_ts_model.py 文件源码 项目:AE_ts 作者: RobRomijnders 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def plot_z_run(z_run, label, ):
    from sklearn.decomposition import TruncatedSVD
    f1, ax1 = plt.subplots(2, 1)

    PCA_model = TruncatedSVD(n_components=3).fit(z_run)
    z_run_reduced = PCA_model.transform(z_run)
    ax1[0].scatter(z_run_reduced[:, 0], z_run_reduced[:, 1], c=label, marker='*', linewidths=0)
    ax1[0].set_title('PCA on z_run')
    from sklearn.manifold import TSNE
    tSNE_model = TSNE(verbose=2, perplexity=80, min_grad_norm=1E-12, n_iter=3000)
    z_run_tsne = tSNE_model.fit_transform(z_run)
    ax1[1].scatter(z_run_tsne[:, 0], z_run_tsne[:, 1], c=label, marker='*', linewidths=0)
    ax1[1].set_title('tSNE on z_run')
    return
cv_gp.py 文件源码 项目:jamespy_py3 作者: jskDr 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def cv_gp_kernel(self, kernel, n, cv=5):
        X = self.X
        y = self.y
        Xn = TruncatedSVD(n).fit_transform(X)
        cv = cross_val_score(GaussianProcessClassifier(kernel=kernel), Xn, y, cv=cv)
        return cv
nlp.py 文件源码 项目:Informed-Finance-Canary 作者: Darthone 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def tfidf(corpus, corpusKeys, use_dict=False):
    #TODO clean this up
    #discard any stop words - saves on processing
    stopset = list(stopwords.words('english'))
    stopset.append('000')
    stopset.extend([str(x) for x in range(9999)])
    vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(2,3))

    #matrix of input set
    X = (vectorizer.fit_transform(corpus)).toarray()
    size_matrix = X.shape[0] 
    lsa = TruncatedSVD(n_components=size_matrix, n_iter=100)
    terms = vectorizer.get_feature_names()
    records = []
    if use_dict:
        records = {}

    for i, comp in enumerate(X):
        termsInComp = zip(terms, comp)
        sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]

        #List with all the terms gathered from the tfidf vectorizer
        termList = [term[0] + '.' for term in sortedTerms]

        # List with Article ID and list of tfidf terms
        if use_dict:
            records[corpusKeys[i]] = ((vader(corpusKeys[i], termList), termList))
        else:
            records.append((vader(corpusKeys[i], termList), termList))
    return records
tfidf_vader.py 文件源码 项目:Informed-Finance-Canary 作者: Darthone 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def tfidf():
    qry = (StockArticle.select(Article.id, Article.title, Article.content, Article.date, Stock.id.alias('stock_id'), Stock.ticker, StockArticle).join(Stock, on=(StockArticle.stock_id == Stock.id)).join(Article, on=(StockArticle.article_id == Article.id)).where((Stock.ticker == 'GM.N') | (Stock.ticker == 'TGT.N') | (Stock.ticker == 'UAA') | (Stock.ticker == 'UAA.N'), Article.date > '2015-01-01').naive())
    corpusDict = {article.article_id : article.content for article in qry }
    corpus = corpusDict.values()
    corpusKeys = corpusDict.keys()

    #discard any stop words - saves on processing
    stopset = list(stopwords.words('english'))
    stopset.append('000')
    for i in range(9999):
        stopset.append(str(i))
    vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(2,3))

    #matrix of input set
    X = vectorizer.fit_transform(corpus)
    X = X.toarray()
    size_matrix = X.shape[0] 
    lsa = TruncatedSVD(n_components=size_matrix, n_iter=100)
    #lsa.fit(X)
    terms = vectorizer.get_feature_names()
    tfidfList = []
    for i, comp in enumerate(X):
        termsInComp = zip(terms,comp)
        sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]

        #List with all the terms gathered from the tfidf vectorizer
        termList = [term[0] + '.' for term in sortedTerms]

        # List with Article ID and list of tfidf terms
        tfidfList = [corpusKeys[i],termList]

        vader(tfidfList)
d_truncated_SVD.py 文件源码 项目:Machine-Learning 作者: zjuzpz 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def getSVD(data):
    svd = TruncatedSVD(n_components=50, n_iter=5)
    matrix = solution(data)
    svd_matrix = svd.fit_transform(matrix)
    return svd_matrix


问题


面经


文章

微信
公众号

扫码关注公众号