def make_aa_clustering(self, short_filenames, input_texts):
output_dir = self.output_dir + 'affinity_propagation/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if self.need_tf_idf:
self.signals.PrintInfo.emit("?????? TF-IDF...")
idf_filename = output_dir + 'tf_idf.csv'
msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
self.signals.PrintInfo.emit(msg)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(input_texts)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
aa_clusterizator = AffinityPropagation(damping=self.aa_damping,
max_iter=self.aa_max_iter,
convergence_iter=self.aa_no_change_stop)
predict_result = aa_clusterizator.fit_predict(X)
self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
clasters_output = ''
for claster_index in range(max(predict_result) + 1):
clasters_output += ('??????? ' + str(claster_index) + ':\n')
for predict, document in zip(predict_result, short_filenames):
if predict == claster_index:
clasters_output += (' ' + str(document) + '\n')
clasters_output += '\n'
self.signals.PrintInfo.emit(clasters_output)
self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
writeStringToFile(clasters_output, output_dir + 'clusters.txt')
self.draw_clusters_plot(X, predict_result, short_filenames)
python类TruncatedSVD()的实例源码
def make_birch_clustering(self, short_filenames, input_texts):
output_dir = self.output_dir + 'birch/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if self.need_tf_idf:
self.signals.PrintInfo.emit("?????? TF-IDF...")
idf_filename = output_dir + 'tf_idf.csv'
msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
self.signals.PrintInfo.emit(msg)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(input_texts)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
birch = Birch(threshold=self.birch_threshold,
branching_factor=self.birch_branching_factor,
n_clusters=self.birch_clusters_count)
predict_result = birch.fit_predict(X)
self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
clasters_output = ''
for claster_index in range(max(predict_result) + 1):
clasters_output += ('??????? ' + str(claster_index) + ':\n')
for predict, document in zip(predict_result, short_filenames):
if predict == claster_index:
clasters_output += (' ' + str(document) + '\n')
clasters_output += '\n'
self.signals.PrintInfo.emit(clasters_output)
self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
writeStringToFile(clasters_output, output_dir + 'clusters.txt')
self.draw_clusters_plot(X, predict_result, short_filenames)
def doPCA(X, output_columns_count):
#DO PCA on the data and use it to transform
svd = TruncatedSVD(output_columns_count)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
return X
def SVD_results(data, n_comps=None):
svd = SVD(n_components=n_comps)
model = svd.fit(data)
out_data = {'model' : model, 'reconstruction error': svd.reconstruction_err_ }
return 'SVD', out_data
def SVD_Vec(matData, dimension):
svd = TruncatedSVD(n_components=dimension)
newData = svd.fit_transform(matData)
return newData
def SVD_Vec(matData, dimension):
svd = TruncatedSVD(n_components=dimension)
newData = svd.fit_transform(matData)
return newData
def SVD_Vec(matData, dimension):
svd = TruncatedSVD(n_components=dimension)
newData = svd.fit_transform(matData)
return newData
def featuresByLSA(features,ncomponents=100):
svd = TruncatedSVD(n_components=ncomponents)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
dtm_lsa = lsa.fit_transform(features)
return dtm_lsa
def construct_pipeline(classifier):
"""
This function creates a feature extraction pipeline that accepts data
from a CorpusLoader and appends the classification model to the end of
the pipeline, returning a newly constructed Pipeline object that is
ready to be fit and trained!
"""
return Pipeline([
# Create a Feature Union of Text Stats and Bag of Words
('union', FeatureUnion(
transformer_list = [
# Pipeline for pulling document structure features
('stats', Pipeline([
('stats', TextStats()),
('vect', DictVectorizer()),
])),
# Pipeline for creating a bag of words TF-IDF vector
('bow', Pipeline([
('tokens', TextNormalizer()),
('tfidf', TfidfVectorizer(
tokenizer=identity, preprocessor=None, lowercase=False
)),
('best', TruncatedSVD(n_components=1000)),
])),
],
# weight components in feature union
transformer_weights = {
'stats': 0.15,
'bow': 0.85,
},
)),
# Append the estimator to the end of the pipeline
('classifier', classifier),
])
def transform(self):
tfidf = self._init_word_ngram_tfidf(self.ngram)
X = tfidf.fit_transform(self.obs_corpus)
svd = TruncatedSVD(n_components = self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
return svd.fit_transform(X)
def transform(self):
tfidf = self._init_char_ngram_tfidf(self.ngram)
X = tfidf.fit_transform(self.obs_corpus)
svd = TruncatedSVD(n_components=self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
return svd.fit_transform(X)
# ------------------------ Cooccurrence LSA -------------------------------
# 1st in CrowdFlower
def transform(self):
## tfidf
tfidf = self._init_word_ngram_tfidf(ngram=self.ngram)
X_obs = tfidf.fit_transform(self.obs_corpus)
X_target = tfidf.fit_transform(self.target_corpus)
X_tfidf = scipy.sparse.hstack([X_obs, X_target]).tocsr()
## svd
svd = TruncatedSVD(n_components=self.svd_dim,
n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
X_svd = svd.fit_transform(X_tfidf)
return X_svd
# -------------------------------- TSNE ------------------------------------------
# 2nd in CrowdFlower (preprocessing_mikhail.py)
generate_svd_20_feature.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 43
收藏 0
点赞 0
评论 0
def svd(train,test,dims=20,it=15,file_name='tf_idf',path='data/'):
svd=TruncatedSVD(n_iter=it,random_state=1123,n_components=dims)
svd.fit(train)
pd.to_pickle(svd.transform(train),path+'train_svd_'+str(dims)+'_'+file_name+'.pkl')
pd.to_pickle(svd.transform(test),path+'test_svd_'+str(dims)+'_'+file_name+'.pkl')
return 'Success'
# In[3]:
generate_svd_100_feature.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def svd(train,test,dims=100,it=15,file_name='tf_idf',path='data/'):
svd=TruncatedSVD(n_iter=it,random_state=1123,n_components=dims)
svd.fit(train)
pd.to_pickle(svd.transform(train),path+'train_svd_'+str(dims)+'_'+file_name+'.pkl')
pd.to_pickle(svd.transform(test),path+'test_svd_'+str(dims)+'_'+file_name+'.pkl')
return 'Success'
# In[12]:
def buildKB16(n_comp = 200, seed_value = 123):
## data
# read the training/test data
print('Importing Data')
xtrain = pd.read_csv('../input/xtrain_kb6099.csv')
xtest = pd.read_csv('../input/xtest_kb6099.csv')
# separate
id_train = xtrain.ID; xtrain.drop('ID', axis = 1, inplace = True)
ytrain = xtrain.target; xtrain.drop('target', axis = 1, inplace = True)
id_test = xtest.ID; xtest.drop('ID', axis = 1, inplace = True)
# fit SVD
svd = TruncatedSVD(n_components = n_comp,n_iter=5, random_state= seed_value)
svd.fit(xtrain)
xtrain = svd.transform(xtrain)
xtest = svd.transform(xtest)
xtrain = pd.DataFrame(xtrain)
xtest = pd.DataFrame(xtest)
## store the results
# add indices etc
xtrain = pd.DataFrame(xtrain)
xtrain['ID'] = id_train
xtrain['target'] = ytrain
#
xtest = pd.DataFrame(xtest)
xtest['ID'] = id_test
#
#
# # save the files
xtrain.to_csv('../input/xtrain_kb16c'+str(n_comp)+'.csv', index = False, header = True)
xtest.to_csv('../input/xtest_kb16c'+str(n_comp)+'.csv', index = False, header = True)
return
def plot_z_run(z_run, label, ):
from sklearn.decomposition import TruncatedSVD
f1, ax1 = plt.subplots(2, 1)
PCA_model = TruncatedSVD(n_components=3).fit(z_run)
z_run_reduced = PCA_model.transform(z_run)
ax1[0].scatter(z_run_reduced[:, 0], z_run_reduced[:, 1], c=label, marker='*', linewidths=0)
ax1[0].set_title('PCA on z_run')
from sklearn.manifold import TSNE
tSNE_model = TSNE(verbose=2, perplexity=80, min_grad_norm=1E-12, n_iter=3000)
z_run_tsne = tSNE_model.fit_transform(z_run)
ax1[1].scatter(z_run_tsne[:, 0], z_run_tsne[:, 1], c=label, marker='*', linewidths=0)
ax1[1].set_title('tSNE on z_run')
return
def cv_gp_kernel(self, kernel, n, cv=5):
X = self.X
y = self.y
Xn = TruncatedSVD(n).fit_transform(X)
cv = cross_val_score(GaussianProcessClassifier(kernel=kernel), Xn, y, cv=cv)
return cv
def tfidf(corpus, corpusKeys, use_dict=False):
#TODO clean this up
#discard any stop words - saves on processing
stopset = list(stopwords.words('english'))
stopset.append('000')
stopset.extend([str(x) for x in range(9999)])
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(2,3))
#matrix of input set
X = (vectorizer.fit_transform(corpus)).toarray()
size_matrix = X.shape[0]
lsa = TruncatedSVD(n_components=size_matrix, n_iter=100)
terms = vectorizer.get_feature_names()
records = []
if use_dict:
records = {}
for i, comp in enumerate(X):
termsInComp = zip(terms, comp)
sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
#List with all the terms gathered from the tfidf vectorizer
termList = [term[0] + '.' for term in sortedTerms]
# List with Article ID and list of tfidf terms
if use_dict:
records[corpusKeys[i]] = ((vader(corpusKeys[i], termList), termList))
else:
records.append((vader(corpusKeys[i], termList), termList))
return records
def tfidf():
qry = (StockArticle.select(Article.id, Article.title, Article.content, Article.date, Stock.id.alias('stock_id'), Stock.ticker, StockArticle).join(Stock, on=(StockArticle.stock_id == Stock.id)).join(Article, on=(StockArticle.article_id == Article.id)).where((Stock.ticker == 'GM.N') | (Stock.ticker == 'TGT.N') | (Stock.ticker == 'UAA') | (Stock.ticker == 'UAA.N'), Article.date > '2015-01-01').naive())
corpusDict = {article.article_id : article.content for article in qry }
corpus = corpusDict.values()
corpusKeys = corpusDict.keys()
#discard any stop words - saves on processing
stopset = list(stopwords.words('english'))
stopset.append('000')
for i in range(9999):
stopset.append(str(i))
vectorizer = TfidfVectorizer(stop_words=stopset, use_idf=True, ngram_range=(2,3))
#matrix of input set
X = vectorizer.fit_transform(corpus)
X = X.toarray()
size_matrix = X.shape[0]
lsa = TruncatedSVD(n_components=size_matrix, n_iter=100)
#lsa.fit(X)
terms = vectorizer.get_feature_names()
tfidfList = []
for i, comp in enumerate(X):
termsInComp = zip(terms,comp)
sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
#List with all the terms gathered from the tfidf vectorizer
termList = [term[0] + '.' for term in sortedTerms]
# List with Article ID and list of tfidf terms
tfidfList = [corpusKeys[i],termList]
vader(tfidfList)
def getSVD(data):
svd = TruncatedSVD(n_components=50, n_iter=5)
matrix = solution(data)
svd_matrix = svd.fit_transform(matrix)
return svd_matrix