def print_topic_summary(self, df, topic_num, num_words=20):
'''
Function to print summary of a topic from NMF clustering
INPUT:
df: pandas DataFrame that NMF clustering was run on
topic_num: index of topic from clustering
num_words: top n words to print in summary
'''
num_reviews = self.labels[:, topic_num].sum()
print 'Summary of Topic {}:'.format(topic_num)
print 'Number of reviews in topic: {}'.format(num_reviews)
print 'Top {} words in topic:'.format(num_words)
print self.top_words_by_topic(num_words, topic_num)
if not num_reviews:
return None
python类NMF的实例源码
def new(n_feature=128):
vectorizer = CountVectorizer(
encoding='utf-8',
ngram_range=(1,1), # Unigram only
max_features=n_feature,
binary=True
)
# Fill the gap (missing expected tags)
# ---
# Hypothesis: Some tags are somehow related so
# we smoothen the missing values with matrix factorisation.
smoother = NMF(n_components=n_feature)
# Binarise the vector's individual values
binariser = Binarizer(copy=True)
# Count vectoriser => NMF as smoother => Binariser
print(colored('Taghasher model created','yellow'))
return [vectorizer,smoother,binariser]
def test_nmf(eng):
t = linspace(0, 10, 100)
s1 = 1 + absolute(sin(t))
s2 = 1 + square(cos(2*t))
h = c_[s1, s2].T
w = array([[1, 0], [0, 1], [1, 1]])
x = dot(w, h)
x = fromarray(x, engine=eng)
from sklearn.decomposition import NMF as skNMF
nmf = skNMF(n_components=2, random_state=0)
w1 = nmf.fit_transform(x.toarray())
h1 = nmf.components_
xhat1 = dot(w1, h1)
w2, h2 = NMF(k=2, seed=0).fit(x)
xhat2 = dot(w2, h2)
tol=1e-1
assert allclose(xhat1, xhat2, atol=tol)
def run_nmf(alignment, num_clusters, alpha, mixing):
model = NMF(n_components = num_clusters, init = 'nndsvd', alpha = alpha, l1_ratio = mixing, verbose = 0)
return(model.fit_transform(alignment))
def fit_nmf(self, df):
'''
Function to run NMF clustering on dataframe
INPUT:
df: pandas Dataframe containing 'lemmatized_text' column for TF-IDF
'''
self.optimize_nmf(df)
self.nmf = NMF(n_components=self.optimum_topics, alpha=self.nmf_alpha,
l1_ratio=self.nmf_l1_ratio, random_state=self.random_state).fit(self.tfidf_matrix)
self.W_matrix = self.nmf.transform(self.tfidf_matrix)
sums = self.W_matrix.sum(axis=1)
self.W_pct = self.W_matrix / sums[:, None]
self.labels = self.W_pct >= 0.20
print "Reconstruction Error: {}".format(self.nmf.reconstruction_err_)
def plot_topic(self, topic_idx):
'''
Function to plot a wordcloud based on a topic
INPUT:
topic_idx: index of topic from NMF clustering
'''
title = raw_input('Enter a title for this plot: ')
num_reviews = self.labels[:, topic_idx].sum()
word_freq = self.topic_word_frequency(topic_idx)
wc = WordCloud(width=2000, height=1000, max_words=150,
background_color='white')
wc.fit_words(word_freq)
fig = plt.figure(figsize=(16, 8))
ax = fig.add_subplot(111)
ax.set_title('Topic {}: {}\nNumber of Reviews in Topic: {}'.format(
topic_idx, title, num_reviews), fontsize=24)
ax.axis('off')
ax.imshow(wc)
name = 'topic_' + str(topic_idx) + '.png'
if self.pro_or_con == 'pro':
img_path = os.path.join('images', 'positive')
else:
img_path = os.path.join('images', 'negative')
plt.savefig(os.path.join(img_path, name))
plt.show()
def visualize_topics(self, df):
'''
Function to cycle through all topics and print summary and plot cloud
INPUT:
df: pandas DataFrame (source for NMF text)
'''
for i in range(self.optimum_topics):
self.print_topic_summary(df, i)
self.plot_topic(i)
print ''
def build_model(self, baskets, use_probabilities=False):
# print 'build V'
self.__buildV(baskets, use_probabilities)
# print 'density', 1.0 * len(self.V.nonzero()[0]) / (self.V.shape[0] * self.V.shape[1])
sknmf = SKNMF(n_components=self.n_factor, init='random', solver='cd', tol=self.tol, max_iter=self.max_iter,
alpha=self.alpha, l1_ratio=self.l1_ratio, beta=self.beta)
self.W = sknmf.fit_transform(self.V)
self.H = sknmf.components_
self.R = np.dot(self.W, self.H)
self.__state = 'built'
return self
def apply( self, X, k = 2, init_W = None, init_H = None ):
"""
Apply NMF to the specified document-term matrix X.
"""
self.W = None
self.H = None
random_seed = np.random.randint( 1, 100000 )
if not (init_W is None or init_H is None):
model = decomposition.NMF( init="custom", n_components=k, max_iter=self.max_iters, random_state = random_seed )
self.W = model.fit_transform( X, W=init_W, H=init_H )
else:
model = decomposition.NMF( init=self.init_strategy, n_components=k, max_iter=self.max_iters, random_state = random_seed )
self.W = model.fit_transform( X )
self.H = model.components_
def rank_terms( self, topic_index, top = -1 ):
"""
Return the top ranked terms for the specified topic, generated during the last NMF run.
"""
if self.H is None:
raise ValueError("No results for previous run available")
# NB: reverse
top_indices = np.argsort( self.H[topic_index,:] )[::-1]
# truncate if necessary
if top < 1 or top > len(top_indices):
return top_indices
return top_indices[0:top]
def plot_nmf(data, analyse = True, n_components = 2):
"""Perform NMF and plot overview of the results"""
if analyse:
nmf = sd.NMF(n_components=n_components, init = 'nndsvdar', random_state = 0, solver = 'cd')
Y = nmf.fit_transform(data)
else:
Y = data;
nmf = None;
if n_components is None:
n_components = 3;
if n_components == 1:
plt.subplot(1,3,1);
plt.plot(Y);
elif n_components == 2:
plt.subplot(1,3,1);
plt.scatter(Y[:,0], Y[:,1], c = range(len(Y[:,0])), cmap = plt.cm.Spectral);
else:
ax = plt.gcf().add_subplot(1,3,1, projection = '3d');
ax.scatter(Y[:, 0], Y[:, 1], Y[:,2], c = range(len(Y[:,0])), cmap=plt.cm.Spectral)
plt.title("nmf")
if nmf is not None:
feat = nmf.components_;
plt.subplot(1,3,2);
plt.imshow(feat, interpolation = 'none', aspect = 'auto', cmap = 'viridis')
plt.colorbar(pad = 0.01,fraction = 0.01)
plt.title('features');
plt.subplot(1,3,3);
plt.imshow(Y, interpolation = 'none', aspect = 'auto', cmap = 'viridis')
plt.colorbar(pad = 0.01,fraction = 0.01)
plt.title('amplitudes');
plt.tight_layout();
def plot_nmf(data, analyse = True, n_components = 2):
"""Perform NMF and plot overview of the results"""
if analyse:
nmf = sd.NMF(n_components=n_components, init = 'nndsvdar', random_state = 0, solver = 'cd')
Y = nmf.fit_transform(data)
else:
Y = data;
nmf = None;
if n_components is None:
n_components = 3;
if n_components == 1:
plt.subplot(1,3,1);
plt.plot(Y);
elif n_components == 2:
plt.subplot(1,3,1);
plt.scatter(Y[:,0], Y[:,1], c = range(len(Y[:,0])), cmap = plt.cm.Spectral);
else:
ax = plt.gcf().add_subplot(1,3,1, projection = '3d');
ax.scatter(Y[:, 0], Y[:, 1], Y[:,2], c = range(len(Y[:,0])), cmap=plt.cm.Spectral)
plt.title("nmf")
if nmf is not None:
feat = nmf.components_;
plt.subplot(1,3,2);
plt.imshow(feat, interpolation = 'none', aspect = 'auto', cmap = 'viridis')
plt.colorbar(pad = 0.01,fraction = 0.01)
plt.title('features');
plt.subplot(1,3,3);
plt.imshow(Y, interpolation = 'none', aspect = 'auto', cmap = 'viridis')
plt.colorbar(pad = 0.01,fraction = 0.01)
plt.title('amplitudes');
plt.tight_layout();
def finetune(Y, cin, nIter=5):
"""Fine tuning of components within greedyROI using rank-1 NMF
"""
for iter in range(nIter):
a = np.maximum(np.dot(Y, cin), 0)
a = a / np.sqrt(np.sum(a**2))
c = np.sum(Y * a[..., np.newaxis], tuple(np.arange(Y.ndim - 1)))
return a, c
#%%
def NMF_feature_extraction(text_lst, n_samples, n_features, n_topics, n_top_words):
print "Extracting tf-idf features for NMF..."
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(text_lst)
print "Fitting the NMF model with tf-idf features," "n_samples=%d and n_features=%d..." % (n_samples, n_features)
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
print "\nTopics in NMF model:"
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)
print "*************end NMF****************"
def NMF_results(data, n_comps=None):
nmf = NMF(n_components=n_comps)
model = nmf.fit(data)
out_data = {'model' : model, 'reconstruction error': nmf.reconstruction_err_ }
return 'NMF', out_data
def nmf(X, n_components=None):
'''
Non Negative Matrix Factorization.
Outputs the weights (W) matrix and the components.
'''
model = NMF(n_components)
W = model.fit_transform(X)
components = model.components_
return W, components
def nmf(X, n_components=None):
model = NMF(n_components)
W = model.fit_transform(X)
components = model.components_
return W, components
def build_analyzer(self):
analyzer = super(TfidfVectorizer, self).build_analyzer()
return lambda doc: (no_plural_stemmer(w) for w in analyzer(doc))
# We use a few heuristics to filter out useless terms early on: the posts
# are stripped of headers, footers and quoted replies, and common English
# words, words occurring in only one document or in at least 95% of the
# documents are removed.
# Use tf-idf features for NMF.
def nmf_accuracy():
tdm = pickle.load(open(DATASET_PATH + "BOW.p", "rb"))
true_labels = pickle.load(open(OUTFILE_STANCE, "rb"))[0]
print("I'm NNMF-ing!")
NNMF = NMF(max_iter=50, n_components=100)
tdm_reshaped = NNMF.fit_transform(tdm)
print("I'm clustering!")
cluster_kmeans(tdm_reshaped, true_labels)
def build_nmf(X, k=5):
mod = NMF(n_components=k)
W = mod.fit_transform(X)
H = mod.components_
return W, H
def build_nmf_all(X, k=5):
scaler = MinMaxScaler()
X_sca = scaler.fit_transform(X)
nmfModel = NMF(n_components=k)
W = nmfModel.fit_transform(X_sca)
H = nmfModel.components_
print 'NMF done!'
# plot_heatmap(H.T, k=k)
labelsNMF = W.argmax(axis=1)
return W, H, labelsNMF, nmfModel
def nmf_test(df):
X = df.drop(['Year', 'zipcode'], axis=1).values
scaler = MinMaxScaler()
X_sca = scaler.fit_tranform(X)
scores = []
for k in xrange(2, 11):
model = NMF(n_components=k)
W = model.fit_transform(X_sca)
labels = W.argmax(axis=1)
score = silhouette_score(X_sca, labels)
scores.append(score)
plt.plot(xrange(2, 11), scores, 'b*-')
plt.show()
generate_NMF_6_feature.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def svd(train,test,dims=6,it=15,file_name='tf_idf',path='data/'):
svd=NMF(random_state=1123,n_components=dims)
svd.fit(train)
#print svd.transform(train).shape
pd.to_pickle(svd.transform(train),path+'train_NMF_'+str(dims)+'_'+file_name+'.pkl')
pd.to_pickle(svd.transform(test),path+'test_NMF_'+str(dims)+'_'+file_name+'.pkl')
return 'Success'
# In[16]:
def check_transformer_data_not_an_array(name, Transformer):
X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
random_state=0, n_features=2, cluster_std=0.1)
X = StandardScaler().fit_transform(X)
# We need to make sure that we have non negative data, for things
# like NMF
X -= X.min() - .1
this_X = NotAnArray(X)
this_y = NotAnArray(np.asarray(y))
_check_transformer(name, Transformer, this_X, this_y)
def check_classifiers_classes(name, Classifier):
X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
X, y = shuffle(X, y, random_state=7)
X = StandardScaler().fit_transform(X)
# We need to make sure that we have non negative data, for things
# like NMF
X -= X.min() - .1
y_names = np.array(["one", "two", "three"])[y]
for y_names in [y_names, y_names.astype('O')]:
if name in ["LabelPropagation", "LabelSpreading"]:
# TODO some complication with -1 label
y_ = y
else:
y_ = y_names
classes = np.unique(y_)
# catch deprecation warnings
with warnings.catch_warnings(record=True):
classifier = Classifier()
if name == 'BernoulliNB':
classifier.set_params(binarize=X.mean())
set_testing_parameters(classifier)
set_random_state(classifier)
# fit
classifier.fit(X, y_)
y_pred = classifier.predict(X)
# training set performance
assert_array_equal(np.unique(y_), np.unique(y_pred))
if np.any(classifier.classes_ != classes):
print("Unexpected classes_ attribute for %r: "
"expected %s, got %s" %
(classifier, classes, classifier.classes_))
def _fit_local(self, data):
from sklearn.decomposition import NMF
nmf = NMF(n_components=self.k, tol=self.tol, max_iter=self.max_iter, random_state=self.seed)
w = nmf.fit_transform(data)
return w, nmf.components_,
def optimize_nmf(self, df):
'''
Function to optimize the number of topics used in NMF clustering.
INPUT:
df: pandas Dataframe containing 'lemmatized_text' column for TF-IDF
'''
self.fit_tfidf(df)
if not self.optimum_topics:
avg_cosine_sim = []
pbar = ProgressBar()
for i in pbar(self.num_topics):
cosine_sim = []
self.nmf = NMF(n_components=i,
alpha=self.nmf_alpha,
l1_ratio=self.nmf_l1_ratio,
random_state=self.random_state).fit(self.tfidf_matrix)
err = self.nmf.reconstruction_err_
self.H_matrix = self.nmf.components_
if i == 1:
avg_cosine_sim.append(1)
else:
idx_arr = np.arange(i)
for combo in combinations(idx_arr, 2):
vect_1 = self.H_matrix[:, int(combo[0])].reshape(-1, 1)
vect_2 = self.H_matrix[:, int(combo[1])].reshape(-1, 1)
sim = cosine_similarity(vect_1, vect_2)
cosine_sim.append(sim)
avg_cosine_sim.append(np.mean(cosine_sim))
self.reconstruction_err_array.append(err)
fig = plt.figure(figsize=(16, 8))
ax_1 = fig.add_subplot(211)
ax_1.plot(self.num_topics, self.reconstruction_err_array)
ax_1.set_title("Reconstruction Error vs Number of Topics")
ax_1.set_xlabel("Number of Topics")
ax_1.set_ylabel("Reconstruction Error")
ax_2 = fig.add_subplot(212)
ax_2.plot(self.num_topics, avg_cosine_sim)
ax_2.set_title("Avg Cosine Similarity Between Topics")
ax_2.set_xlabel("Number of Topics")
ax_2.set_ylabel("Avg Cosine Similarity")
plt.tight_layout()
if self.pro_or_con == 'pro':
img_path = os.path.join('images', 'positive')
else:
img_path = os.path.join('images', 'negative')
plt.savefig(os.path.join(img_path, "nmf_metrics.png"))
plt.show()
self.optimum_topics = int(raw_input("Desired topics from graph: "))
def main():
r_pre = "[your file path]/all_purpose"
f_path = "[your file path]/all_purpose_export.txt"
p1 = r_pre + "\.csv\t\d+\t(.*?)(\t\d+){6}"
p2 = "(.*?)O\s*\t(.*?)"
extracted_combo_dct = {}
stemmed_extracted_combo_dct = {}
extracted_combo_lst = []
stemmed_extracted_combo_lst = []
n_top_words = 3
n_topics = 20
n_features = 50
f = open(f_path)
for l in f:
r1 = re.search(p1, l)
m1 = ' '.join(r1.group(1).split('\t'))
r2 = re.search(p2, l)
if r2 == None:
print l
break # used to add missing " O"
m2 = ' '.join([e for e in l.split(r2.group(1))[1].split('O')[1].split('\t') if e != ' ']).split('\n')[0]
extracted_combo_dct.setdefault(m1, 0)
stemmed_extracted_combo_dct.setdefault(m2, 0)
extracted_combo_dct[m1] += 1
stemmed_extracted_combo_dct[m2] += 1
extracted_combo_lst.append(m1)
stemmed_extracted_combo_lst.append(m2)
sort_dct_by_value(extracted_combo_dct)
sort_dct_by_value(stemmed_extracted_combo_dct)
n_samples = len(extracted_combo_lst)
n_stemmed_samples = len(stemmed_extracted_combo_lst)
# using NMF feature extraction
NMF_feature_extraction(extracted_combo_lst, n_samples, n_features, n_topics, n_top_words)
NMF_feature_extraction(stemmed_extracted_combo_lst, n_stemmed_samples, n_features, n_topics, n_top_words)
# using LDA feature extraction
LDA_feature_extraction(extracted_combo_lst, n_samples, n_features, n_topics, n_top_words)
LDA_feature_extraction(stemmed_extracted_combo_lst, n_stemmed_samples, n_features, n_topics, n_top_words)
def evaluate(self, matrix):
"""
Args:
matrix (2d array): this is the matrix of documents and tokens
where the number of topics needs to be determined, this has worked
with compressed sparse row matrices before
Returns
topic_count (int): this is the number of topics that IPNMF was
able to pick up heuristically
"""
if self.noise_pct == 'auto':
self._pareto_corpus_content(matrix, .8)
if self.step == 'auto':
self._determine_auto_step_size(matrix)
if self.pnmf_verbose:
print('initializing evaluation...')
self.corpus_count = matrix.shape[0]
self.rich_content = int(self.corpus_count * (1-self.noise_pct))
self.noise_content = self.corpus_count - self.rich_content
topic_array = np.arange(self.start, self.max_steps * self.step +
self.start, self.step)
for topic_count in topic_array:
if self.pnmf_verbose:
print('extracting {} topics...'.format(topic_count))
self.topic_count = topic_count
nmf = NMF(n_components=self.topic_count, init=self.init,
solver=self.solver, tol=self.tol, max_iter=self.max_iter,
random_state=self.random_state, alpha=self.alpha,
l1_ratio=self.l1_ratio, verbose=self.verbose,
shuffle=self.shuffle, nls_max_iter=self.nls_max_iter,
sparseness=self.sparseness, beta=self.beta,
eta=self.eta)
W = nmf.fit_transform(matrix)
self.nmf = nmf
self.topic_labels = np.apply_along_axis(func1d=np.argmax,
axis=1, arr=W)
self.topic_summary = Counter(self.topic_labels)
if self._stopping_condition():
if self.pnmf_verbose:
print('heuristic topic count is {}'
.format(self.topic_count - self.step))
self.topic_count = self.topic_count - self.step
nmf = NMF(n_components=self.topic_count, init=self.init,
solver=self.solver, tol=self.tol,
max_iter=self.max_iter,
random_state=self.random_state, alpha=self.alpha,
l1_ratio=self.l1_ratio, verbose=self.verbose,
shuffle=self.shuffle,
nls_max_iter=self.nls_max_iter,
sparseness=self.sparseness, beta=self.beta,
eta=self.eta)
nmf.fit(matrix)
self.nmf = self.previous_nmf
return self.topic_count
else:
self.previous_nmf = nmf
def set_testing_parameters(estimator):
# set parameters to speed up some estimators and
# avoid deprecated behaviour
params = estimator.get_params()
if ("n_iter" in params
and estimator.__class__.__name__ != "TSNE"):
estimator.set_params(n_iter=5)
if "max_iter" in params:
warnings.simplefilter("ignore", ConvergenceWarning)
if estimator.max_iter is not None:
estimator.set_params(max_iter=min(5, estimator.max_iter))
# LinearSVR
if estimator.__class__.__name__ == 'LinearSVR':
estimator.set_params(max_iter=20)
# NMF
if estimator.__class__.__name__ == 'NMF':
estimator.set_params(max_iter=100)
# MLP
if estimator.__class__.__name__ in ['MLPClassifier', 'MLPRegressor']:
estimator.set_params(max_iter=100)
if "n_resampling" in params:
# randomized lasso
estimator.set_params(n_resampling=5)
if "n_estimators" in params:
# especially gradient boosting with default 100
estimator.set_params(n_estimators=min(5, estimator.n_estimators))
if "max_trials" in params:
# RANSAC
estimator.set_params(max_trials=10)
if "n_init" in params:
# K-Means
estimator.set_params(n_init=2)
if "decision_function_shape" in params:
# SVC
estimator.set_params(decision_function_shape='ovo')
if estimator.__class__.__name__ == "SelectFdr":
# be tolerant of noisy datasets (not actually speed)
estimator.set_params(alpha=.5)
if estimator.__class__.__name__ == "TheilSenRegressor":
estimator.max_subpopulation = 100
if isinstance(estimator, BaseRandomProjection):
# Due to the jl lemma and often very few samples, the number
# of components of the random matrix projection will be probably
# greater than the number of features.
# So we impose a smaller number (avoid "auto" mode)
estimator.set_params(n_components=1)
if isinstance(estimator, SelectKBest):
# SelectKBest has a default of k=10
# which is more feature than we have in most case.
estimator.set_params(k=1)
if isinstance(estimator, NMF):
if not isinstance(estimator, ProjectedGradientNMF):
estimator.set_params(solver='cd')