def evaluate(self, matrix):
"""
Args:
matrix (2d array): this is the matrix of documents and tokens
where the number of topics needs to be determined, this has worked
with compressed sparse row matrices before
Returns
topic_count (int): this is the number of topics that IPNMF was
able to pick up heuristically
"""
if self.noise_pct == 'auto':
self._pareto_corpus_content(matrix, .8)
if self.step == 'auto':
self._determine_auto_step_size(matrix)
if self.pnmf_verbose:
print('initializing evaluation...')
self.corpus_count = matrix.shape[0]
self.rich_content = int(self.corpus_count * (1-self.noise_pct))
self.noise_content = self.corpus_count - self.rich_content
topic_array = np.arange(self.start, self.max_steps * self.step +
self.start, self.step)
for topic_count in topic_array:
if self.pnmf_verbose:
print('extracting {} topics...'.format(topic_count))
self.topic_count = topic_count
nmf = NMF(n_components=self.topic_count, init=self.init,
solver=self.solver, tol=self.tol, max_iter=self.max_iter,
random_state=self.random_state, alpha=self.alpha,
l1_ratio=self.l1_ratio, verbose=self.verbose,
shuffle=self.shuffle, nls_max_iter=self.nls_max_iter,
sparseness=self.sparseness, beta=self.beta,
eta=self.eta)
W = nmf.fit_transform(matrix)
self.nmf = nmf
self.topic_labels = np.apply_along_axis(func1d=np.argmax,
axis=1, arr=W)
self.topic_summary = Counter(self.topic_labels)
if self._stopping_condition():
if self.pnmf_verbose:
print('heuristic topic count is {}'
.format(self.topic_count - self.step))
self.topic_count = self.topic_count - self.step
nmf = NMF(n_components=self.topic_count, init=self.init,
solver=self.solver, tol=self.tol,
max_iter=self.max_iter,
random_state=self.random_state, alpha=self.alpha,
l1_ratio=self.l1_ratio, verbose=self.verbose,
shuffle=self.shuffle,
nls_max_iter=self.nls_max_iter,
sparseness=self.sparseness, beta=self.beta,
eta=self.eta)
nmf.fit(matrix)
self.nmf = self.previous_nmf
return self.topic_count
else:
self.previous_nmf = nmf
评论列表
文章目录