def _init_svd(self, dictionary, definitions):
self.td_matrix = lil_matrix((len(dictionary), self.n_terms))
for defn, i in zip(definitions, range(len(definitions))):
if i % 100 == 0:
print("Building term-document matrix: {} / {}".format(i, len(dictionary)), end="\r")
self.td_matrix[i, :] = self.compute_freq_vec(dictionary[defn])
self.td_matrix = self.td_matrix.transpose().tocsr()
print()
for i in range(self.n_terms):
n = float(self.td_matrix[i, :].getnnz())
if i % 100 == 0:
print("Applying td-idf: {} / {}".format(i, self.n_terms), end="\r")
if n > 0:
self.td_matrix[i, :] *= np.log(len(dictionary) / n)
print()
print("Performing rank reduction...")
self.u, self.s, self.vt = randomized_svd(self.td_matrix, 50, transpose=False)
self.doc_matrix = np.matmul(np.diag(self.s), self.vt).transpose()
评论列表
文章目录