def fit(self, X, y=None):
"""Learn the document lenght and document frequency vector
(if necessary).
Parameters
----------
X : sparse matrix, [n_samples, n_features]
a matrix of term/token counts
"""
X = check_array(X, ['csr'], copy=self.copy)
scheme_t, scheme_d, scheme_n = _validate_smart_notation(self.weighting)
self.dl_ = _document_length(X)
if scheme_d in 'stp' or self.compute_df:
self.df_ = _document_frequency(X)
else:
self.df_ = None
if sp.isspmatrix_csr(X):
self.du_ = np.diff(X.indptr)
else:
self.du_ = X.shape[-1] - (X == 0).sum(axis=1)
self._n_features = X.shape[1]
if self.df_ is not None:
df_n_samples = len(self.dl_)
else:
df_n_samples = None
if scheme_n.endswith('p') and self.norm_pivot is None:
# Need to compute the pivot if it's not provided
_, self.norm_pivot = _smart_tfidf(X, self.weighting, self.df_,
df_n_samples,
norm_alpha=self.norm_alpha,
norm_pivot=self.norm_pivot,
return_pivot=True)
return self
评论列表
文章目录