def fit_transform(self, X, y=None):
"""Apply document term weighting and normalization on text features
Parameters
----------
X : sparse matrix, [n_samples, n_features]
a matrix of term/token counts
"""
X = check_array(X, ['csr'], copy=self.copy)
scheme_t, scheme_d, scheme_n = _validate_smart_notation(self.weighting)
self.dl_ = _document_length(X)
if scheme_d in 'stpd' or self.compute_df:
self.df_ = _document_frequency(X)
else:
self.df_ = None
if sp.isspmatrix_csr(X):
self.du_ = np.diff(X.indptr)
else:
self.du_ = X.shape[-1] - (X == 0).sum(axis=1)
self._n_features = X.shape[1]
if self.df_ is not None:
df_n_samples = len(self.dl_)
else:
df_n_samples = None
if self.df_ is not None:
df_n_samples = len(self.dl_)
else:
df_n_samples = None
X, self.norm_pivot = _smart_tfidf(X, self.weighting, self.df_,
df_n_samples,
norm_alpha=self.norm_alpha,
norm_pivot=self.norm_pivot,
return_pivot=True)
return X
评论列表
文章目录