topic_models.py 文件源码-python代码片段

def fit(self, X, y, hier):
        """
        Estimate the topic distributions per document (theta), term
        distributions per topic (phi), and regression coefficients (eta).

        Parameters
        ----------
        X : array-like, shape = (n_docs, n_terms)
            The document-term matrix.

        y : array-like, shape = (n_docs, n_labels)
            Response values for each document for each labels.

        hier : 1D array-like, size = n_labels
            The index of the list corresponds to the current label
            and the value of the indexed position is the parent of the label.
                Set -1 as the root.
        """

        self.doc_term_matrix = X
        self.n_docs, self.n_terms = X.shape
        self.n_tokens = X.sum()
        doc_lookup, term_lookup = self._create_lookups(X)

        # iterate
        self.theta, self.phi, self.eta, self.loglikelihoods = gibbs_sampler_blhslda(
            self.n_iter, self.n_report_iter,
            self.n_topics, self.n_docs, self.n_terms, self.n_tokens,
            self.alpha, self.beta, self.mu, self.nu2, self.b, doc_lookup,
            term_lookup, np.ascontiguousarray(y, dtype=np.intc),
            np.ascontiguousarray(hier, dtype=np.intc), self.seed)