tools.py 文件源码-python代码片段

tools.py 文件源码

python

阅读 45 收藏 0 点赞 0 评论 0

项目：document_classification 作者: scotthlee 项目源码文件源码

def process(self, df, x_name, y_name=None, ngrams=2, max_features=35000, method='counts', binary=True, sparse=False):
        #choosing the particular flavor of vectorizer
        if method == 'counts':
            vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace', binary=binary)
        elif method == 'tfidf':
            vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace')

        #fitting the vectorizer and converting the counts to an array
        full_fit = vectorizer.fit_transform(df[x_name])
        full_counts = full_fit.toarray()
        self.vocabulary_ = vectorizer.vocabulary_

        #passing the attributes up to the class instance
        self.data = df
        if sparse:
            full_counts = csr_matrix(full_counts)
        self.X = full_counts
        if y_name != None:
            self.y = np.array(df[y_name])
        return

    #splits the data into training and test sets; either called from process()
    #or on its own when your text is already vectorized and divided into x and y