def process(self, df, x_name, y_name=None, ngrams=2, max_features=35000, method='counts', binary=True, sparse=False):
#choosing the particular flavor of vectorizer
if method == 'counts':
vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace', binary=binary)
elif method == 'tfidf':
vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace')
#fitting the vectorizer and converting the counts to an array
full_fit = vectorizer.fit_transform(df[x_name])
full_counts = full_fit.toarray()
self.vocabulary_ = vectorizer.vocabulary_
#passing the attributes up to the class instance
self.data = df
if sparse:
full_counts = csr_matrix(full_counts)
self.X = full_counts
if y_name != None:
self.y = np.array(df[y_name])
return
#splits the data into training and test sets; either called from process()
#or on its own when your text is already vectorized and divided into x and y
评论列表
文章目录