def voc_count_bag(self):
if (self.wordbag_path == "" or self.vocabulary_count_bag_name == "" or self.stopword_path ==""):
print "wordbag_path(????????) or vocabulary_count_bag_name(?????????) or stopword_path(??????) can not be empty."
return
file_obj = open(self.wordbag_path+self.trainset_name,'rb')
self.data_set = pickle.load(file_obj)
file_obj.close()
#??vocabulary_count_bag?????
self.vocabulary_count_bag.target_name = self.data_set.target_name
self.vocabulary_count_bag.label =self.data_set.label
self.vocabulary_count_bag.filenames =self.data_set.filenames
corpus = self.data_set.content
stopword_list = self.getstopword(self.stopword_path)
#??????????,?????????????
vectorizer = CountVectorizer(stop_words=stopword_list, max_df=500, min_df=1,max_features=10000)
y = vectorizer.fit_transform(corpus)
self.vocabulary_count_bag.vcm = y
self.vocabulary_count_bag.vcm_sum = y.toarray().sum(axis=0)
self.vocabulary_count_bag.vocabulary = vectorizer.get_feature_names()
if not os.path.exists(self.wordbag_path):
os.makedirs(self.wordbag_path)
file_obj1 = open(self.wordbag_path+self.vocabulary_count_bag_name,'wb')
pickle.dump(self.vocabulary_count_bag,file_obj1)
file_obj1.close()
print "????????vocabulary_count_bag???wordbag_path???????vocabulary_count_bag_name??????"
print "#######################################"
#???????
textprocess.py 文件源码
python
阅读 21
收藏 0
点赞 0
评论 0
评论列表
文章目录