textprocess.py 文件源码

python
阅读 21 收藏 0 点赞 0 评论 0

项目:scik-learn-learn-Chinese-text-classider 作者: chapzq77 项目源码 文件源码
def voc_count_bag(self):
        if (self.wordbag_path == "" or self.vocabulary_count_bag_name == "" or self.stopword_path ==""):
            print "wordbag_path(????????) or vocabulary_count_bag_name(?????????) or stopword_path(??????) can not be empty."
            return 
        file_obj = open(self.wordbag_path+self.trainset_name,'rb')
        self.data_set = pickle.load(file_obj)
        file_obj.close()
        #??vocabulary_count_bag?????
        self.vocabulary_count_bag.target_name = self.data_set.target_name
        self.vocabulary_count_bag.label =self.data_set.label
        self.vocabulary_count_bag.filenames =self.data_set.filenames
        corpus = self.data_set.content
        stopword_list = self.getstopword(self.stopword_path)
        #??????????,?????????????
        vectorizer = CountVectorizer(stop_words=stopword_list, max_df=500, min_df=1,max_features=10000)
        y = vectorizer.fit_transform(corpus)
        self.vocabulary_count_bag.vcm = y
        self.vocabulary_count_bag.vcm_sum = y.toarray().sum(axis=0)
        self.vocabulary_count_bag.vocabulary = vectorizer.get_feature_names()
        if not os.path.exists(self.wordbag_path):
            os.makedirs(self.wordbag_path)
        file_obj1 = open(self.wordbag_path+self.vocabulary_count_bag_name,'wb')
        pickle.dump(self.vocabulary_count_bag,file_obj1)
        file_obj1.close()
        print "????????vocabulary_count_bag???wordbag_path???????vocabulary_count_bag_name??????"
        print "#######################################"

    #???????
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号