def getDatas(dataset_dir_name):
movie_reviews = load_files(dataset_dir_name)
doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)
#word_tokenizer ??????????????????????????????????????????????????
vectorizer = CountVectorizer(binary = True, decode_error = u'ignore')
word_tokenizer = vectorizer.build_tokenizer()
#????????list
doc_terms_list_train = list(getChList(doc_str) for doc_str in doc_str_list_train)
doc_terms_list_test = list(getChList(doc_str) for doc_str in doc_str_list_test)
return vectorizer, doc_str_list_train, doc_str_list_test,doc_class_list_train, doc_class_list_test, doc_terms_list_train
categorizing.py 文件源码
python
阅读 27
收藏 0
点赞 0
评论 0
评论列表
文章目录