def ensemble_classify():
label_list = get_labels()
tweet_list = get_labelled_tweets()
# vectorise using tf-idf
vectoriser = TfidfVectorizer(min_df=3,
max_features=None,
strip_accents='unicode',
analyzer='word',
token_pattern=r'\w{1,}',
ngram_range=(1, 2),
use_idf=1,
smooth_idf=1,
sublinear_tf=1,)
## do transformation into vector
vectoriser.fit(tweet_list)
vectorised_tweet_list = vectoriser.transform(tweet_list)
train_vector, test_vector, train_labels, test_labels = train_test_split(vectorised_tweet_list,
label_list,
test_size=0.8,
random_state=42)
n_estimators = 10 # number of weak learners
model = AdaBoostClassifier(n_estimators=n_estimators)
ada_classifier = model.fit(train_vector, train_labels)
result = ada_classifier.predict(test_vector)
# output result to csv
create_directory('data')
result.tofile("data/tfidf_ada.csv", sep=',')
save_model(ada_classifier, 'tfidf_ada')
# evaluation
binarise_result = label_binarize(result, classes=class_list)
binarise_labels = label_binarize(test_labels, classes=class_list)
generate_eval_metrics(binarise_result, 'tfidf_ada', binarise_labels)
ensemble_classifier.py 文件源码
python
阅读 22
收藏 0
点赞 0
评论 0
评论列表
文章目录