def get_train_data(corpus, count=None, **kwargs):
X = []
y = []
documents = corpus.iter_documents()
if count:
documents = islice(documents, count)
for document in tqdm(documents):
try:
text = document.raw()
words = document.words()
labels = text2labels(text, words)
features = list(text2features(text))
X.append(features)
y.append(labels)
except Exception as exc:
# TODO:
continue
return train_test_split(X, y, **kwargs)
评论列表
文章目录