def get_pos_train_data(corpus, count=None, **kwargs):
X = []
y = []
documents = corpus.iter_documents()
if count:
documents = islice(documents, count)
for document in tqdm(documents):
sents = document.iter_tagged_sents()
for sent in sents:
tokens = []
labels = []
for token, tags in sent:
tags = tags.split(',')
tokens.append(token)
labels.append(tags[0]) # TODO:
X.append(sent2posfeatures(tokens))
y.append(labels)
return train_test_split(X, y, **kwargs)
评论列表
文章目录