def generate_dic():
train_sents = load_corpus('CoNLL-2003/train.txt')
valid_sents = load_corpus('CoNLL-2003/valid.txt')
test_sents = load_corpus('CoNLL-2003/test.txt')
train_ = [get_sent(sent) for sent in train_sents]
print("train size: "+str(len(train_sents)))
valid_ = [get_sent(sent) for sent in valid_sents]
print("valid size: "+str(len(valid_sents)))
test_ = [get_sent(sent) for sent in test_sents]
print("test size: "+str(len(test_sents)))
all_ = train_ + valid_ + test_
lengths = [len(text) for text in all_]
print("all data: "+str(len(lengths)))
print_lengths(lengths)
dic_words = corpora.Dictionary(all_)
dic_words.save('words.dict')
print(len(dic_words))
# label
train_.clear()
valid_.clear()
test_.clear()
train_ = [get_label(sent) for sent in train_sents]
valid_ = [get_label(sent) for sent in valid_sents]
test_ = [get_label(sent) for sent in test_sents]
all_ = train_ + valid_ + test_
dic_labels = corpora.Dictionary(all_)
for key,value in dic_labels.items():
print(value)
print(len(dic_labels))
评论列表
文章目录