load_data.py 文件源码

python
阅读 31 收藏 0 点赞 0 评论 0

项目:sequence-labeling 作者: BUAAQingYuan 项目源码 文件源码
def generate_dic():
    train_sents = load_corpus('CoNLL-2003/train.txt')
    valid_sents = load_corpus('CoNLL-2003/valid.txt')
    test_sents = load_corpus('CoNLL-2003/test.txt')
    train_ = [get_sent(sent) for sent in train_sents]
    print("train size: "+str(len(train_sents)))
    valid_ = [get_sent(sent) for sent in valid_sents]
    print("valid size: "+str(len(valid_sents)))
    test_ = [get_sent(sent) for sent in test_sents]
    print("test size: "+str(len(test_sents)))
    all_ = train_ + valid_ + test_
    lengths = [len(text) for text in all_]
    print("all data: "+str(len(lengths)))
    print_lengths(lengths)
    dic_words = corpora.Dictionary(all_)
    dic_words.save('words.dict')
    print(len(dic_words))
    # label
    train_.clear()
    valid_.clear()
    test_.clear()
    train_ = [get_label(sent) for sent in train_sents]
    valid_ = [get_label(sent) for sent in valid_sents]
    test_ = [get_label(sent) for sent in test_sents]
    all_ = train_ + valid_ + test_
    dic_labels = corpora.Dictionary(all_)
    for key,value in dic_labels.items():
        print(value)
    print(len(dic_labels))
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号