export_dataset.py 文件源码-python代码片段

export_dataset.py 文件源码

python

阅读 42 收藏 0 点赞 0 评论 0

项目：seq2seq-keyphrase 作者: memray 项目源码文件源码

def export_ke20k_train_maui():
    '''
    just use the validation dataset
    :return:
    '''
    config  = keyphrase.config.setup_keyphrase_all()   # load settings.
    target_dir = '/Users/memray/Project/seq2seq-keyphrase/dataset/keyphrase/baseline-data/maui/ke20k/train/'

    import emolga,string

    printable = set(string.printable)
    validation_records = emolga.dataset.build_dataset.deserialize_from_file(config['path'] + '/dataset/keyphrase/'+config['data_process_name']+'validation_record_'+str(config['validation_size'])+'.pkl')
    for r_id, r in enumerate(validation_records):
        print(r_id)

        r['title'] = filter(lambda x: x in printable, r['title'])
        r['abstract'] = filter(lambda x: x in printable, r['abstract'])
        r['keyword'] = filter(lambda x: x in printable, r['keyword'])

        with open(target_dir+str(r_id)+'.txt', 'w') as textfile:
            textfile.write(r['title']+'\n'+r['abstract'])

        with open(target_dir + str(r_id) + '.key', 'w') as phrasefile:
            for p in r['keyword'].split(';'):
                phrasefile.write('%s\t1\n' % p)