def export_ke20k_train_maui():
'''
just use the validation dataset
:return:
'''
config = keyphrase.config.setup_keyphrase_all() # load settings.
target_dir = '/Users/memray/Project/seq2seq-keyphrase/dataset/keyphrase/baseline-data/maui/ke20k/train/'
import emolga,string
printable = set(string.printable)
validation_records = emolga.dataset.build_dataset.deserialize_from_file(config['path'] + '/dataset/keyphrase/'+config['data_process_name']+'validation_record_'+str(config['validation_size'])+'.pkl')
for r_id, r in enumerate(validation_records):
print(r_id)
r['title'] = filter(lambda x: x in printable, r['title'])
r['abstract'] = filter(lambda x: x in printable, r['abstract'])
r['keyword'] = filter(lambda x: x in printable, r['keyword'])
with open(target_dir+str(r_id)+'.txt', 'w') as textfile:
textfile.write(r['title']+'\n'+r['abstract'])
with open(target_dir + str(r_id) + '.key', 'w') as phrasefile:
for p in r['keyword'].split(';'):
phrasefile.write('%s\t1\n' % p)
评论列表
文章目录