def process_poetry(self, data_dir='/media/pony/DLdigest/data/languageModel/chinese-poetry/json'):
save_dir = os.path.join(self.save_dir, 'poem')
check_path_exists(save_dir)
count = 0
for entry in os.scandir(data_dir):
if entry.name.startswith('poet'):
with open(entry.path, 'r') as json_file:
poems = json.load(json_file)
for p in poems:
paras = HanziConv.toSimplified(''.join(p['paragraphs']).replace('\n', ''))
paras = filter_punctuation(paras)
for para in paras.split(' '):
if len(para.strip())>1:
pys = ' '.join(np.array(pinyin(para)).flatten())
with open(os.path.join(save_dir, str(count//400000+1)+'.txt'), 'a') as f:
f.write(para+','+pys+'\n')
count += 1
评论列表
文章目录