def write_combined_file(lang_code, all_lang_paths, all_en_paths):
src_combined_filename = "train" + "_" + lang_code + "_en."+ lang_code + ".txt"
tgt_combined_filename = "train" + "_" + lang_code + "_en.en" + ".txt"
if not os.path.exists(output_dir + lang_code):
os.makedirs(output_dir+lang_code)
write_lang = []
write_en = []
for corp in all_lang_paths:
for filename in corp:
with open(filename) as f:
doc = f.read()
write_lang.append(doc)
for corp in all_en_paths:
for filename in corp:
with open(filename) as f:
doc = f.read()
write_en.append(doc)
for doc1, doc2 in zip(write_lang, write_en):
if len(doc1.split("\n"))!=len(doc2.split("\n")):
continue
else:
with open(output_dir + lang_code + "/" + src_combined_filename, 'a') as wf:
wf.write(doc1)
with open(output_dir + lang_code + "/" + tgt_combined_filename, 'a') as wf:
wf.write(doc2)
评论列表
文章目录