def gen_dataset_from_baike():
doc_path = os.path.join(rel_ext_dir, 'sample_baike_doc.json')
out_path = os.path.join(rel_ext_dir, 'data/raw_dataset.txt')
name2fb_path = os.path.join(cache_dir, 'DatasetFinder.name2fb.cache')
fb_ttls_path = os.path.join(cache_dir, 'DatasetFinder.fb_ttls.cache')
finder = DatasetFinder.load_from_cache(name2fb_path, fb_ttls_path)
Print('load userdict')
jieba.load_userdict(os.path.join(rel_ext_dir, 'trimmed_baike_dict.txt'))
Print('gen dataset from [%s]' %doc_path)
outf = file(out_path, 'w')
for line in tqdm(file(doc_path), total = nb_lines_of(doc_path)):
p = line.split('\t')
baike_url = p[0].decode('utf-8')
paragraphs = json.loads(p[1])
for paragraph in paragraphs:
sentences = split_sentences(paragraph)
for sentence in sentences:
cases, words = gen_dataset(sentence, finder)
if len(cases) > 0:
out_obj = {
'words': "#".join(words),
'cases': map(str, cases),
}
outf.write("%s\t%s\n" %(baike_url, json.dumps(out_obj, ensure_ascii = False)))
outf.close()
评论列表
文章目录