def parse_and_save():
en = spacy.load('en')
reader = WikiReader(wikidump)
records = reader.records()
def section_texts_flat(records):
while 1:
try:
record = next(records)
except OSError as e:
print('error: %s' % e)
else:
for section in record['sections']:
yield section['text']
pipe = en.pipe(section_texts_flat(records),
n_threads=cpu_count(),
batch_size=1000)
# pipe = (en(txt) for txt in section_texts_flat(records))
preproc = Preprocessor(en.vocab)
with FilePoolWriter(wikidoc_dir, wikidoc_fn_template) as f:
for i, doc in enumerate(tqdm.tqdm(pipe)):
if len(doc._py_tokens) <= 7:
# short sentences -- nah
continue
for sent in doc.sents:
packed = preproc.pack(sent)
f.write(packed)
if i % 10000 == 0:
print('i=%s, saving vocab' % i)
save_vocab(en.vocab)
save_vocab(en.vocab)
import IPython
IPython.embed()
评论列表
文章目录