def run_nlp(txt_dir, spacy_dir, nlp=None):
"""
Process text files in directory txt_dir with Spacy NLP pipeline and
serialize analyses to directory spacy_dir
"""
if not nlp:
nlp = spacy.load('en')
makedirs(spacy_dir, exist_ok=True)
for txt_fname in sorted_glob(join(txt_dir, '*.txt')):
print('reading ' + txt_fname)
text = open(txt_fname).read()
# Spacy considers '\n' as a separate token.
# That causes problems when writing tokens in column format,
# so we strip the final '\n'.
doc = nlp(text.rstrip('\n'))
spacy_fname = join(spacy_dir,
splitext(basename(txt_fname))[0] + '.spacy')
write_doc(spacy_fname, doc)
评论列表
文章目录