spacynlp.py 文件源码-python代码片段

spacynlp.py 文件源码

python

阅读 19 收藏 0 点赞 0 评论 0

项目：scienceie17 作者: OC-ScienceIE 项目源码文件源码

def run_nlp(txt_dir, spacy_dir, nlp=None):
    """
    Process text files in directory txt_dir with Spacy NLP pipeline and
    serialize analyses to directory spacy_dir
    """
    if not nlp:
        nlp = spacy.load('en')

    makedirs(spacy_dir, exist_ok=True)

    for txt_fname in sorted_glob(join(txt_dir, '*.txt')):
        print('reading ' + txt_fname)
        text = open(txt_fname).read()
        # Spacy considers '\n' as a separate token.
        # That causes problems when writing tokens in column format,
        # so we strip the final '\n'.
        doc = nlp(text.rstrip('\n'))
        spacy_fname = join(spacy_dir,
                           splitext(basename(txt_fname))[0] + '.spacy')
        write_doc(spacy_fname, doc)