python类tokens()的实例源码-面圈网

spacynlp.py 文件源码项目：scienceie17 作者: OC-ScienceIE 项目源码文件源码阅读 18 收藏 0 点赞 0 评论 0

def run_nlp(txt_dir, spacy_dir, nlp=None):
    """
    Process text files in directory txt_dir with Spacy NLP pipeline and
    serialize analyses to directory spacy_dir
    """
    if not nlp:
        nlp = spacy.load('en')

    makedirs(spacy_dir, exist_ok=True)

    for txt_fname in sorted_glob(join(txt_dir, '*.txt')):
        print('reading ' + txt_fname)
        text = open(txt_fname).read()
        # Spacy considers '\n' as a separate token.
        # That causes problems when writing tokens in column format,
        # so we strip the final '\n'.
        doc = nlp(text.rstrip('\n'))
        spacy_fname = join(spacy_dir,
                           splitext(basename(txt_fname))[0] + '.spacy')
        write_doc(spacy_fname, doc)

embeddings.py 文件源码项目：botcycle 作者: D2KLab 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

spacynlp.py 文件源码项目：scienceie17 作者: OC-ScienceIE 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def map_chars_to_tokens(doc):
    """
    Creates a mapping from input characters to corresponding input tokens

    For instance, given the input:

    Nuclear theory ...
    |||||||||||||||
    012345678911111...
              01234

    it returns an array of size equal to the number of input chars plus one,
    whcih looks like this:

    000000011111112...

    This means that the first 7 chars map to the first token ("Nuclear"),
    the next 7 chars (including the initial whitespace) map to the second
    token ("theory") and so on.
    """
    n_chars = len(doc.text_with_ws)
    char2token = np.zeros(n_chars + 1, 'int')
    start_char = 0
    for token in doc:
        end_char = token.idx + len(token)
        char2token[start_char:end_char] = token.i
        start_char = end_char
    char2token[-1] = char2token[-2] + 1
    return char2token

doc_parsers.py 文件源码项目：snorkel-biocorpus 作者: HazyResearch 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def _original_string(self, tokens, offsets):
        """
        Recreate string with original char offsets
        :param tokens:
        :param offsets:
        :return:
        """
        s = ""
        for t, i in zip(tokens, offsets):
            diff = i - len(s)
            if diff:
                s += ' ' * diff
            s += t
        return s