def run_nlp(txt_dir, spacy_dir, nlp=None):
"""
Process text files in directory txt_dir with Spacy NLP pipeline and
serialize analyses to directory spacy_dir
"""
if not nlp:
nlp = spacy.load('en')
makedirs(spacy_dir, exist_ok=True)
for txt_fname in sorted_glob(join(txt_dir, '*.txt')):
print('reading ' + txt_fname)
text = open(txt_fname).read()
# Spacy considers '\n' as a separate token.
# That causes problems when writing tokens in column format,
# so we strip the final '\n'.
doc = nlp(text.rstrip('\n'))
spacy_fname = join(spacy_dir,
splitext(basename(txt_fname))[0] + '.spacy')
write_doc(spacy_fname, doc)
python类tokens()的实例源码
def __call__(self, text):
words = text.split(' ')
# All tokens 'own' a subsequent space character in this tokenizer
spaces = [True] * len(words)
return Doc(self.vocab, words=words, spaces=spaces)
def map_chars_to_tokens(doc):
"""
Creates a mapping from input characters to corresponding input tokens
For instance, given the input:
Nuclear theory ...
|||||||||||||||
012345678911111...
01234
it returns an array of size equal to the number of input chars plus one,
whcih looks like this:
000000011111112...
This means that the first 7 chars map to the first token ("Nuclear"),
the next 7 chars (including the initial whitespace) map to the second
token ("theory") and so on.
"""
n_chars = len(doc.text_with_ws)
char2token = np.zeros(n_chars + 1, 'int')
start_char = 0
for token in doc:
end_char = token.idx + len(token)
char2token[start_char:end_char] = token.i
start_char = end_char
char2token[-1] = char2token[-2] + 1
return char2token
def _original_string(self, tokens, offsets):
"""
Recreate string with original char offsets
:param tokens:
:param offsets:
:return:
"""
s = ""
for t, i in zip(tokens, offsets):
diff = i - len(s)
if diff:
s += ' ' * diff
s += t
return s