def preprocess(self, x):
"""Load a single example using this field, tokenizing if necessary.
If the input is a Python 2 `str`, it will be converted to Unicode
first. If `sequential=True`, it will be tokenized. Then the input
will be optionally lowercased and passed to the user-provided
`preprocessing` Pipeline."""
if (six.PY2 and isinstance(x, six.string_types) and not
isinstance(x, six.text_type)):
x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x)
if self.sequential and isinstance(x, six.text_type):
x = self.tokenize(x.rstrip('\n'))
if self.lower:
x = Pipeline(six.text_type.lower)(x)
if self.preprocessing is not None:
return self.preprocessing(x)
else:
return x
评论列表
文章目录