def tokenize(self, text):
"""
tokenize text into a list of Token objects
:param text: text to be tokenized (might contains several sentences)
:type text: str
:return: List of Token objects
:rtype: list(Token)
"""
tokens = []
if self.tokenizer_type == "SpaceTokenizer":
operator = RegexpTokenizer('\w+|\$[\d\.]+|\S+')
for counter, span in enumerate(operator.span_tokenize(text)):
new_token = Token(counter, text[span[0]:span[1]], span[0], span[1])
tokens.append(new_token)
elif self.tokenizer_type == "NLTKWhiteSpaceTokenizer":
operator = WhitespaceTokenizer()
for counter, span in enumerate(operator.span_tokenize(text)):
new_token = Token(counter, text[span[0]:span[1]], span[0], span[1])
tokens.append(new_token)
elif self.tokenizer_type == "PTBTokenizer":
ptb_tokens = word_tokenize(text)
counter = 0
for token, span in self._penn_treebank_tokens_with_spans(text, ptb_tokens):
new_token = Token(counter, token, span[0], span[1])
counter += 1
tokens.append(new_token)
return tokens
评论列表
文章目录