def __compile_tokenize_pattern(self):
"""
Compiles the regular expression used by self.tokenize() and stores
a reference to it in self.tokenize_pattern. The full regular expression
used here is a concatenation of several patterns (as written above
self.__init__() and conditionally using either the word pattern that
matches hyphen-broken words, or the pattern that only captures "whole"
words.
"""
# Capture hyphen-broken words as single tokens by default.
word_pattern_str = self._pattern_str_word_with_hyphen_breaks
# If we're not supposed to remove hyphen breaks, use the alternate word
# pattern, which doesn't look for "hyphen breaks".
if not self.remove_hyphen_breaks:
word_pattern_str = self._pattern_str_word
# Concatenate the separate pattern strings into the final pattern string.
# The order here indicates group match priority (i.e. match "words"
# first, etc.)
# Join the regex pattern strings with the "or" character ("|").
final_tokenize_pattern_str = r"|".join([
word_pattern_str,
self._pattern_str_entity,
self._pattern_str_remnant,
self._pattern_str_whitespace,
self._pattern_str_newline
])
# Compile the final pattern. Those strings have whitespace, so make
# sure re.VERBOSE is one of the flags used!
self.tokenize_pattern = re.compile(final_tokenize_pattern_str, re.I | re.VERBOSE)
评论列表
文章目录