RegexTokenizer.py 文件源码-python代码片段

def __compile_tokenize_pattern(self):
        """
        Compiles the regular expression used by self.tokenize() and stores
        a reference to it in self.tokenize_pattern. The full regular expression
        used here is a concatenation of several patterns (as written above
        self.__init__() and conditionally using either the word pattern that
        matches hyphen-broken words, or the pattern that only captures "whole"
        words.

        """
        # Capture hyphen-broken words as single tokens by default.
        word_pattern_str = self._pattern_str_word_with_hyphen_breaks
        # If we're not supposed to remove hyphen breaks, use the alternate word
        # pattern, which doesn't look for "hyphen breaks".
        if not self.remove_hyphen_breaks:
            word_pattern_str = self._pattern_str_word
        # Concatenate the separate pattern strings into the final pattern string.
        # The order here indicates group match priority (i.e. match "words"
        # first, etc.)
        # Join the regex pattern strings with the "or" character ("|").
        final_tokenize_pattern_str = r"|".join([
            word_pattern_str,
            self._pattern_str_entity,
            self._pattern_str_remnant,
            self._pattern_str_whitespace,
            self._pattern_str_newline
        ])
        # Compile the final pattern. Those strings have whitespace, so make
        # sure re.VERBOSE is one of the flags used!
        self.tokenize_pattern = re.compile(final_tokenize_pattern_str, re.I | re.VERBOSE)