text_tools.py 文件源码-python代码片段

text_tools.py 文件源码

python

阅读 21 收藏 0 点赞 0 评论 0

def process_text(self, text):
        flags = (UNICODE if sys.version < '3' and type(text) is unicode
                 else 0)
        regexp = self.regexp if self.regexp is not None else r"\w[\w']+"

        words = findall(regexp, text, flags)
        # remove stopwords
        words = [word for word in words]
        # remove 's
        words = [word[:-2] if word.lower().endswith("'s") else word
                 for word in words]
        # remove numbers
        words = [word for word in words if not word.isdigit()]

        if self.collocations:
            word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
        else:
            word_counts, _ = process_tokens(words, self.normalize_plurals)

        return word_counts