def _extract_tokens(self, file_text):
"""Extract tokens from a file and return a Counter dictionary.
This method is designed specifically so that it can be overridden
easily while maintaining _get_file_tokens and _get_dir_tokens.
"""
token_dict = collections.Counter()
# does a simple word and punctuation tokenization on the text
tokens = wordpunct_tokenize(file_text)
for token in tokens:
token_dict[token] += 1
return token_dict
评论列表
文章目录