def _extract_tokens(self, file_text):
"""Extract tokens from a file and return a Counter dictionary."""
token_dict = collections.Counter()
# matches and removes beginning and end tags
regex = re.compile(r'(<doc id.*>|<\/doc>)')
data = regex.sub('', file_text)
tokens = wordpunct_tokenize(data)
for token in tokens:
token_dict[token] += 1
return token_dict
评论列表
文章目录