def _extract_tokens(self, file_text):
"""Extract tokens from a Babel file and return a Counter dictionary."""
token_dict = collections.Counter()
# matches and removes beginning and end tags
regex = re.compile(r'\[\d*\.\d*\]\n(.*)')
matches = regex.findall(file_text)
tokens = set()
for match in matches:
wp_tokenized = wordpunct_tokenize(match)
tokens.update(wp_tokenized)
for token in tokens:
token_dict[token] += 1
return token_dict
评论列表
文章目录