def normalize(self, s):
'''
Normalize text.
'''
s = s.strip().lower()
if self.to_ascii:
s = unidecode(s)
if self.rejoin_lines:
s = re.sub(r'(\w-)\s*\n\s*', r'\1', s, flags=_RE_FLAGS)
if self.remove_hyphens:
s = re.sub(r'([^\W\d_])-+(?=[^\W\d_])', r'\1', s, flags=_RE_FLAGS)
if self.remove_specials:
s = re.sub(r'(\D|^)([^\w\s]|_)+(?=\D|$)', r'\1 ', s,
flags=_RE_FLAGS)
s = re.sub(r'(\w)([^\w\s]|_)+\s+', r'\1 ', s, flags=_RE_FLAGS)
s = re.sub(r'\s+([^\w\s]|_)+(?=\w)', r'\1 ', s, flags=_RE_FLAGS)
for pattern, replacement in self.subs:
s = re.sub(pattern, replacement, s, flags=_RE_FLAGS)
if self._stemmer:
callback = lambda m: self._stemmer.stem(m.group())
s = re.sub(r'([^\W\d_]|-)+', callback, s, flags=_RE_FLAGS)
s = re.sub(r'\s+', ' ', s, flags=_RE_FLAGS)
return s.strip()
评论列表
文章目录