__init__.py 文件源码

python
阅读 20 收藏 0 点赞 0 评论 0

项目:geoextract 作者: stadt-karlsruhe 项目源码 文件源码
def normalize(self, s):
        '''
        Normalize text.
        '''
        s = s.strip().lower()
        if self.to_ascii:
            s = unidecode(s)
        if self.rejoin_lines:
            s = re.sub(r'(\w-)\s*\n\s*', r'\1', s, flags=_RE_FLAGS)
        if self.remove_hyphens:
            s = re.sub(r'([^\W\d_])-+(?=[^\W\d_])', r'\1', s, flags=_RE_FLAGS)
        if self.remove_specials:
            s = re.sub(r'(\D|^)([^\w\s]|_)+(?=\D|$)', r'\1 ', s,
                       flags=_RE_FLAGS)
            s = re.sub(r'(\w)([^\w\s]|_)+\s+', r'\1 ', s, flags=_RE_FLAGS)
            s = re.sub(r'\s+([^\w\s]|_)+(?=\w)', r'\1 ', s, flags=_RE_FLAGS)
        for pattern, replacement in self.subs:
            s = re.sub(pattern, replacement, s, flags=_RE_FLAGS)
        if self._stemmer:
            callback = lambda m: self._stemmer.stem(m.group())
            s = re.sub(r'([^\W\d_]|-)+', callback, s, flags=_RE_FLAGS)
        s = re.sub(r'\s+', ' ', s, flags=_RE_FLAGS)
        return s.strip()
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号