def __init__(self, encoding='utf-8', normalize=True):
# type: (Text, bool) -> None
"""
:param encoding:
Used to decode non-unicode values.
:param normalize:
Whether to normalize the resulting value:
- Convert to NFC form.
- Remove non-printable characters.
- Convert all line endings to unix-style ('\n').
"""
super(Unicode, self).__init__()
self.encoding = encoding
self.normalize = normalize
if self.normalize:
#
# Compile the regex that we will use to remove non-
# printables from the resulting unicode.
# http://www.regular-expressions.info/unicode.html#category
#
# Note: using a double negative so that we can exclude
# newlines, which are technically considered control chars.
# http://stackoverflow.com/a/3469155
#
self.npr = regex.compile(r'[^\P{C}\s]+', regex.UNICODE)
评论列表
文章目录