def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
python类UNICODE的实例源码
def _encode_regex(name, value, dummy0, dummy1):
"""Encode a python regex or bson.regex.Regex."""
flags = value.flags
# Python 2 common case
if flags == 0:
return b"\x0B" + name + _make_c_string_check(value.pattern) + b"\x00"
# Python 3 common case
elif flags == re.UNICODE:
return b"\x0B" + name + _make_c_string_check(value.pattern) + b"u\x00"
else:
sflags = b""
if flags & re.IGNORECASE:
sflags += b"i"
if flags & re.LOCALE:
sflags += b"l"
if flags & re.MULTILINE:
sflags += b"m"
if flags & re.DOTALL:
sflags += b"s"
if flags & re.UNICODE:
sflags += b"u"
if flags & re.VERBOSE:
sflags += b"x"
sflags += b"\x00"
return b"\x0B" + name + _make_c_string_check(value.pattern) + sflags
def __init__(self, **kwargs):
"""Construct a TINYTEXT.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(TINYTEXT, self).__init__(**kwargs)
def __init__(self, **kwargs):
"""Construct a MEDIUMTEXT.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(MEDIUMTEXT, self).__init__(**kwargs)
def __init__(self, **kwargs):
"""Construct a LONGTEXT.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(LONGTEXT, self).__init__(**kwargs)
def __init__(self, length=None, **kwargs):
"""Construct a VARCHAR.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(VARCHAR, self).__init__(length=length, **kwargs)
def expandvars(path):
"""
Args:
path (pathlike): A path to expand
Returns:
`fsnative`
Like :func:`python:os.path.expandvars` but supports unicode under Windows
+ Python 2 and always returns a `fsnative`.
"""
path = path2fsn(path)
def repl_func(match):
return environ.get(match.group(1), match.group(0))
path = re.compile(r"\$(\w+)", flags=re.UNICODE).sub(repl_func, path)
if os.name == "nt":
path = re.sub(r"%([^%]+)%", repl_func, path)
return re.sub(r"\$\{([^\}]+)\}", repl_func, path)
def safe_filename(text, max_length=200):
"""Sanitizes filenames for many operating systems.
:params text: The unsanitized pending filename.
"""
# Tidy up ugly formatted filenames.
text = text.replace('_', ' ')
text = text.replace(':', ' -')
# NTFS forbids filenames containing characters in range 0-31 (0x00-0x1F)
ntfs = [chr(i) for i in range(0, 31)]
# Removing these SHOULD make most filename safe for a wide range of
# operating systems.
paranoid = ['\"', '\#', '\$', '\%', '\'', '\*', '\,', '\.', '\/', '\:',
'\;', '\<', '\>', '\?', '\\', '\^', '\|', '\~', '\\\\']
blacklist = re.compile('|'.join(ntfs + paranoid), re.UNICODE)
filename = blacklist.sub('', text)
return truncate(filename)
def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
def __init__(self, pattern, markdown_instance=None):
"""
Create an instant of an inline pattern.
Keyword arguments:
* pattern: A regular expression that matches a pattern
"""
self.pattern = pattern
self.compiled_re = re.compile("^(.*?)%s(.*)$" % pattern,
re.DOTALL | re.UNICODE)
# Api for Markdown to pass safe_mode into instance
self.safe_mode = False
if markdown_instance:
self.markdown = markdown_instance
def regex(self):
"""
Returns a compiled regular expression, depending upon the activated
language-code.
"""
language_code = get_language()
if language_code not in self._regex_dict:
if isinstance(self._regex, six.string_types):
regex = self._regex
else:
regex = force_text(self._regex)
try:
compiled_regex = re.compile(regex, re.UNICODE)
except re.error as e:
raise ImproperlyConfigured(
'"%s" is not a valid regular expression: %s' %
(regex, six.text_type(e)))
self._regex_dict[language_code] = compiled_regex
return self._regex_dict[language_code]
def _do_match(self, text):
if self.use_regex:
try:
flags = re.UNICODE
if not self.case_sensitive:
flags |= re.IGNORECASE
return bool(re.findall(self.pattern, text, flags=flags))
except Exception as ex:
logger.warning('Regular expression match failed', exc_info=True)
raise self.BadPatternException(str(ex))
else:
if self.case_sensitive:
pattern = self.pattern
else:
pattern = self.pattern.lower()
text = text.lower()
return pattern in text
def obtainGroups(webcontent, groupNum):
synonym_list = []
for group in range(groupNum):
while not re.search("synonyms-list-group", webcontent.readline(), re.UNICODE):
continue
meaning = re.search("Meaning: <b>([^<]+)</b>", webcontent.readline(), re.UNICODE).group(1)
webcontent.readline() # </div> line
webcontent.readline() # synonyms-list_content line
sublist = webcontent.readline().split(',')
subSynList = []
for wordContainer in sublist:
potential_synonym = re.search("<a href=[^>]+>([^<]+)</a>", wordContainer, re.UNICODE)
if potential_synonym:
subSynList.append(potential_synonym.group(1))
synonym_list.append([meaning, subSynList])
return synonym_list
def _parser(webcontent):
end_tag_count=4
pointer = webcontent.tell()
end = len(webcontent.getvalue())
while pointer<end:
line_curr = webcontent.readline()
found = re.search("Found ([0-9]+) synonym[ a-z]+([0-9]+) group", line_curr, re.UNICODE)
notFound = re.search("<div class=\"no-results\">", line_curr, re.UNICODE)
if found:
groupNum = int(found.group(2))
synonymNum = int(found.group(1))
synonym_list = obtainGroups(webcontent, groupNum)
webcontent.close()
return synonym_list
if notFound:
webcontent.close()
return []
webcontent.close()
return synonym_list
def __init__(self, **kwargs):
"""Construct a TINYTEXT.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(TINYTEXT, self).__init__(**kwargs)
def __init__(self, **kwargs):
"""Construct a MEDIUMTEXT.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(MEDIUMTEXT, self).__init__(**kwargs)
def __init__(self, **kwargs):
"""Construct a LONGTEXT.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(LONGTEXT, self).__init__(**kwargs)
def __init__(self, length=None, **kwargs):
"""Construct a VARCHAR.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(VARCHAR, self).__init__(length=length, **kwargs)
def _fun_into(x):
if ct.PY3:
reg1 = re.compile(r'??(.*?)?', re.UNICODE)
reg2 = re.compile(r'??(.*?)?', re.UNICODE)
res1 = reg1.findall(x)
res2 = reg2.findall(x)
res1 = 0 if len(res1)<1 else float(res1[0])
res2 = 0 if len(res2)<1 else float(res2[0])
return res1 + res2
else:
if isinstance(x, unicode):
s1 = unicode('??','utf-8')
s2 = unicode('??','utf-8')
s3 = unicode('?','utf-8')
reg1 = re.compile(r'%s(.*?)%s'%(s1, s3), re.UNICODE)
reg2 = re.compile(r'%s(.*?)%s'%(s2, s3), re.UNICODE)
res1 = reg1.findall(x)
res2 = reg2.findall(x)
res1 = 0 if len(res1)<1 else float(res1[0])
res2 = 0 if len(res2)<1 else float(res2[0])
return res1 + res2
else:
return 0
def __init__(self, expression, error_message='Invalid expression',
strict=False, search=False, extract=False,
is_unicode=False):
if strict or not search:
if not expression.startswith('^'):
expression = '^(%s)' % expression
if strict:
if not expression.endswith('$'):
expression = '(%s)$' % expression
if is_unicode:
if not isinstance(expression, unicodeT):
expression = expression.decode('utf8')
self.regex = re.compile(expression, re.UNICODE)
else:
self.regex = re.compile(expression)
self.error_message = error_message
self.extract = extract
self.is_unicode = is_unicode or (not(PY2))
def transpyler_lexer_factory(transpyler):
"""
Return a Pygments lexer class for the given transpyler.
"""
def analyse_text(text):
return shebang_matches(text, r'pythonw?3(\.\d)?')
return type(
transpyler.pygments_class_name,
(Python3Lexer,),
dict(
analyse_text=analyse_text,
name=transpyler.name,
aliases=[transpyler.display_name],
filenames=transpyler.file_extensions,
mimetypes=transpyler.mimetypes,
flags=re.MULTILINE | re.UNICODE,
uni_name="[%s][%s]*" % (uni.xid_start, uni.xid_continue),
tokens=make_transpyled_tokens(transpyler),
)
)
def build_regexp(definition, compile=True):
"""
Build, compile and return a regular expression based on `definition`.
:Parameter: `definition`: a 4-tuple (group name, prefix, suffix, parts),
where "parts" is a list of regular expressions and/or regular
expression definitions to be joined into an or-group.
"""
name, prefix, suffix, parts = definition
part_strings = []
for part in parts:
if type(part) is tuple:
part_strings.append(build_regexp(part, None))
else:
part_strings.append(part)
or_group = '|'.join(part_strings)
regexp = '%(prefix)s(?P<%(name)s>%(or_group)s)%(suffix)s' % locals()
if compile:
return re.compile(regexp, re.UNICODE)
else:
return regexp
def test_pass_precompiled_regex(self):
"""
You can alternatively provide a precompiled regex to the Filter
instead of a string pattern.
"""
# Compile our own pattern so that we can specify the
# ``IGNORECASE`` flag.
# Note that you are responsible for adding the ``UNICODE`` flag
# to your compiled regex!
# noinspection SpellCheckingInspection
pattern = re.compile(r'\btest\b', re.IGNORECASE | re.UNICODE)
self.assertFilterPasses(
self._filter('test march of the TEST penguins', pattern=pattern),
['test', 'TEST'],
)
def test_pass_regex_library_support(self):
"""
The Regex Filter also supports precompiled patterns using the
``regex`` library.
"""
# Roughly, "Hi there!" in Burmese.
word = '\u101f\u102d\u102f\u1004\u103a\u1038'
# Note that :py:func:`regex.compile` automatically adds the
# ``UNICODE`` flag for you when the pattern is a unicode.
pattern = regex.compile(r'\w+')
self.assertFilterPasses(
self._filter(word, pattern=pattern),
[word],
)
def test_pass_precompiled_regex(self):
"""
You can alternatively provide a precompiled regex to the Filter
instead of a string pattern.
"""
# Compile our own pattern so that we can specify the
# ``IGNORECASE`` flag.
# Note that you are responsible for adding the ``UNICODE`` flag
# to your compiled regex!
# noinspection SpellCheckingInspection
pattern = re.compile(r'\btest\b', re.IGNORECASE | re.UNICODE)
self.assertFilterPasses(
self._filter('test march of the TEST penguins', pattern=pattern),
['', ' march of the ', ' penguins'],
)
def test_pass_regex_library_support(self):
"""
The Regex Filter also supports precompiled patterns using the
``regex`` library.
"""
# Roughly, "Hi there!" in Burmese.
word = '\u101f\u102d\u102f\u1004\u103a\u1038!'
# Note that :py:func:`regex.compile` automatically adds the
# ``UNICODE`` flag for you when the pattern is a unicode.
pattern = regex.compile(r'\w+')
self.assertFilterPasses(
self._filter(word, pattern=pattern),
['', '!'],
)
def build_regexp(definition, compile=True):
"""
Build, compile and return a regular expression based on `definition`.
:Parameter: `definition`: a 4-tuple (group name, prefix, suffix, parts),
where "parts" is a list of regular expressions and/or regular
expression definitions to be joined into an or-group.
"""
name, prefix, suffix, parts = definition
part_strings = []
for part in parts:
if type(part) is tuple:
part_strings.append(build_regexp(part, None))
else:
part_strings.append(part)
or_group = '|'.join(part_strings)
regexp = '%(prefix)s(?P<%(name)s>%(or_group)s)%(suffix)s' % locals()
if compile:
return re.compile(regexp, re.UNICODE)
else:
return regexp