def __init__(self, **kwargs):
"""Construct a TINYTEXT.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(TINYTEXT, self).__init__(**kwargs)
python类UNICODE的实例源码
def __init__(self, **kwargs):
"""Construct a MEDIUMTEXT.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(MEDIUMTEXT, self).__init__(**kwargs)
def __init__(self, **kwargs):
"""Construct a LONGTEXT.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(LONGTEXT, self).__init__(**kwargs)
def __init__(self, length=None, **kwargs):
"""Construct a VARCHAR.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(VARCHAR, self).__init__(length=length, **kwargs)
def __init__(self, pattern, markdown_instance=None):
"""
Create an instant of an inline pattern.
Keyword arguments:
* pattern: A regular expression that matches a pattern
"""
self.pattern = pattern
self.compiled_re = re.compile("^(.*?)%s(.*)$" % pattern,
re.DOTALL | re.UNICODE)
# Api for Markdown to pass safe_mode into instance
self.safe_mode = False
if markdown_instance:
self.markdown = markdown_instance
def __init__(self, expression, error_message='Invalid expression',
strict=False, search=False, extract=False,
is_unicode=False):
if strict or not search:
if not expression.startswith('^'):
expression = '^(%s)' % expression
if strict:
if not expression.endswith('$'):
expression = '(%s)$' % expression
if is_unicode:
if not isinstance(expression, unicode):
expression = expression.decode('utf8')
self.regex = re.compile(expression, re.UNICODE)
else:
self.regex = re.compile(expression)
self.error_message = error_message
self.extract = extract
self.is_unicode = is_unicode
def __init__(self, **kwargs):
"""Construct a TINYTEXT.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(TINYTEXT, self).__init__(**kwargs)
def __init__(self, **kwargs):
"""Construct a MEDIUMTEXT.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(MEDIUMTEXT, self).__init__(**kwargs)
def __init__(self, **kwargs):
"""Construct a LONGTEXT.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(LONGTEXT, self).__init__(**kwargs)
def __init__(self, length=None, **kwargs):
"""Construct a VARCHAR.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(VARCHAR, self).__init__(length=length, **kwargs)
def valid_name(name):
r"""Determine whether or not the given string is a valid
alphanumeric string.
Parameters
----------
name : str
An alphanumeric identifier.
Returns
-------
result : bool
``True`` if the name is valid, else ``False``.
Examples
--------
>>> valid_name('alpha') # True
>>> valid_name('!alpha') # False
"""
identifier = re.compile(r"^[^\d\W]\w*\Z", re.UNICODE)
result = re.match(identifier, name)
return result is not None
def __init__(self, expression, error_message='invalid expression',
strict=False, search=False, extract=False,
unicode=False):
if strict or not search:
if not expression.startswith('^'):
expression = '^(%s)' % expression
if strict:
if not expression.endswith('$'):
expression = '(%s)$' % expression
if unicode:
if not isinstance(expression,unicode):
expression = expression.decode('utf8')
self.regex = re.compile(expression,re.UNICODE)
else:
self.regex = re.compile(expression)
self.error_message = error_message
self.extract = extract
self.unicode = unicode
def parse_pocket_export(html_file):
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
html_file.seek(0)
pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
for line in html_file:
# example line
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
match = pattern.search(line)
if match:
fixed_url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
time = datetime.fromtimestamp(float(match.group(2)))
info = {
'url': fixed_url,
'domain': domain(fixed_url),
'base_url': base_url(fixed_url),
'timestamp': str(time.timestamp()),
'tags': match.group(3),
'title': match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '') or base_url(fixed_url),
'sources': [html_file.name],
}
info['type'] = get_link_type(info)
yield info
def _apply_search_backrefs(pattern, flags=0):
"""Apply the search backrefs to the search pattern."""
if isinstance(pattern, (compat.string_type, compat.binary_type)):
re_verbose = bool(VERBOSE & flags)
re_unicode = None
if compat.PY3 and bool(ASCII & flags):
re_unicode = False
elif bool(UNICODE & flags):
re_unicode = True
pattern = SearchTemplate(pattern, re_verbose, re_unicode).apply()
elif isinstance(pattern, RE_TYPE):
if flags:
raise ValueError("Cannot process flags argument with a compiled pattern!")
else:
raise TypeError("Not a string or compiled pattern!")
return pattern
def _compile(expr):
return re.compile(expr, re.UNICODE)
# Note that since _all_string_prefixes includes the empty string,
# StringPrefix can be the empty string (making it optional).
def __init__(self):
#load affix information
# pass
self.prefix_letters = stem_const.DEFAULT_PREFIX_LETTERS
self.suffix_letters = stem_const.DEFAULT_SUFFIX_LETTERS
self.infix_letters = stem_const.DEFAULT_INFIX_LETTERS
self.max_prefix_length = stem_const.DEFAULT_MAX_PREFIX
self.max_suffix_length = stem_const.DEFAULT_MAX_SUFFIX
self.min_stem_length = stem_const.DEFAULT_MIN_STEM
self.joker = stem_const.DEFAULT_JOKER
self.prefix_list = stem_const.DEFAULT_PREFIX_LIST
self.suffix_list = stem_const.DEFAULT_SUFFIX_LIST
self.word = u""
self.unvocalized = u""
self.normalized = u""
self.starword = u""
self.root = u""
self.left = 0
self.right = 0
self.segment_list = []
#token pattern
# letters and harakat
self.token_pat = re.compile(r"[^\w\u064b-\u0652']+", re.UNICODE)
self.prefixes_tree = self._create_prefix_tree(self.prefix_list)
self.suffixes_tree = self._create_suffix_tree(self.suffix_list)
######################################################################
#{ Attribut Functions
######################################################################
def compile(self, rules):
mapping = {}
patterns = []
for rule in rules:
name = 'rule_{id}'.format(id=id(rule))
pattern = r'(?P<{name}>{pattern})'.format(
name=name,
pattern=rule.pattern
)
mapping[name] = rule
patterns.append(pattern)
pattern = '|'.join(patterns)
regexp = re.compile(pattern, re.UNICODE | re.IGNORECASE)
return regexp, mapping
def from_native(cls, regex):
"""Convert a Python regular expression into a ``Regex`` instance.
Note that in Python 3, a regular expression compiled from a
:class:`str` has the ``re.UNICODE`` flag set. If it is undesirable
to store this flag in a BSON regular expression, unset it first::
>>> pattern = re.compile('.*')
>>> regex = Regex.from_native(pattern)
>>> regex.flags ^= re.UNICODE
>>> db.collection.insert({'pattern': regex})
:Parameters:
- `regex`: A regular expression object from ``re.compile()``.
.. warning::
Python regular expressions use a different syntax and different
set of flags than MongoDB, which uses `PCRE`_. A regular
expression retrieved from the server may not compile in
Python, or may match a different set of strings in Python than
when used in a MongoDB query.
.. _PCRE: http://www.pcre.org/
"""
if not isinstance(regex, RE_TYPE):
raise TypeError(
"regex must be a compiled regular expression, not %s"
% type(regex))
return Regex(regex.pattern, regex.flags)
def __init__(self, length=None, **kw):
"""Construct a TEXT.
:param length: Optional, if provided the server may optimize storage
by substituting the smallest TEXT type sufficient to store
``length`` characters.
:param charset: Optional, a column-level character set for this string
value. Takes precedence to 'ascii' or 'unicode' short-hand.
:param collation: Optional, a column-level collation for this string
value. Takes precedence to 'binary' short-hand.
:param ascii: Defaults to False: short-hand for the ``latin1``
character set, generates ASCII in schema.
:param unicode: Defaults to False: short-hand for the ``ucs2``
character set, generates UNICODE in schema.
:param national: Optional. If true, use the server's configured
national character set.
:param binary: Defaults to False: short-hand, pick the binary
collation type that matches the column's character set. Generates
BINARY in schema. This does not affect the type of data stored,
only the collation of character data.
"""
super(TEXT, self).__init__(length=length, **kw)