def deaccent(text):
"""
Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.
Return input string with accents removed, as unicode.
>>> deaccent("Šéf chomutovských komunist? dostal poštou bílý prášek")
u'Sef chomutovskych komunistu dostal postou bily prasek'
"""
if not isinstance(text, unicode):
# assume utf8 for byte strings, use default (strict) error handling
text = text.decode('utf8')
norm = unicodedata.normalize("NFD", text)
result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
return unicodedata.normalize("NFC", result)
python类category()的实例源码
def remove_nonprinting_characters(input, encoding='utf-8'):
input_was_unicode = True
if isinstance(input, basestring):
if not isinstance(input, unicode):
input_was_unicode = False
unicode_input = to_unicode_or_bust(input)
# see http://www.fileformat.info/info/unicode/category/index.htm
char_classes_to_remove = ["C", "M", "Z"]
response = u''.join(c for c in unicode_input if unicodedata.category(c)[0] not in char_classes_to_remove)
if not input_was_unicode:
response = response.encode(encoding)
return response
# getting a "decoding Unicode is not supported" error in this function?
# might need to reinstall libaries as per
# http://stackoverflow.com/questions/17092849/flask-login-typeerror-decoding-unicode-is-not-supported
def _is_safe_url(url, host):
# Chrome considers any URL with more than two slashes to be absolute, but
# urlparse is not so flexible. Treat any url with three slashes as unsafe.
if url.startswith('///'):
return False
url_info = urlparse(url)
# Forbid URLs like http:///example.com - with a scheme, but without a hostname.
# In that URL, example.com is not the hostname but, a path component. However,
# Chrome will still consider example.com to be the hostname, so we must not
# allow this syntax.
if not url_info.netloc and url_info.scheme:
return False
# Forbid URLs that start with control characters. Some browsers (like
# Chrome) ignore quite a few control characters at the start of a
# URL and might consider the URL as scheme relative.
if unicodedata.category(url[0])[0] == 'C':
return False
return ((not url_info.netloc or url_info.netloc == host) and
(not url_info.scheme or url_info.scheme in ['http', 'https']))
def XetexBody(self):
data = ''
prevcode = 0
for code in sorted(self.font.chars):
try:
uniname = unicodedata.name(unichr(code))
except ValueError:
uniname = ''
if code - prevcode > 1:
gaps = len([x for x in range(prevcode + 1, code)
if unicodedata.category(unichr(x))[0] != 'C'])
if gaps:
data += ('\\rowcolor{missing}\\multicolumn{3}{|c|}'
'{\\small %d visible characters not mapped to glyphs} \\\\\n') % (gaps)
prevcode = code
data += ('\\texttt{%04X} & {\\customfont\\symbol{%d}} &'
'{\\small %s}\\\\\n') % (code, code, uniname)
return data
def weekday_portuguese_to_english(string):
string = string.lower()
string = string.strip()
string = string.replace("-", " ")
string = ''.join((c for c in unicodedata.normalize('NFD', string)
if unicodedata.category(c) != 'Mn'))
string = string.replace(",", " ")
string = string.split(" ")[0]
if string in [u"dom", u"domingo"]:
return "Sunday"
elif string in [u"seg", u"segunda", u"segunda-feira"]:
return "Monday"
elif string in [u"ter", u"terca", u"terça", u"terca-feira", u"terça-feira"]:
return "Tuesday"
elif string in [u"qua", u"quarta", u"quarta-feira"]:
return "Wednesday"
elif string in [u"qui", u"quinta", u"quinta-feira"]:
return "Thursday"
elif string in [u"sex", u"sexta", u"sexta-feira"]:
return "Friday"
elif string in [u"sab", u"sáb", u"sabado", u"sábado"]:
return "Saturday"
def push(self, evt):
trace("[input] pushed {!r}", evt.data)
key = evt.data
d = self.k.get(key)
if isinstance(d, dict):
trace("[input] transition")
self.stack.append(key)
self.k = d
else:
if d is None:
trace("[input] invalid")
if self.stack or len(key) > 1 or unicodedata.category(key) == 'C':
self.results.append(
(self.invalid_cls, self.stack + [key]))
else:
# small optimization:
self.k[key] = self.character_cls
self.results.append(
(self.character_cls, [key]))
else:
trace("[input] matched {}", d)
self.results.append((d, self.stack + [key]))
self.stack = []
self.k = self.ck
def get_cc(nunichar):
"""Computes CharCase for a Unicode character.
This function computes the CharCase of a Unicode character.
Args:
nunichar: A Unicode character whose casing is to be computed.
Returns:
The CharCase for the input character.
"""
catstr = unicodedata.category(nunichar)
if catstr == "Ll":
return CharCase.lower
elif catstr == "Lu":
return CharCase.upper
else:
return CharCase.dc
def __init__(self, emoji):
self.raw = emoji
if isinstance(emoji, str):
self.id = 0
self.unicode = emoji
self.custom = False
self.managed = False
self.name = [unicodedata.name(ch) for ch in emoji]
self.category = [unicodedata.category(ch) for ch in emoji]
self.roles = []
self.guild = None
else:
self.id = emoji.id
self.unicode = ''
self.custom = True
self.managed = getattr(emoji, 'managed', None)
self.name = [emoji.name]
self.category = ['custom']
self.roles = getattr(emoji, 'roles', None)
self.guild = getattr(emoji, 'guild', None)
def read_identifier(self):
self.j = self.i + 1
while unicodedata.category(self.data[self.j]) in self.IDENT_PART_CATEGORIES:
self.j += 1
ident = self.data[self.i:self.j]
if ident in Keyword.VALUES:
token_type = Keyword
if ident in BasicType.VALUES:
token_type = BasicType
elif ident in Modifier.VALUES:
token_type = Modifier
elif ident in Boolean.VALUES:
token_type = Boolean
elif ident == 'null':
token_type = Null
else:
token_type = Identifier
return token_type
def tokenize(text, splits='COPZ'):
token = []
if PY3:
for c in str(text, 'utf-8'):
if category(c)[0] in splits:
if len(token):
yield u''.join(token)
token = []
else:
token.append(c)
else:
for c in unicode(text):
if category(c)[0] in splits:
if len(token):
yield u''.join(token)
token = []
else:
token.append(c)
if len(token):
yield u''.join(token)
def _consume_alpha_utf8(self,text,offset):
"""Consume a sequence of utf8 bytes forming an alphabetic character."""
incr = 2
u = ""
while not u and incr <= 4:
try:
try:
# In the common case this will be a string
u = text[offset:offset+incr].decode("utf8")
except AttributeError:
# Looks like it was e.g. a mutable char array.
try:
s = text[offset:offset+incr].tostring()
except AttributeError:
s = "".join([c for c in text[offset:offset+incr]])
u = s.decode("utf8")
except UnicodeDecodeError:
incr += 1
if not u:
return 0
if u.isalpha():
return incr
if unicodedata.category(u)[0] == "M":
return incr
return 0
def _consume_alpha_u(self,text,offset):
"""Consume an alphabetic character from the given unicode string.
Given a unicode string and the current offset, this method returns
the number of characters occupied by the next alphabetic character
in the string. Trailing combining characters are consumed as a
single letter.
"""
assert offset < len(text)
incr = 0
if text[offset].isalpha():
incr = 1
while offset + incr < len(text):
if unicodedata.category(text[offset+incr])[0] != "M":
break
incr += 1
return incr
def value_for(self, association):
value = str(self.prefix)
try:
if self.prop:
val = str(eval("association."+self.prop))
else:
val = str(self.value)
if 'accents' in self.options:
val = ''.join((c for c in unicodedata.normalize('NFD', val) if unicodedata.category(c) != 'Mn'))
if 'caps' in self.options:
val = val.upper()
return value + val
except AttributeError:
return ''
def _escape_text(self, s):
"""Escape text
In addition to escaping text, unicode characters are replaced with a
span that will display the glyph using CSS. This is to ensure that the
text has a consistent width.
"""
tpl = ('<span class="u"><span class="g">&#x{0:x};</span>'
'<span class="ns">{1}</span></span>')
out = ''
for c in s:
w = utils.str_width(c)
if unicodedata.category(c) in ('Co', 'Cn', 'So'):
out += tpl.format(ord(c), ' ')
elif w > 1 or ord(c) > 255:
out += tpl.format(ord(c), ' ' * w)
else:
out += escape(c)
return out
def deaccent(text):
"""
Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.
Return input string with accents removed, as unicode.
>>> deaccent("Šéf chomutovských komunist? dostal poštou bílý prášek")
u'Sef chomutovskych komunistu dostal postou bily prasek'
"""
if not isinstance(text, unicode):
# assume utf8 for byte strings, use default (strict) error handling
text = text.decode('utf8')
norm = unicodedata.normalize("NFD", text)
result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
return unicodedata.normalize("NFC", result)
def deaccent(text):
"""
Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.
Return input string with accents removed, as unicode.
>>> deaccent("Šéf chomutovských komunist? dostal poštou bílý prášek")
u'Sef chomutovskych komunistu dostal postou bily prasek'
"""
if not isinstance(text, unicode):
# assume utf8 for byte strings, use default (strict) error handling
text = text.decode('utf8')
norm = unicodedata.normalize("NFD", text)
result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
return unicodedata.normalize("NFC", result)
def deaccent(text):
"""
Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.
Return input string with accents removed, as unicode.
>>> deaccent("Šéf chomutovských komunist? dostal poštou bílý prášek")
u'Sef chomutovskych komunistu dostal postou bily prasek'
"""
if not isinstance(text, unicode):
# assume utf8 for byte strings, use default (strict) error handling
text = text.decode('utf8')
norm = unicodedata.normalize("NFD", text)
result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
return unicodedata.normalize("NFC", result)
def _splitByControlCharacters(self, val):
# extract non-control characters
output = []
s = ''
for c in unicode(val):
if unicodedata.category(c)[0] == 'C':
if len(s) > 0:
# start a new string if we found a control character
output.append(str(s))
s = ''
else:
s += c
# clean up any left over string
if len(s) > 0:
output.append(str(s))
# return extracts strings
return output
def calibrate(self):
data = (u'a', u'1', u' ', u'\u1234', u'\uFFFF')
len_data = len(data)
digit = unicodedata.digit
numeric = unicodedata.numeric
decimal = unicodedata.decimal
category = unicodedata.category
bidirectional = unicodedata.bidirectional
decomposition = unicodedata.decomposition
mirrored = unicodedata.mirrored
combining = unicodedata.combining
for i in xrange(self.rounds):
c = data[i % len_data]
def calibrate(self):
data = (u'a', u'1', u' ', u'\u1234', u'\uFFFF')
len_data = len(data)
digit = unicodedata.digit
numeric = unicodedata.numeric
decimal = unicodedata.decimal
category = unicodedata.category
bidirectional = unicodedata.bidirectional
decomposition = unicodedata.decomposition
mirrored = unicodedata.mirrored
combining = unicodedata.combining
for i in xrange(self.rounds):
c = data[i % len_data]
def _test_matching_pattern(self, pattern, isvalidchar, unicode=False):
r = unicode_regex(pattern) if unicode else ascii_regex(pattern)
codepoints = six.moves.range(0, sys.maxunicode+1) \
if unicode else six.moves.range(1, 128)
for c in [six.unichr(x) for x in codepoints]:
if isvalidchar(c):
assert r.match(c), (
'"%s" supposed to match "%s" (%r, category "%s"), '
'but it doesnt' % (pattern, c, c, unicodedata.category(c))
)
else:
assert not r.match(c), (
'"%s" supposed not to match "%s" (%r, category "%s"), '
'but it does' % (pattern, c, c, unicodedata.category(c))
)
def domains(self):
with app.app_context():
if not r.table_list().contains('domains').run(db.conn):
log.info("Table domains not found, creating...")
r.table_create('domains', primary_key="id").run(db.conn)
r.table('domains').index_create("status").run(db.conn)
r.table('domains').index_wait("status").run(db.conn)
r.table('domains').index_create("hyp_started").run(db.conn)
r.table('domains').index_wait("hyp_started").run(db.conn)
r.table('domains').index_create("user").run(db.conn)
r.table('domains').index_wait("user").run(db.conn)
r.table('domains').index_create("group").run(db.conn)
r.table('domains').index_wait("group").run(db.conn)
r.table('domains').index_create("category").run(db.conn)
r.table('domains').index_wait("category").run(db.conn)
r.table('domains').index_create("kind").run(db.conn)
r.table('domains').index_wait("kind").run(db.conn)
return True
def lstrip_token(token):
'''Strips some characters from the left side of a token
Characters which have a type listed in CATEGORIES_TO_STRIP_FROM_TOKENS
are stripped from the left side of a token.
The stripped token is returned.
:param token: The token where characters may be stripped from
:type token: String
:rtype: String
Examples:
>>> lstrip_token(".'foo'.")
"foo'."
'''
token = token.lstrip()
while (len(token) > 0
and
unicodedata.category(token[0]) in CATEGORIES_TO_STRIP_FROM_TOKENS):
token = token[1:]
return token
def rstrip_token(token):
'''Strips some characters from the right side of a token
Characters which have a type listed in CATEGORIES_TO_STRIP_FROM_TOKENS
are stripped from the right side of a token.
The stripped token is returned.
:param token: The token where characters may be stripped from
:type token: String
:rtype: String
Examples:
>>> rstrip_token(".'foo'.")
".'foo"
'''
token = token.rstrip()
while (len(token) > 0
and
unicodedata.category(token[-1]) in CATEGORIES_TO_STRIP_FROM_TOKENS):
token = token[0:-1]
return token
def remove_accents(text):
'''Removes accents from the text
Returns the text with all accents removed
Using “from unidecode import unidecode” is more
sophisticated, but I am not sure whether I can require
“unidecode”.
:param text: The text to change
:type text: string
:rtype: string
Examples:
>>> remove_accents('Ångstrøm')
'Angstrom'
>>> remove_accents('ÅÆæŒœ?øß?ü')
'AAEaeOEoeijossSSu'
'''
return ''.join([
x for x in unicodedata.normalize('NFKD', text)
if unicodedata.category(x) != 'Mn']).translate(TRANS_TABLE)
def contains_letter(text):
'''Returns whether “text” contains a “letter” type character
:param text: The text to check
:type text: string
:rtype: boolean
Examples:
>>> contains_letter('Hi!')
True
>>> contains_letter(':-)')
False
'''
for char in text:
category = unicodedata.category(char)
if category in ('Ll', 'Lu', 'Lo',):
return True
return False
def _is_safe_url(url, allowed_hosts, require_https=False):
# Chrome considers any URL with more than two slashes to be absolute, but
# urlparse is not so flexible. Treat any url with three slashes as unsafe.
if url.startswith('///'):
return False
url_info = urlparse(url)
# Forbid URLs like http:///example.com - with a scheme, but without a hostname.
# In that URL, example.com is not the hostname but, a path component. However,
# Chrome will still consider example.com to be the hostname, so we must not
# allow this syntax.
if not url_info.netloc and url_info.scheme:
return False
# Forbid URLs that start with control characters. Some browsers (like
# Chrome) ignore quite a few control characters at the start of a
# URL and might consider the URL as scheme relative.
if unicodedata.category(url[0])[0] == 'C':
return False
scheme = url_info.scheme
# Consider URLs without a scheme (e.g. //example.com/p) to be http.
if not url_info.scheme and url_info.netloc:
scheme = 'http'
valid_schemes = ['https'] if require_https else ['http', 'https']
return ((not url_info.netloc or url_info.netloc in allowed_hosts) and
(not scheme or scheme in valid_schemes))
def capitulos(item):
logger.info()
itemlist = []
data = item.extra
thumbnail =scrapertools.get_match(data,'background-image:url\(\'([^"]+)\'')
thumbnail = re.sub(r"w185","original",thumbnail)
patron= '<a href="([^"]+)".*?<br\/><i>(.*?)<\/i>'
matches=re.compile(patron,re.DOTALL).findall(data)
for url, capitulo in matches:
capitulo = re.sub(r"Cap.*?tulo","",capitulo)
capitulo= "[COLOR floralwhite][B]"+capitulo+"[/B][/COLOR]"
if capitulo == item.extra.split("|")[4]:
continue
if not ".jpg" in item.extra.split("|")[2]:
fanart = item.show.split("|")[0]
else:
fanart = item.extra.split("|")[2]
itemlist.append( Item(channel=item.channel, title = capitulo , action="findvideos", url=url, thumbnail= thumbnail,extra= "fv2"+"|"+item.extra.split("|")[3],show= item.show,category= item.category, fanart=fanart, folder=True) )
return itemlist
def findvideos(item):
logger.info()
itemlist = []
temp = item.fulltitle.split("|")[0]
epi = item.fulltitle.split("|")[1]
url_temp= "http://api.themoviedb.org/3/tv/"+item.show.split("|")[5]+"/season/"+temp+"/images?api_key="+api_key+""
data = httptools.downloadpage(url_temp).data
data = re.sub(r"\n|\r|\t|\s{2}| ","",data)
patron = '{"id".*?"file_path":"(.*?)","height"'
matches = re.compile(patron,re.DOTALL).findall(data)
if len(matches) == 0:
thumbnail= item.thumbnail
for thumtemp in matches:
thumbnail= "https://image.tmdb.org/t/p/original"+ thumtemp
title = item.show.split("|")[3]+ " " + temp+"x"+epi
title = "[COLOR lightgreen]"+title+"[/COLOR]"
itemlist.append( Item(channel=item.channel, title = title , action="play", url=item.url, server="torrent", thumbnail= item.show.split("|")[4],extra=item.extra,show= item.show, fanart=item.show.split("|")[0],fulltitle = title, folder=False) )
extra = item.extra+"|"+temp+"|"+epi
title_info =" Info"
title_info = "[COLOR darkseagreen]"+title_info+"[/COLOR]"
itemlist.append( Item(channel=item.channel, action="info_capitulos" , title=title_info , url=item.url, thumbnail=thumbnail, fanart=item.show.split("|")[1], extra =extra,show=item.show,category=item.category, folder=False ))
return itemlist
def play(item):
logger.info()
itemlist = servertools.find_video_items(data=item.url)
data = scrapertools.cache_page(item.url)
listavideos = servertools.findvideos(data)
for video in listavideos:
videotitle = scrapertools.unescape(video[0])
url =item.url
server = video[2]
#xbmctools.addnewvideo( item.channel , "play" , category , server , , url , thumbnail , plot )
itemlist.append( Item(channel=item.channel, action="play", server=server, title="Trailer - " + videotitle , url=url , thumbnail=item.thumbnail , plot=item.plot , fulltitle = item.title , fanart="http://s23.postimg.org/84vkeq863/movietrailers.jpg", folder=False) )
return itemlist