def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
python类LOCALE的实例源码
def _encode_regex(name, value, dummy0, dummy1):
"""Encode a python regex or bson.regex.Regex."""
flags = value.flags
# Python 2 common case
if flags == 0:
return b"\x0B" + name + _make_c_string_check(value.pattern) + b"\x00"
# Python 3 common case
elif flags == re.UNICODE:
return b"\x0B" + name + _make_c_string_check(value.pattern) + b"u\x00"
else:
sflags = b""
if flags & re.IGNORECASE:
sflags += b"i"
if flags & re.LOCALE:
sflags += b"l"
if flags & re.MULTILINE:
sflags += b"m"
if flags & re.DOTALL:
sflags += b"s"
if flags & re.UNICODE:
sflags += b"u"
if flags & re.VERBOSE:
sflags += b"x"
sflags += b"\x00"
return b"\x0B" + name + _make_c_string_check(value.pattern) + sflags
def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
def _encode_regex(name, value, dummy0, dummy1):
"""Encode a python regex or bson.regex.Regex."""
flags = value.flags
# Python 2 common case
if flags == 0:
return b"\x0B" + name + _make_c_string_check(value.pattern) + b"\x00"
# Python 3 common case
elif flags == re.UNICODE:
return b"\x0B" + name + _make_c_string_check(value.pattern) + b"u\x00"
else:
sflags = b""
if flags & re.IGNORECASE:
sflags += b"i"
if flags & re.LOCALE:
sflags += b"l"
if flags & re.MULTILINE:
sflags += b"m"
if flags & re.DOTALL:
sflags += b"s"
if flags & re.UNICODE:
sflags += b"u"
if flags & re.VERBOSE:
sflags += b"x"
sflags += b"\x00"
return b"\x0B" + name + _make_c_string_check(value.pattern) + sflags
def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
def _encode_regex(name, value, dummy0, dummy1):
"""Encode a python regex or bson.regex.Regex."""
flags = value.flags
# Python 2 common case
if flags == 0:
return b"\x0B" + name + _make_c_string_check(value.pattern) + b"\x00"
# Python 3 common case
elif flags == re.UNICODE:
return b"\x0B" + name + _make_c_string_check(value.pattern) + b"u\x00"
else:
sflags = b""
if flags & re.IGNORECASE:
sflags += b"i"
if flags & re.LOCALE:
sflags += b"l"
if flags & re.MULTILINE:
sflags += b"m"
if flags & re.DOTALL:
sflags += b"s"
if flags & re.UNICODE:
sflags += b"u"
if flags & re.VERBOSE:
sflags += b"x"
sflags += b"\x00"
return b"\x0B" + name + _make_c_string_check(value.pattern) + sflags
def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
def _encode_regex(name, value, dummy0, dummy1):
"""Encode a python regex or bson.regex.Regex."""
flags = value.flags
# Python 2 common case
if flags == 0:
return b"\x0B" + name + _make_c_string_check(value.pattern) + b"\x00"
# Python 3 common case
elif flags == re.UNICODE:
return b"\x0B" + name + _make_c_string_check(value.pattern) + b"u\x00"
else:
sflags = b""
if flags & re.IGNORECASE:
sflags += b"i"
if flags & re.LOCALE:
sflags += b"l"
if flags & re.MULTILINE:
sflags += b"m"
if flags & re.DOTALL:
sflags += b"s"
if flags & re.UNICODE:
sflags += b"u"
if flags & re.VERBOSE:
sflags += b"x"
sflags += b"\x00"
return b"\x0B" + name + _make_c_string_check(value.pattern) + sflags
def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
def _encode_regex(name, value, dummy0, dummy1):
"""Encode a python regex or bson.regex.Regex."""
flags = value.flags
# Python 2 common case
if flags == 0:
return b"\x0B" + name + _make_c_string_check(value.pattern) + b"\x00"
# Python 3 common case
elif flags == re.UNICODE:
return b"\x0B" + name + _make_c_string_check(value.pattern) + b"u\x00"
else:
sflags = b""
if flags & re.IGNORECASE:
sflags += b"i"
if flags & re.LOCALE:
sflags += b"l"
if flags & re.MULTILINE:
sflags += b"m"
if flags & re.DOTALL:
sflags += b"s"
if flags & re.UNICODE:
sflags += b"u"
if flags & re.VERBOSE:
sflags += b"x"
sflags += b"\x00"
return b"\x0B" + name + _make_c_string_check(value.pattern) + sflags
def findReplaceFlags(self, tokens):
"""Map letters in |tokens| to re flags."""
flags = re.MULTILINE
if 'i' in tokens:
flags |= re.IGNORECASE
if 'l' in tokens:
# Affects \w, \W, \b, \B.
flags |= re.LOCALE
if 'm' in tokens:
# Affects ^, $.
flags |= re.MULTILINE
if 's' in tokens:
# Affects ..
flags |= re.DOTALL
if 'x' in tokens:
# Affects whitespace and # comments.
flags |= re.VERBOSE
if 'u' in tokens:
# Affects \w, \W, \b, \B.
flags |= re.UNICODE
if 0:
tokens = re.sub('[ilmsxu]', '', tokens)
if len(tokens):
self.setMessage('unknown regex flags '+tokens)
return flags
generate_stem_pos_tag.py 文件源码
项目:kaggle-quora-solution-8th
作者: qqgeogor
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def pos_tag_text(line,
token_pattern=token_pattern,
exclude_stopword=stopwords,
encode_digit=False):
token_pattern = re.compile(token_pattern, flags = re.UNICODE | re.LOCALE)
for name in ["question1", "question2"]:
l = line[name]
## tokenize
tokens = [x.lower() for x in token_pattern.findall(l)]
## stem
#tokens=l.lower().split()
#print tokens
tokens = stem_tokens(tokens, english_stemmer)
line[name+'_stem']=' '.join(tokens)
#print tokens
if exclude_stopword:
tokens = [x for x in tokens if x not in stopwords]
tags = pos_tag(tokens)
tags_list = [t for w,t in tags]
tags_str = " ".join(tags_list)
#print tags_str
line[name+'_pos_tag'] = tags_str
return line[[ u'question1_stem', u'question1_pos_tag', u'question2_stem',
u'question2_pos_tag']]
def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
def _encode_regex(name, value, dummy0, dummy1):
"""Encode a python regex or bson.regex.Regex."""
flags = value.flags
# Python 2 common case
if flags == 0:
return b"\x0B" + name + _make_c_string_check(value.pattern) + b"\x00"
# Python 3 common case
elif flags == re.UNICODE:
return b"\x0B" + name + _make_c_string_check(value.pattern) + b"u\x00"
else:
sflags = b""
if flags & re.IGNORECASE:
sflags += b"i"
if flags & re.LOCALE:
sflags += b"l"
if flags & re.MULTILINE:
sflags += b"m"
if flags & re.DOTALL:
sflags += b"s"
if flags & re.UNICODE:
sflags += b"u"
if flags & re.VERBOSE:
sflags += b"x"
sflags += b"\x00"
return b"\x0B" + name + _make_c_string_check(value.pattern) + sflags
def str_flags_to_int(str_flags):
flags = 0
if "i" in str_flags:
flags |= re.IGNORECASE
if "l" in str_flags:
flags |= re.LOCALE
if "m" in str_flags:
flags |= re.MULTILINE
if "s" in str_flags:
flags |= re.DOTALL
if "u" in str_flags:
flags |= re.UNICODE
if "x" in str_flags:
flags |= re.VERBOSE
return flags
def get_flags(self):
flags = 0
for option in self.options:
if option == 'i':
flags |= re.IGNORECASE
elif option == 'l':
flags |= re.LOCALE
elif option == 'm':
flags |= re.MULTILINE
elif option == 's':
flags |= re.DOTALL
elif option == 'u':
flags |= re.UNICODE
return flags
def processLine(line):
titlep=re.compile('^# (\\w+.+$)', re.LOCALE) #group 1
quotep1=re.compile('^\\t> (.+$)', re.LOCALE) #group 1
quotep2=re.compile('^\\t (.+$)', re.LOCALE) #group 1
notep1=re.compile('^\\t- (.+$)', re.LOCALE) #group 1
notep2=re.compile('^\\t (.+$)', re.LOCALE) #group 1
citep=re.compile('^\\t\\t- (@\\w+)') #group1
tagsp=re.compile('^\\t\\t(- Tags: | {8})(.+$)') #group2
ctimep=re.compile('^\\t\\t- Ctime: (.+$)') #group1
patterns={'title': [titlep,1],\
'quote1': [quotep1,1],\
'quote2': [quotep2,1],\
'note1': [notep1,1],\
'note2': [notep2,1],\
'cite': [citep,1],\
'tags': [tagsp,2],\
'ctime': [ctimep,1]\
}
for kk,vv in patterns.items():
m=vv[0].match(line)
if m:
return kk,m.group(vv[1]).encode('utf8')
return None,''
#------------Organize by tags and save------------
def flags(key):
flag = 0
if 'a' in key:
flag += re.ASCII
if 'i' in key:
flag += re.IGNORECASE
if 'l' in key:
flag += re.LOCALE
if 'm' in key:
flag += re.MULTILINE
if 's' in key:
flag += re.DOTALL
if 'x' in key:
flag += re.VERBOSE
return flag
def _regex_from_encoded_pattern(s):
"""'foo' -> re.compile(re.escape('foo'))
'/foo/' -> re.compile('foo')
'/foo/i' -> re.compile('foo', re.I)
"""
if s.startswith('/') and s.rfind('/') != 0:
# Parse it: /PATTERN/FLAGS
idx = s.rfind('/')
pattern, flags_str = s[1:idx], s[idx+1:]
flag_from_char = {
"i": re.IGNORECASE,
"l": re.LOCALE,
"s": re.DOTALL,
"m": re.MULTILINE,
"u": re.UNICODE,
}
flags = 0
for char in flags_str:
try:
flags |= flag_from_char[char]
except KeyError:
raise ValueError("unsupported regex flag: '%s' in '%s' "
"(must be one of '%s')"
% (char, s, ''.join(list(flag_from_char.keys()))))
return re.compile(s[1:idx], flags)
else: # not an encoded regex
return re.compile(re.escape(s))
# Recipe: dedent (0.1.2)
def _regex_from_encoded_pattern(s):
"""'foo' -> re.compile(re.escape('foo'))
'/foo/' -> re.compile('foo')
'/foo/i' -> re.compile('foo', re.I)
"""
if s.startswith('/') and s.rfind('/') != 0:
# Parse it: /PATTERN/FLAGS
idx = s.rfind('/')
pattern, flags_str = s[1:idx], s[idx+1:]
flag_from_char = {
"i": re.IGNORECASE,
"l": re.LOCALE,
"s": re.DOTALL,
"m": re.MULTILINE,
"u": re.UNICODE,
}
flags = 0
for char in flags_str:
try:
flags |= flag_from_char[char]
except KeyError:
raise ValueError("unsupported regex flag: '%s' in '%s' "
"(must be one of '%s')"
% (char, s, ''.join(list(flag_from_char.keys()))))
return re.compile(s[1:idx], flags)
else: # not an encoded regex
return re.compile(re.escape(s))
# Recipe: dedent (0.1.2)
def is_valid_blacklist_pattern(cls, pattern):
"""
If value contains at least five characters (not spaces), consider this a valid pattern
:param pattern: The regex pattern to check
:return: True if pattern is valid
"""
matches = re.findall('[\S]+', pattern, re.LOCALE)
if len(matches) < 5:
return False
if any(re.search(pattern, t, re.LOCALE) for t in ['', ' ', 'JUST SOME TEST', "\n"]):
return False
return True
def is_blacklisted(cls, value):
return any(re.search(p, value, re.LOCALE) for p in cls.read_items() if p.strip())
def _regex_from_encoded_pattern(s):
"""'foo' -> re.compile(re.escape('foo'))
'/foo/' -> re.compile('foo')
'/foo/i' -> re.compile('foo', re.I)
"""
if s.startswith('/') and s.rfind('/') != 0:
# Parse it: /PATTERN/FLAGS
idx = s.rfind('/')
pattern, flags_str = s[1:idx], s[idx+1:]
flag_from_char = {
"i": re.IGNORECASE,
"l": re.LOCALE,
"s": re.DOTALL,
"m": re.MULTILINE,
"u": re.UNICODE,
}
flags = 0
for char in flags_str:
try:
flags |= flag_from_char[char]
except KeyError:
raise ValueError("unsupported regex flag: '%s' in '%s' "
"(must be one of '%s')"
% (char, s, ''.join(list(flag_from_char.keys()))))
return re.compile(s[1:idx], flags)
else: # not an encoded regex
return re.compile(re.escape(s))
# Recipe: dedent (0.1.2)
def _regex_from_encoded_pattern(s):
"""'foo' -> re.compile(re.escape('foo'))
'/foo/' -> re.compile('foo')
'/foo/i' -> re.compile('foo', re.I)
"""
if s.startswith('/') and s.rfind('/') != 0:
# Parse it: /PATTERN/FLAGS
idx = s.rfind('/')
pattern, flags_str = s[1:idx], s[idx+1:]
flag_from_char = {
"i": re.IGNORECASE,
"l": re.LOCALE,
"s": re.DOTALL,
"m": re.MULTILINE,
"u": re.UNICODE,
}
flags = 0
for char in flags_str:
try:
flags |= flag_from_char[char]
except KeyError:
raise ValueError("unsupported regex flag: '%s' in '%s' "
"(must be one of '%s')"
% (char, s, ''.join(list(flag_from_char.keys()))))
return re.compile(s[1:idx], flags)
else: # not an encoded regex
return re.compile(re.escape(s))
# Recipe: dedent (0.1.2)
def _regex_from_encoded_pattern(s):
"""'foo' -> re.compile(re.escape('foo'))
'/foo/' -> re.compile('foo')
'/foo/i' -> re.compile('foo', re.I)
"""
if s.startswith('/') and s.rfind('/') != 0:
# Parse it: /PATTERN/FLAGS
idx = s.rfind('/')
pattern, flags_str = s[1:idx], s[idx+1:]
flag_from_char = {
"i": re.IGNORECASE,
"l": re.LOCALE,
"s": re.DOTALL,
"m": re.MULTILINE,
"u": re.UNICODE,
}
flags = 0
for char in flags_str:
try:
flags |= flag_from_char[char]
except KeyError:
raise ValueError("unsupported regex flag: '%s' in '%s' "
"(must be one of '%s')"
% (char, s, ''.join(list(flag_from_char.keys()))))
return re.compile(s[1:idx], flags)
else: # not an encoded regex
return re.compile(re.escape(s))
# Recipe: dedent (0.1.2)