def detect_test_from_requests(self, stream_mode=False):
"Check in raw first line of requests for an httpwookiee test marker."
detected = False
if stream_mode:
import six
block = six.text_type(self.stream)
else:
block = self.requests[0].first_line.raw.decode('utf8')
matches = re.match(r'.*httpw=--(.*)--.*',
block,
re.S)
if matches:
request_test_id = matches.group(1)
if self.test_id == request_test_id:
detected = True
return detected
python类S的实例源码
def parse_one_page(html):
pattern = re.compile(
'<li.*?blue-link".*?>(.*?)</a>.*?title".*?href="(.*?)">(.*?)</a>.*?abstract">(.*?)</p>.*?ic-list-read">.*?'
+'</i>(.*?)</a>.*?ic-list-comments.*?</i>(.*?)</a>.*?ic-list-like.*?</i>(.*?)</span>.*?ic-list-money.*?</i>(.*?)</span>.*?</li>',
re.S)
items=re.findall(pattern,html)
for item in items:
yield {
'author':item[0],
'link':"http://www.jianshu.com"+item[1],
'title':item[2],
'abstract':item[3].strip(),
'read-num':item[4].strip(),
'comment-num':item[5].strip(),
'like-num':item[6],
'money-num':item[7]
}
def _do_code_blocks(self, text):
"""Process Markdown `<pre><code>` blocks."""
code_block_re = re.compile(r'''
(?:\n\n|\A\n?)
( # $1 = the code block -- one or more lines, starting with a space/tab
(?:
(?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces
.*\n+
)+
)
((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
# Lookahead to make sure this block isn't already in a code block.
# Needed when syntax highlighting is being used.
(?![^<]*\</code\>)
''' % (self.tab_width, self.tab_width),
re.M | re.X)
return code_block_re.sub(self._code_block_sub, text)
def test_blankpage(self) -> None:
_re = re.compile(
'\s*<!DOCTYPE html>'
'\s*<html>'
'\s*<head>'
'\s*<meta charset="utf-8">'
'\s*<title>'
'\s*W-DOM'
'\s*</title>'
'(\s*<script type="text/javascript">.*?</script>)?'
'\s*</head>'
'\s*<body>'
'\s*<script type="text/javascript">'
'.*?</script>'
'\s*</body>'
'.*</html>',
re.S
)
html = self.doc.build()
self.assertIsNotNone(_re.match(remove_wdom_id(html)))
def _do_code_blocks(self, text):
"""Process Markdown `<pre><code>` blocks."""
code_block_re = re.compile(r'''
(?:\n\n|\A\n?)
( # $1 = the code block -- one or more lines, starting with a space/tab
(?:
(?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces
.*\n+
)+
)
((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
# Lookahead to make sure this block isn't already in a code block.
# Needed when syntax highlighting is being used.
(?![^<]*\</code\>)
''' % (self.tab_width, self.tab_width),
re.M | re.X)
return code_block_re.sub(self._code_block_sub, text)
def detach_signature(txt):
# See RFC 4880, section 7
# cf. debian.deb822.Deb822.split_gpg_and_payload (which doesn't handle dash
# escaping and doesn't verify that the input is well-formed)
m = re.match(r'^\s*-----BEGIN PGP SIGNED MESSAGE-----\n'
r'(?:[^\n]+\n)*'
r'\n'
r'(.*)\n'
r'-----BEGIN PGP SIGNATURE-----\n'
r'(.*)\n'
r'-----END PGP SIGNATURE-----\s*$',
re.sub(r'\r\n?', '\n', txt), flags=re.S)
if m:
### TODO: Also return the armor headers?
return (re.sub('^- ', '', m.group(1), flags=re.M).replace('\n', '\r\n'),
m.group(2))
else:
return (txt, None)
def split_arg_string(string):
"""Given an argument string this attempts to split it into small parts."""
rv = []
for match in re.finditer(r"('([^'\\]*(?:\\.[^'\\]*)*)'"
r'|"([^"\\]*(?:\\.[^"\\]*)*)"'
r'|\S+)\s*', string, re.S):
arg = match.group().strip()
if arg[:1] == arg[-1:] and arg[:1] in '"\'':
arg = arg[1:-1].encode('ascii', 'backslashreplace') \
.decode('unicode-escape')
try:
arg = type(string)(arg)
except UnicodeError:
pass
rv.append(arg)
return rv
def compile_rules(environment):
"""Compiles all the rules from the environment into a list of rules."""
e = re.escape
rules = [
(len(environment.comment_start_string), 'comment',
e(environment.comment_start_string)),
(len(environment.block_start_string), 'block',
e(environment.block_start_string)),
(len(environment.variable_start_string), 'variable',
e(environment.variable_start_string))
]
if environment.line_statement_prefix is not None:
rules.append((len(environment.line_statement_prefix), 'linestatement',
r'^[ \t\v]*' + e(environment.line_statement_prefix)))
if environment.line_comment_prefix is not None:
rules.append((len(environment.line_comment_prefix), 'linecomment',
r'(?:^|(?<=\S))[^\S\r\n]*' +
e(environment.line_comment_prefix)))
return [x[1:] for x in sorted(rules, reverse=True)]
def handlehtml(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
try:
request = urllib2.Request(url, None, headers)
html = urllib2.urlopen(request)
data = html.read()
#----------------------------
reg = re.compile(r"http://\w*?\.yinyuetai\.com/uploads/videos/common/.*?(?=&br)", re.S)
findlist = re.findall(reg, data)
#HC(432p) HD(540p) HE(720p)
#
if len(findlist) >= 3:
return findlist[2]
elif len(findlist) >= 2:
return findlist[1]
else:
return findlist[0]
except:
print 'Reading vodeolist failed!'
def splitpasswd(user):
'''urllib.splitpasswd(), but six's support of this is missing'''
_passwdprog = re.compile('^([^:]*):(.*)$', re.S)
match = _passwdprog.match(user)
if match:
return match.group(1, 2)
return user, None
def now_date(self):
# ??????
now = datetime.datetime.now() # ->????????
# ????????:
formateDate = now.strftime("%Y%m%d%H%M%S")
return formateDate
# ???????
def splitpasswd(user):
'''urllib.splitpasswd(), but six's support of this is missing'''
_passwdprog = re.compile('^([^:]*):(.*)$', re.S)
match = _passwdprog.match(user)
if match:
return match.group(1, 2)
return user, None
def splitpasswd(user):
'''urllib.splitpasswd(), but six's support of this is missing'''
_passwdprog = re.compile('^([^:]*):(.*)$', re.S)
match = _passwdprog.match(user)
if match:
return match.group(1, 2)
return user, None
def parse_attr(self):
"""??????"""
attr_p1 = self.html("p[class='col-xs-6']")
text1 = attr_p1.text()
type_result = re.search('??? (.+) ??', text1)
if type_result:
self.type_ = type_result.group(1)
nation_result = re.search('??? (.+) ??', text1)
if nation_result:
self.nation = nation_result.group(1)
language_result = re.search('??? (.+) ??', text1)
if language_result:
self.language = language_result.group(1)
duration_result = re.search('??? (.+) ????', text1)
if duration_result:
self.duration = duration_result.group(1)
year_result = re.search('????? (.+) ????', text1)
if year_result:
self.date = year_result.group(1)
douban_result = re.search('????? (.+) ??', text1)
if douban_result:
self.douban = douban_result.group(1)
attr_p2 = self.html("p[class='col-lg-7 col-md-7 col-sm-7 col-xs-12']")
text2 = attr_p2.text()
actor_result = re.search('??? (.+) ??', text2, flags=re.S)
if actor_result:
self.actor = actor_result.group(1)
director_result = re.search('??? (.+)$', text2)
if director_result:
self.director = director_result.group(1)
def _parse_name(self, information):
"""??????"""
search_result = re.search('?? ???(.+?)?', information)
if not search_result:
search_result = re.search('?????(.+?)?', information, flags=re.S)
if not search_result:
search_result = re.search('?? ????(.+?)?', information)
if search_result:
self.name = search_result.group(1).strip()
def _parse_original_name(self, information):
"""??????"""
search_result = re.search('?? ???(.+?)?', information)
if not search_result:
search_result = re.search('?????(.+?)?', information, flags=re.S)
if not search_result:
search_result = re.search('?? ????(.+?)?', information, flags=re.S)
if search_result:
self.original_name = search_result.group(1).strip()
def _parse_nation(self, information):
"""????"""
search_result = re.search('?? ???(.+?)?', information)
if not search_result:
search_result = re.search('?????(.+?)?', information, flags=re.S)
if not search_result:
search_result = re.search('?????(.+?)?', information, flags=re.S)
if not search_result:
search_result = re.search('?? ????(.+?)?', information, flags=re.S)
if search_result:
self.nation = search_result.group(1).strip()
def _parse_type(self, information):
"""????"""
search_result = re.search('?? ???(.+?)?', information)
if not search_result:
search_result = re.search('?????(.+?)?', information, flags=re.S)
if not search_result:
search_result = re.search('?? ????(.+?)?', information, flags=re.S)
if search_result:
self.type_ = search_result.group(1).strip()
def _parse_imdb(self, information):
"""??IMDB??"""
search_result = re.search('?IMDB ? ??(.+?)?', information)
if not search_result:
search_result = re.search('?IMDB??(.+?)?', information, flags=re.S)
if not search_result:
search_result = re.search('?IMDb??(.+?)?', information, flags=re.S)
if not search_result:
search_result = re.search('?IMDB????(.+?)?', information, flags=re.S)
if search_result:
self.imdb = search_result.group(1).strip()
def _parse_douban(self, information):
"""??????"""
search_result = re.search('?????(.+?)?', information, flags=re.S)
if search_result:
self.douban = search_result.group(1).strip()