def __init__(self):
super(EmuApi, self).__init__()
self.service = 'EmuParadise'
self.base_url = 'https://www.emuparadise.me'
self.referrer = None
self._parser = HTMLParser.HTMLParser()
self.endpoints = ENDPOINTS
self.response = self.get_response()
self.search_regex = '<div class="roms">' \
'<a .*?href="(.*?)">(.*?)</a>.*?' \
'<a href="\/roms\/roms\.php\?sysid=(\d+)".*?class="sysname">' \
'(.*?)</a>.*?<b>Size:</b> (.*?) .*?</div>'
self.download_url = 'http://direct.emuparadise.me/roms/get-download.php?gid={download_id}' \
'&token={token}' \
'&mirror_available=true'
self.requires_arguments = True
self.token = '211217baa2d87c57b360b9a673a12cfd'
python类HTMLParser()的实例源码
def getXKCDImageTitle ( html ):
comicBlock = find_last_between( html, 'div id="comic"', "</div>")
if not comicBlock:
return None
imageTitle = find_last_between( comicBlock, "alt=", ">" )
# Drop srcset= if there
imageTitle = imageTitle.split('srcset=')[0]
h = HTMLParser()
imageTitle = h.unescape(imageTitle)
imageTitle = imageTitle.replace('"', '').strip()
imageTitle = imageTitle.replace('/', '').strip()
return imageTitle
# Garfield Minus Garfield Methods
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
real_name = int(name.lstrip('X'), 16)
else:
real_name = int(name)
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
data = u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def feed(self, markup):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
try:
parser.feed(markup)
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def get_steps(protocol_id):
"""
Get steps of a protocol
:param protocol_id: int, protocol id
:return: list, list of unresolved steps
"""
step_list = []
steps = Protocol.objects.filter(parent=protocol_id).order_by('step_order')
html_parser = HTMLParser()
workspace_path = settings['env']['workspace']
for index, step in enumerate(steps):
# priority for self-compiled tool
software_path = os.path.join(os.path.join(os.path.join(workspace_path, str(step.user_id)), 'bin'),
str(step.software))
if os.path.exists(software_path) and os.path.isfile(software_path):
step.software = software_path
step_list.append({
'id': index,
'parameter': html_parser.unescape(str(step.software).rstrip() + " " + str(step.parameter)),
'specify_output': step.specify_output,
'hash': step.hash,
})
return step_list
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
real_name = int(name.lstrip('X'), 16)
else:
real_name = int(name)
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
data = u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def feed(self, markup):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
try:
parser.feed(markup)
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def twitter_url(match, bot=None):
# Find the tweet ID from the URL
tweet_id = match.group(1)
# Get the tweet using the tweepy API
api = get_api(bot)
if not api:
return
try:
tweet = api.get_status(tweet_id)
user = tweet.user
except tweepy.error.TweepError:
return
# Format the return the text of the tweet
text = " ".join(tweet.text.split())
if user.verified:
prefix = u"\u2713"
else:
prefix = ""
time = timesince.timesince(tweet.created_at, datetime.utcnow())
h = HTMLParser()
return u"{}@\x02{}\x02 ({}): {} ({} ago)".format(prefix, user.screen_name, user.name, h.unescape(text), time)
def insert_to(project_url, destination, find_what, indent=0):
url = ('%smagic/%s' % (project_url, destination)).replace('\\', '/')
response = urllib2.urlopen(url)
if response.getcode() == 200:
with open(destination, 'r') as dest:
dest_contents = dest.readlines()
lines = ''.join(dest_contents)
content = HTMLParser().unescape(response.read())
if content.replace(' ', '') in lines.replace(' ', ''):
print_out('IGNORED', destination)
return
generated = []
for line in dest_contents:
generated.append(line)
if line.lower().find(find_what.lower()) >= 0:
spaces = len(line) - len(line.lstrip())
for l in content.split('\n'):
if l:
generated.append('%s%s\n' % (' ' * (spaces + indent), l))
with open(destination, 'w') as dest:
for line in generated:
dest.write(line)
print_out('INSERT', destination)
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
real_name = int(name.lstrip('X'), 16)
else:
real_name = int(name)
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
data = u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def feed(self, markup):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
try:
parser.feed(markup)
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def process(keyword,page):
url='https://www.google.com/search?q=%s&start=%s&num=100'%(keyword,page*100)
urlinfos=[]
#urlinfo1={"url":"http://www.baidu.com/link?url=966OdUyxuwFJoAYx_XGYq7_FiVLcej4qEA3Q84e-lLAtLPRGGHA6tsNFNsTN9zka&wd=&eqid=a64931cc000026c3000000035994fd9e","title":"python Django?? ?????????????????..._???","info":'? W3School,???????????????? jQuery ??? jQuery ??jQuery ???? ?W3School,???????? jQuery ????????????? jQuery...'}
page = ct.crawlerTool.getPage(url)
#print page
#print url
segments = ct.crawlerTool.getXpath('//div[@class="g"]',page)
#print segments
for segment in segments:
#print segment
try:
urlinfo={}
urlinfo['url']= ct.crawlerTool.getXpath('//h3/a/@href',segment)[0]#/text()???????
urlinfo['title'] = ct.crawlerTool.getXpath('//h3/a/text()',segment)[0]
urlinfo['info'] = HTMLParser().unescape(ct.crawlerTool.extractorText(ct.crawlerTool.getXpath('//div[@class="s"]', segment)))
#print urlinfo['url'],urlinfo['title'],urlinfo['info']
#info??????????
urlinfos.append(urlinfo)
except:
print('error')
traceback.print_exc()
return {"urlinfos":urlinfos}
def insert_to(project_url, destination, find_what, indent=0):
url = ('%smagic/%s' % (project_url, destination)).replace('\\', '/')
response = urllib2.urlopen(url)
if response.getcode() == 200:
with open(destination, 'r') as dest:
dest_contents = dest.readlines()
lines = ''.join(dest_contents)
content = HTMLParser().unescape(response.read())
if content.replace(' ', '') in lines.replace(' ', ''):
print_out('IGNORED', destination)
return
generated = []
for line in dest_contents:
generated.append(line)
if line.lower().find(find_what.lower()) >= 0:
spaces = len(line) - len(line.lstrip())
for l in content.split('\n'):
if l:
generated.append('%s%s\n' % (' ' * (spaces + indent), l))
with open(destination, 'w') as dest:
for line in generated:
dest.write(line)
print_out('INSERT', destination)
def ParseGTestXML(xml_content):
"""Parse gtest XML result."""
results = []
html = HTMLParser.HTMLParser()
# TODO(jbudorick): Unclear how this handles crashes.
testsuites = xml.etree.ElementTree.fromstring(xml_content)
for testsuite in testsuites:
suite_name = testsuite.attrib['name']
for testcase in testsuite:
case_name = testcase.attrib['name']
result_type = base_test_result.ResultType.PASS
log = []
for failure in testcase:
result_type = base_test_result.ResultType.FAIL
log.append(html.unescape(failure.attrib['message']))
results.append(base_test_result.BaseTestResult(
'%s.%s' % (suite_name, TestNameWithoutDisabledPrefix(case_name)),
result_type,
int(float(testcase.attrib['time']) * 1000),
log=('\n'.join(log) if log else '')))
return results
def unescape_html(html_):
"""
Replace HTML entities (e.g. `£`) in a string.
:param html_: The escaped HTML.
:return: The input string with entities replaces.
"""
# http://stackoverflow.com/a/2360639
if sys.version_info.major == 2: # 2.7
# noinspection PyUnresolvedReferences,PyCompatibility
from HTMLParser import HTMLParser
return HTMLParser().unescape(html_)
if sys.version_info.minor == 3: # 3.3
# noinspection PyCompatibility
from html.parser import HTMLParser
# noinspection PyDeprecation
return HTMLParser().unescape(html_)
# 3.4+
# noinspection PyCompatibility
import html
return html.unescape(html_)
def feeds(page_url):
"""Search the given URL for possible feeds, returning a list of them."""
# If the URL is a feed, there's no need to scan it for links.
if is_feed(page_url):
return [page_url]
data = fetch_url(page_url)
parser = FeedFinder(page_url)
try:
parser.feed(data)
except HTMLParser.HTMLParseError:
pass
found = parser.urls()
# Return only feeds that feedparser can understand.
return [feed for feed in found if is_feed(feed)]
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
real_name = int(name.lstrip('X'), 16)
else:
real_name = int(name)
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
data = u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def feed(self, markup):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
try:
parser.feed(markup)
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def _provider_auth(self, url, qs, username, password, html):
url += '?sid=0'
# prepare auth
r = self.session.post(url + '&id=tve&option=credential', proxies=self.proxy, headers={'Accept-Encoding': 'gzip'})
# authenticate
post_data = {
'option': 'credential',
'urlRedirect': url,
'Ecom_User_ID': username,
'Ecom_Password': password,
}
r1 = self.session.post(url, data=post_data, proxies=self.proxy, headers={'Accept-Encoding': 'gzip'})
r2 = self.session.get(url, proxies=self.proxy, headers={'Accept-Encoding': 'gzip'})
try:
html_parser = HTMLParser.HTMLParser()
redirurl = re.findall(r'<form method=\"POST\" enctype=\"application/x-www-form-urlencoded\" action=\"(.*)\">', r2.text)[0]
argsre = dict([(match.group(1), html_parser.unescape(match.group(2))) for match in re.finditer(r'<input type=\"hidden\" name=\"(\w+)\" value=\"([^\"]+)\"/>', r2.text)])
return self.session.post(redirurl, data=argsre, proxies=self.proxy, headers={'Accept-Encoding': 'gzip'})
except:
raise Exception('Invalid user name or password.')
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
real_name = int(name.lstrip('X'), 16)
else:
real_name = int(name)
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
data = u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def feed(self, markup):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
try:
parser.feed(markup)
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
real_name = int(name.lstrip('X'), 16)
else:
real_name = int(name)
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
data = u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def feed(self, markup):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
try:
parser.feed(markup)
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def get_url(domain,port,timeout):
url_list = []
if port ==443:
surl = 'https://' + domain
else:
surl = 'http://' + domain
res = urllib2.urlopen(surl, timeout=timeout)
html = res.read()
root_url = res.geturl()
m = re.findall("<(?:img|link|script)[^>]*?(?:src|href)=('|\")(.*?)\\1", html, re.I)
if m:
for url in m:
ParseResult = urlparse.urlparse(url[1])
if ParseResult.netloc and ParseResult.scheme:
if domain == ParseResult.hostname:
url_list.append(HTMLParser.HTMLParser().unescape(url[1]))
elif not ParseResult.netloc and not ParseResult.scheme:
url_list.append(HTMLParser.HTMLParser().unescape(urlparse.urljoin(root_url, url[1])))
return list(set(url_list))
def ParseGTestXML(xml_content):
"""Parse gtest XML result."""
results = []
html = HTMLParser.HTMLParser()
# TODO(jbudorick): Unclear how this handles crashes.
testsuites = xml.etree.ElementTree.fromstring(xml_content)
for testsuite in testsuites:
suite_name = testsuite.attrib['name']
for testcase in testsuite:
case_name = testcase.attrib['name']
result_type = base_test_result.ResultType.PASS
log = []
for failure in testcase:
result_type = base_test_result.ResultType.FAIL
log.append(html.unescape(failure.attrib['message']))
results.append(base_test_result.BaseTestResult(
'%s.%s' % (suite_name, case_name),
result_type,
int(float(testcase.attrib['time']) * 1000),
log=('\n'.join(log) if log else '')))
return results
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
real_name = int(name.lstrip('X'), 16)
else:
real_name = int(name)
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
data = u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def feed(self, markup):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
try:
parser.feed(markup)
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
def test_cdata_with_closing_tags(self):
# see issue #13358
# make sure that HTMLParser calls handle_data only once for each CDATA.
# The normal event collector normalizes the events in get_events,
# so we override it to return the original list of events.
class Collector(EventCollector):
def get_events(self):
return self.events
content = """<!-- not a comment --> ¬-an-entity-ref;
<a href="" /> </p><p> & <span></span></style>
'</script' + '>' </html> </head> </scripter>!"""
for element in [' script', 'script ', ' script ',
'\nscript', 'script\n', '\nscript\n']:
s = u'<script>{content}</{element}>'.format(element=element,
content=content)
self._run_check(s, [("starttag", "script", []),
("data", content),
("endtag", "script")],
collector=Collector)
def test_cdata_with_closing_tags(self):
# see issue #13358
# make sure that HTMLParser calls handle_data only once for each CDATA.
# The normal event collector normalizes the events in get_events,
# so we override it to return the original list of events.
class Collector(EventCollector):
def get_events(self):
return self.events
content = """<!-- not a comment --> ¬-an-entity-ref;
<a href="" /> </p><p> & <span></span></style>
'</script' + '>' </html> </head> </scripter>!"""
for element in [' script', 'script ', ' script ',
'\nscript', 'script\n', '\nscript\n']:
s = u'<script>{content}</{element}>'.format(element=element,
content=content)
self._run_check(s, [("starttag", "script", []),
("data", content),
("endtag", "script")],
collector=Collector)
def lrc2dict(lrc):
time_stamps = re.findall(r'\[[^\]]+\]', lrc)
html_parser = HTMLParser.HTMLParser()
if time_stamps:
# ????
lyric = lrc
for tplus in time_stamps:
lyric = lyric.replace(tplus, '').replace('\r', '').replace('\n', '').replace('????','').replace('???','').replace('?????','').replace('???','').replace('??','').replace('??','').replace('??','').replace('??','')
lyric = lyric.replace('???', '').replace('??', '').replace('????', '').replace('???', '').replace('??', '').replace('???', '')
# ????
# tplus: [02:31.79]
# t 02:31.79
# print lyric
print html_parser.unescape(lyric)
return html_parser.unescape(lyric)
else:
return ''