def html(self):
return self._html
python类html()的实例源码
def export(self):
output_dir = os.path.join(self._export_dir, self._id)
root = ".."
os.makedirs(output_dir, exist_ok=True)
data, files = self._process(root=root)
with open(os.path.join(output_dir, 'index.html'), 'wb+') as f:
f.write(data.encode('utf-8'))
for url, file in files:
self._download_url(url, os.path.join(output_dir, file))
def _process(self, root='..'):
files = []
self._clean_html()
self._annotate()
for (element, attr, url, _) in self._html.iterlinks():
if element.tag == 'a' and attr == 'href' and url.startswith('https://www.google.com/url'):
element.set('href', process_link(url, root=root))
elif element.tag == 'img' and attr == 'src':
filetitle = hashlib.md5(url.encode()).hexdigest()
filetitle += '.jpg'
element.set('src', '../' + self._id + '/' + filetitle) # We go to top level to handle when the document is use as appliance
files.append((url, filetitle))
self._toc = self._get_toc()
self._add_anchors()
self._wrap_images()
self._replace_youtube_videos()
# Wrap the original body
try:
body = self._html.xpath('//body')[0]
except (IndexError):
body = lxml.html.Element('body')
body.tag = 'div'
if 'style' in body.attrib:
del body.attrib['style']
self._content = lxml.etree.tostring(body, pretty_print=True, method="html")
return self._theme.render(self._template + '.html', document=self, root=root, config=self._config, appliances=self._appliances), files
def _wrap_images(self):
"""
Wrap images in a target blank
"""
for img in self._html.iter('img'):
img.attrib.pop('style', None)
a = img
a.attrib["href"] = img.attrib.pop("src", None)
a.attrib["target"] = "_blank"
a.tag = "a"
img = lxml.html.builder.IMG()
img.attrib["src"] = a.attrib["href"]
a.append(img)
def _add_anchors(self):
"""
Add anchors link to h1, h2, h3
"""
for element in self._html.iter('h1', 'h2', 'h3'):
if len(element) == 0 and element.attrib.get('id') is not None:
a = lxml.html.builder.A()
a.attrib['href'] = "#" + element.attrib.get('id')
a.text = element.text
element.text = None
element.append(a)
def main():
"""
Run a test
"""
import tempfile
with tempfile.TemporaryDirectory() as tmpdir:
document = DriveDocument("42", "test", "<html><body style=\"test\"><h1>Hello</h1></body></html>", editable_by_anyone=True)
document.export(tmpdir)
with open(os.path.join(tmpdir, "42", "index.html")) as f:
print(f.read())
def get_keylist(self, search_url,i):
html = requests.get(search_url, headers=self.headers, verify=False).content
selector = etree.HTML(html)
# ????
content = selector.xpath('//div[@class="news-box"]/ul/li/div[@class="txt-box"]/h3/a/@href')
for list in content:
maincontent = self.get_content(list,i)
def removeFile(self):
# ?????
if os.path.exists('/home/wwwroot/laravel/public/img/daily/'):
shutil.rmtree(r'/home/wwwroot/laravel/public/img/daily') # pic
if os.path.exists('/home/wwwroot/url/daily/'):
shutil.rmtree(r'/home/wwwroot/url/daily') # html
# ???????????
def get_list(self, search_url):
html = requests.get(search_url, headers=self.headers, verify=False).content
selector = etree.HTML(html)
# ????
content = selector.xpath('//div[@class="news-box"]/ul/li/div[@class="txt-box"]/h3/a/@href')
for list in content:
maincontent = self.get_content(list)
# ?????????????
def get_list(self, search_url):
html = requests.get(search_url, headers=self.headers, verify=False).content
selector = etree.HTML(html)
# ????
content = selector.xpath('//div[@class="news-box"]/ul/li/div[@class="txt-box"]/h3/a/@href')
for list in content:
maincontent = self.get_content(list)
# ?????????????
def reserve(self, match):
payload = {'match_selected': match['match_id'],
'match_verein_id': '',
'as_values_match_verein_id': '',
'check_match': match['match_id']}
r = self.driver.request("POST", self.baseUrl + '&act=new', data=payload)
doc = lxml.html.fromstring(r.content)
path_match = "/html/body//table//tr[@id]/*//text() | " \
"/html/body//table//tr[@id]/*//@href"
raw = doc.xpath(path_match)
# 2017-06-05 -> 05.06.17
date = datetime.datetime.strptime(match['match_date'], '%Y-%m-%d %H:%M').strftime('%d.%m.%y %H:%M')
# ---- raw snipet -----
# 0 06.06.17 18:30 Uhr
# 1 Relegation
# 2 TSV Landsberg
# 3 - TSV Bogen
# 4 index.php?page=fotograf_spiele&mafo_id=43704&act=del
# 5 Bereits jemand eingetragen:
# 6 http://www.fupa.net/fupaner/abc-def-3
# 7 abc def
# ...
for i, d in enumerate(raw):
if date in d:
if match['home'] in raw[i + 2] and match['guest'] in raw[i + 3]:
url = raw[i + 4]
match['mafo_id'] = url.split("?")[1].split("&")[1].split("=")[1]
try:
if 'Bereits jemand eingetragen' in raw[i + 5]:
# already reserved
return match, raw[i + 7] # Photographer
except:
pass
# match can be reserved
return match, None
def parse_html(html_file):
"""
Read the HTML file using lxml's HTML parser, but convert to Unicode
using Beautiful Soup's UnicodeDammit class.
Can raise LxmlError or TypeError if the file can't be opened or
parsed.
"""
unicode_html = UnicodeDammit(html_file, smart_quotes_to="html",
is_html=True)
if unicode_html.unicode_markup is None:
raise ValueError("no HTML provided")
if not unicode_html.unicode_markup:
raise ValueError("could not detect character encoding")
return lxml.html.fromstring(unicode_html.unicode_markup)
def test_parse_fragments_fromstring(self):
parser = lxml.html.HTMLParser(encoding='utf-8', remove_comments=True)
html = """<frameset>
<frame src="main.php" name="srcpg" id="srcpg" frameborder="0" rolling="Auto" marginwidth="" marginheight="0">
</frameset>"""
etree_document = lxml.html.fragments_fromstring(html, parser=parser)
self.assertEqual(len(etree_document), 1)
root = etree_document[0]
self.assertEqual(root.tag, "frameset")
frame_element = root[0]
self.assertEqual(frame_element.tag, 'frame')
def test_parse_fromstring(self):
parser = lxml.html.HTMLParser(encoding='utf-8', remove_comments=True)
html = """<html><frameset>
<frame src="main.php" name="srcpg" id="srcpg" frameborder="0" rolling="Auto" marginwidth="" marginheight="0">
</frameset></html>"""
etree_document = lxml.html.fromstring(html, parser=parser)
self.assertEqual(etree_document.tag, 'html')
self.assertEqual(len(etree_document), 1)
frameset_element = etree_document[0]
self.assertEqual(len(frameset_element), 1)
frame_element = frameset_element[0]
self.assertEqual(frame_element.tag, 'frame')
def test_allow_tags(self):
html = """
<html>
<head>
</head>
<body>
<p>some text</p>
<table>
<tr>
<td>hello</td><td>world</td>
</tr>
<tr>
<td>hello</td><td>world</td>
</tr>
</table>
<img>
</body>
</html>
"""
html_root = lxml.html.document_fromstring(html)
cleaner = Cleaner(
remove_unknown_tags = False,
allow_tags = ['table', 'tr', 'td'])
result = cleaner.clean_html(html_root)
self.assertEqual(12-5+1, len(list(result.iter())))
def test_safe_attrs_included(self):
html = """<p><span style="color: #00ffff;">Cyan</span></p>"""
safe_attrs=set(lxml.html.defs.safe_attrs)
safe_attrs.add('style')
cleaner = Cleaner(
safe_attrs_only=True,
safe_attrs=safe_attrs)
result = cleaner.clean_html(html)
self.assertEqual(html, result)
def test_safe_attrs_excluded(self):
html = """<p><span style="color: #00ffff;">Cyan</span></p>"""
expected = """<p><span>Cyan</span></p>"""
safe_attrs=set()
cleaner = Cleaner(
safe_attrs_only=True,
safe_attrs=safe_attrs)
result = cleaner.clean_html(html)
self.assertEqual(expected, result)
def submit(self, probNum, path=".", language=None):
"""
submits the problem according to the problem Number of the question.
returns a list containing the submission details about the question.
"""
file_path, filename = UvaSession.find_file(probNum, path)
probFile = open(file_path)
if language is None:
language_number = UvaSession.find_language(filename)
else:
language_number = UvaSession.language_handler[language]
if language_number is None:
return
payload = {
"localid": probNum,
"code": probFile.read(),
"language": language_number,
"codeupl": "",
"problemid": "",
"category": "",
"submit": "Submit"
}
updated_headers = {
"Referer": UvaSession.UVA_HOST + "index.php?option=com_onlinejudge&Itemid=25",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Host": "uva.onlinejudge.org",
"Origin": UvaSession.UVA_HOST
}
resp = self.uva_session.post(UvaSession.SUBMIT_PATH, data=payload, headers=updated_headers)
submission_id = resp.url[resp.url.find('ID')+3:]
return self.check_result(submission_id, probNum)
def login(self, username="", password=""):
# logging in without credentials
self.username = username
response_page = self.codechef_session.get(CodechefSession.codechef_url)
html_page = lxml.html.fromstring(response_page.text)
hidden_inputs = html_page.xpath(
r'//form//input[@type="hidden"]'
)
payload = {i.attrib["name"]: i.attrib["value"]
for i in hidden_inputs}
payload['name'] = username
payload['pass'] = password
payload['op'] = 'Login'
response = self.codechef_session.post(CodechefSession.codechef_url, data=payload)
# removing extra sessions using simple scraping and form handling
while response.url == CodechefSession.codechef_url + '/session/limit':
html_page = lxml.html.fromstring(response.text)
all_inputs = html_page.xpath(r'//form//input')
payload = {i.attrib["name"]: i.attrib["value"] for i in all_inputs[::-1]}
response = self.codechef_session.post(CodechefSession.codechef_url + '/session/limit', data=payload)
soup = bs(response.content, 'lxml')
name = soup.find(text=username)
self.logged_in = bool(name)
if self.logged_in: self.username = username
return self.logged_in
def submit(self, question_code, path=".", language=None):
contest = ""
for contests in self.info_present_contests():
for contest_ques in CodechefSession.ques_in_contest(contests['contest_name']):
if contest_ques == question_code:
contest = '/' + contests['contest_name']
break
file_path = path
# file_path, file_name = CodechefSession.find_file(question_code, path)
lang = CodechefSession.language_handler[language]
response = self.codechef_session.get(
self.codechef_url + contest + '/submit/' + question_code
)
html_page = lxml.html.fromstring(response.text)
hidden_inputs = html_page.xpath(r'//form//input[@type="hidden"]')
payload = {i.attrib['name']: i.attrib['value'] for i in hidden_inputs}
payload['language'] = lang
payload['problem_code'] = question_code
payload['op'] = 'Submit'
file = {
"files[sourcefile]": open(file_path)
}
response = self.codechef_session.post(CodechefSession.codechef_url + contest + '/submit/' + question_code,
data=payload,
files=file
)
sub_id = response.url.split('/')[-1]
return sub_id , self.check_result(sub_id, question_code)