def lxml_test():
url = "http://www.caixunzz.com"
req = urllib2.Request(url=url)
resp = urllib2.urlopen(req)
#print resp.read()
'''
parse_body=html.fromstring(resp.read())
href=parse_body.xpath('//a[@class="label"]/@href')
print href
#not working from above
'''
tree = etree.HTML(resp.read())
href = tree.xpath('//a[@class="label"]/@href')
#print href.tag
for i in href:
#print html.tostring(i)
#print type(i)
print i
print type(href)
#not working yet
python类tostring()的实例源码
def process_item(self, item, spider):
if item is not None:
doc = item['content']
if not isinstance(doc,
(str, bytes)):
if isinstance(doc,
HtmlElement):
item['content'] = tostring(doc,
encoding='UTF-8',
pretty_print=True,
method='html')
item['encoding'] = 'UTF-8'
else:
raise Exception((
'Error in store pipeline unsupported doc type[{}]'
).format(doc.__class__.__name__))
item_ = dict(item)
item_['lang'] = get_article_lang(item)
item_['spider'] = spider._id
item_['source'] = spider.title
item_['category'] = get_category(item_)
if not is_exists_article(item_):
save_article(item_)
return item
def setUp(self):
super(TestViewSaving, self).setUp()
self.arch = h.DIV(
h.DIV(
h.H3("Column 1"),
h.UL(
h.LI("Item 1"),
h.LI("Item 2"),
h.LI("Item 3"))),
h.DIV(
h.H3("Column 2"),
h.UL(
h.LI("Item 1"),
h.LI(h.SPAN("My Company", attrs(model='res.company', id=1, field='name', type='char'))),
h.LI(h.SPAN("+00 00 000 00 0 000", attrs(model='res.company', id=1, field='phone', type='char')))
))
)
self.view_id = self.registry('ir.ui.view').create(self.cr, self.uid, {
'name': "Test View",
'type': 'qweb',
'arch': ET.tostring(self.arch, encoding='utf-8').decode('utf-8')
})
def ingest(self, file_path):
"""Ingestor implementation."""
file_size = self.result.size or os.path.getsize(file_path)
if file_size > self.MAX_SIZE:
raise ProcessingException("XML file is too large.")
try:
doc = etree.parse(file_path)
except (ParserError, ParseError):
raise ProcessingException("XML could not be parsed.")
text = self.extract_html_text(doc.getroot())
transform = etree.XSLT(self.XSLT)
html_doc = transform(doc)
html_body = html.tostring(html_doc,
encoding='unicode',
pretty_print=True)
self.result.flag(self.result.FLAG_HTML)
self.result.emit_html_body(html_body, text)
def WriteHTML(self,testcaseinfo):
self.CreateHtmlFile()
f = open(self.reportfile,"r")
htmlcontent = f.read()
f.close()
#tree = mytree.fromstring(str(htmlcontent))
htmlcontent.encode('utf-8')
tree = html.fromstring(htmlcontent)
tableElem = tree.find(".//table")
if testcaseinfo.result == "Failed":
mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td bgcolor=\"#FF0000\">{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(testcaseinfo.id,testcaseinfo.name,testcaseinfo.owner,testcaseinfo.result,testcaseinfo.starttime,testcaseinfo.endtime,testcaseinfo.secondsDuration,testcaseinfo.errorinfo)
else:
mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(testcaseinfo.id,testcaseinfo.name,testcaseinfo.owner,testcaseinfo.result,testcaseinfo.starttime,testcaseinfo.endtime,testcaseinfo.secondsDuration,testcaseinfo.errorinfo)
tableElem.append(mytree.HTML(str(mytablerow)))
f = open(self.reportfile,"w")
#html.tostring
newContent = repr(html.tostring(tree,method="html",with_tail=False))
newContent = newContent.replace(r"\n","").replace(r"\t","").replace('b\'',"")
newContent = newContent[:len(newContent)-1]
f.write(newContent)
f.close()
def __init__(self, file_name, user_id):
with open(file_name, 'r') as self.opened_file:
# So Instapaper doesn't close <li> tags
# This was causing infinite recursion when using BS directly
# Hence why the stuff below is being done, so that the <li> tags get closed
self.html = html.document_fromstring(self.opened_file.read())
self.html = html.tostring(self.html)
self.soup = BeautifulSoup4(self.html)
self.user = user_id
self.urls = dict()
self.check_duplicates = dict()
self.check_duplicates_query = Bookmark.query.filter(Bookmark.user == self.user,
Bookmark.deleted == False).all()
for bmark in self.check_duplicates_query:
self.check_duplicates[bmark.main_url] = bmark
self.tags_dict = dict()
self.tags_set = set()
self.valid_url = re.compile(
r'^(?:[a-z0-9\.\-]*)://'
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}(?<!-)\.?)|'
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'
r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'
r'(?::\d+)?'
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def _sanitize_html_frags(html_value, valid_tags, valid_attributes):
fragments = html.fragments_fromstring(html_value)
for f in fragments:
if isinstance(f, html.HtmlElement):
_sanitize_html_rec(f, valid_tags, valid_attributes)
if f.tag in valid_tags:
_clean_attributes(f, valid_attributes)
yield html.tostring(f, encoding="unicode")
else:
if f.text:
yield f.text
for sub in f:
yield html.tostring(sub, encoding="unicode")
if f.tail:
yield f.tail
if f.tag in ('p', 'br'):
yield '\n'
else:
yield f
def totext(self, elem):
return self.tostring(elem, encoding=unicode, method='text').strip()
def parse_results_page(self, root): # {{{
from lxml.html import tostring
matches = []
def title_ok(title):
title = title.lower()
bad = ['bulk pack', '[audiobook]', '[audio cd]', '(a book companion)', '( slipcase with door )', ': free sampler']
for x in bad:
if x in title:
return False
# if title and title[0] in '[{' and re.search(r'\(\s*author\s*\)', title) is not None:
# # Bad entries in the catalog
# return False
return True
for a in root.xpath(r'//li[starts-with(@class, "line")]//a[@href and contains(@name, "itemlist-picture")]'):
# title = a.get('title')
# if title_ok(title):
url = a.get('href')
if url.startswith('/'):
url = 'http://product.dangdang.com/%s' % (url)
matches.append(url)
# Keep only the top 5 matches as the matches are sorted by relevance by
# Amazon so lower matches are not likely to be very relevant
return matches[:5]
# }}}
def url_trim(html):
"""Trims anchor texts that are longer than 70 chars."""
fragment = fromstring(html)
for el, attrib_, link_, pos_ in fragment.iterlinks():
new_link_text = trim_url(el.text_content())
el.text = new_link_text
return mark_safe(tostring(fragment, encoding=unicode))
def lxml_case3():
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item><span>Hello world</span></a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
<li class="de-item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
tree=etree.HTML(text)
html_s=etree.tostring(tree)
#print html_s
#print tree.xpath('//li//span/text()')[0]
'''
reg_case=tree.xpath('//*[starts-with(@class,"item")]')
for i in reg_case:
print i.xpath('.//a/@href')
'''
result=tree.xpath(r'//*[re:match(@class, "item-0")]')
print result
for i in result[0]:
print i.xpath('.//a/@href')
def test_body(self):
html = '''<body><p>test</p></body>'''
res = b'''<html><body><p>test</p></body></html>'''
tree = self.soupparser.fromstring(html)
self.assertEqual(tostring(tree), res)
def test_head_body(self):
# HTML tag missing, parser should fix that
html = '<head><title>test</title></head><body><p>test</p></body>'
res = b'<html><head><title>test</title></head><body><p>test</p></body></html>'
tree = self.soupparser.fromstring(html)
self.assertEqual(tostring(tree), res)
def test_wrap_html(self):
# <head> outside <html>, parser should fix that
html = '<head><title>title</test></head><html><body/></html>'
res = b'<html><head><title>title</title></head><body></body></html>'
tree = self.soupparser.fromstring(html)
self.assertEqual(tostring(tree), res)
def test_comment_pi(self):
html = '''<!-- comment -->
<?test asdf?>
<head><title>test</title></head><body><p>test</p></body>
<!-- another comment -->'''
res = b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<!-- comment --><?test asdf?><html><head><title>test</title></head><body><p>test</p></body></html><!-- another comment -->'''
tree = self.soupparser.fromstring(html).getroottree()
self.assertEqual(tostring(tree, method='html'), res)
def test_doctype1(self):
# Test document type declaration, comments and PI's
# outside the root
html = \
'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar>'''
res = \
b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''
tree = self.soupparser.fromstring(html).getroottree()
self.assertEqual(tree.docinfo.public_id, "-//W3C//DTD HTML 4.01//EN")
self.assertEqual(tostring(tree), res)
def test_doctype_html5(self):
# html 5 doctype declaration
html = b'<!DOCTYPE html>\n<html lang="en"></html>'
tree = self.soupparser.fromstring(html).getroottree()
self.assertTrue(tree.docinfo.public_id is None)
self.assertEqual(tostring(tree), html)
def get_language(self, s_intervention, p, i_lang, new_paragraphs):
language = p.xpath('.//span[@class="italic"][text()[re:test(.,"^[\xad\s\.—–\-?,\(]*({})[\xad\s\.—–\-?,\)]*")]]'.format('|'.join(self.langs)), namespaces=self.ns)
if len(language) > 0 and not self.explanations_of_vote.match(language[0].text):
lang = re.match(
r'.*({}).*'.format('|'.join(self.langs)),
language[0].text)
output = lang.group(1)
for l in language:
l.drop_tree()
else:
p = html.tostring(p, with_tail=True, encoding='utf-8').decode('utf-8')
lang_in_text = re.search(
r'\(({})\)'.format('|'.join(self.langs)),
p)
if lang_in_text is not None:
output = lang_in_text.group(1)
p = re.sub(r'\(({})\) *'.format('|'.join(self.langs)), r'', p)
else:
if len(new_paragraphs) == 0:
if 'role' in s_intervention.keys():
president_pattern = '|'.join(self.loc['president'])
if re.match(r'{}\Z'.format(president_pattern), s_intervention['role']):
output = 'unknown'
else:
if i_lang is None:
output = self.language.upper()
else:
output = i_lang
else:
if i_lang is None:
output = self.language.upper()
else:
output = i_lang
else:
output = new_paragraphs[-1]['language']
p = html.fromstring(p)
return output, p
def serialize(self, infile, root):
ofile_name = os.path.splitext(os.path.basename(infile))[0]
ofile_path = os.path.join(self.outdir, ofile_name+'.xml')
xml = etree.tostring(
root,
encoding='utf-8',
xml_declaration=True,
pretty_print=True).decode('utf-8')
with open(ofile_path, mode='w', encoding='utf-8') as ofile:
ofile.write(xml)
pass
def serialize(self, infile, root):
ofile_name = os.path.splitext(os.path.basename(infile))[0]
ofile_path = os.path.join(self.outdir, ofile_name+'.xml')
xml = etree.tostring(
root,
encoding='utf-8',
xml_declaration=True,
pretty_print=True).decode('utf-8')
with open(ofile_path, mode='w', encoding='utf-8') as ofile:
ofile.write(xml)
pass
def get_name(self, tree):
name = tree.xpath('//li[@class="mep_name"]')[0]
name = self.rm_a.clean_html(name)
name = html.tostring(name).decode('utf-8')
name = re.sub(r'[\t\n]', r'', name)
name = name.split('<br>')
name = [html.fromstring(x).text_content() for x in name]
name = ' '.join(name)
return name
def convert_html_to_telegraph_format(html_string, clean_html=True, output_format="json_string"):
if clean_html:
html_string = clean_article_html(html_string)
body = preprocess_fragments(
_fragments_from_string(html_string)
)
if body is not None:
desc = [x for x in body.iterdescendants()]
for tag in desc:
preprocess_media_tags(tag)
move_to_top(body)
post_process(body)
else:
fragments = _fragments_from_string(html_string)
body = fragments[0].getparent() if len(fragments) else None
content = []
if body is not None:
content = [_recursive_convert(x) for x in body.iterchildren()]
if output_format == 'json_string':
return json.dumps(content, ensure_ascii=False)
elif output_format == 'python_list':
return content
elif output_format == 'html_string':
return html.tostring(body, encoding='unicode')
def from_html(self, cr, uid, model, field, element, context=None):
content = []
if element.text: content.append(element.text)
content.extend(html.tostring(child)
for child in element.iterchildren(tag=etree.Element))
return '\n'.join(content)
def test_save(self):
Company = self.registry('res.company')
View = self.registry('ir.ui.view')
replacement = ET.tostring(h.DIV(
h.H3("Column 2"),
h.UL(
h.LI("wob wob wob"),
h.LI(h.SPAN("Acme Corporation", attrs(model='res.company', id=1, field='name', expression="bob", type='char'))),
h.LI(h.SPAN("+12 3456789", attrs(model='res.company', id=1, field='phone', expression="edmund", type='char'))),
)
), encoding='utf-8')
View.save(self.cr, self.uid, res_id=self.view_id, value=replacement,
xpath='/div/div[2]')
company = Company.browse(self.cr, self.uid, 1)
self.assertEqual(company.name, "Acme Corporation")
self.assertEqual(company.phone, "+12 3456789")
self.eq(
ET.fromstring(View.browse(self.cr, self.uid, self.view_id).arch.encode('utf-8')),
h.DIV(
h.DIV(
h.H3("Column 1"),
h.UL(
h.LI("Item 1"),
h.LI("Item 2"),
h.LI("Item 3"))),
h.DIV(
h.H3("Column 2"),
h.UL(
h.LI("wob wob wob"),
h.LI(h.SPAN({'t-field': "bob"})),
h.LI(h.SPAN({'t-field': "edmund"}))
))
)
)
def test_save_only_embedded(self):
Company = self.registry('res.company')
company_id = 1
Company.write(self.cr, self.uid, company_id, {'name': "Foo Corporation"})
node = html.tostring(h.SPAN(
"Acme Corporation",
attrs(model='res.company', id=company_id, field="name", expression='bob', type='char')))
self.registry('ir.ui.view').save(self.cr, self.uid, res_id=company_id,value=node)
company = Company.browse(self.cr, self.uid, company_id)
self.assertEqual(company.name, "Acme Corporation")
def test_field_tail(self):
View = self.registry('ir.ui.view')
replacement = ET.tostring(
h.LI(h.SPAN("+12 3456789", attrs(
model='res.company', id=1, type='char',
field='phone', expression="edmund")),
"whop whop"
), encoding="utf-8")
View.save(self.cr, self.uid, res_id = self.view_id, value=replacement,
xpath='/div/div[2]/ul/li[3]')
self.eq(
ET.fromstring(View.browse(self.cr, self.uid, self.view_id).arch.encode('utf-8')),
h.DIV(
h.DIV(
h.H3("Column 1"),
h.UL(
h.LI("Item 1"),
h.LI("Item 2"),
h.LI("Item 3"))),
h.DIV(
h.H3("Column 2"),
h.UL(
h.LI("Item 1"),
h.LI(h.SPAN("My Company", attrs(model='res.company', id=1, field='name', type='char'))),
h.LI(h.SPAN({'t-field': "edmund"}), "whop whop"),
))
)
)
def modify_html(content, prop='_content'):
html_string = getattr(content, prop)
html_tree = html.fromstring(html_string)
yield html_tree
html_string = html.tostring(html_tree, encoding='unicode')
html_string = re.sub(r'%7B(\w+)%7D', r'{\1}', html_string)
html_string = re.sub(r'%7C(\w+)%7C', r'|\1|', html_string)
setattr(content, prop, html_string)
def get_content(self, page, meta):
if not page.is_html:
return page.content
check_path = self.config.data.get('check_path')
if check_path is not None:
if page.doc.find(check_path) is None:
log.info("Failed XML path check: %r", page.url)
return None
for meta_el in ['title', 'author', 'date']:
path = self.config.data.get('%s_path' % meta_el)
if path is not None and page.doc.findtext(path):
meta[meta_el] = page.doc.findtext(path)
if 'date' in meta:
try:
date = meta.pop('date')
date = parse(date)
if 'dates' not in meta:
meta['dates'] = []
meta['dates'].append(date.isoformat())
except Exception as ex:
log.exception(ex)
body = page.doc
if self.config.data.get('body_path') is not None:
body = page.doc.find(self.config.data.get('body_path'))
for path in self.config.data.get('remove_paths', []):
for el in body.findall(path):
el.drop_tree()
return html.tostring(body)
def parse_movie_details(self, response):
html_root = html.fromstring(response.content,
base_url=response.base_url)
movie_info = dict()
movie_info['??'] = self.xpath_first(html_root,
'//div[@id="content"]'
'/h1/span[1]/text()').strip()
try:
# to pure text
soup = BeautifulSoup(html.tostring(
self.xpath_first(html_root,
'//div[@id="info"]')), 'html')
except TypeError:
return None
else:
for line in soup.get_text().splitlines():
try:
left, *right = line.split(':')
except AttributeError:
pass
else:
key = left.strip()
value = ''.join(x.strip() for x in right)
if key and value:
movie_info[key] = value
yield movie_info
def test_convert_spans(self):
expected = '''
<p>
<em><strong>
foobar
<em>
lala
<strong>
yum
</strong>
</em>
<span>
hey hey
</span>
<strong>
uh oh
</strong>
<span>
yes
</span>
</strong></em>
</p>
'''
h = fromstring(html)
for span in h.findall('.//span'):
html2md.convert_span(span)
result = tostring(h).decode('utf-8')
results = [x.replace('\n', '').replace(' ', '') for x in [result, expected]]
print('=========')
print(results[0])
print('=========')
print(results[1])
self.assertEqual(results[0], results[1])