def html_to_markdown(html):
"""convert html to markdown.
this will try and convert span styling
to the proper tags as well.
e.g. `<span style='font-weight:bold;'>foo</span>`
will become `<strong>foo</strong>`.
"""
h = fromstring(html)
clean_highlighted_code(h)
for span in h.findall('.//span') + h.findall('.//font'):
convert_span(span)
html = tostring(h).decode('utf-8')
# not ideal but works in a pinch
html = html.replace('<mark>', '==')
html = html.replace('</mark>', '==')
md = to_md(html)
# sometimes html2text returns a ton of extra whitespace.
# clean up lines with only whitespace.
# condense line break streaks of 3 or more.
md = re.sub(r'\n([\s\*_]+)\n', '\n\n', md)
md = re.sub(r'\n{3,}', '\n\n', md)
return md
python类tostring()的实例源码
def rewrite_links(raw_html, rewrite_func):
"""
Take an HTML input string, rewrite links according
to the `rewrite_func`, return the rewritten HTML string.
"""
html = fromstring(raw_html)
html.rewrite_links(rewrite_func)
return tostring(html)
def view_source(self):
for line in html.tostring(self.lxml_html, pretty_print=True, encoding='unicode').split('\n'):
print(line)
def get_html_for(self, locator):
"""Returns the HTML of the element (including its own tags) targeted by the given `locator`
:param locator: An instance of :class:`XPath` or a string containing an XPath expression.
"""
xpath = six.text_type(locator)
element = self.xpath(xpath)[0]
return html.tostring(element, encoding='unicode')
def get_inner_html_for(self, locator):
"""Returns the HTML of the children of the element targeted by the given `locator` (excluding the
element's own tags).
:param locator: An instance of :class:`XPath` or a string containing an XPath expression.
"""
xpath = six.text_type(locator)
element = self.xpath(xpath)[0]
return ''.join(html.tostring(child, encoding='unicode') for child in element.getchildren())
def WriteHTML(self, testcaseinfo):
self.CreateHtmlFile()
f = open(self.reportfile, "r")
htmlcontent = f.read()
f.close()
# tree = mytree.fromstring(str(htmlcontent))
htmlcontent.encode('utf-8')
tree = html.fromstring(htmlcontent)
tableElem = tree.find(".//table")
if testcaseinfo.result == "Failed":
mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td bgcolor=\"#FF0000\">{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(
testcaseinfo.id, testcaseinfo.name, testcaseinfo.owner, testcaseinfo.result, testcaseinfo.starttime,
testcaseinfo.endtime, testcaseinfo.secondsDuration, testcaseinfo.errorinfo)
elif testcaseinfo.result == "Pass":
mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td bgcolor=\"#00FF00\">{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(
testcaseinfo.id, testcaseinfo.name, testcaseinfo.owner, testcaseinfo.result, testcaseinfo.starttime,
testcaseinfo.endtime, testcaseinfo.secondsDuration, testcaseinfo.errorinfo)
else:
mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(
testcaseinfo.id, testcaseinfo.name, testcaseinfo.owner, testcaseinfo.result, testcaseinfo.starttime,
testcaseinfo.endtime, testcaseinfo.secondsDuration, testcaseinfo.errorinfo)
tableElem.append(mytree.HTML(str(mytablerow)))
f = open(self.reportfile, "w")
# html.tostring
newContent = repr(html.tostring(tree, method="html", with_tail=False))
newContent = newContent.replace(r"\n", "").replace(r"\t", "").replace('b\'', "")
newContent = newContent[:len(newContent) - 1]
f.write(newContent)
f.close()
def process_content(jsonBody,item_dict):
entry = json.loads(jsonBody)
content=Xhtml.fromstring(entry['body'])
# get author
# print item_dict['json_url']
try:
author=content.xpath('//span[@class="author"]/text()')[0].strip()
except IndexError:
author = ''
try:
bio=content.xpath('//span[@class="bio"]/text()')[0].strip()
except IndexError:
bio=''
item_dict['author'] = author + bio
coverelement = Element('img')
coverelement.set('src', item_dict['cover'])
content.insert(0, coverelement)
item_dict['content'] = Xhtml.tostring(content, encoding='unicode')
#
print "++++\tGet zhihu items\t++++"
print item_dict['cover']
print item_dict['created']
print item_dict['title']
print item_dict['author']
print item_dict['link']
return item_dict
def process_content(html,item_dict):
root = Xhtml.fromstring(html)
# ??????
try:
content = root.xpath('//*[@class="article-content"]')[0]
except IndexError:
return ''
#
item_dict['cover'] = None
imgs = root.xpath('//img[@src]')
if imgs:
for img in imgs:
src=img.attrib['src'].strip()
if (not item_dict['cover']) and src[-3:].lower() in ['jpg','png','gif'] :
item_dict['cover']='http:'+src
# ????
coverelement = Element('img')
coverelement.set('src', item_dict['cover'])
content.insert(0, coverelement)
elif src[:22]=="data:image/png;base64,":
img.set("src","")
else:
pass
item_dict['content'] = Xhtml.tostring(content, encoding='unicode')
#
print "++++\tGet jaq items\t++++"
print item_dict['cover']
print item_dict['created']
print item_dict['title']
print item_dict['desc']
print item_dict['link']
return item_dict
def __init__(self, failure):
traceback = html.Element("pre")
traceback.text = failure.getTraceback()
super(StartFailedPage, self).__init__(
status=int(SERVICE_UNAVAILABLE), brief="MAAS failed to start",
detail=html.tostring(traceback, encoding=str))
def adapt_html(
html_text, extra_metadata, click_tracking=True, open_tracking=True,
configuration=None, **kwargs):
"""Changes an HTML string by replacing links (<a href...>) with tracking
links and by adding a 1x1 transparent pixel just before the closing body
tag.
:param html_text: The HTML to change (unicode or bytestring).
:param extra_metadata: A dict that can be json-encoded and that will
be encoded in the tracking link.
:param click_tracking: If links (<a href...>) must be changed.
:param open_tracking: If a transparent pixel must be added before the
closing body tag.
:param configuration: An optional Configuration instance.
:param kwargs: Optional configuration parameters. If provided with a
Configuration instance, the kwargs parameters will override the
Configuration parameters.
"""
configuration = get_configuration(configuration, kwargs)
tree = html.fromstring(html_text)
if click_tracking:
_replace_links(tree, extra_metadata, configuration)
if open_tracking:
_add_tracking_pixel(tree, extra_metadata, configuration)
new_html_text = html.tostring(tree)
return new_html_text.decode("utf-8")
def _clean_html(html_value, cleaner):
fragments = html.fragments_fromstring(html_value)
for f in fragments:
if isinstance(f, html.HtmlElement):
cleaner(f)
yield html.tostring(f, encoding="unicode")
else:
yield f
def markdown(target, image=False):
fragment = _markdown_fragment(target, image)
return html.tostring(fragment, encoding=unicode)[5:-6] # <div>...</div>
def parse_details_page(url, log, timeout, browser):
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.chardet import xml_to_unicode
import html5lib
from lxml.html import tostring
try:
raw = browser.open_novisit(url, timeout=timeout).read().decode('gb18030').strip()
except Exception as e:
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
log.error('URL malformed: %r'%url)
return
attr = getattr(e, 'args', [None])
attr = attr if attr else [None]
if isinstance(attr[0], socket.timeout):
msg = 'Amazon timed out. Try again later.'
log.error(msg)
else:
msg = 'Failed to make details query: %r'%url
log.exception(msg)
return
oraw = raw
raw = raw
raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
if '<title>404 - ' in raw:
log.error('URL malformed: %r'%url)
return
try:
root = html5lib.parse(raw, treebuilder='lxml',
namespaceHTMLElements=False)
except:
msg = 'Failed to parse amazon details page: %r'%url
log.exception(msg)
return
errmsg = root.xpath('//*[@id="errorMessage"]')
if errmsg:
msg = 'Failed to parse amazon details page: %r'%url
msg += tostring(errmsg, method='text', encoding=unicode).strip()
log.error(msg)
return
from css_selectors import Select
selector = Select(root)
return oraw, root, selector
def _render_comments(self, desc):
from calibre.library.comments import sanitize_comments_html
import html5lib
# html5lib parsed noscript as CDATA
desc = html5lib.parseFragment('<div>%s</div>' % (self.totext(desc).replace('textarea', 'div')), \
treebuilder='lxml', namespaceHTMLElements=False)[0]
matches = desc.xpath('descendant::*[contains(text(), "????") \
or contains(text(), "????") or contains(text(), "????") \
or contains(text(), "????") or contains(text(), "????")]/../*[self::p or self::div or self::span]')
if matches:
if len(matches)>1:
desc = matches[-1]
for item in matches:
content_len = len(self.totext(item))
if content_len > 50 and content_len < 200:
desc = item
break
for c in desc.xpath('descendant::noscript'):
c.getparent().remove(c)
for c in desc.xpath('descendant::*[@class="seeAll" or'
' @class="emptyClear" or @id="collapsePS" or'
' @id="expandPS"]'):
c.getparent().remove(c)
#
for a in desc.xpath('descendant::a[@href]'):
del a.attrib['href']
a.tag = 'span'
desc = self.tostring(desc, method='text', encoding=unicode).strip()
# return desc
# Encoding bug in Amazon data U+fffd (replacement char)
# in some examples it is present in place of '
desc = desc.replace('\ufffd', "'")
# remove all attributes from tags
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
# Collapse whitespace
desc = re.sub('\n+', '\n', desc)
desc = re.sub(' +', ' ', desc)
# Remove the notice about text referring to out of print editions
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
# Remove comments
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
return sanitize_comments_html(desc)
def parse_series(self, root):
ans = (None, None)
# This is found on the paperback/hardback pages for books on amazon.com
series = root.xpath('//div[@data-feature-name="seriesTitle"]')
if series:
series = series[0]
spans = series.xpath('./span')
if spans:
raw = self.tostring(spans[0], encoding=unicode, method='text', with_tail=False).strip()
m = re.search('\s+([0-9.]+)$', raw.strip())
if m is not None:
series_index = float(m.group(1))
s = series.xpath('./a[@id="series-page-link"]')
if s:
series = self.tostring(s[0], encoding=unicode, method='text', with_tail=False).strip()
if series:
ans = (series, series_index)
# This is found on Kindle edition pages on amazon.com
if ans == (None, None):
for span in root.xpath('//div[@id="aboutEbooksSection"]//li/span'):
text = (span.text or '').strip()
m = re.match('Book\s+([0-9.]+)', text)
if m is not None:
series_index = float(m.group(1))
a = span.xpath('./a[@href]')
if a:
series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).strip()
if series:
ans = (series, series_index)
if ans == (None, None):
desc = root.xpath('//div[@id="ps-content"]/div[@class="buying"]')
if desc:
raw = self.tostring(desc[0], method='text', encoding=unicode)
raw = re.sub(r'\s+', ' ', raw)
match = self.series_pat.search(raw)
if match is not None:
s, i = match.group('series'), float(match.group('index'))
if s:
ans = (s, i)
if ans[0]:
ans = (re.sub(r'\s+Series$', '', ans[0]).strip(), ans[1])
ans = (re.sub(r'\(.+?\s+Series\)$', '', ans[0]).strip(), ans[1])
return ans
def update_zendesk_article_html(self):
'''
rewrite the html of zendesk articles
to point anchor tags at new zendesk articles, instead of old uservoice articles
'''
print "**UPDATING HTML to switch anchor hrefs to zendesk"
url = '{}/api/v2/help_center/categories/{}/articles.json'.format(self.zendesk_url, self.zendesk_destination_category_id)
articles = []
while url:
response = requests.get(url, headers=self.headers, auth=self.credentials)
if response.status_code != 200:
print('FAILED to get get article list with error {}'.format(response.status_code))
exit()
data = response.json()
for article in data['articles']:
articles.append(article)
url = data['next_page']
print "UPDATING HTML for {} articles".format(len(articles))
for article in articles:
url = "{}/api/v2/help_center/articles/{}.json".format(self.zendesk_url, article['id'])
response = requests.get(url, headers=self.headers, auth=self.credentials)
if response.status_code != 200:
print('FAILED to update HTML for article {} with error {}'.format(article['id'], response.status_code))
exit()
html_doc = fromstring(article['body'])
for anchor_tag in html_doc.cssselect('a'):
if not anchor_tag.get('href'):
continue
number_from_string_regex = re.search('(\d+)', anchor_tag.get('href'))
if not number_from_string_regex:
continue
uv_id = int(number_from_string_regex.group(0))
if uv_id in self.uvid_to_zdid:
url = "{}/api/v2/help_center/articles/{}.json".format(self.zendesk_url, self.uvid_to_zdid[uv_id])
response = requests.get(url, headers=self.headers, auth=self.credentials)
if response.status_code != 200:
print('FAILED to get article {} with error {}'.format(self.uvid_to_zdid[uv_id], response.status_code))
exit()
new_url = response.json()['article']['html_url']
try:
print('CHANGING {} to {}'.format(anchor_tag.get('href'), new_url))
except:
e = sys.exc_info()[0]
print "lxml parsing error {}".format(e)
anchor_tag.set('href', new_url)
info = {
'body': tostring(html_doc)
}
payload = json.dumps({'article': info})
url = "{}/api/v2/help_center/articles/{}.json".format(self.zendesk_url, article['id'])
response = requests.put(url, data=payload, headers=self.headers, auth=self.credentials)
if response.status_code != 200:
print('FAILED to update HTML for article {} with error {}'.format(article['id'], response.status_code))
exit()
else:
print "SKIPPING this href {}".format(anchor_tag.get('href'))