def _render_comments(self, desc):
from calibre.library.comments import sanitize_comments_html
import html5lib
# html5lib parsed noscript as CDATA
desc = html5lib.parseFragment('<div>%s</div>' % (self.totext(desc).replace('textarea', 'div')), \
treebuilder='lxml', namespaceHTMLElements=False)[0]
matches = desc.xpath('descendant::*[contains(text(), "????") \
or contains(text(), "????") or contains(text(), "????") \
or contains(text(), "????") or contains(text(), "????")]/../*[self::p or self::div or self::span]')
if matches:
if len(matches)>1:
desc = matches[-1]
for item in matches:
content_len = len(self.totext(item))
if content_len > 50 and content_len < 200:
desc = item
break
for c in desc.xpath('descendant::noscript'):
c.getparent().remove(c)
for c in desc.xpath('descendant::*[@class="seeAll" or'
' @class="emptyClear" or @id="collapsePS" or'
' @id="expandPS"]'):
c.getparent().remove(c)
#
for a in desc.xpath('descendant::a[@href]'):
del a.attrib['href']
a.tag = 'span'
desc = self.tostring(desc, method='text', encoding=unicode).strip()
# return desc
# Encoding bug in Amazon data U+fffd (replacement char)
# in some examples it is present in place of '
desc = desc.replace('\ufffd', "'")
# remove all attributes from tags
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
# Collapse whitespace
desc = re.sub('\n+', '\n', desc)
desc = re.sub(' +', ' ', desc)
# Remove the notice about text referring to out of print editions
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
# Remove comments
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
return sanitize_comments_html(desc)
评论列表
文章目录