__init__.py 文件源码-python代码片段

def _render_comments(self, desc):
        from calibre.library.comments import sanitize_comments_html
        import html5lib
        # html5lib parsed noscript as CDATA

        desc = html5lib.parseFragment('<div>%s</div>' % (self.totext(desc).replace('textarea', 'div')), \
                                      treebuilder='lxml', namespaceHTMLElements=False)[0]
        matches = desc.xpath('descendant::*[contains(text(), "????") \
            or contains(text(), "????") or contains(text(), "????") \
            or contains(text(), "????") or contains(text(), "????")]/../*[self::p or self::div or self::span]')

        if matches:
            if len(matches)>1:
                desc = matches[-1]
                for item in matches:
                    content_len = len(self.totext(item))
                    if content_len > 50 and content_len < 200:
                        desc = item
                        break

        for c in desc.xpath('descendant::noscript'):
            c.getparent().remove(c)
        for c in desc.xpath('descendant::*[@class="seeAll" or'
                            ' @class="emptyClear" or @id="collapsePS" or'
                            ' @id="expandPS"]'):
            c.getparent().remove(c)
        #
        for a in desc.xpath('descendant::a[@href]'):
            del a.attrib['href']
            a.tag = 'span'
        desc = self.tostring(desc, method='text', encoding=unicode).strip()
        # return desc
        # Encoding bug in Amazon data U+fffd (replacement char)
        # in some examples it is present in place of '
        desc = desc.replace('\ufffd', "'")
        # remove all attributes from tags
        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
        # Collapse whitespace
        desc = re.sub('\n+', '\n', desc)
        desc = re.sub(' +', ' ', desc)
        # Remove the notice about text referring to out of print editions
        desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
        # Remove comments
        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
        return sanitize_comments_html(desc)