python类tostring()的实例源码

html2md.py 文件源码 项目:nom 作者: frnsys 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def html_to_markdown(html):
    """convert html to markdown.
    this will try and convert span styling
    to the proper tags as well.

    e.g. `<span style='font-weight:bold;'>foo</span>`
    will become `<strong>foo</strong>`.
    """
    h = fromstring(html)

    clean_highlighted_code(h)
    for span in h.findall('.//span') + h.findall('.//font'):
        convert_span(span)

    html = tostring(h).decode('utf-8')

    # not ideal but works in a pinch
    html = html.replace('<mark>', '==')
    html = html.replace('</mark>', '==')

    md = to_md(html)

    # sometimes html2text returns a ton of extra whitespace.
    # clean up lines with only whitespace.
    # condense line break streaks of 3 or more.
    md = re.sub(r'\n([\s\*_]+)\n', '\n\n', md)
    md = re.sub(r'\n{3,}', '\n\n', md)

    return md
parsers.py 文件源码 项目:nom 作者: frnsys 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def rewrite_links(raw_html, rewrite_func):
    """
    Take an HTML input string, rewrite links according
    to the `rewrite_func`, return the rewritten HTML string.
    """
    html = fromstring(raw_html)
    html.rewrite_links(rewrite_func)
    return tostring(html)
tools.py 文件源码 项目:reahl 作者: reahl 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def view_source(self):
        for line in html.tostring(self.lxml_html, pretty_print=True, encoding='unicode').split('\n'): 
            print(line)
tools.py 文件源码 项目:reahl 作者: reahl 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def get_html_for(self, locator):
        """Returns the HTML of the element (including its own tags) targeted by the given `locator`

           :param locator: An instance of :class:`XPath` or a string containing an XPath expression.
        """
        xpath = six.text_type(locator)
        element = self.xpath(xpath)[0]
        return html.tostring(element, encoding='unicode')
tools.py 文件源码 项目:reahl 作者: reahl 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def get_inner_html_for(self, locator):
        """Returns the HTML of the children of the element targeted by the given `locator` (excluding the 
           element's own tags).

           :param locator: An instance of :class:`XPath` or a string containing an XPath expression.
        """
        xpath = six.text_type(locator)
        element = self.xpath(xpath)[0]
        return ''.join(html.tostring(child, encoding='unicode') for child in element.getchildren())
TestReport.py 文件源码 项目:spider 作者: luanxiangming 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def WriteHTML(self, testcaseinfo):

        self.CreateHtmlFile()

        f = open(self.reportfile, "r")

        htmlcontent = f.read()
        f.close()
        # tree = mytree.fromstring(str(htmlcontent))
        htmlcontent.encode('utf-8')
        tree = html.fromstring(htmlcontent)
        tableElem = tree.find(".//table")
        if testcaseinfo.result == "Failed":
            mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td bgcolor=\"#FF0000\">{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(
                testcaseinfo.id, testcaseinfo.name, testcaseinfo.owner, testcaseinfo.result, testcaseinfo.starttime,
                testcaseinfo.endtime, testcaseinfo.secondsDuration, testcaseinfo.errorinfo)
        elif testcaseinfo.result == "Pass":
            mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td bgcolor=\"#00FF00\">{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(
                testcaseinfo.id, testcaseinfo.name, testcaseinfo.owner, testcaseinfo.result, testcaseinfo.starttime,
                testcaseinfo.endtime, testcaseinfo.secondsDuration, testcaseinfo.errorinfo)
        else:
            mytablerow = "<tr><td>{0}</td><td>{1}</td><td>{2}</td><td>{3}</td><td>{4}</td><td>{5}</td><td>{6}</td><td>{7}</td></tr>".format(
                testcaseinfo.id, testcaseinfo.name, testcaseinfo.owner, testcaseinfo.result, testcaseinfo.starttime,
                testcaseinfo.endtime, testcaseinfo.secondsDuration, testcaseinfo.errorinfo)
        tableElem.append(mytree.HTML(str(mytablerow)))

        f = open(self.reportfile, "w")
        # html.tostring
        newContent = repr(html.tostring(tree, method="html", with_tail=False))
        newContent = newContent.replace(r"\n", "").replace(r"\t", "").replace('b\'', "")
        newContent = newContent[:len(newContent) - 1]
        f.write(newContent)
        f.close()
zhihu.py 文件源码 项目:all2rss 作者: Sendarg 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def process_content(jsonBody,item_dict):
    entry = json.loads(jsonBody)
    content=Xhtml.fromstring(entry['body'])
    # get author
    # print item_dict['json_url']
    try:
        author=content.xpath('//span[@class="author"]/text()')[0].strip()
    except IndexError:
        author = ''
    try:
        bio=content.xpath('//span[@class="bio"]/text()')[0].strip()
    except IndexError:
        bio=''
    item_dict['author'] = author + bio

    coverelement = Element('img')
    coverelement.set('src', item_dict['cover'])
    content.insert(0, coverelement)

    item_dict['content'] = Xhtml.tostring(content, encoding='unicode')
    #
    print "++++\tGet zhihu items\t++++"
    print item_dict['cover']
    print item_dict['created']
    print item_dict['title']
    print item_dict['author']
    print item_dict['link']
    return item_dict
jaq.py 文件源码 项目:all2rss 作者: Sendarg 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def process_content(html,item_dict):
    root = Xhtml.fromstring(html)
    # ??????
    try:
        content = root.xpath('//*[@class="article-content"]')[0]
    except IndexError:
        return ''
    #
    item_dict['cover'] = None
    imgs = root.xpath('//img[@src]')
    if imgs:
        for img in imgs:
            src=img.attrib['src'].strip()
            if (not item_dict['cover']) and  src[-3:].lower() in ['jpg','png','gif'] :
                item_dict['cover']='http:'+src
                # ????
                coverelement = Element('img')
                coverelement.set('src', item_dict['cover'])
                content.insert(0, coverelement)
            elif src[:22]=="data:image/png;base64,":
                img.set("src","")
            else:
                pass


    item_dict['content'] = Xhtml.tostring(content, encoding='unicode')
    #
    print "++++\tGet jaq items\t++++"
    print item_dict['cover']
    print item_dict['created']
    print item_dict['title']
    print item_dict['desc']
    print item_dict['link']
    return item_dict
webapp.py 文件源码 项目:maas 作者: maas 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init__(self, failure):
        traceback = html.Element("pre")
        traceback.text = failure.getTraceback()
        super(StartFailedPage, self).__init__(
            status=int(SERVICE_UNAVAILABLE), brief="MAAS failed to start",
            detail=html.tostring(traceback, encoding=str))
html.py 文件源码 项目:pytracking 作者: resulto 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def adapt_html(
        html_text, extra_metadata, click_tracking=True, open_tracking=True,
        configuration=None, **kwargs):
    """Changes an HTML string by replacing links (<a href...>) with tracking
    links and by adding a 1x1 transparent pixel just before the closing body
    tag.

    :param html_text: The HTML to change (unicode or bytestring).
    :param extra_metadata: A dict that can be json-encoded and that will
        be encoded in the tracking link.
    :param click_tracking: If links (<a href...>) must be changed.
    :param open_tracking: If a transparent pixel must be added before the
        closing body tag.
    :param configuration: An optional Configuration instance.
    :param kwargs: Optional configuration parameters. If provided with a
        Configuration instance, the kwargs parameters will override the
        Configuration parameters.
    """
    configuration = get_configuration(configuration, kwargs)

    tree = html.fromstring(html_text)

    if click_tracking:
        _replace_links(tree, extra_metadata, configuration)

    if open_tracking:
        _add_tracking_pixel(tree, extra_metadata, configuration)

    new_html_text = html.tostring(tree)

    return new_html_text.decode("utf-8")
clean_input.py 文件源码 项目:idealoom 作者: conversence 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def _clean_html(html_value, cleaner):
    fragments = html.fragments_fromstring(html_value)
    for f in fragments:
        if isinstance(f, html.HtmlElement):
            cleaner(f)
            yield html.tostring(f, encoding="unicode")
        else:
            yield f
text.py 文件源码 项目:weasyl 作者: Weasyl 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def markdown(target, image=False):
    fragment = _markdown_fragment(target, image)
    return html.tostring(fragment, encoding=unicode)[5:-6]  # <div>...</div>
__init__.py 文件源码 项目:calibre_dangdang 作者: qunxyz 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def parse_details_page(url, log, timeout, browser):
    from calibre.utils.cleantext import clean_ascii_chars
    from calibre.ebooks.chardet import xml_to_unicode
    import html5lib
    from lxml.html import tostring
    try:
        raw = browser.open_novisit(url, timeout=timeout).read().decode('gb18030').strip()
    except Exception as e:
        if callable(getattr(e, 'getcode', None)) and \
                        e.getcode() == 404:
            log.error('URL malformed: %r'%url)
            return
        attr = getattr(e, 'args', [None])
        attr = attr if attr else [None]
        if isinstance(attr[0], socket.timeout):
            msg = 'Amazon timed out. Try again later.'
            log.error(msg)
        else:
            msg = 'Failed to make details query: %r'%url
            log.exception(msg)
        return

    oraw = raw
    raw = raw
    raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0]
    if '<title>404 - ' in raw:
        log.error('URL malformed: %r'%url)
        return

    try:
        root = html5lib.parse(raw, treebuilder='lxml',
                              namespaceHTMLElements=False)
    except:
        msg = 'Failed to parse amazon details page: %r'%url
        log.exception(msg)
        return

    errmsg = root.xpath('//*[@id="errorMessage"]')
    if errmsg:
        msg = 'Failed to parse amazon details page: %r'%url
        msg += tostring(errmsg, method='text', encoding=unicode).strip()
        log.error(msg)
        return

    from css_selectors import Select
    selector = Select(root)
    return oraw, root, selector
__init__.py 文件源码 项目:calibre_dangdang 作者: qunxyz 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def _render_comments(self, desc):
        from calibre.library.comments import sanitize_comments_html
        import html5lib
        # html5lib parsed noscript as CDATA

        desc = html5lib.parseFragment('<div>%s</div>' % (self.totext(desc).replace('textarea', 'div')), \
                                      treebuilder='lxml', namespaceHTMLElements=False)[0]
        matches = desc.xpath('descendant::*[contains(text(), "????") \
            or contains(text(), "????") or contains(text(), "????") \
            or contains(text(), "????") or contains(text(), "????")]/../*[self::p or self::div or self::span]')

        if matches:
            if len(matches)>1:
                desc = matches[-1]
                for item in matches:
                    content_len = len(self.totext(item))
                    if content_len > 50 and content_len < 200:
                        desc = item
                        break

        for c in desc.xpath('descendant::noscript'):
            c.getparent().remove(c)
        for c in desc.xpath('descendant::*[@class="seeAll" or'
                            ' @class="emptyClear" or @id="collapsePS" or'
                            ' @id="expandPS"]'):
            c.getparent().remove(c)
        #
        for a in desc.xpath('descendant::a[@href]'):
            del a.attrib['href']
            a.tag = 'span'
        desc = self.tostring(desc, method='text', encoding=unicode).strip()
        # return desc
        # Encoding bug in Amazon data U+fffd (replacement char)
        # in some examples it is present in place of '
        desc = desc.replace('\ufffd', "'")
        # remove all attributes from tags
        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
        # Collapse whitespace
        desc = re.sub('\n+', '\n', desc)
        desc = re.sub(' +', ' ', desc)
        # Remove the notice about text referring to out of print editions
        desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
        # Remove comments
        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
        return sanitize_comments_html(desc)
__init__.py 文件源码 项目:calibre_dangdang 作者: qunxyz 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def parse_series(self, root):
        ans = (None, None)

        # This is found on the paperback/hardback pages for books on amazon.com
        series = root.xpath('//div[@data-feature-name="seriesTitle"]')
        if series:
            series = series[0]
            spans = series.xpath('./span')
            if spans:
                raw = self.tostring(spans[0], encoding=unicode, method='text', with_tail=False).strip()
                m = re.search('\s+([0-9.]+)$', raw.strip())
                if m is not None:
                    series_index = float(m.group(1))
                    s = series.xpath('./a[@id="series-page-link"]')
                    if s:
                        series = self.tostring(s[0], encoding=unicode, method='text', with_tail=False).strip()
                        if series:
                            ans = (series, series_index)
        # This is found on Kindle edition pages on amazon.com
        if ans == (None, None):
            for span in root.xpath('//div[@id="aboutEbooksSection"]//li/span'):
                text = (span.text or '').strip()
                m = re.match('Book\s+([0-9.]+)', text)
                if m is not None:
                    series_index = float(m.group(1))
                    a = span.xpath('./a[@href]')
                    if a:
                        series = self.tostring(a[0], encoding=unicode, method='text', with_tail=False).strip()
                        if series:
                            ans = (series, series_index)
        if ans == (None, None):
            desc = root.xpath('//div[@id="ps-content"]/div[@class="buying"]')
            if desc:
                raw = self.tostring(desc[0], method='text', encoding=unicode)
                raw = re.sub(r'\s+', ' ', raw)
                match = self.series_pat.search(raw)
                if match is not None:
                    s, i = match.group('series'), float(match.group('index'))
                    if s:
                        ans = (s, i)
        if ans[0]:
            ans = (re.sub(r'\s+Series$', '', ans[0]).strip(), ans[1])
            ans = (re.sub(r'\(.+?\s+Series\)$', '', ans[0]).strip(), ans[1])
        return ans
transfer_knowledgebase.py 文件源码 项目:zendesk-utils 作者: trailbehind 项目源码 文件源码 阅读 45 收藏 0 点赞 0 评论 0
def update_zendesk_article_html(self):
    '''
    rewrite the html of zendesk articles 
    to point anchor tags at new zendesk articles, instead of old uservoice articles
    '''
    print "**UPDATING HTML to switch anchor hrefs to zendesk"
    url = '{}/api/v2/help_center/categories/{}/articles.json'.format(self.zendesk_url, self.zendesk_destination_category_id)

    articles = []
    while url:
      response = requests.get(url, headers=self.headers, auth=self.credentials)
      if response.status_code != 200:
        print('FAILED to get get article list with error {}'.format(response.status_code))
        exit()
      data = response.json()
      for article in data['articles']:
        articles.append(article)
      url = data['next_page']

    print "UPDATING HTML for {} articles".format(len(articles))
    for article in articles:
      url = "{}/api/v2/help_center/articles/{}.json".format(self.zendesk_url, article['id'])
      response = requests.get(url, headers=self.headers, auth=self.credentials)
      if response.status_code != 200:
        print('FAILED to update HTML for article {} with error {}'.format(article['id'], response.status_code))
        exit()
      html_doc = fromstring(article['body'])
      for anchor_tag in html_doc.cssselect('a'):
        if not anchor_tag.get('href'):
          continue
        number_from_string_regex = re.search('(\d+)', anchor_tag.get('href'))
        if not number_from_string_regex:
          continue
        uv_id = int(number_from_string_regex.group(0))
        if uv_id in self.uvid_to_zdid:
          url = "{}/api/v2/help_center/articles/{}.json".format(self.zendesk_url, self.uvid_to_zdid[uv_id])
          response = requests.get(url, headers=self.headers, auth=self.credentials)
          if response.status_code != 200:
            print('FAILED to get article {} with error {}'.format(self.uvid_to_zdid[uv_id], response.status_code))
            exit()
          new_url = response.json()['article']['html_url']
          try:
            print('CHANGING {} to {}'.format(anchor_tag.get('href'), new_url))
          except:
            e = sys.exc_info()[0]
            print "lxml parsing error {}".format(e)
          anchor_tag.set('href', new_url)
          info = {
            'body': tostring(html_doc)
          }
          payload = json.dumps({'article': info})
          url = "{}/api/v2/help_center/articles/{}.json".format(self.zendesk_url, article['id'])
          response = requests.put(url, data=payload, headers=self.headers, auth=self.credentials)
          if response.status_code != 200:
            print('FAILED to update HTML for article {} with error {}'.format(article['id'], response.status_code))
            exit()
        else:
          print "SKIPPING this href {}".format(anchor_tag.get('href'))


问题


面经


文章

微信
公众号

扫码关注公众号