python类html2text()的实例源码-面圈网

scrape_util.py 文件源码项目：pietsmiet_xposter 作者: PietsmietApp 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def format_text(feed):
    """
    Converts a html text to markdown and adds a bottom line to it
    :param feed: Feed to format
    :return: formatted text
    """
    text = html2text.html2text(feed.desc)
    link = feed.link

    text = '[Link zum PietSmiet.de-Artikel](' + link + ')\n\n' + \
           text + '\n\n--- \n[Code](https://github.com/PietsmietApp/pietsmiet_xposter) | ' + \
           '*Auch als Push-Benachrichtigung in der [Community App für Pietsmiet](' \
           'https://play.google.com/store/apps/details?id=de.pscom.pietsmiet&referrer=utm_source%3Dreddit' \
           '%26utm_medium%3Duploadplan)* '

    return text

email_service.py 文件源码项目：nflpool 作者: prcutler 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def send_email(to_address, subject, html_body):
        try:
            smtp = EmailService.create_smtp_server()
            message = mailer.Message(
                From=EmailService.__from_address,
                To=to_address,
                charset='utf-8')
            message.Subject = subject
            message.Html = html_body
            message.Body = html2text.html2text(html_body)

            if not EmailService.__is_debug_mode:
                print("Sending message (live!)")
                smtp.send(message)
            else:
                print("Skipping send, email is in dev mode.")
        except Exception as x:
            print("Error sending mail: {}".format(x))

calendarChecker.py 文件源码项目：matterbot-calendarBot 作者: mharrend 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def checkCalendarForUpcomingEvents():
    """
    Checks calendar for upcoming events
    """
    nowDate = datetime.datetime.now()
    laterDate = nowDate + datetime.timedelta(minutes = calendarSettings['TimespanToCheck'])
    successful, res = showAgenda('', nowDate.strftime("%d.%m.%Y %H:%M"), laterDate.strftime("%d.%m.%Y %H:%M"), True)
    if successful:
        for item in res:
            eventContent = '### **{0}**\nTime: {1} - {2} (KIT time)\nDetails: {3}Location: {4}\n\n'.format(item.subject,item.start.astimezone(EWSTimeZone.timezone('Europe/Copenhagen')).strftime('%H:%M'),item.end.astimezone(EWSTimeZone.timezone('Europe/Copenhagen')).strftime('%H:%M'), html2text.html2text(item.body), item.location)
            for subcalendar in item.categories:
                try:
                    mattermostHook.send(eventContent, channel=subcalendar)
                except Exception as e:
                    messageContent = eventContent + '\n Error occured: \n {0} \n'.format(e.__doc__)
                    mattermostHook.send(messageContent, channel=mattermostSettings['DefaultChannel'])

dl_shakespeare.py 文件源码项目：Fakespeare 作者: RuthAngus 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def dl_scripts():
    url = BASE_URL
    r = requests.get(url)
    tree = BeautifulSoup(r.text, "html.parser")
    os.makedirs("plays", exist_ok=True)
    for a in tree.find_all("a")[2:-7]:
        link = a.get("href").split("/")[0]
        title = a.text.strip().replace(" ", "_")
        title = title.replace("\n", "_")
        fn = "plays/" + title + ".txt"

        r = requests.get(BASE_URL + "/" + link + "/full.html")
        body = html2text(r.text.replace("blockquote", "p"))
        body = body[body.index("### ACT I"):]
        with open(fn, "w") as f:
            f.write(body)

email_service.py 文件源码项目：cookiecutter-pyramid-talk-python-starter 作者: mikeckennedy 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def send_email(to_address, subject, html_body):
        try:
            smtp = EmailService.create_smtp_server()
            message = mailer.Message(
                From=EmailService.__from_address,
                To=to_address,
                charset='utf-8')
            message.Subject = subject
            message.Html = html_body
            message.Body = html2text.html2text(html_body)

            if not EmailService.__is_debug_mode:
                print("Sending message (live!)")
                smtp.send(message)
            else:
                print("Skipping send, email is in dev mode.")
        except Exception as x:
            print("Error sending mail: {}".format(x))

email_service.py 文件源码项目：cookiecutter-course 作者: mikeckennedy 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def send_email(to_address, subject, html_body):
        try:
            smtp = EmailService.create_smtp_server()
            message = mailer.Message(
                From=EmailService.__from_address,
                To=to_address,
                charset='utf-8')
            message.Subject = subject
            message.Html = html_body
            message.Body = html2text.html2text(html_body)

            if not EmailService.__is_debug_mode:
                print("Sending message (live!)")
                smtp.send(message)
            else:
                print("Skipping send, email is in dev mode.")
        except Exception as x:
            print("Error sending mail: {}".format(x))

utils.py 文件源码项目：accounts-srv 作者: openpermissions 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def build_mime_text(recipients, subject, message):
    """
    Puts message data into MIME format
    :param recipients: array of email addresses to send email to
    :param subject: subject of email
    :param message: body of email
    :return MIMEMultipart object
    """

    # Record the MIME types of text/plain and text/html.
    part1 = MIMEText(html2text.html2text(message), 'plain')
    part2 = MIMEText(message, 'html')

    # Attach parts into mime message container.
    body = MIMEMultipart('alternative')
    body['Subject'] = subject
    body['From'] = options.smtp_from
    body['To'] = ",".join(recipients)
    body.attach(part1)
    body.attach(part2)

    raise Return(body)

mail.py 文件源码项目：lowfat 作者: softwaresaved 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def staff_reminder(request):  # pylint: disable=invalid-name
    if config.STAFF_EMAIL_REMINDER:
        request_type = type(request).__name__.lower()
        staff_url = "/email/template/{}/staff/reminder/".format(
            request_type
        )

        context = {
            request_type: request,
            "protocol": "https",
            "site": Site.objects.get(id=SITE_ID),
            "FELLOWS_MANAGEMENT_EMAIL": config.FELLOWS_MANAGEMENT_EMAIL,
        }

        flatemail = FlatPage.objects.get(url=staff_url)
        template = Template(flatemail.content)
        jinja_context = Context(context)
        html = template.render(jinja_context)
        plain_text = html2text(html)
        mail_staffs(
            flatemail.title,
            plain_text,
            html_message=html,
            fail_silently=False
        )

editor.py 文件源码项目：txt2evernote 作者: Xunius 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def ENMLtoText(contentENML):
        soup = BeautifulSoup(contentENML.decode('utf-8'))

        for section in soup.select('li > p'):
            section.replace_with( section.contents[0] )

        for section in soup.select('li > br'):
            if section.next_sibling:
                next_sibling = section.next_sibling.next_sibling
                if next_sibling:
                    if next_sibling.find('li'):
                        section.extract()
                else:
                    section.extract()

        Editor.checklistInENMLtoSoup(soup)

        for section in soup.findAll('en-todo', checked='true'):
            section.replace_with('[x]')

        for section in soup.findAll('en-todo'):
            section.replace_with('[ ]')

        content = html2text.html2text(str(soup).decode('utf-8'), '', 0)
        content = re.sub(r' *\n', os.linesep, content)

        return content.encode('utf-8')

cli.py 文件源码项目：FRG-Crowdsourcing 作者: 97amarnathk 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def markdown_db_migrate():
    '''Perform a migration of the app long descriptions from HTML to
    Markdown for existing database records'''
    with app.app_context():
        query = 'SELECT id, long_description FROM "app";'
        query_result = db.engine.execute(query)
        old_descriptions = query_result.fetchall()
        for old_desc in old_descriptions:
            if old_desc.long_description:
                new_description = html2text(old_desc.long_description)
                query = text('''
                           UPDATE app SET long_description=:long_description
                           WHERE id=:id''')
                db.engine.execute(query, long_description = new_description, id = old_desc.id)

myanimelist.py 文件源码项目：kitsuchan-2 作者: n303p4 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def _handle_anime(entry):
    embed = discord.Embed(title=entry.title.string)
    embed.url = BASE_URL_MYANIMELIST.format("anime", entry.id.string)
    embed.add_field(name="ID", value=entry.id.string)
    embed.add_field(name="Synonyms", value=entry.synonyms.string)
    embed.add_field(name="Episodes", value=entry.episodes.string)
    embed.add_field(name="Score", value=entry.score.string)
    embed.add_field(name="Type", value=entry.type.string)
    embed.add_field(name="Status", value=entry.status.string)
    embed.add_field(name="Start date", value=entry.start_date.string)
    embed.add_field(name="End date", value=entry.end_date.string)
    embed.description = html2text.html2text(entry.synopsis.string)
    return embed

myanimelist.py 文件源码项目：kitsuchan-2 作者: n303p4 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def _handle_manga(entry):
    embed = discord.Embed(title=entry.title.string)
    embed.url = BASE_URL_MYANIMELIST.format("manga", entry.id.string)
    embed.add_field(name="ID", value=entry.id.string)
    embed.add_field(name="Synonyms", value=entry.synonyms.string)
    embed.add_field(name="Chapters", value=entry.chapters.string)
    embed.add_field(name="Volumes", value=entry.volumes.string)
    embed.add_field(name="Score", value=entry.score.string)
    embed.add_field(name="Type", value=entry.type.string)
    embed.add_field(name="Status", value=entry.status.string)
    embed.add_field(name="Start date", value=entry.start_date.string)
    embed.add_field(name="End date", value=entry.end_date.string)
    embed.description = html2text.html2text(entry.synopsis.string)
    return embed

utils.py 文件源码项目：stormtrooper 作者: CompileInc 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def send_html_email(to_addr, **kwargs):
    data_dict = kwargs['data_dict']
    subject_template = kwargs['subject_template']
    email_template = kwargs['email_template']
    email_tag = settings.EMAIL_TAG
    subject = "{} {}".format(email_tag, remove_newlines(render_to_string(subject_template, data_dict)))
    html_body = render_to_string(email_template, data_dict)
    text_body = html2text.html2text(html_body)
    send_mail(subject=subject,
              message=text_body,
              from_email=settings.DEFAULT_FROM_EMAIL,
              recipient_list=to_addr,
              fail_silently=True,
              html_message=html_body)

import.py 文件源码项目：utils 作者: ReCodEx 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def load_active_text(soup):
    text_entry = soup.select("text[active=1]")[0]
    content = text_entry.find("content").get_text()
    content = BeautifulSoup(content, "lxml")

    for node in content.select("code a"):
        node.parent.unwrap()

    return text_entry["id"], html2text(str(content))

import.py 文件源码项目：utils 作者: ReCodEx 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def add_localization(language, exercise_id, config_path):
    config = Config.load(Path.cwd() / (config_path or "import-config.yml"))
    api = ApiClient(config.api_url, config.api_token)

    exercise = api.get_exercise(exercise_id)
    exercise["localizedTexts"].append({
        "locale": language,
        "text": html2text(sys.stdin.read())
    })

    api.update_exercise(exercise_id, exercise)

website_language_extraction.py 文件源码项目：wikiwhere 作者: mkrnr 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def get_website_languages(self,json_data):
        url_language_dictionary = {}

        url_count = 0
        for article in json_data:
            for url in json_data[article]:
                url_count += 1
                # print url_count

                if url in url_language_dictionary:
                    continue

                # start a timeout counter
                signal.alarm(10) 

                try:
                    html = urllib.urlopen(url)

                    encoding = html.headers.getparam('charset')

                    if encoding is None:
                        encoding = chardet.detect(html.read())['encoding']

                    encoded_html = unicode(html.read(),encoding , errors='replace')

                    markup_text =  html2text.html2text(encoded_html)

                    html_from_markup = markdown(markup_text)

                    text = ''.join(BeautifulSoup(html_from_markup,"lxml").findAll(text=True))

                    language = detect(text)

                    url_language_dictionary[url] = language
                except TimeoutException:
                    print "timeout for: " + url
                except Exception as exception:
                    print "Continue after " + exception.__class__.__name__ + " for URL: " + url 
                    continue

        return url_language_dictionary

scrape_interface.py 文件源码项目：localdocindex 作者: stcioc 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def converthtml2text(html):
    # build the flat text
    html2text.BODY_WIDTH = 0
    html2text.IGNORE_ANCHORS = True
    html2text.IGNORE_IMAGES = True

    outstr = html2text.html2text(html)

    # html2text adds markup: | for bold, ** for italic, # for header, *** for hr - remove
    outstr = outstr.replace("|", "")
    outstr = outstr.replace("**", "")
    outstr = outstr.replace("# ", "")
    outstr = outstr.replace("* * *", "")

    # remove double spaces
    while True:
        filelen = len(outstr)
        outstr = outstr.replace("  ", " ")
        if filelen == len(outstr):
            break
    outstr = outstr.replace("\n ", "\n")
    outstr = outstr.replace(" \n", "\n")

    # remove empty lines
    while True:
        filelen = len(outstr)
        outstr = outstr.replace("\n\n", "\n")
        if filelen == len(outstr):
            break
    return outstr


# sends to OCR a PDF file
# the text file is stored in the folder targetpath
# returns the path of the output txt file
# uses Abby FineReader Hot folder
# if text file already exists (previously OCR), does not OCR again
# can be replaced with other method if necessary
# returns a tuple
# 1st element - operation code (ERROR, CREATED, EXISTS)
# 2nd element - error message or ocr file path

tools.py 文件源码项目：LocalNote 作者: littlecodersh 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def html2text(s):
    s = re.compile('</*en-media[^>]*?>').sub('', s)
    return h2t(s)

book.py 文件源码项目：JARVIS-on-Messenger 作者: swapagarwal 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def process(input, entities):
    output = {}
    try:
        book_title = entities['book'][0]['value']

        with requests_cache.enabled('book_cache', backend='sqlite', expire_after=86400):
            response = requests.get(
                'https://www.goodreads.com/book/title.xml?key=' + GOODREADS_ACCESS_TOKEN + '&title=' + book_title)
            data = ElementTree.fromstring(response.content)

        book_node = data.find('book')
        author = book_node.find('authors').find('author').find('name').text
        title = book_node.find('title').text
        description = html2text(book_node.find('description').text)
        average_rating = book_node.find('average_rating').text
        link = book_node.find('link').text
        goodreads_attribution = '- Powered by Goodreads'

        template = TextTemplate()
        template.set_text('Title: ' + title + '\nAuthor: ' + author + '\nDescription: ' + description)
        template.set_post_text('\nAverage Rating: ' + average_rating + ' / 5' + '\n' + goodreads_attribution)

        text = template.get_text()
        template = ButtonTemplate(text)
        template.add_web_url('Goodreads Link', link)

        output['input'] = input
        output['output'] = template.get_message()
        output['success'] = True
    except:
        error_message = 'I couldn\'t find any book matching your query.'
        error_message += '\nPlease ask me something else, like:'
        error_message += '\n  - book timeline'
        error_message += '\n  - harry potter book plot'
        error_message += '\n  - little women book rating'
        output['error_msg'] = TextTemplate(error_message).get_message()
        output['success'] = False
    return output

items.py 文件源码项目：dancedeets-monorepo 作者: mikelambert 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def format_text(html):
    text = html2text.html2text(html, bodywidth=0).replace('\n\n', '\n')
    text = HTMLParser.HTMLParser().unescape(text)
    text = strip_markdown.strip(text)
    text = re.sub(' +\n', '\n', text).strip()
    text = re.sub('\n\n\n', '\n\n', text)
    return text

controllers.py 文件源码项目：ai-chatbot-framework 作者: alfredfrancis 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def sentenceTokenize():
    sentences = html2text.html2text(request.form['sentences'])
    result = nlp.sentenceTokenize(sentences)
    return buildResponse.sentPlainText(result)

controllers.py 文件源码项目：ai-chatbot-framework 作者: alfredfrancis 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def posTagAndLabel():
    sentences = request.form['sentences']
    cleanSentences = html2text.html2text(sentences)
    result = nlp.posTagAndLabel(cleanSentences)
    return buildResponse.buildJson(result)

__init__.py 文件源码项目：django-happymailer 作者: barbuza 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def send(self, force=False):
        if not self.enabled and not force:
            return
        subject = six.text_type(DjangoTemplate(self.subject).render(Context(self.variables)))
        html = self.compile()
        text = html2text.html2text(html)
        self._send(subject, text, settings.HAPPYMAILER_FROM, recipient_list=self.recipients(),
                   html_message=html, fail_silently=False)

speak.py 文件源码项目：pythonista-scripts 作者: khilnani 项目源码文件源码阅读 19 收藏 0 点赞 0 评论 0

def main():
    speech.stop()
    if not appex.is_running_extension():
        console.hud_alert('Reading clipboard')
        text = clipboard.get()
        url = None
    else:
        text = appex.get_text()
        url = appex.get_url()

    if url == None:
        try:
            url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0]
        except:
            pass

    if url != None:
        console.hud_alert('Reading: ' + url)
        h = html2text.HTML2Text()
        try:
            r = requests.get(
            url=url,
            headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))})
        except requests.ConnectionError as e:
            console.alert('Unable to connect to url.')
            return True
        html_content = r.text.decode('utf-8')
        text = html2text.html2text(html_content)
    else:
        console.hud_alert('Reading text: ' + str(text))

    if text:
        speech.say(text)
        stop = console.alert('Done?', hide_cancel_button=True, button1='OK')
        speech.stop()
    else:
        console.hud_alert('No text found.')

url2md.py 文件源码项目：pythonista-scripts 作者: khilnani 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def main():
    if appex.is_running_extension():
        url = appex.get_url()
        if url == None:
            text = appex.get_text()
            url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0]
    else:
        text = clipboard.get().strip()
        url = [ mgroups[0] for mgroups in GRUBER_URLINTEXT_PAT.findall(text) ][0]
        if not "http" in url:
            url = "http://"
        try:
            url = console.input_alert("URL", "", url)
        except:
            return True

    console.hud_alert('URL: %s' % url)

    h = html2text.HTML2Text()
    try:
        r = requests.get(
            url=url,
            headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))}
        )
    except Exception as e:
        raise(e.message)
        return True

    html_content = r.text.decode('utf-8')
    rendered_content = html2text.html2text(html_content)
    clipboard.set(rendered_content)

    launch_e = console.alert('Markdown copied to clipboard. Launch Evernote?', button1='Yes', button2='No', hide_cancel_button=True)
    if launch_e ==1:
        _eurl = "evernote://x-callback-url/new-note?type=clipboard&title=DRAFT&text="
        app=UIApplication.sharedApplication()
        eurl=nsurl(_eurl)
        app.openURL_(eurl)
    appex.finish()

gamefaqs_dl.py 文件源码项目：Pythonista_scripts 作者: wizardofozzie 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def main():
    if appex.is_running_extension():
        url = appex.get_url()
    else:
        url = clipboard.get().strip()
        if not RE_URL.match(url):
            try:
                url = console.input_alert("Enter gamefaqs URL", "", "https://www.gamefaqs.com/")
            except KeyboardInterrupt:
                sys.exit(0)

    newurl = "{0}?print=1".format(url)
    #baseurl = http://www.gamefaqs.com/ps3/959558-fallout-new-vegas/faqs/61226
    if RE_URL.match(url):
        h = html2text.HTML2Text()
        r = requests.get(
                         url=newurl, 
                         headers={"User-agent": "Mozilla/5.0{0:06}".format(random.randrange(999999))}
                         )
        html_content = r.text.decode('utf-8')
        rendered_content = html2text.html2text(html_content)
        filename = url.partition("gamefaqs.com/")[-1].partition("/")[-1].partition("/faqs")[0]+".txt"
        filepath = os.path.join(os.path.expanduser("~/Documents"), filename)

        with open(filepath, "w") as fo:
            fo.write(rendered_content)

        console.hud_alert('Success! Saved {0}'.format(filename), "success")

tfidf.py 文件源码项目：bookmark_analysis 作者: tarwn 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def get_site_text(url):
    resp = requests.get(url)
    resp.raise_for_status()
    html = resp.text
    return html2text.html2text(html)

# 2: Score each word for an individual page against the full set of pages

rake.py 文件源码项目：bookmark_analysis 作者: tarwn 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def get_site_text(url):
    resp = requests.get(url)
    resp.raise_for_status()
    html = resp.text
    return html2text.html2text(html)

#2: Import stopwords from an external file

models.py 文件源码项目：Kiwi 作者: kiwitcms 项目源码文件源码阅读 17 收藏 0 点赞 0 评论 0

def get_plain_text(self):
        action = html2text(smart_str(self.action)).rstrip()
        effect = html2text(smart_str(self.effect)).rstrip()
        setup = html2text(smart_str(self.setup)).rstrip()
        breakdown = html2text(smart_str(self.breakdown)).rstrip()
        return PlainText(action=action, setup=setup,
                         effect=effect, breakdown=breakdown)

book.py 文件源码项目：Rero 作者: voqz 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def process(input, entities):
    output = {}
    try:
        book_title = entities['book'][0]['value']

        with requests_cache.enabled('book_cache', backend='sqlite', expire_after=86400):
            response = requests.get('https://www.goodreads.com/book/title.xml?key=' + GOODREADS_ACCESS_TOKEN + '&title=' + book_title)
            data = ElementTree.fromstring(response.content)

        book_node = data.find('book')
        author = book_node.find('authors').find('author').find('name').text
        title = book_node.find('title').text
        description = html2text(book_node.find('description').text)
        average_rating = book_node.find('average_rating').text
        link = book_node.find('link').text
        goodreads_attribution = '- Powered by Goodreads'

        template = TextTemplate()
        template.set_text('Title: ' + title + '\nAuthor: ' + author + '\nDescription: ' + description)
        template.set_post_text('\nAverage Rating: ' + average_rating + ' / 5' + '\n' + goodreads_attribution)

        text = template.get_text()
        template = ButtonTemplate(text)
        template.add_web_url('Goodreads Link', link)

        output['input'] = input
        output['output'] = template.get_message()
        output['success'] = True
    except:
        error_message = 'I couldn\'t find any book matching your query.'
        error_message += '\nPlease ask me something else, like:'
        error_message += '\n  - book timeline'
        error_message += '\n  - harry potter book plot'
        error_message += '\n  - little women book rating'
        output['error_msg'] = TextTemplate(error_message).get_message()
        output['success'] = False
    return output