scrape_interface.py 文件源码-python代码片段

def converthtml2text(html):
    # build the flat text
    html2text.BODY_WIDTH = 0
    html2text.IGNORE_ANCHORS = True
    html2text.IGNORE_IMAGES = True

    outstr = html2text.html2text(html)

    # html2text adds markup: | for bold, ** for italic, # for header, *** for hr - remove
    outstr = outstr.replace("|", "")
    outstr = outstr.replace("**", "")
    outstr = outstr.replace("# ", "")
    outstr = outstr.replace("* * *", "")

    # remove double spaces
    while True:
        filelen = len(outstr)
        outstr = outstr.replace("  ", " ")
        if filelen == len(outstr):
            break
    outstr = outstr.replace("\n ", "\n")
    outstr = outstr.replace(" \n", "\n")

    # remove empty lines
    while True:
        filelen = len(outstr)
        outstr = outstr.replace("\n\n", "\n")
        if filelen == len(outstr):
            break
    return outstr


# sends to OCR a PDF file
# the text file is stored in the folder targetpath
# returns the path of the output txt file
# uses Abby FineReader Hot folder
# if text file already exists (previously OCR), does not OCR again
# can be replaced with other method if necessary
# returns a tuple
# 1st element - operation code (ERROR, CREATED, EXISTS)
# 2nd element - error message or ocr file path