def converthtml2text(html):
# build the flat text
html2text.BODY_WIDTH = 0
html2text.IGNORE_ANCHORS = True
html2text.IGNORE_IMAGES = True
outstr = html2text.html2text(html)
# html2text adds markup: | for bold, ** for italic, # for header, *** for hr - remove
outstr = outstr.replace("|", "")
outstr = outstr.replace("**", "")
outstr = outstr.replace("# ", "")
outstr = outstr.replace("* * *", "")
# remove double spaces
while True:
filelen = len(outstr)
outstr = outstr.replace(" ", " ")
if filelen == len(outstr):
break
outstr = outstr.replace("\n ", "\n")
outstr = outstr.replace(" \n", "\n")
# remove empty lines
while True:
filelen = len(outstr)
outstr = outstr.replace("\n\n", "\n")
if filelen == len(outstr):
break
return outstr
# sends to OCR a PDF file
# the text file is stored in the folder targetpath
# returns the path of the output txt file
# uses Abby FineReader Hot folder
# if text file already exists (previously OCR), does not OCR again
# can be replaced with other method if necessary
# returns a tuple
# 1st element - operation code (ERROR, CREATED, EXISTS)
# 2nd element - error message or ocr file path
评论列表
文章目录