__init__.py 文件源码

python
阅读 116 收藏 0 点赞 0 评论 0

项目:doc2text 作者: jlsutherland 项目源码 文件源码
def read(self, path):
        self.filename = os.path.basename(path)
        self.file_basename, self.file_extension = os.path.splitext(self.filename)
        self.path = path
        self.mime_type = mimetypes.guess_type(path)
        self.file_basepath = os.path.dirname(path)

        # If the file is a pdf, split the pdf and prep the pages.
        if self.mime_type[0] == "application/pdf":
            file_temp = open(self.path, 'rb')
            pdf_reader = pyPdf.PdfFileReader(file_temp)
            self.num_pages = pdf_reader.numPages
            try:
                for i in xrange(self.num_pages):
                    output = pyPdf.PdfFileWriter()
                    output.addPage(pdf_reader.getPage(i))
                    path = 'temp.pdf'
                    im_path = 'temp.png'
                    with open(path, 'wb') as f:
                        output.write(f)
                    im = PythonMagick.Image()
                    im.density("300")
                    im.read(path)
                    im.write(im_path)
                    orig_im = cv2.imread(im_path, 0)
                    page = Page(orig_im, i, self.lang)
                    self.pages.append(page)
                    os.remove(path)
                    os.remove(im_path)
                self.prepared = True
            except Exception as e:
                self.error = e
                raise

        # If the file is an image, think of it as a 1-page pdf.
        elif self.mime_type[0] in acceptable_mime:
            self.num_pages = 1
            im = PythonMagick.Image()
            im.density("300")
            im.read(path)
            temp_path = os.path.normpath(os.path.join(
                self.file_basepath, self.file_basename + '_temp.png'
            ))
            im.write(temp_path)
            orig_im = cv2.imread(temp_path, 0)
            os.remove(temp_path)
            page = Page(orig_im, 0)
            self.pages.append(page)

        # Otherwise, out of luck.
        else:
            print(self.mime_type[0])
            raise FileNotAcceptedException
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号