pdf.py 文件源码

python
阅读 29 收藏 0 点赞 0 评论 0

项目:ingestors 作者: alephdata 项目源码 文件源码
def extract_metadata(self, file_path):
        with open(file_path, 'rb') as fh:
            pdf = PdfFileReader(fh, strict=False)
            meta = pdf.getDocumentInfo()
            if meta is not None:
                self.update('title', meta.title)
                self.update('author', meta.author)
                self.update('generator', meta.creator)
                self.update('generator', meta.producer)
                if meta.subject:
                    self.result.keywords.append(meta.subject)

            xmp = pdf.getXmpMetadata()
            if xmp is not None:
                self.update('id', xmp.xmpmm_documentId)
                for lang, title in xmp.dc_title.items():
                    self.update('title', title)
                    self.result.languages.append(lang)
                self.update('generator', xmp.pdf_producer)
                self.update('created_at', xmp.xmp_createDate)
                self.update('modified_at', xmp.xmp_modifyDate)
                self.result.languages.extend(xmp.dc_language)

        # from pprint import pprint
        # pprint(self.result.to_dict())
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号