python类PdfFileReader()的实例源码

downloadfiles.py 文件源码 项目:RastLeak 作者: n4xh4ck5 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ######
day.py 文件源码 项目:gthnk 作者: iandennismiller 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def render_pdf(self):
        outpdf = PdfFileWriter()
        for page in self.pages:
            if page.extension == "pdf":
                # the page is already a PDF so append directly
                outpdf.addPage(PdfFileReader(BytesIO(page.binary)).getPage(0))
            else:
                # otherwise, the page is an image that needs to be converted to PDF first
                buf = BytesIO()
                img = Image.open(BytesIO(page.binary))
                img.convert("RGB").save(buf, format="pdf")
                # once image is PDF, it can be appended
                outpdf.addPage(PdfFileReader(buf).getPage(0))

        pdf_page_buf = BytesIO()
        outpdf.write(pdf_page_buf)
        return(pdf_page_buf.getvalue())
pdf2png.py 文件源码 项目:CSE371Project 作者: muhakh 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def pdf_page_to_png(src_pdf, pagenum = 0, resolution = 72,):
    '''
    Returns specified PDF page as wand.image.Image png.
    :param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages.
    :param int pagenum: Page number to take.
    :param int resolution: Resolution for resulting png in DPI.
    '''
    dst_pdf = PyPDF2.PdfFileWriter()
    dst_pdf.addPage(src_pdf.getPage(pagenum))

    pdf_bytes = io.BytesIO()
    dst_pdf.write(pdf_bytes)
    pdf_bytes.seek(0)

    img = Image(file = pdf_bytes, resolution = resolution)
    img.convert("png")

    return img

# Example of converting exam.pdf located at the same direcory
# convert('exam')   # NOTE : default resolution is 72 dpi
tilecharbox.py 文件源码 项目:handfontgen 作者: nixeneko 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def outputpapertemplate(self, dest, listchar, output=None):
        if output == None:
            output = PyPDF2.PdfFileWriter()

        while listchar:
            iopage = self.outputtemplateonepage(listchar)
            page = PyPDF2.PdfFileReader(iopage)
            output.addPage(page.getPage(0))

        if dest != None:
            if isinstance(dest, str): # when dest is a file path
                destdir = os.path.dirname(dest)
                if destdir != '' and not os.path.isdir(destdir):
                    os.makedirs(destdir)
                with open(dest, "wb") as w:
                    output.write(w)
            else: # when dest is io.IOBase
                output.write(dest)
        else:
            return output
marisol.py 文件源码 项目:Marisol 作者: wikkiewikkie 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def apply(self):
        """
        Applies all requested overlays to the page

        Returns:
            bool
        """
        for overlay in self.document.overlays.values():
            if isinstance(overlay, BatesOverlay):
                overlay.text = self.number
                overlay.apply(self.canvas)
            elif isinstance(overlay, GenericTextOverlay):
                overlay.apply(self.canvas)

        for redaction in self.redactions:
            redaction.apply(self.canvas)

        self.canvas.showPage()
        self.canvas.save()

        self.canvas_file.seek(0)
        reader = PdfFileReader(self.canvas_file)
        overlay_page = reader.getPage(0)
        self.page.mergePage(overlay_page)
        return True
parsers.py 文件源码 项目:pentestly 作者: praetorian-inc 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def pdf_parser(s):
    s = s.strip()
    # required to suppress warning messages
    with open(os.devnull, 'w') as fp:
        pdf = PdfFileReader(StringIO(s), strict=False, warndest=fp)
    if pdf.isEncrypted:
        try:
            pdf.decrypt('')
        except NotImplementedError:
            return {}
    meta = pdf.getDocumentInfo()
    #print(str(meta))
    result = {
        'author': meta.author,
    }
    return result
test_pdf.py 文件源码 项目:callisto-core 作者: project-callisto 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def test_report_pdf(self):
        self.client_post_report_creation()
        pdf = report_delivery.report_as_pdf(
            report=self.report,
            data=mock_report_data,
            recipient=None,
        )
        pdf_reader = PyPDF2.PdfFileReader(BytesIO(pdf))

        self.assertIn(
            "Reported by: testing_12",
            pdf_reader.getPage(0).extractText())
        self.assertIn('food options', pdf_reader.getPage(0).extractText())
        self.assertIn('vegetables', pdf_reader.getPage(0).extractText())
        self.assertIn('apples: red', pdf_reader.getPage(0).extractText())
        self.assertIn('eat it now???', pdf_reader.getPage(0).extractText())
report_xml.py 文件源码 项目:odoo-report 作者: vertelab 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def create(self, cr, uid, ids, data, context=None):
        pool = registry(cr.dbname)
        merger = PdfFileMerger()
        outfiles = []
        for p in pool.get(self.model).read(cr,uid,ids):
            outfiles.append(self.newfilename())
            sla = self.render(cr,uid,p,data.get('template') or self.template)
            if self.report_type == 'scribus_sla':
                os.unlink(outfiles[-1])
                return (sla.read(),'sla')
            command = "xvfb-run -a scribus-ng -ns -g %s -py %s -pa -o %s" % (sla.name,os.path.join(get_module_path('report_scribus'), 'scribus.py'),outfiles[-1])
            _logger.info(command)
            res = os.system(command)
            sla.close()
            if not os.path.exists(outfiles[-1]) or os.stat(outfiles[-1]).st_size == 0:
                raise MissingError('There are something wrong with the template or scribus installation')
            merger.append(PdfFileReader(file(outfiles[-1], 'rb')))
        outfile = tempfile.NamedTemporaryFile(mode='w+b',suffix='.pdf')
        merger.write(outfile.name)
        for filename in outfiles:
            os.unlink(filename)
        outfile.seek(0)
        pdf = outfile.read()
        outfile.close()
        return (pdf,'pdf')
pipes.py 文件源码 项目:HackathonOAB 作者: Marlysson 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def run(self):

        objeto_pdf = open(self.caminho_arquivo, 'rb')
        reader = PyPDF2.PdfFileReader(objeto_pdf)

        conteudos = []

        for num in range(reader.numPages):

            texto = reader.getPage(num).extractText()
            conteudos.append(texto.encode("utf-8"))

        instancia = object.__new__(GravarConvertido)
        instancia.__init__(self.caminho_arquivo," ".join(conteudos))

        self.next_pipe = instancia
        self.next_pipe.run()
pdf.py 文件源码 项目:refextract 作者: inspirehep 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def _destinations_in_two_columns(pdf, destinations, cutoff=3):
    """
    Check if the named destinations are organized along two columns (heuristic)

    @param pdf: a PdfFileReader object
    @param destinations:

    'cutoff' is used to tune the heuristic: if 'cutoff' destinations in the
    would-be second column start at the same position, return True
    """
    # iterator for the x coordinates of refs in the would-be second column
    xpositions = (_destination_position(pdf, dest)[3] for (_, dest)
                  in destinations
                  if _destination_position(pdf, dest)[1] == 1)
    xpos_count = {}
    for xpos in xpositions:
        xpos_count[xpos] = xpos_count.get(xpos, 0) + 1
        if xpos_count[xpos] >= cutoff:
            return True
    return False
pdf.py 文件源码 项目:osp-scraper 作者: opensyllabus 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def extract_links(self, response):
        pdf = pyPdf.PdfFileReader(BytesIO(response.body))
        pgs = pdf.getNumPages()

        for page_num in range(pgs):
            page = pdf.getPage(page_num)

            annotations = page.get('/Annots', [])
            for annotation in annotations:
                annot_object = annotation.getObject()

                a_tag = annot_object.get('/A')
                if a_tag and '/URI' in a_tag:
                    uri = a_tag['/URI']
                    if isinstance(uri, pyPdf.generic.ByteStringObject):
                        uri = uri.decode("utf-8").replace("\x00", "")
                    yield (uri, uri)
pdf.py 文件源码 项目:ingestors 作者: alephdata 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def extract_metadata(self, file_path):
        with open(file_path, 'rb') as fh:
            pdf = PdfFileReader(fh, strict=False)
            meta = pdf.getDocumentInfo()
            if meta is not None:
                self.update('title', meta.title)
                self.update('author', meta.author)
                self.update('generator', meta.creator)
                self.update('generator', meta.producer)
                if meta.subject:
                    self.result.keywords.append(meta.subject)

            xmp = pdf.getXmpMetadata()
            if xmp is not None:
                self.update('id', xmp.xmpmm_documentId)
                for lang, title in xmp.dc_title.items():
                    self.update('title', title)
                    self.result.languages.append(lang)
                self.update('generator', xmp.pdf_producer)
                self.update('created_at', xmp.xmp_createDate)
                self.update('modified_at', xmp.xmp_modifyDate)
                self.result.languages.extend(xmp.dc_language)

        # from pprint import pprint
        # pprint(self.result.to_dict())
satData.py 文件源码 项目:FA-IR_Ranking 作者: MilkaLichtblau 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __loadSATPDF(self, filename):
        print("loading SAT score pdf")
        """
        loads the SAT PDF file, deletes all nonsense and creates an array containing only the numbers
        from the table

        Return
        ------
        All numbers from the SAT table in a string array
        """
        pdf = pypdf.PdfFileReader(open(filename, "rb"))
        tableContents = []

        for page in pdf.pages:
            content = page.extractText()
            tableHeader = "Total \nMale Female \nScore \nNumber Percentile Number Percentile Number Percentile "
            tableFooter = "De˜nitions of statistical terms are provided online at research."
            tableContents += self.__getTableContent(content, tableHeader, tableFooter)
            if "Number" and  "Mean" and "S.D." in tableContents:
                tableContents = tableContents[:tableContents.index("S.D.") - 2]

        return tableContents
merged_pdf.py 文件源码 项目:python_for_linux_system_administration 作者: lalor 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def main():
    all_pdfs = get_all_pdf_files(os.path.expanduser('~lmx/'))
    if not all_pdfs:
        raise SystemExit('No pdf file found!')

    merger = PyPDF2.PdfFileMerger()

    with open(all_pdfs[0], 'rb') as first_obj:
        merger.append(first_obj)

    for pdf in all_pdfs[1:]:
        with open(pdf, 'rb') as obj:
            reader = PyPDF2.PdfFileReader(obj)
            merger.append(fileobj=obj, pages=(1, reader.getNumPages()))

    with open('merge-pdfs.pdf', 'wb') as f:
        merger.write(f)
RastLeak_1_2.py 文件源码 项目:RastLeak 作者: n4xh4ck5 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)
    #print metadata_files
####### FUNCTION AnalyzeMetadata doc ######
RastLeak_1_3.py 文件源码 项目:RastLeak 作者: n4xh4ck5 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)
    #print metadata_files
####### FUNCTION AnalyzeMetadata doc ######
rastleak_2_0.py 文件源码 项目:RastLeak 作者: n4xh4ck5 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ######
downloadfiles.py 文件源码 项目:RastLeak 作者: n4xh4ck5 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ######
RastLeak_1_2.py 文件源码 项目:RastLeak 作者: n4xh4ck5 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)
    #print metadata_files
####### FUNCTION AnalyzeMetadata doc ######
rastleak_1_4.py 文件源码 项目:RastLeak 作者: n4xh4ck5 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ######
rastleak_2_0.py 文件源码 项目:RastLeak 作者: n4xh4ck5 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def Analyze_Metadata_pdf(filename):
####### FUNCTION AnalyzeMetadata ######
    pdfFile = PdfFileReader(file(filename, 'rb'))
    metadata = pdfFile.getDocumentInfo()
    print ' - Document: ' + str(filename)
    for meta in metadata:
        value=(metadata[meta])
        print ' - ' + meta + ':' + metadata[meta]
        if meta == "/Author":
            if value not in meta_author_array:
                meta_author_array.append(value)
        elif meta =="/Producer":
            if value not in meta_producer_array:
                meta_producer_array.append(value)
        elif meta == "/Creator":
            if value not in meta_creator_array:
                meta_creator_array.append(value)
    #Group the different arrays in one with all metadata
    metadata_files.append(meta_author_array)
    metadata_files.append(meta_producer_array)
    metadata_files.append(meta_creator_array)

####### FUNCTION AnalyzeMetadata doc ######
cli_pdf_to_ppt.py 文件源码 项目:PDF-to-PPT 作者: vijayanandrp 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def pdf_splitter(self):
        self.log.info('Called pdf_splitter')
        input_pdf = PdfFileReader(file(self.pdf_file, 'rb'))
        self.total_pages = input_pdf.numPages

        for page_number in range(self.total_pages):
            output = PdfFileWriter()
            output.addPage(input_pdf.getPage(page_number))
            # new filename
            new_pdf = '_%s%s' % (str(page_number+1), '.pdf')
            new_pdf = self.pdf_file.replace('.pdf', new_pdf)
            file_stream = file(new_pdf, 'wb')
            output.write(file_stream)
            file_stream.close()

            # calling pdf to image conversion
            self.pdf_to_image(new_pdf)
parsers.py 文件源码 项目:recon-ng 作者: Hehe-Zhc 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def pdf_parser(s):
    s = s.strip()
    # required to suppress warning messages
    with open(os.devnull, 'w') as fp:
        pdf = PdfFileReader(StringIO(s), strict=False, warndest=fp)
    if pdf.isEncrypted:
        try:
            pdf.decrypt('')
        except NotImplementedError:
            return {}
    meta = pdf.getDocumentInfo()
    #print(str(meta))
    result = {}
    for key in meta.keys():
        result[key[1:]] = meta.get(key)
    return result
image.py 文件源码 项目:knowledge-repo 作者: airbnb 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def pdf_page_to_png(src_pdf, pagenum=0, resolution=154):
    """
    Returns specified PDF page as wand.image.Image png.
    :param PyPDF2.PdfFileReader src_pdf: PDF from which to take pages.
    :param int pagenum: Page number to take.
    :param int resolution: Resolution for resulting png in DPI.
    """

    check_dependencies(__optional_dependencies__['pdf'])
    # Import libraries within this function so as to avoid import-time dependence
    import PyPDF2
    from wand.image import Image  # TODO: When we start using this again, document which system-level libraries are required.

    dst_pdf = PyPDF2.PdfFileWriter()
    dst_pdf.addPage(src_pdf.getPage(pagenum))

    pdf_bytes = io.BytesIO()
    dst_pdf.write(pdf_bytes)
    pdf_bytes.seek(0)

    img = Image(file=pdf_bytes, resolution=resolution)
    img.convert("png")

    return img
multipage2book.py 文件源码 项目:multipage_to_book_batch_converter 作者: uml-digitalinitiatives 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def count_pages(input_file):
    """Count the number of pages in a file

    Keyword arguments
    input_file -- the full path to the input file
    """
    count = 0
    if is_pdf.match(input_file):
        with open(input_file, 'rb') as fp:
            count += len(rxcountpages.findall(fp.read()))
        if count == 0:
            pdf_read = PyPDF2.PdfFileReader(input_file)
            count = pdf_read.getNumPages()
            pdf_read = None
    else:
        ops = [
            'identify', '-ping', '-format', "%n\\n", input_file
        ]
        results = do_system_call(ops, return_result=True)
        count = int(results.rstrip().split('\n').pop())

    return count
parsers.py 文件源码 项目:recon-ng 作者: captainhooligan 项目源码 文件源码 阅读 39 收藏 0 点赞 0 评论 0
def pdf_parser(s):
    s = s.strip()
    # required to suppress warning messages
    with open(os.devnull, 'w') as fp:
        pdf = PdfFileReader(StringIO(s), strict=False, warndest=fp)
    if pdf.isEncrypted:
        try:
            pdf.decrypt('')
        except NotImplementedError:
            return {}
    meta = pdf.getDocumentInfo()
    #print(str(meta))
    result = {}
    for key in meta.keys():
        result[key[1:]] = meta.get(key)
    return result
utils.py 文件源码 项目:open-syllabus-project 作者: davidmcclure 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def pdf_date(path):

    """
    Extract a date from PDF file metadata.

    Args:
        path (str): The file path.

    Returns:
        datetime: The created date.
    """

    reader = PdfFileReader(path)

    # Get rid of `D:` prefix and timezone.
    stamp = reader.documentInfo['/CreationDate']
    match = re.search('\d+', stamp)

    return datetime.strptime(
        match.group(),
        '%Y%m%d%H%M%S'
    )
article_authors.py 文件源码 项目:repeat-aft 作者: ripeta 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def extract(text, paper=None, logger=logger):

    # try using pypdf2/pdfminer
    try:
        pdf = paper._read_document()
        pdfReader = PdfFileReader(pdf)
        author = pdfReader.getDocumentInfo().author
        value_text, value_result = author, author
        source_type = "extracted"
        source_detail = "pdf metadata"
        if author:
            return (value_text, value_result, source_type, source_detail)
        else:
            return None
    except:
        # search for author or return None
        # Though currently there is no search function
        return None
article_title.py 文件源码 项目:repeat-aft 作者: ripeta 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def extract(text, paper=None, logger=logger):

    # try using pypdf2/pdfminer
    try:
        pdf = paper._read_document()
        pdfReader = PdfFileReader(pdf)
        title = pdfReader.getDocumentInfo().title
        value_text, value_result = title, title
        source_type = "extracted"
        source_detail = "pdf metadata"
        if title:
            return (value_text, value_result, source_type, source_detail)
        else:
            return None
    except:
        # search for title or return None
        # Though currently there is no search function
        return None
test_.py 文件源码 项目:ilovepdf 作者: sdelquin 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def test_split():
    i = ILovePdf(config.PUBLIC_KEY, config.SECRET_KEY)
    i.new_task("split")
    i.add_file("test.pdf")
    i.execute(ranges="1-2,5-8")
    i.download()
    zip_ref = zipfile.ZipFile("out.zip", "r")
    zip_ref.extractall("test_split")
    zip_ref.close()
    assert len(glob.glob("test_split/*.pdf")) == 2
    output_file1 = PdfFileReader(open("test_split/test-1-2.pdf", "rb"))
    output_file2 = PdfFileReader(open("test_split/test-5-8.pdf", "rb"))
    assert output_file1.getNumPages() == 2
    assert output_file2.getNumPages() == 4
    os.remove("out.zip")
    shutil.rmtree("test_split")


问题


面经


文章

微信
公众号

扫码关注公众号