python类PdfFileReader()的实例源码

pdf.py 文件源码 项目:pdf-server 作者: nathanielove 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def get_pages(pdf_filename, from_, to):
    if to < from_:
        to = from_

    file = open(pdf_filename, 'rb')
    pdf = PdfFileReader(file)

    output = PdfFileWriter()

    for i in range(from_ - 1, to):
        output.addPage(pdf.getPage(i))

    stream = BytesIO()
    output.write(stream)
    data = stream.getvalue()
    file.close()
    return data
metadata_extractor.py 文件源码 项目:Forensic-Tools 作者: MonroCoury 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def pdfMetaData(file_path, save=True):
    '''Get PDF document metadata, takes 2 arguments, file_path and save (boolean, default is True)'''
    pdf_doc = PdfFileReader(open(file_path, "rb"))

    if pdf_doc.isEncrypted:
        try:
            if pdf_doc.decrypt("") != 1:
                sys.exit("target pdf document is encrypted... exiting...")
        except:
            sys.exit("target pdf document is encrypted with an unsupported algorithm... exiting...")

    doc_info = pdf_doc.getDocumentInfo()
    stats = os.stat(file_path)
    now = dt.now()
    file_name = getFileName(file_path)
    metadata = "Time: %d/%d/%d %d : %d : %d. Found the following metadata for file %s:\n\n" % (now.year, now.month,
                                                                                               now.day, now.hour, now.minute,
                                                                                               now.second, file_name[:-4])
    try:
        for md in doc_info:
            metadata += str(md[1:]) + " : " + pretifyPyPDF2Time(str(md[1:]) ,str(doc_info[md])) + "\n"
    except TypeError:
        sys.exit("Couldn't read document info! Make sure target is a valid pdf document...")

    metadata += "Last metadata mod Date: %s\nLast Mod Date: %s\nLast Access Date: %s\nOwner User ID: %s" %(dt.fromtimestamp(stats.st_ctime),
                                                                                                           dt.fromtimestamp(stats.st_mtime),
                                                                                                           dt.fromtimestamp(stats.st_atime),
                                                                                                           stats.st_uid)
    try:
        print(metadata)
    except UnicodeEncodeError:
        print("Console encoding can't decode the result. Enter chcp 65001 in the console and rerun the script.")

    if save:
        file_name = getFileName(file_path)
        tgt = file_name + ".txt"

        saveResult(tgt, metadata)
validators.py 文件源码 项目:Plamber 作者: OlegKlimenko 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def validate_pdf(value):
    """
    Validates the uploading file if it is a PDF.
    Raises an error if validation not passed.

    :param value: The file object.
    """
    try:
        PyPDF2.PdfFileReader(io.BytesIO(value.read()))
    except PyPDF2.utils.PdfReadError:
        raise ValidationError('Tried to upload not PDF as a book!')
pdf.py 文件源码 项目:document_clipper 作者: reclamador 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def slice(self, pdf_file_path, page_actions, final_pdf_path):
        """
        Create new pdf from a slice of pages of a PDF
        :param pdf_file_path: path of the source PDF document, from which a new PDF file will be created.
        :param pages_actions: list of tuples, each tuple containing the page number and the clockwise rotation to
        be applied. The page number is non-zero indexed (first is page 1, and so on).
        :return: None. Writes the resulting PDF file into the provided path.
        """
        output = PdfFileWriter()
        with open(pdf_file_path, 'rb') as file_input:
            input = PdfFileReader(file_input, strict=False)

            # Check page actions correspond to valid input PDF pages
            input_num_pages = input.getNumPages()
            actions_page_numbers = zip(*page_actions)[0]
            largest_page_num = max(actions_page_numbers)
            lowest_page_num = min(actions_page_numbers)

            if lowest_page_num < 1:
                raise Exception(u"Invalid page numbers range in actions: page numbers cannot be lower than 1.")

            if (largest_page_num - 1) > input_num_pages:
                raise Exception(u"Invalid page numbers range in actions: page numbers cannot exceed the maximum numbers"
                                u"of pages of the source PDF document.")

            # Perform actual slicing + rotation
            for num_page, rotation in page_actions:
                output.addPage(input.getPage(num_page-1).rotateCounterClockwise(rotation) if rotation
                               else input.getPage(num_page-1))
            self._write_to_pdf(output, final_pdf_path)
ranker.py 文件源码 项目:Resume-Ranker 作者: sadmicrowave 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def parse_pdf_doc(self):
        """
        Open a pdf document filetype and parse contents to string variable
        for matching comparison.
        """

        docText = ''
        # open the file, with read/binary priviledges
        f = open(self.file, 'rb')
        pdf = PyPDF2.PdfFileReader(f)
        for page in pdf.pages :
            docText += page.extractText()

        f.close()
        return docText.strip() or None
marisol.py 文件源码 项目:Marisol 作者: wikkiewikkie 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init__(self, file, prefix, fill, start, area):
        """
        Represents a document to be numbered.

        Args:
            file (): PDF file associated with this document.
            prefix (str): Bates number prefix.
            fill (int): Length to zero-pad number to.
            start (int): Number to start with.
            area (Area): Area on the document where the number should be drawn
        """
        try:
            self.file = io.BytesIO(file.read())
        except AttributeError:
            with open(file, "rb") as file:
                self.file = io.BytesIO(file.read())
        self.reader = PdfFileReader(self.file)
        self.prefix = prefix
        self.fill = fill
        self.start = copy.copy(start)
        self.area = area

        self.overlays = {x: None for x in Area}
        self.overlays[area] = BatesOverlay(None, self.area)

        self.index = 0

        self.pages = []
        for num, page in enumerate(self.reader.pages):
            p = Page(self, page, self.prefix, self.fill, self.start + num)
            self.pages.append(p)
api.py 文件源码 项目:pdfdir 作者: chroming 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def __init__(self, path):
        self.path = path
        reader = PdfFileReader(open(path, "rb"))
        self.writer = PdfFileWriter()
        self.writer.appendPagesFromReader(reader)
        self.writer.addMetadata({k: v for k, v in reader.getDocumentInfo().items()
                                 if isinstance(v, (utils.string_type, utils.bytes_type))})
scriptie_scraper.py 文件源码 项目:THESIS_LIFEBOAT 作者: Jasper-Koops 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def reader(title):
    """Leest de PDF en converteert het naar TEXT"""
    pdfFileObj = open(title,'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
    text = ""
    maxpage = int(pdfReader.numPages) + 1
    for x in range(0, maxpage + 100):
        try:
            pageObj = pdfReader.getPage(x)
            text += str(pageObj.extractText())
        except:
            break
    return text
layout.py 文件源码 项目:stereo 作者: suda 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def generate_document(self, data):
        packet = StringIO()
        if self.template_file is not None:
            template = PdfFileReader(open(self.template_file, 'rb'))
        c = canvas.Canvas(packet, pagesize=(self.width, self.height))

        i = 0
        for field_cls in self.fields:
            # TODO: Catch exception if there is less columns than fields
            field = field_cls(self, c, data[i])
            field.render()
            i += 1

        # Save canvas
        c.save()
        packet.seek(0)
        text = PdfFileReader(packet)
        output = PdfFileWriter()
        if self.template_file is not None:
            # Merge text with base
            page = template.getPage(0)
            page.mergePage(text.getPage(0))
        else:
            page = text.getPage(0)
        output.addPage(page)

        # Save file
        filename = "%s/%s.pdf" % (self.output_dir, self.generate_filename(data))
        outputStream = open(filename, 'wb')
        output.write(outputStream)
        outputStream.close()
copernicus.py 文件源码 项目:Copernicus 作者: Soroboruo 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def getPDFContent(path):
    content = ""
    pdf = PyPDF2.PdfFileReader(path, "rb")
    for i in range(0, pdf.getNumPages()):

        content += pdf.getPage(i).extractText() + "\n"
    content = " ".join(content.strip().split())
    return content
utils.py 文件源码 项目:caj2pdf 作者: JeziL 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def add_outlines(toc, filename, output):
    build_outlines_btree(toc)
    pdf_out = PdfFileWriter()
    pdf_in = PdfFileReader(open(filename, 'rb'))
    for p in pdf_in.pages:
        pdf_out.addPage(p)
    toc_num = len(toc)
    idoix = len(pdf_out._objects) + 1
    idorefs = [PDF.IndirectObject(x + idoix, 0, pdf_out) for x in range(toc_num + 1)]
    ol = PDF.DictionaryObject()
    ol.update({
        PDF.NameObject("/Type"): PDF.NameObject("/Outlines"),
        PDF.NameObject("/First"): idorefs[1],
        PDF.NameObject("/Last"): idorefs[-1],
        PDF.NameObject("/Count"): PDF.NumberObject(toc_num)
    })
    olitems = []
    for t in toc:
        oli = PDF.DictionaryObject()
        oli.update({
            PDF.NameObject("/Title"): PDF.TextStringObject(t["title"].decode("utf-8")),
            PDF.NameObject("/Dest"): make_dest(pdf_out, t["page"])
        })
        opt_keys = {"real_parent": "/Parent", "prev": "/Prev", "next": "/Next", "first": "/First", "last": "/Last"}
        for k, v in opt_keys.items():
            n = getattr(t["node"], k)()
            if n is not None:
                oli.update({
                    PDF.NameObject(v): idorefs[n.index]
                })
        olitems.append(oli)
    pdf_out._addObject(ol)
    for i in olitems:
        pdf_out._addObject(i)
    pdf_out._root_object.update({
        PDF.NameObject("/Outlines"): idorefs[0]
    })
    outputFile = open(output, "wb")
    pdf_out.write(outputFile)
    outputFile.close()
pypdfocr_pdf.py 文件源码 项目:pdf_liberty 作者: mplitnikas 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def iter_pdf_page(self, f):
        reader = PdfFileReader(f)
        for pgnum in range(reader.getNumPages()):
            pg = reader.getPage(pgnum)
            yield pg
pypdfocr_pdffiler.py 文件源码 项目:pdf_liberty 作者: mplitnikas 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def iter_pdf_page_text(self, filename):
        self.filename = filename
        reader = PdfFileReader(filename)
        logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
        for pgnum in range(reader.getNumPages()):
            text = reader.getPage(pgnum).extractText()
            text = text.encode('ascii', 'ignore')
            text = text.replace('\n', ' ')
            yield text
pypdfocr_pdf.py 文件源码 项目:pdf_liberty 作者: mplitnikas 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def iter_pdf_page(self, f):
        reader = PdfFileReader(f)
        for pgnum in range(reader.getNumPages()):
            pg = reader.getPage(pgnum)
            yield pg
pypdfocr_pdffiler.py 文件源码 项目:pdf_liberty 作者: mplitnikas 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def iter_pdf_page_text(self, filename):
        self.filename = filename
        reader = PdfFileReader(filename)
        logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
        for pgnum in range(reader.getNumPages()):
            text = reader.getPage(pgnum).extractText()
            text = text.encode('ascii', 'ignore')
            text = text.replace('\n', ' ')
            yield text
pypdfocr_pdffiler.py 文件源码 项目:pdf_liberty 作者: mplitnikas 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def iter_pdf_page_text(self, filename):
        self.filename = filename
        reader = PdfFileReader(filename)
        logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
        for pgnum in range(reader.getNumPages()):
            text = reader.getPage(pgnum).extractText()
            text = text.encode('ascii', 'ignore')
            text = text.replace('\n', ' ')
            yield text
pypdfocr_pdf.py 文件源码 项目:pdf_liberty 作者: mplitnikas 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def iter_pdf_page(self, f):
        reader = PdfFileReader(f)
        for pgnum in range(reader.getNumPages()):
            pg = reader.getPage(pgnum)
            yield pg
pypdfocr_pdffiler.py 文件源码 项目:pdf_liberty 作者: mplitnikas 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def iter_pdf_page_text(self, filename):
        self.filename = filename
        reader = PdfFileReader(filename)
        logging.info("pdf scanner found %d pages in %s" % (reader.getNumPages(), filename))
        for pgnum in range(reader.getNumPages()):
            text = reader.getPage(pgnum).extractText()
            text = text.encode('ascii', 'ignore')
            text = text.replace('\n', ' ')
            yield text
classifier.py 文件源码 项目:oabot 作者: dissemin 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def check_nb_pages(self, data):
        """
        Does this PDF contain enough pages?
        """
        try:
            s_io = StringIO(data)
            reader = PyPDF2.PdfFileReader(s_io)
            num_pages = reader.getNumPages()
            print("num pages: %d" % num_pages)
            return num_pages > 2
        except PyPdfError as e:
            return False
pdfcropper.py 文件源码 项目:krop 作者: gocarlos 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def loadFromStream(self, stream):
        self.reader = PdfFileReader(stream, strict=False)


问题


面经


文章

微信
公众号

扫码关注公众号