metadata_extractor.py 文件源码-python代码片段

def pdfMetaData(file_path, save=True):
    '''Get PDF document metadata, takes 2 arguments, file_path and save (boolean, default is True)'''
    pdf_doc = PdfFileReader(open(file_path, "rb"))

    if pdf_doc.isEncrypted:
        try:
            if pdf_doc.decrypt("") != 1:
                sys.exit("target pdf document is encrypted... exiting...")
        except:
            sys.exit("target pdf document is encrypted with an unsupported algorithm... exiting...")

    doc_info = pdf_doc.getDocumentInfo()
    stats = os.stat(file_path)
    now = dt.now()
    file_name = getFileName(file_path)
    metadata = "Time: %d/%d/%d %d : %d : %d. Found the following metadata for file %s:\n\n" % (now.year, now.month,
                                                                                               now.day, now.hour, now.minute,
                                                                                               now.second, file_name[:-4])
    try:
        for md in doc_info:
            metadata += str(md[1:]) + " : " + pretifyPyPDF2Time(str(md[1:]) ,str(doc_info[md])) + "\n"
    except TypeError:
        sys.exit("Couldn't read document info! Make sure target is a valid pdf document...")

    metadata += "Last metadata mod Date: %s\nLast Mod Date: %s\nLast Access Date: %s\nOwner User ID: %s" %(dt.fromtimestamp(stats.st_ctime),
                                                                                                           dt.fromtimestamp(stats.st_mtime),
                                                                                                           dt.fromtimestamp(stats.st_atime),
                                                                                                           stats.st_uid)
    try:
        print(metadata)
    except UnicodeEncodeError:
        print("Console encoding can't decode the result. Enter chcp 65001 in the console and rerun the script.")

    if save:
        file_name = getFileName(file_path)
        tgt = file_name + ".txt"

        saveResult(tgt, metadata)