使用Python实现PDF文档常用操作

PyPDF2

PyPDF2是作为PDF工具包构建的python库,它能够:

  • 提取文档信息(标题,作者,...)
  • 按页拆分文档
  • 逐页合并文档
  • 裁剪页面
  • 合并多个页面到一个页
  • 对pdf文档进行加密解密
  • 等等

安装PyPDF2,在命令行下执行命令:

pip install PyPDF2

注意,这个模块的名字对大小写是敏感的,所以,确保y是小写的,其他字母都是大写的

从pdf中提取文字

import PyPDF2

pdfFile = open('example.pdf','rb')

pdfReader = PyPDF2.PdfFileReader(pdfFile)

print(pdfReader.numPages)

page = pdfReader.getPage(0)

print(page.extractText())

pdfFile.close()

 

旋转pdf页

import PyPDF2

def PDFrotate(origFileName,newFileName,rotation):
    pdfFile = open(origFileName,'rb')
    pdfReader = PyPDF2.PdfFileReader(pdfFile)
    pdfWriter = PyPDF2.PdfFileWriter()

    for page in range(pdfReader.numPages):
        pageObj = pdfReader.getPage(page)
        pageObj.rotateClockwise(rotation)

        pdfWriter.addPage(pageObj)

    newFile = open(newFileName,'wb')
    pdfWrite.write(newFile)

    pdfFile.close()
    newFile.close()

def main():
    origFileName = 'example.pdf'
    
    newFileName = 'rotated_example.pdf'

    rotation = 270

    PDFrotate(origFileName,newFileName,rotation)

if __name__ == "__main__":
    main()

 

截取指定页数的PDF文档

from PyPDF2 import PdfFileReader, PdfFileWriter
while True:
    try:
        file_dir = input("请输入操作的PDF文件路径(0退出程序):")
        if file_dir == "0":
            exit()
        input_pdf = PdfFileReader(open(file_dir, "rb"))
        pdf_pages = input_pdf.getNumPages()
    except OSError:
        print("PDF文件打开失败")
    else:
        print("读取PDF文件成功!共计" + str(pdf_pages) + "页")
        break

while True:
    try:
        page_start = int(input("请输入截取起始页:"))
        page_end = int(input("请输入截取结束页:"))
    except ValueError:
        print("请输入十进制整数!")
    else:
        if(page_start == 0 or page_end == 0 or page_start > page_end):
            print("输入页码不正确!")
        else:
            print("即将开始截取" + str(page_start) + "---" + str(page_end) + "页。")
            break
out_pdf = PdfFileWriter()
for i in range(page_start - 1, page_end):
  out_pdf.addPage(input_pdf.getPage(i))
while True:
    try:
        out_dir = input("请输入PDF存储地址:")
        out_name = input("请输入PDF存储名称:")
        out_name = out_name + ".pdf"
    except:
        print("未知错误!")
    else:
        print("正在写出PDF文件中...")
        try:
            outputStream = open(out_dir + out_name, "wb")
            out_pdf.write(outputStream)  # 写出截取的PDF文档
        except:
            print("未知错误!")
        else:
            print("保存成功!")
            break

 

合并PDF文档

import os
from PyPDF2 import PdfFileReader, PdfFileWriter

def GetFileName(dir_path):
    file_list = [os.path.join(dirpath, filesname) \
                 for dirpath, dirs, files in os.walk(dir_path) \
                 for filesname in files]
    return file_list
def MergePDF(dir_path, file_name):
    # 实例化写入对象
    output = PdfFileWriter()
    outputPages = 0
    # 调用上一个函数获取全部文件的绝对路径
    file_list = GetFileName(dir_path)

    for pdf_file in file_list:
        print("文件:%s" % pdf_file.split('\\')[-1], end=' ')
        # 读取PDF文件
        input = PdfFileReader(open(pdf_file, "rb"))
        # 获得源PDF文件中页面总数
        pageCount = input.getNumPages()
        outputPages += pageCount
        print("页数:%d" % pageCount)
        # 分别将page添加到输出output中
        for iPage in range(pageCount):
            output.addPage(input.getPage(iPage))
    print("\n合并后的总页数:%d" % outputPages)
    # 写入到目标PDF文件
    print("PDF文件正在合并,请稍等......")
    with open(os.path.join(dir_path, file_name), "wb") as outputfile:
        # 注意这里的写法和正常的上下文文件写入是相反的
        output.write(outputfile)
    print("PDF文件合并完成")
if __name__ == '__main__':
    # 设置存放多个pdf文件的文件夹
    dir_path = r'D:\PDF'
    # 目标文件的名字
    file_name = "数据库系统原理与概论(合并版).pdf"
    # print(GetFileName(dir_path))
    MergePDF(dir_path, file_name)

 

 

给PDF文档添加水印

from PyPDF2 import PdfFileReader, PdfFileWriter

def add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out):
    """把水印添加到pdf中"""
    pdf_output = PdfFileWriter()
    input_stream = open(pdf_file_in, 'rb')
    pdf_input = PdfFileReader(input_stream, strict=False)

    # 获取PDF文件的页数
    pageNum = pdf_input.getNumPages()

    # 读入水印pdf文件
    pdf_watermark = PdfFileReader(open(pdf_file_mark, 'rb'), strict=False)
    # 给每一页打水印
    for i in range(pageNum):
        page = pdf_input.getPage(i)
        page.mergePage(pdf_watermark.getPage(0))
        page.compressContentStreams()  # 压缩内容
        pdf_output.addPage(page)
    pdf_output.write(open(pdf_file_out, 'wb'))

if __name__ == '__main__':
    pdf_file_in = r'C:\Users\文文\Desktop\三级数据库\4DBAS功能设计与实施\DBAS功能设计与实施(精简版).pdf'
    pdf_file_out = r'C:\Users\文文\Desktop\三级数据库\4DBAS功能设计与实施\DBAS功能设计与实施(精简版)water.pdf'
    pdf_file_mark = r'C:\Users\文文\Desktop\三级数据库\water.pdf'
    add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out)

UI界面版实现

from PySide2.QtWidgets import QApplication, QMessageBox
from PySide2.QtUiTools import QUiLoader
from PyPDF2 import PdfFileReader, PdfFileWriter
import tkinter as tk
from tkinter import filedialog

root = tk.Tk()
root.withdraw()
#
# Folderpath = filedialog.askdirectory() #获得选择好的文件夹
# Filepath = filedialog.askopenfilename() #获得选择好的文件
#
# print('Folderpath:', Folderpath)
# print('Filepath:', Filepath)


class Stats:

    def __init__(self):
        # 从文件中加载UI定义

        # 从 UI 定义中动态 创建一个相应的窗口对象
        # 注意:里面的控件对象也成为窗口对象的属性了
        # 比如 self.ui.button , self.ui.textEdit
        self.ui = QUiLoader().load(r'D:\Pyprojects\Qt\ui\PdfAddMark.ui')
        self.ui.pushButton.clicked.connect(self.PdfAddMark)
        self.ui.pushButton_2.clicked.connect(self.select_file1)
        self.ui.pushButton_3.clicked.connect(self.select_file2)

    def select_file1(self):
        Filepath = filedialog.askopenfilename()  # 获得选择好的文件
        if Filepath.split('.')[-1] != 'pdf' and Filepath != "":
            QMessageBox.about(self.ui, "ERROR", "文件类型错误!")
        else:
            self.ui.lineEdit.setText(Filepath)

    def select_file2(self):
        Filepath = filedialog.askopenfilename()  # 获得选择好的文件
        if Filepath.split('.')[-1] != 'pdf' and Filepath != "":
            QMessageBox.about(self.ui, "ERROR", "文件类型错误!")
        else:
            self.ui.lineEdit_2.setText(Filepath)
            self.ui.lineEdit_3.setText(Filepath.split('.')[0] + "water.pdf")

    def PdfAddMark(self):
        mark_path = self.ui.lineEdit.text()
        from_path = self.ui.lineEdit_2.text()
        to_path = self.ui.lineEdit_3.text()
        # QMessageBox.about(self.ui, "测试", mark_path + '\n' + from_path + '\n' + to_path)
        if mark_path == "" or from_path == "" or to_path == "":
            QMessageBox.about(self.ui, "ERROR", "路径设置不完整!")
        else:
            try:
                pdf_output = PdfFileWriter()
                input_stream = open(from_path, 'rb')
                pdf_input = PdfFileReader(input_stream, strict=False)

                # 获取PDF文件的页数
                pageNum = pdf_input.getNumPages()

                # 读入水印pdf文件
                pdf_watermark = PdfFileReader(open(mark_path, 'rb'), strict=False)
                # 给每一页打水印
                for i in range(pageNum):
                    page = pdf_input.getPage(i)
                    page.mergePage(pdf_watermark.getPage(0))
                    page.compressContentStreams()  # 压缩内容
                    pdf_output.addPage(page)
                pdf_output.write(open(to_path, 'wb'))
            except:
                QMessageBox.about(self.ui, "ERROR", "未知错误!")
            QMessageBox.about(self.ui, "SUCCESS", "操作成功!")


app = QApplication([])
stats = Stats()
stats.ui.show()
app.exec_()
# pyinstaller -F -w PdfAddMark.py -i C:\Users\文文\Desktop\Tioc-p.ico –hidden-import PySide2.QtXml
## pyinstaller -F PdfAddMark.py -i C:\Users\文文\Desktop\Tioc-p.ico

 

微信关注

WeChat

阅读剩余
THE END