使用Python实现PDF文档常用操作
PyPDF2
PyPDF2是作为PDF工具包构建的python库,它能够:
- 提取文档信息(标题,作者,...)
- 按页拆分文档
- 逐页合并文档
- 裁剪页面
- 合并多个页面到一个页
- 对pdf文档进行加密解密
- 等等
安装PyPDF2,在命令行下执行命令:
pip install PyPDF2
注意,这个模块的名字对大小写是敏感的,所以,确保y是小写的,其他字母都是大写的
从pdf中提取文字
import PyPDF2
pdfFile = open('example.pdf','rb')
pdfReader = PyPDF2.PdfFileReader(pdfFile)
print(pdfReader.numPages)
page = pdfReader.getPage(0)
print(page.extractText())
pdfFile.close()
旋转pdf页
import PyPDF2
def PDFrotate(origFileName,newFileName,rotation):
pdfFile = open(origFileName,'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFile)
pdfWriter = PyPDF2.PdfFileWriter()
for page in range(pdfReader.numPages):
pageObj = pdfReader.getPage(page)
pageObj.rotateClockwise(rotation)
pdfWriter.addPage(pageObj)
newFile = open(newFileName,'wb')
pdfWrite.write(newFile)
pdfFile.close()
newFile.close()
def main():
origFileName = 'example.pdf'
newFileName = 'rotated_example.pdf'
rotation = 270
PDFrotate(origFileName,newFileName,rotation)
if __name__ == "__main__":
main()
截取指定页数的PDF文档
from PyPDF2 import PdfFileReader, PdfFileWriter
while True:
try:
file_dir = input("请输入操作的PDF文件路径(0退出程序):")
if file_dir == "0":
exit()
input_pdf = PdfFileReader(open(file_dir, "rb"))
pdf_pages = input_pdf.getNumPages()
except OSError:
print("PDF文件打开失败")
else:
print("读取PDF文件成功!共计" + str(pdf_pages) + "页")
break
while True:
try:
page_start = int(input("请输入截取起始页:"))
page_end = int(input("请输入截取结束页:"))
except ValueError:
print("请输入十进制整数!")
else:
if(page_start == 0 or page_end == 0 or page_start > page_end):
print("输入页码不正确!")
else:
print("即将开始截取" + str(page_start) + "---" + str(page_end) + "页。")
break
out_pdf = PdfFileWriter()
for i in range(page_start - 1, page_end):
out_pdf.addPage(input_pdf.getPage(i))
while True:
try:
out_dir = input("请输入PDF存储地址:")
out_name = input("请输入PDF存储名称:")
out_name = out_name + ".pdf"
except:
print("未知错误!")
else:
print("正在写出PDF文件中...")
try:
outputStream = open(out_dir + out_name, "wb")
out_pdf.write(outputStream) # 写出截取的PDF文档
except:
print("未知错误!")
else:
print("保存成功!")
break
合并PDF文档
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
def GetFileName(dir_path):
file_list = [os.path.join(dirpath, filesname) \
for dirpath, dirs, files in os.walk(dir_path) \
for filesname in files]
return file_list
def MergePDF(dir_path, file_name):
# 实例化写入对象
output = PdfFileWriter()
outputPages = 0
# 调用上一个函数获取全部文件的绝对路径
file_list = GetFileName(dir_path)
for pdf_file in file_list:
print("文件:%s" % pdf_file.split('\\')[-1], end=' ')
# 读取PDF文件
input = PdfFileReader(open(pdf_file, "rb"))
# 获得源PDF文件中页面总数
pageCount = input.getNumPages()
outputPages += pageCount
print("页数:%d" % pageCount)
# 分别将page添加到输出output中
for iPage in range(pageCount):
output.addPage(input.getPage(iPage))
print("\n合并后的总页数:%d" % outputPages)
# 写入到目标PDF文件
print("PDF文件正在合并,请稍等......")
with open(os.path.join(dir_path, file_name), "wb") as outputfile:
# 注意这里的写法和正常的上下文文件写入是相反的
output.write(outputfile)
print("PDF文件合并完成")
if __name__ == '__main__':
# 设置存放多个pdf文件的文件夹
dir_path = r'D:\PDF'
# 目标文件的名字
file_name = "数据库系统原理与概论(合并版).pdf"
# print(GetFileName(dir_path))
MergePDF(dir_path, file_name)
给PDF文档添加水印
from PyPDF2 import PdfFileReader, PdfFileWriter
def add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out):
"""把水印添加到pdf中"""
pdf_output = PdfFileWriter()
input_stream = open(pdf_file_in, 'rb')
pdf_input = PdfFileReader(input_stream, strict=False)
# 获取PDF文件的页数
pageNum = pdf_input.getNumPages()
# 读入水印pdf文件
pdf_watermark = PdfFileReader(open(pdf_file_mark, 'rb'), strict=False)
# 给每一页打水印
for i in range(pageNum):
page = pdf_input.getPage(i)
page.mergePage(pdf_watermark.getPage(0))
page.compressContentStreams() # 压缩内容
pdf_output.addPage(page)
pdf_output.write(open(pdf_file_out, 'wb'))
if __name__ == '__main__':
pdf_file_in = r'C:\Users\文文\Desktop\三级数据库\4DBAS功能设计与实施\DBAS功能设计与实施(精简版).pdf'
pdf_file_out = r'C:\Users\文文\Desktop\三级数据库\4DBAS功能设计与实施\DBAS功能设计与实施(精简版)water.pdf'
pdf_file_mark = r'C:\Users\文文\Desktop\三级数据库\water.pdf'
add_watermark(pdf_file_in, pdf_file_mark, pdf_file_out)
UI界面版实现
from PySide2.QtWidgets import QApplication, QMessageBox
from PySide2.QtUiTools import QUiLoader
from PyPDF2 import PdfFileReader, PdfFileWriter
import tkinter as tk
from tkinter import filedialog
root = tk.Tk()
root.withdraw()
#
# Folderpath = filedialog.askdirectory() #获得选择好的文件夹
# Filepath = filedialog.askopenfilename() #获得选择好的文件
#
# print('Folderpath:', Folderpath)
# print('Filepath:', Filepath)
class Stats:
def __init__(self):
# 从文件中加载UI定义
# 从 UI 定义中动态 创建一个相应的窗口对象
# 注意:里面的控件对象也成为窗口对象的属性了
# 比如 self.ui.button , self.ui.textEdit
self.ui = QUiLoader().load(r'D:\Pyprojects\Qt\ui\PdfAddMark.ui')
self.ui.pushButton.clicked.connect(self.PdfAddMark)
self.ui.pushButton_2.clicked.connect(self.select_file1)
self.ui.pushButton_3.clicked.connect(self.select_file2)
def select_file1(self):
Filepath = filedialog.askopenfilename() # 获得选择好的文件
if Filepath.split('.')[-1] != 'pdf' and Filepath != "":
QMessageBox.about(self.ui, "ERROR", "文件类型错误!")
else:
self.ui.lineEdit.setText(Filepath)
def select_file2(self):
Filepath = filedialog.askopenfilename() # 获得选择好的文件
if Filepath.split('.')[-1] != 'pdf' and Filepath != "":
QMessageBox.about(self.ui, "ERROR", "文件类型错误!")
else:
self.ui.lineEdit_2.setText(Filepath)
self.ui.lineEdit_3.setText(Filepath.split('.')[0] + "water.pdf")
def PdfAddMark(self):
mark_path = self.ui.lineEdit.text()
from_path = self.ui.lineEdit_2.text()
to_path = self.ui.lineEdit_3.text()
# QMessageBox.about(self.ui, "测试", mark_path + '\n' + from_path + '\n' + to_path)
if mark_path == "" or from_path == "" or to_path == "":
QMessageBox.about(self.ui, "ERROR", "路径设置不完整!")
else:
try:
pdf_output = PdfFileWriter()
input_stream = open(from_path, 'rb')
pdf_input = PdfFileReader(input_stream, strict=False)
# 获取PDF文件的页数
pageNum = pdf_input.getNumPages()
# 读入水印pdf文件
pdf_watermark = PdfFileReader(open(mark_path, 'rb'), strict=False)
# 给每一页打水印
for i in range(pageNum):
page = pdf_input.getPage(i)
page.mergePage(pdf_watermark.getPage(0))
page.compressContentStreams() # 压缩内容
pdf_output.addPage(page)
pdf_output.write(open(to_path, 'wb'))
except:
QMessageBox.about(self.ui, "ERROR", "未知错误!")
QMessageBox.about(self.ui, "SUCCESS", "操作成功!")
app = QApplication([])
stats = Stats()
stats.ui.show()
app.exec_()
# pyinstaller -F -w PdfAddMark.py -i C:\Users\文文\Desktop\Tioc-p.ico –hidden-import PySide2.QtXml
## pyinstaller -F PdfAddMark.py -i C:\Users\文文\Desktop\Tioc-p.ico
微信关注
阅读剩余
版权声明:
作者:理想
链接:https://www.imyjs.cn/archives/611
文章版权归作者所有,未经允许请勿转载。
THE END