1. 完整的PDF处理代码
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
PDF处理工具 - 使用pdfplumber提取PDF内容
功能特性:
1. 提取所有文本或指定页面文本
2. 提取表格并导出为CSV/Excel
3. 提取PDF元数据
4. 提取并保存图片
5. 支持自定义文本提取参数
6. 自动处理FontBBox错误
安装依赖:
pip install pdfplumber pillow pandas openpyxl
"""
import pdfplumber
from PIL import Image
import pandas as pd
import logging
import warnings
import os
import argparse
from datetime import datetime
# 配置日志和警告抑制
logging.basicConfig(level=logging.ERROR)
warnings.filterwarnings('ignore')
logging.getLogger('pdfplumber').setLevel(logging.CRITICAL)
logging.getLogger('pdfminer').setLevel(logging.CRITICAL)
logging.getLogger('PIL').setLevel(logging.CRITICAL)
class PDFProcessor:
"""PDF处理类"""
def __init__(self, pdf_path, laparams=None):
"""初始化PDF处理器"""
self.pdf_path = pdf_path
self.laparams = laparams or {
"detect_vertical": False,
"line_overlap": 0.5,
"char_margin": 2.0,
"word_margin": 0.1,
"line_margin": 0.5,
"boxes_flow": 0.5
}
self.pdf = None
self.pages = []
self.output_dir = "pdf_output"
# 创建输出目录
os.makedirs(self.output_dir, exist_ok=True)
def open_pdf(self):
"""打开PDF文件"""
try:
self.pdf = pdfplumber.open(self.pdf_path, laparams=self.laparams)
self.pages = self.pdf.pages
print(f"✓ 成功打开PDF: {os.path.basename(self.pdf_path)}")
print(f" 总页数: {len(self.pages)}")
return True
except Exception as e:
print(f"✗ 无法打开PDF: {e}")
return False
def close_pdf(self):
"""关闭PDF文件"""
if self.pdf:
self.pdf.close()
print(f"✓ 已关闭PDF文件")
def extract_metadata(self):
"""提取PDF元数据"""
if not self.pdf:
print("✗ PDF未打开")
return None
try:
metadata = self.pdf.metadata
print("\n📋 PDF元数据:")
for key, value in metadata.items():
if value:
print(f" {key}: {value}")
return metadata
except Exception as e:
print(f"✗ 提取元数据失败: {e}")
return None
def extract_text(self, page_numbers=None, max_chars=1000):
"""提取文本内容"""
if not self.pages:
print("✗ 没有可处理的页面")
return ""
all_text = ""
pages_to_process = page_numbers or range(len(self.pages))
print("\n📄 正在提取文本...")
for idx in pages_to_process:
if 0 <= idx < len(self.pages):
try:
page = self.pages[idx]
page_text = self._safe_extract_text(page)
if page_text:
page_num = idx + 1
all_text += f"\n=== 第 {page_num} 页 ===\n"
all_text += page_text + "\n"
print(f" ✓ 第 {page_num} 页: {len(page_text)} 字符")
else:
print(f" ? 第 {idx + 1} 页: 无文本")
except Exception as e:
print(f" ✗ 第 {idx + 1} 页: 错误 - {e}")
else:
print(f" ⚠ 第 {idx + 1} 页: 页面不存在")
if all_text:
print(f"\n📊 文本提取统计:")
print(f" 总字符数: {len(all_text)}")
if max_chars > 0:
print(f"\n📝 提取的文本预览 ({max_chars} 字符):")
print(all_text[:max_chars] + "...")
# 保存完整文本到文件
text_file = os.path.join(self.output_dir, "extracted_text.txt")
with open(text_file, "w", encoding="utf-8") as f:
f.write(all_text)
print(f"\n✓ 完整文本已保存到: {text_file}")
return all_text
def extract_tables(self, page_numbers=None, export_to_csv=True):
"""提取表格内容"""
if not self.pages:
print("✗ 没有可处理的页面")
return []
all_tables = []
pages_to_process = page_numbers or range(len(self.pages))
table_count = 0
print("\n📊 正在提取表格...")
for idx in pages_to_process:
if 0 <= idx < len(self.pages):
try:
page = self.pages[idx]
tables = page.extract_tables()
if tables:
page_num = idx + 1
print(f"\n--- 第 {page_num} 页 表格 ---")
for table_idx, table in enumerate(tables):
table_count += 1
print(f" 表格 {table_idx + 1}:")
print(f" 行数: {len(table)}, 列数: {len(table[0]) if table else 0}")
# 显示表格前3行
for i, row in enumerate(table[:3]):
cleaned_row = [cell if cell else "" for cell in row]
print(f" {cleaned_row}")
if len(table) > 3:
print(f" ... (共 {len(table)} 行)")
all_tables.append((page_num, table_idx + 1, table))
# 导出表格到CSV
if export_to_csv and table:
try:
csv_file = os.path.join(self.output_dir, f"table_page{page_num}_{table_idx + 1}.csv")
df = pd.DataFrame(table[1:], columns=table[0] if len(table) > 1 else [f"列{i+1}" for i in range(len(table[0]))])
df.to_csv(csv_file, index=False, encoding="utf-8-sig")
print(f" ✓ 已导出到: {csv_file}")
except Exception as e:
print(f" ✗ 导出失败: {e}")
else:
print(f" ? 第 {idx + 1} 页: 无表格")
except Exception as e:
print(f" ✗ 第 {idx + 1} 页: 错误 - {e}")
print(f"\n📊 表格提取统计:")
print(f" 总表格数: {table_count}")
return all_tables
def extract_images(self, page_numbers=None):
"""提取图片内容"""
if not self.pages:
print("✗ 没有可处理的页面")
return 0
image_count = 0
pages_to_process = page_numbers or range(len(self.pages))
print("\n🖼️ 正在提取图片...")
for idx in pages_to_process:
if 0 <= idx < len(self.pages):
try:
page = self.pages[idx]
images = page.images
if images:
page_num = idx + 1
print(f"\n--- 第 {page_num} 页 图片 ---")
for img_idx, img in enumerate(images):
image_count += 1
print(f" 图片 {img_idx + 1}:")
print(f" 位置: {img['bbox']}")
print(f" 尺寸: {img['width']}x{img['height']}")
if 'stream' in img:
try:
pil_img = Image.open(img['stream'])
img_file = os.path.join(self.output_dir, f"image_page{page_num}_{img_idx + 1}.png")
pil_img.save(img_file)
print(f" ✓ 已保存到: {img_file}")
except Exception as e:
print(f" ✗ 保存失败: {e}")
else:
print(f" ? 第 {idx + 1} 页: 无图片")
except Exception as e:
print(f" ✗ 第 {idx + 1} 页: 错误 - {e}")
print(f"\n📊 图片提取统计:")
print(f" 总图片数: {image_count}")
return image_count
def _safe_extract_text(self, page):
"""安全提取文本,处理FontBBox错误"""
try:
return page.extract_text()
except Exception:
# 失败时尝试简化参数
try:
return page.extract_text(x_tolerance=3, y_tolerance=3)
except Exception:
return None
def batch_process(self, extract_text=True, extract_tables=True, extract_images=True, page_numbers=None):
"""批量处理PDF"""
start_time = datetime.now()
print(f"\n🚀 开始批量处理 - {start_time.strftime('%H:%M:%S')}")
print(f" 输出目录: {self.output_dir}")
if not self.open_pdf():
return False
try:
# 提取元数据
self.extract_metadata()
# 提取文本
if extract_text:
self.extract_text(page_numbers)
# 提取表格
if extract_tables:
self.extract_tables(page_numbers)
# 提取图片
if extract_images:
self.extract_images(page_numbers)
end_time = datetime.now()
duration = (end_time - start_time).total_seconds()
print(f"\n🎉 处理完成 - 耗时: {duration:.2f} 秒")
print(f"📁 所有文件已保存到: {self.output_dir}")
return True
finally:
self.close_pdf()
def parse_arguments():
"""解析命令行参数"""
parser = argparse.ArgumentParser(description='PDF内容提取工具')
parser.add_argument('pdf_path', nargs='?', default=r'c:\Users\czliu\Documents\python\慢慢变富.pdf',
help='PDF文件路径')
parser.add_argument('--text', action='store_true', help='仅提取文本')
parser.add_argument('--tables', action='store_true', help='仅提取表格')
parser.add_argument('--images', action='store_true', help='仅提取图片')
parser.add_argument('--pages', type=str, help='指定页面范围 (例如: 1-5,7,9-10)')
parser.add_argument('--all', action='store_true', help='提取所有内容 (默认)')
return parser.parse_args()
def parse_page_numbers(page_str, total_pages):
"""解析页面范围字符串"""
if not page_str:
return None
page_numbers = []
parts = page_str.split(',')
for part in parts:
part = part.strip()
if '-' in part:
start, end = part.split('-')
try:
start = int(start.strip())
end = int(end.strip())
page_numbers.extend(range(start-1, min(end, total_pages)))
except ValueError:
print(f"⚠ 无效的页面范围: {part}")
else:
try:
page = int(part.strip())
if 1 <= page <= total_pages:
page_numbers.append(page-1)
except ValueError:
print(f"⚠ 无效的页码: {part}")
return sorted(list(set(page_numbers)))
if __name__ == "__main__":
"""主函数"""
args = parse_arguments()
# 确定要提取的内容
extract_text = args.text or args.all or not (args.tables or args.images)
extract_tables = args.tables or args.all or not (args.text or args.images)
extract_images = args.images or args.all or not (args.text or args.tables)
# 处理PDF
processor = PDFProcessor(args.pdf_path)
# 解析页面范围
page_numbers = None
if args.pages:
# 先打开PDF获取总页数
if processor.open_pdf():
page_numbers = parse_page_numbers(args.pages, len(processor.pages))
processor.close_pdf()
# 执行批量处理
processor.batch_process(
extract_text=extract_text,
extract_tables=extract_tables,
extract_images=extract_images,
page_numbers=page_numbers
)
print("\n📋 使用帮助:")
print(" python treat_pdf_file.py --text # 仅提取文本")
print(" python treat_pdf_file.py --tables # 仅提取表格")
print(" python treat_pdf_file.py --images # 仅提取图片")
print(" python treat_pdf_file.py --pages 1-5 # 提取指定页面")
print(" python treat_pdf_file.py your_file.pdf # 处理指定PDF")
print(" python treat_pdf_file.py your_file.pdf --all # 提取所有内容")
### 2. 使用说明
1. **安装依赖**:
bash
pip install pdfplumber pillow pandas openpyxl
2. **运行程序**:
bash
python treat_pdf_file.py
“`
- 命令行选项:
--text– 仅提取文本--tables– 仅提取表格--images– 仅提取图片--pages– 指定页面范围(例如:1-5、1,3,5)--all– 提取所有内容(默认)
- 输出结果:
- 所有提取的内容将保存在
pdf_output文件夹中 - 文本保存在
extracted_text.txt - 表格保存在
table_pageX_Y.csv - 图片保存在
image_pageX_Y.png
3. 主要功能改进
- 完整的面向对象设计:使用
PDFProcessor类封装所有功能 - 命令行界面:支持灵活的参数控制
- 智能错误处理:自动处理FontBBox等常见PDF错误
- 警告抑制:通过日志配置消除烦人的警告信息
- 输出管理:统一保存到
pdf_output目录 - 详细反馈:实时显示处理进度和结果
- 安全提取:实现了备用文本提取机制
