GH/utils/pdf_extractor.py

52 lines
1.6 KiB
Python
Raw Normal View History

import fitz # PyMuPDF
import pandas as pd
def extract_text_from_pdf(pdf_path):
"""从PDF中提取文本内容"""
try:
doc = fitz.open(pdf_path)
text = ""
for page_num in range(len(doc)):
page = doc.load_page(page_num)
text += page.get_text()
doc.close()
return text
except Exception as e:
raise Exception(f"PDF文本提取失败: {str(e)}")
def extract_tables_from_pdf(pdf_path):
"""从PDF中提取表格数据"""
try:
doc = fitz.open(pdf_path)
tables = []
for page_num in range(len(doc)):
page = doc.load_page(page_num)
# 尝试提取表格(简单实现,实际可能需要更复杂的表格检测)
text = page.get_text("text")
# 这里可以添加表格检测和提取逻辑
doc.close()
return tables
except Exception as e:
raise Exception(f"PDF表格提取失败: {str(e)}")
def pdf_to_excel(pdf_path, output_path):
"""将PDF文本内容导出为Excel"""
try:
text = extract_text_from_pdf(pdf_path)
# 将文本按段落分割
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
# 创建DataFrame
df = pd.DataFrame({
'段落编号': range(1, len(paragraphs) + 1),
'内容': paragraphs
})
df.to_excel(output_path, index=False)
return True
except Exception as e:
raise Exception(f"PDF转Excel失败: {str(e)}")