52 lines
1.6 KiB
Python
52 lines
1.6 KiB
Python
|
|
import fitz # PyMuPDF
|
||
|
|
import pandas as pd
|
||
|
|
|
||
|
|
def extract_text_from_pdf(pdf_path):
|
||
|
|
"""从PDF中提取文本内容"""
|
||
|
|
try:
|
||
|
|
doc = fitz.open(pdf_path)
|
||
|
|
text = ""
|
||
|
|
for page_num in range(len(doc)):
|
||
|
|
page = doc.load_page(page_num)
|
||
|
|
text += page.get_text()
|
||
|
|
doc.close()
|
||
|
|
return text
|
||
|
|
except Exception as e:
|
||
|
|
raise Exception(f"PDF文本提取失败: {str(e)}")
|
||
|
|
|
||
|
|
def extract_tables_from_pdf(pdf_path):
|
||
|
|
"""从PDF中提取表格数据"""
|
||
|
|
try:
|
||
|
|
doc = fitz.open(pdf_path)
|
||
|
|
tables = []
|
||
|
|
|
||
|
|
for page_num in range(len(doc)):
|
||
|
|
page = doc.load_page(page_num)
|
||
|
|
|
||
|
|
# 尝试提取表格(简单实现,实际可能需要更复杂的表格检测)
|
||
|
|
text = page.get_text("text")
|
||
|
|
# 这里可以添加表格检测和提取逻辑
|
||
|
|
|
||
|
|
doc.close()
|
||
|
|
return tables
|
||
|
|
except Exception as e:
|
||
|
|
raise Exception(f"PDF表格提取失败: {str(e)}")
|
||
|
|
|
||
|
|
def pdf_to_excel(pdf_path, output_path):
|
||
|
|
"""将PDF文本内容导出为Excel"""
|
||
|
|
try:
|
||
|
|
text = extract_text_from_pdf(pdf_path)
|
||
|
|
|
||
|
|
# 将文本按段落分割
|
||
|
|
paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
|
||
|
|
|
||
|
|
# 创建DataFrame
|
||
|
|
df = pd.DataFrame({
|
||
|
|
'段落编号': range(1, len(paragraphs) + 1),
|
||
|
|
'内容': paragraphs
|
||
|
|
})
|
||
|
|
|
||
|
|
df.to_excel(output_path, index=False)
|
||
|
|
return True
|
||
|
|
except Exception as e:
|
||
|
|
raise Exception(f"PDF转Excel失败: {str(e)}")
|