import fitz # PyMuPDF import pandas as pd def extract_text_from_pdf(pdf_path): """从PDF中提取文本内容""" try: doc = fitz.open(pdf_path) text = "" for page_num in range(len(doc)): page = doc.load_page(page_num) text += page.get_text() doc.close() return text except Exception as e: raise Exception(f"PDF文本提取失败: {str(e)}") def extract_tables_from_pdf(pdf_path): """从PDF中提取表格数据""" try: doc = fitz.open(pdf_path) tables = [] for page_num in range(len(doc)): page = doc.load_page(page_num) # 尝试提取表格(简单实现,实际可能需要更复杂的表格检测) text = page.get_text("text") # 这里可以添加表格检测和提取逻辑 doc.close() return tables except Exception as e: raise Exception(f"PDF表格提取失败: {str(e)}") def pdf_to_excel(pdf_path, output_path): """将PDF文本内容导出为Excel""" try: text = extract_text_from_pdf(pdf_path) # 将文本按段落分割 paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()] # 创建DataFrame df = pd.DataFrame({ '段落编号': range(1, len(paragraphs) + 1), '内容': paragraphs }) df.to_excel(output_path, index=False) return True except Exception as e: raise Exception(f"PDF转Excel失败: {str(e)}")