GH/utils/ocr_processor.py
AI Developer 2ec2c0a1ab feat: 完整的数据提取与转换器项目
- 添加MDF文件导出功能
- 集成阿里云OCR大模型识别
- 添加百度智能云AI照片评分
- 集成DeepSeek大模型创意文案生成
- 完善文档和配置管理
- 使用uv进行现代化依赖管理
- 添加完整的.gitignore配置
2026-01-08 20:25:49 +08:00

73 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pytesseract
from PIL import Image
import os
def extract_text_from_image(image_path, lang='chi_sim+eng', use_ai=False, ai_provider='aliyun'):
"""从图片中提取文字OCR"""
try:
if use_ai:
# 使用AI大模型进行OCR
if ai_provider == 'aliyun':
from .aliyun_ocr import extract_text_with_aliyun
return extract_text_with_aliyun(image_path, 'general')
else:
raise Exception(f"不支持的AI提供商: {ai_provider}")
else:
# 使用传统的Tesseract OCR
# 设置tesseract路径如果需要
if os.name == 'nt': # Windows系统
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# 打开并处理图片
image = Image.open(image_path)
# 使用OCR提取文字
text = pytesseract.image_to_string(image, lang=lang)
return text.strip()
except Exception as e:
raise Exception(f"图片文字识别失败: {str(e)}")
def extract_text_with_ai(image_path, provider='aliyun', ocr_type='general', options=None):
"""使用AI大模型进行图片文字识别"""
try:
if provider == 'aliyun':
from .aliyun_ocr import extract_text_with_aliyun
return extract_text_with_aliyun(image_path, ocr_type, options)
else:
raise Exception(f"不支持的AI提供商: {provider}")
except Exception as e:
raise Exception(f"AI OCR识别失败: {str(e)}")
def image_to_text_file(image_path, output_path):
"""将图片文字保存为文本文件"""
try:
text = extract_text_from_image(image_path)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(text)
return True
except Exception as e:
raise Exception(f"图片转文本文件失败: {str(e)}")
def image_to_excel(image_path, output_path):
"""将图片文字保存为Excel文件"""
try:
import pandas as pd
text = extract_text_from_image(image_path)
# 按行分割文本
lines = [line.strip() for line in text.split('\n') if line.strip()]
# 创建DataFrame
df = pd.DataFrame({
'行号': range(1, len(lines) + 1),
'内容': lines
})
df.to_excel(output_path, index=False)
return True
except Exception as e:
raise Exception(f"图片转Excel失败: {str(e)}")