- 添加MDF文件导出功能 - 集成阿里云OCR大模型识别 - 添加百度智能云AI照片评分 - 集成DeepSeek大模型创意文案生成 - 完善文档和配置管理 - 使用uv进行现代化依赖管理 - 添加完整的.gitignore配置
73 lines
2.5 KiB
Python
73 lines
2.5 KiB
Python
import pytesseract
|
||
from PIL import Image
|
||
import os
|
||
|
||
def extract_text_from_image(image_path, lang='chi_sim+eng', use_ai=False, ai_provider='aliyun'):
|
||
"""从图片中提取文字(OCR)"""
|
||
try:
|
||
if use_ai:
|
||
# 使用AI大模型进行OCR
|
||
if ai_provider == 'aliyun':
|
||
from .aliyun_ocr import extract_text_with_aliyun
|
||
return extract_text_with_aliyun(image_path, 'general')
|
||
else:
|
||
raise Exception(f"不支持的AI提供商: {ai_provider}")
|
||
else:
|
||
# 使用传统的Tesseract OCR
|
||
# 设置tesseract路径(如果需要)
|
||
if os.name == 'nt': # Windows系统
|
||
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
||
|
||
# 打开并处理图片
|
||
image = Image.open(image_path)
|
||
|
||
# 使用OCR提取文字
|
||
text = pytesseract.image_to_string(image, lang=lang)
|
||
|
||
return text.strip()
|
||
except Exception as e:
|
||
raise Exception(f"图片文字识别失败: {str(e)}")
|
||
|
||
def extract_text_with_ai(image_path, provider='aliyun', ocr_type='general', options=None):
|
||
"""使用AI大模型进行图片文字识别"""
|
||
try:
|
||
if provider == 'aliyun':
|
||
from .aliyun_ocr import extract_text_with_aliyun
|
||
return extract_text_with_aliyun(image_path, ocr_type, options)
|
||
else:
|
||
raise Exception(f"不支持的AI提供商: {provider}")
|
||
except Exception as e:
|
||
raise Exception(f"AI OCR识别失败: {str(e)}")
|
||
|
||
def image_to_text_file(image_path, output_path):
|
||
"""将图片文字保存为文本文件"""
|
||
try:
|
||
text = extract_text_from_image(image_path)
|
||
|
||
with open(output_path, 'w', encoding='utf-8') as f:
|
||
f.write(text)
|
||
|
||
return True
|
||
except Exception as e:
|
||
raise Exception(f"图片转文本文件失败: {str(e)}")
|
||
|
||
def image_to_excel(image_path, output_path):
|
||
"""将图片文字保存为Excel文件"""
|
||
try:
|
||
import pandas as pd
|
||
|
||
text = extract_text_from_image(image_path)
|
||
|
||
# 按行分割文本
|
||
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
||
|
||
# 创建DataFrame
|
||
df = pd.DataFrame({
|
||
'行号': range(1, len(lines) + 1),
|
||
'内容': lines
|
||
})
|
||
|
||
df.to_excel(output_path, index=False)
|
||
return True
|
||
except Exception as e:
|
||
raise Exception(f"图片转Excel失败: {str(e)}") |