GH/utils/ocr_processor.py

73 lines
2.5 KiB
Python
Raw Normal View History

import pytesseract
from PIL import Image
import os
def extract_text_from_image(image_path, lang='chi_sim+eng', use_ai=False, ai_provider='aliyun'):
"""从图片中提取文字OCR"""
try:
if use_ai:
# 使用AI大模型进行OCR
if ai_provider == 'aliyun':
from .aliyun_ocr import extract_text_with_aliyun
return extract_text_with_aliyun(image_path, 'general')
else:
raise Exception(f"不支持的AI提供商: {ai_provider}")
else:
# 使用传统的Tesseract OCR
# 设置tesseract路径如果需要
if os.name == 'nt': # Windows系统
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# 打开并处理图片
image = Image.open(image_path)
# 使用OCR提取文字
text = pytesseract.image_to_string(image, lang=lang)
return text.strip()
except Exception as e:
raise Exception(f"图片文字识别失败: {str(e)}")
def extract_text_with_ai(image_path, provider='aliyun', ocr_type='general', options=None):
"""使用AI大模型进行图片文字识别"""
try:
if provider == 'aliyun':
from .aliyun_ocr import extract_text_with_aliyun
return extract_text_with_aliyun(image_path, ocr_type, options)
else:
raise Exception(f"不支持的AI提供商: {provider}")
except Exception as e:
raise Exception(f"AI OCR识别失败: {str(e)}")
def image_to_text_file(image_path, output_path):
"""将图片文字保存为文本文件"""
try:
text = extract_text_from_image(image_path)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(text)
return True
except Exception as e:
raise Exception(f"图片转文本文件失败: {str(e)}")
def image_to_excel(image_path, output_path):
"""将图片文字保存为Excel文件"""
try:
import pandas as pd
text = extract_text_from_image(image_path)
# 按行分割文本
lines = [line.strip() for line in text.split('\n') if line.strip()]
# 创建DataFrame
df = pd.DataFrame({
'行号': range(1, len(lines) + 1),
'内容': lines
})
df.to_excel(output_path, index=False)
return True
except Exception as e:
raise Exception(f"图片转Excel失败: {str(e)}")