GH/utils/ocr_processor.py

import pytesseract
from PIL import Image
import os

def extract_text_from_image(image_path, lang='chi_sim+eng', use_ai=False, ai_provider='aliyun'):
    """从图片中提取文字（OCR）"""
    try:
        if use_ai:
            # 使用AI大模型进行OCR
            if ai_provider == 'aliyun':
                from .aliyun_ocr import extract_text_with_aliyun
                return extract_text_with_aliyun(image_path, 'general')
            else:
                raise Exception(f"不支持的AI提供商: {ai_provider}")
        else:
            # 使用传统的Tesseract OCR
            # 设置tesseract路径（如果需要）
            if os.name == 'nt':  # Windows系统
                pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
            
            # 打开并处理图片
            image = Image.open(image_path)
            
            # 使用OCR提取文字
            text = pytesseract.image_to_string(image, lang=lang)
            
            return text.strip()
    except Exception as e:
        raise Exception(f"图片文字识别失败: {str(e)}")

def extract_text_with_ai(image_path, provider='aliyun', ocr_type='general', options=None):
    """使用AI大模型进行图片文字识别"""
    try:
        if provider == 'aliyun':
            from .aliyun_ocr import extract_text_with_aliyun
            return extract_text_with_aliyun(image_path, ocr_type, options)
        else:
            raise Exception(f"不支持的AI提供商: {provider}")
    except Exception as e:
        raise Exception(f"AI OCR识别失败: {str(e)}")

def image_to_text_file(image_path, output_path):
    """将图片文字保存为文本文件"""
    try:
        text = extract_text_from_image(image_path)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(text)
        
        return True
    except Exception as e:
        raise Exception(f"图片转文本文件失败: {str(e)}")

def image_to_excel(image_path, output_path):
    """将图片文字保存为Excel文件"""
    try:
        import pandas as pd
        
        text = extract_text_from_image(image_path)
        
        # 按行分割文本
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        
        # 创建DataFrame
        df = pd.DataFrame({
            '行号': range(1, len(lines) + 1),
            '内容': lines
        })
        
        df.to_excel(output_path, index=False)
        return True
    except Exception as e:
        raise Exception(f"图片转Excel失败: {str(e)}")