GH/diagnose_ocr.py

#!/usr/bin/env python3
"""
OCR功能诊断脚本
检查Tesseract OCR的安装和配置状态
"""

import os
import sys
import tempfile
from pathlib import Path

def check_tesseract_installation():
    """检查Tesseract OCR是否安装"""
    print("🔍 检查Tesseract OCR安装状态...")
    
    # 常见的Tesseract安装路径
    possible_paths = [
        r"C:\Program Files\Tesseract-OCR\tesseract.exe",
        r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe",
        r"D:\Program Files\Tesseract-OCR\tesseract.exe",
        r"D:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
    ]
    
    tesseract_path = None
    for path in possible_paths:
        if os.path.exists(path):
            tesseract_path = path
            print(f"✅ Tesseract找到: {path}")
            break
    
    if not tesseract_path:
        print("❌ Tesseract未找到在默认路径")
        
        # 检查系统PATH
        import shutil
        tesseract_cmd = shutil.which("tesseract")
        if tesseract_cmd:
            print(f"✅ Tesseract在PATH中找到: {tesseract_cmd}")
            tesseract_path = tesseract_cmd
        else:
            print("❌ Tesseract未在系统PATH中找到")
    
    return tesseract_path

def check_python_dependencies():
    """检查Python OCR相关依赖"""
    print("\n🐍 检查Python依赖...")
    
    dependencies = ["pytesseract", "PIL", "pandas"]
    
    for dep in dependencies:
        try:
            if dep == "PIL":
                import PIL
                print(f"✅ {dep}: {PIL.__version__}")
            elif dep == "pytesseract":
                import pytesseract
                print(f"✅ {dep}: 已安装")
            elif dep == "pandas":
                import pandas
                print(f"✅ {dep}: {pandas.__version__}")
        except ImportError as e:
            print(f"❌ {dep}: 未安装 - {e}")

def create_test_image():
    """创建测试图片"""
    print("\n🖼️ 创建测试图片...")
    
    try:
        from PIL import Image, ImageDraw, ImageFont
        
        # 创建图片
        img = Image.new('RGB', (400, 200), color='white')
        d = ImageDraw.Draw(img)
        
        # 尝试使用不同字体
        fonts_to_try = [
            "arial.ttf",
            "Arial.ttf", 
            "simhei.ttf",  # 黑体
            "msyh.ttc",    # 微软雅黑
            "C:\\Windows\\Fonts\\arial.ttf",
            "C:\\Windows\\Fonts\\simhei.ttf"
        ]
        
        font = None
        for font_path in fonts_to_try:
            try:
                font = ImageFont.truetype(font_path, 24)
                print(f"✅ 字体找到: {font_path}")
                break
            except:
                continue
        
        if not font:
            print("⚠️ 未找到合适字体，使用默认字体")
            font = ImageFont.load_default()
        
        # 添加清晰的中英文文字
        text_lines = [
            "OCR测试文字",
            "Hello World",
            "1234567890",
            "ABCDEFGHIJKLMN"
        ]
        
        y_position = 30
        for line in text_lines:
            d.text((50, y_position), line, fill="black", font=font)
            y_position += 40
        
        # 保存图片
        test_image_path = os.path.join(tempfile.gettempdir(), "ocr_test_image.png")
        img.save(test_image_path, "PNG")
        
        print(f"✅ 测试图片已创建: {test_image_path}")
        print(f"   图片大小: {os.path.getsize(test_image_path)} 字节")
        
        return test_image_path
        
    except Exception as e:
        print(f"❌ 创建测试图片失败: {e}")
        return None

def test_ocr_functionality(image_path):
    """测试OCR功能"""
    print("\n🔤 测试OCR识别功能...")
    
    if not image_path or not os.path.exists(image_path):
        print("❌ 测试图片不存在")
        return
    
    try:
        import pytesseract
        from PIL import Image
        
        # 设置Tesseract路径（如果需要）
        tesseract_path = check_tesseract_installation()
        if tesseract_path:
            pytesseract.pytesseract.tesseract_cmd = tesseract_path
        
        # 打开并检查图片
        image = Image.open(image_path)
        print(f"✅ 图片格式: {image.format}, 大小: {image.size}")
        
        # 测试不同语言的OCR
        languages = ['eng', 'chi_sim', 'eng+chi_sim']
        
        for lang in languages:
            try:
                print(f"\n   测试语言: {lang}")
                text = pytesseract.image_to_string(image, lang=lang)
                
                if text.strip():
                    print(f"   ✅ 识别成功:")
                    print(f"      {text.strip()}")
                else:
                    print(f"   ⚠️ 识别无结果")
                    
            except Exception as e:
                print(f"   ❌ 语言 {lang} 识别失败: {e}")
        
        # 测试图片数据
        print(f"\n📊 图片数据检查:")
        print(f"   模式: {image.mode}")
        print(f"   通道: {'RGB' if image.mode == 'RGB' else image.mode}")
        
        # 检查图片是否可读
        try:
            image.verify()
            print("   ✅ 图片验证通过")
        except Exception as e:
            print(f"   ❌ 图片验证失败: {e}")
        
    except Exception as e:
        print(f"❌ OCR测试失败: {e}")

def check_system_environment():
    """检查系统环境"""
    print("\n💻 检查系统环境...")
    
    print(f"   操作系统: {os.name}")
    print(f"   Python版本: {sys.version}")
    print(f"   当前目录: {os.getcwd()}")
    print(f"   TMP目录: {tempfile.gettempdir()}")

def main():
    """主诊断函数"""
    print("=" * 60)
    print("OCR功能诊断工具")
    print("=" * 60)
    
    # 检查系统环境
    check_system_environment()
    
    # 检查依赖
    check_python_dependencies()
    
    # 检查Tesseract安装
    tesseract_path = check_tesseract_installation()
    
    # 创建测试图片
    test_image_path = create_test_image()
    
    # 测试OCR功能
    if test_image_path:
        test_ocr_functionality(test_image_path)
    
    # 提供解决方案
    print("\n" + "=" * 60)
    print("💡 解决方案建议")
    print("=" * 60)
    
    if not tesseract_path:
        print("""
🔧 Tesseract OCR未安装，请按以下步骤安装：

1. 下载Tesseract OCR:
   - 官方地址: https://github.com/UB-Mannheim/tesseract/wiki
   - 选择Windows版本下载

2. 安装步骤:
   - 运行安装程序
   - 安装到默认路径: C:\\Program Files\\Tesseract-OCR\\
   - 安装时勾选"Add to PATH"选项
   - 安装中文语言包（可选但推荐）

3. 验证安装:
   - 重新启动命令行
   - 运行: tesseract --version
   - 应该显示版本信息
""")
    else:
        print("""
✅ Tesseract已安装，问题可能在于：

1. 图片格式问题
   - 确保上传的图片格式正确（PNG, JPG等）
   - 图片包含清晰可读的文字

2. 语言包问题
   - 确保安装了中文语言包（chi_sim）
   - 可以尝试只使用英文识别

3. 权限问题
   - 确保应用有权限访问临时文件
""")
    
    print("\n🔄 临时解决方案:")
    print("   在应用中暂时禁用OCR功能，或使用在线OCR服务")

if __name__ == "__main__":
    main()