#!/usr/bin/env python3 """ OCR功能诊断脚本 检查Tesseract OCR的安装和配置状态 """ import os import sys import tempfile from pathlib import Path def check_tesseract_installation(): """检查Tesseract OCR是否安装""" print("🔍 检查Tesseract OCR安装状态...") # 常见的Tesseract安装路径 possible_paths = [ r"C:\Program Files\Tesseract-OCR\tesseract.exe", r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe", r"D:\Program Files\Tesseract-OCR\tesseract.exe", r"D:\Program Files (x86)\Tesseract-OCR\tesseract.exe" ] tesseract_path = None for path in possible_paths: if os.path.exists(path): tesseract_path = path print(f"✅ Tesseract找到: {path}") break if not tesseract_path: print("❌ Tesseract未找到在默认路径") # 检查系统PATH import shutil tesseract_cmd = shutil.which("tesseract") if tesseract_cmd: print(f"✅ Tesseract在PATH中找到: {tesseract_cmd}") tesseract_path = tesseract_cmd else: print("❌ Tesseract未在系统PATH中找到") return tesseract_path def check_python_dependencies(): """检查Python OCR相关依赖""" print("\n🐍 检查Python依赖...") dependencies = ["pytesseract", "PIL", "pandas"] for dep in dependencies: try: if dep == "PIL": import PIL print(f"✅ {dep}: {PIL.__version__}") elif dep == "pytesseract": import pytesseract print(f"✅ {dep}: 已安装") elif dep == "pandas": import pandas print(f"✅ {dep}: {pandas.__version__}") except ImportError as e: print(f"❌ {dep}: 未安装 - {e}") def create_test_image(): """创建测试图片""" print("\n🖼️ 创建测试图片...") try: from PIL import Image, ImageDraw, ImageFont # 创建图片 img = Image.new('RGB', (400, 200), color='white') d = ImageDraw.Draw(img) # 尝试使用不同字体 fonts_to_try = [ "arial.ttf", "Arial.ttf", "simhei.ttf", # 黑体 "msyh.ttc", # 微软雅黑 "C:\\Windows\\Fonts\\arial.ttf", "C:\\Windows\\Fonts\\simhei.ttf" ] font = None for font_path in fonts_to_try: try: font = ImageFont.truetype(font_path, 24) print(f"✅ 字体找到: {font_path}") break except: continue if not font: print("⚠️ 未找到合适字体,使用默认字体") font = ImageFont.load_default() # 添加清晰的中英文文字 text_lines = [ "OCR测试文字", "Hello World", "1234567890", "ABCDEFGHIJKLMN" ] y_position = 30 for line in text_lines: d.text((50, y_position), line, fill="black", font=font) y_position += 40 # 保存图片 test_image_path = os.path.join(tempfile.gettempdir(), "ocr_test_image.png") img.save(test_image_path, "PNG") print(f"✅ 测试图片已创建: {test_image_path}") print(f" 图片大小: {os.path.getsize(test_image_path)} 字节") return test_image_path except Exception as e: print(f"❌ 创建测试图片失败: {e}") return None def test_ocr_functionality(image_path): """测试OCR功能""" print("\n🔤 测试OCR识别功能...") if not image_path or not os.path.exists(image_path): print("❌ 测试图片不存在") return try: import pytesseract from PIL import Image # 设置Tesseract路径(如果需要) tesseract_path = check_tesseract_installation() if tesseract_path: pytesseract.pytesseract.tesseract_cmd = tesseract_path # 打开并检查图片 image = Image.open(image_path) print(f"✅ 图片格式: {image.format}, 大小: {image.size}") # 测试不同语言的OCR languages = ['eng', 'chi_sim', 'eng+chi_sim'] for lang in languages: try: print(f"\n 测试语言: {lang}") text = pytesseract.image_to_string(image, lang=lang) if text.strip(): print(f" ✅ 识别成功:") print(f" {text.strip()}") else: print(f" ⚠️ 识别无结果") except Exception as e: print(f" ❌ 语言 {lang} 识别失败: {e}") # 测试图片数据 print(f"\n📊 图片数据检查:") print(f" 模式: {image.mode}") print(f" 通道: {'RGB' if image.mode == 'RGB' else image.mode}") # 检查图片是否可读 try: image.verify() print(" ✅ 图片验证通过") except Exception as e: print(f" ❌ 图片验证失败: {e}") except Exception as e: print(f"❌ OCR测试失败: {e}") def check_system_environment(): """检查系统环境""" print("\n💻 检查系统环境...") print(f" 操作系统: {os.name}") print(f" Python版本: {sys.version}") print(f" 当前目录: {os.getcwd()}") print(f" TMP目录: {tempfile.gettempdir()}") def main(): """主诊断函数""" print("=" * 60) print("OCR功能诊断工具") print("=" * 60) # 检查系统环境 check_system_environment() # 检查依赖 check_python_dependencies() # 检查Tesseract安装 tesseract_path = check_tesseract_installation() # 创建测试图片 test_image_path = create_test_image() # 测试OCR功能 if test_image_path: test_ocr_functionality(test_image_path) # 提供解决方案 print("\n" + "=" * 60) print("💡 解决方案建议") print("=" * 60) if not tesseract_path: print(""" 🔧 Tesseract OCR未安装,请按以下步骤安装: 1. 下载Tesseract OCR: - 官方地址: https://github.com/UB-Mannheim/tesseract/wiki - 选择Windows版本下载 2. 安装步骤: - 运行安装程序 - 安装到默认路径: C:\\Program Files\\Tesseract-OCR\\ - 安装时勾选"Add to PATH"选项 - 安装中文语言包(可选但推荐) 3. 验证安装: - 重新启动命令行 - 运行: tesseract --version - 应该显示版本信息 """) else: print(""" ✅ Tesseract已安装,问题可能在于: 1. 图片格式问题 - 确保上传的图片格式正确(PNG, JPG等) - 图片包含清晰可读的文字 2. 语言包问题 - 确保安装了中文语言包(chi_sim) - 可以尝试只使用英文识别 3. 权限问题 - 确保应用有权限访问临时文件 """) print("\n🔄 临时解决方案:") print(" 在应用中暂时禁用OCR功能,或使用在线OCR服务") if __name__ == "__main__": main()