229 lines
8.9 KiB
Python
229 lines
8.9 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
阿里云OCR服务集成
|
|||
|
|
使用阿里云AI大模型进行图片文字识别
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import base64
|
|||
|
|
import json
|
|||
|
|
import os
|
|||
|
|
from dotenv import load_dotenv
|
|||
|
|
from alibabacloud_ocr_api20210707.client import Client as ocr_api20210707Client
|
|||
|
|
from alibabacloud_tea_openapi import models as open_api_models
|
|||
|
|
from alibabacloud_ocr_api20210707 import models as ocr_api20210707_models
|
|||
|
|
from alibabacloud_tea_util import models as util_models
|
|||
|
|
from alibabacloud_tea_util.client import Client as UtilClient
|
|||
|
|
|
|||
|
|
# 加载环境变量
|
|||
|
|
load_dotenv()
|
|||
|
|
|
|||
|
|
class AliyunOCR:
|
|||
|
|
"""阿里云OCR服务类"""
|
|||
|
|
|
|||
|
|
def __init__(self, access_key_id=None, access_key_secret=None, endpoint=None):
|
|||
|
|
"""初始化阿里云OCR客户端"""
|
|||
|
|
self.access_key_id = access_key_id or os.getenv('ALIYUN_ACCESS_KEY_ID')
|
|||
|
|
self.access_key_secret = access_key_secret or os.getenv('ALIYUN_ACCESS_KEY_SECRET')
|
|||
|
|
self.endpoint = endpoint or os.getenv('ALIYUN_OCR_ENDPOINT', 'ocr-api.cn-hangzhou.aliyuncs.com')
|
|||
|
|
|
|||
|
|
if not self.access_key_id or not self.access_key_secret:
|
|||
|
|
raise Exception("阿里云AccessKey未配置,请在.env文件中设置ALIYUN_ACCESS_KEY_ID和ALIYUN_ACCESS_KEY_SECRET")
|
|||
|
|
|
|||
|
|
# 创建配置对象
|
|||
|
|
config = open_api_models.Config(
|
|||
|
|
access_key_id=self.access_key_id,
|
|||
|
|
access_key_secret=self.access_key_secret
|
|||
|
|
)
|
|||
|
|
config.endpoint = self.endpoint
|
|||
|
|
|
|||
|
|
# 创建客户端
|
|||
|
|
self.client = ocr_api20210707Client(config)
|
|||
|
|
|
|||
|
|
def recognize_general(self, image_path):
|
|||
|
|
"""通用文字识别"""
|
|||
|
|
try:
|
|||
|
|
# 读取图片并编码为base64
|
|||
|
|
with open(image_path, 'rb') as image_file:
|
|||
|
|
image_data = base64.b64encode(image_file.read()).decode('utf-8')
|
|||
|
|
|
|||
|
|
# 创建请求
|
|||
|
|
recognize_general_request = ocr_api20210707_models.RecognizeGeneralRequest(
|
|||
|
|
image_url='', # 使用image_data,所以这里留空
|
|||
|
|
body=util_models.RuntimeOptions()
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 设置图片数据
|
|||
|
|
recognize_general_request.body = image_data
|
|||
|
|
|
|||
|
|
# 发送请求
|
|||
|
|
response = self.client.recognize_general(recognize_general_request)
|
|||
|
|
|
|||
|
|
# 解析响应
|
|||
|
|
if response.body.code == 200:
|
|||
|
|
result = json.loads(response.body.data)
|
|||
|
|
return self._extract_text(result)
|
|||
|
|
else:
|
|||
|
|
raise Exception(f"阿里云OCR识别失败: {response.body.message}")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
raise Exception(f"阿里云OCR识别错误: {str(e)}")
|
|||
|
|
|
|||
|
|
def recognize_advanced(self, image_path, options=None):
|
|||
|
|
"""高级文字识别(支持更多功能)"""
|
|||
|
|
try:
|
|||
|
|
# 读取图片并编码为base64
|
|||
|
|
with open(image_path, 'rb') as image_file:
|
|||
|
|
image_data = base64.b64encode(image_file.read()).decode('utf-8')
|
|||
|
|
|
|||
|
|
# 创建请求
|
|||
|
|
recognize_advanced_request = ocr_api20210707_models.RecognizeAdvancedRequest(
|
|||
|
|
image_url='',
|
|||
|
|
body=util_models.RuntimeOptions()
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 设置图片数据
|
|||
|
|
recognize_advanced_request.body = image_data
|
|||
|
|
|
|||
|
|
# 设置高级选项
|
|||
|
|
if options:
|
|||
|
|
if 'output_char_info' in options:
|
|||
|
|
recognize_advanced_request.output_char_info = options['output_char_info']
|
|||
|
|
if 'output_table' in options:
|
|||
|
|
recognize_advanced_request.output_table = options['output_table']
|
|||
|
|
if 'need_rotate' in options:
|
|||
|
|
recognize_advanced_request.need_rotate = options['need_rotate']
|
|||
|
|
|
|||
|
|
# 发送请求
|
|||
|
|
response = self.client.recognize_advanced(recognize_advanced_request)
|
|||
|
|
|
|||
|
|
# 解析响应
|
|||
|
|
if response.body.code == 200:
|
|||
|
|
result = json.loads(response.body.data)
|
|||
|
|
return self._extract_text(result)
|
|||
|
|
else:
|
|||
|
|
raise Exception(f"阿里云高级OCR识别失败: {response.body.message}")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
raise Exception(f"阿里云高级OCR识别错误: {str(e)}")
|
|||
|
|
|
|||
|
|
def recognize_table(self, image_path):
|
|||
|
|
"""表格识别"""
|
|||
|
|
try:
|
|||
|
|
# 读取图片并编码为base64
|
|||
|
|
with open(image_path, 'rb') as image_file:
|
|||
|
|
image_data = base64.b64encode(image_file.read()).decode('utf-8')
|
|||
|
|
|
|||
|
|
# 创建请求
|
|||
|
|
recognize_table_request = ocr_api20210707_models.RecognizeTableRequest(
|
|||
|
|
image_url='',
|
|||
|
|
body=util_models.RuntimeOptions()
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
# 设置图片数据
|
|||
|
|
recognize_table_request.body = image_data
|
|||
|
|
|
|||
|
|
# 发送请求
|
|||
|
|
response = self.client.recognize_table(recognize_table_request)
|
|||
|
|
|
|||
|
|
# 解析响应
|
|||
|
|
if response.body.code == 200:
|
|||
|
|
result = json.loads(response.body.data)
|
|||
|
|
return self._extract_table_data(result)
|
|||
|
|
else:
|
|||
|
|
raise Exception(f"阿里云表格识别失败: {response.body.message}")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
raise Exception(f"阿里云表格识别错误: {str(e)}")
|
|||
|
|
|
|||
|
|
def _extract_text(self, result):
|
|||
|
|
"""从OCR结果中提取文本"""
|
|||
|
|
text = ""
|
|||
|
|
|
|||
|
|
if 'content' in result:
|
|||
|
|
# 简单文本识别结果
|
|||
|
|
text = result['content']
|
|||
|
|
elif 'prism_wordsInfo' in result:
|
|||
|
|
# 结构化识别结果
|
|||
|
|
words_info = result['prism_wordsInfo']
|
|||
|
|
for word_info in words_info:
|
|||
|
|
if 'word' in word_info:
|
|||
|
|
text += word_info['word'] + "\n"
|
|||
|
|
elif 'prism_tablesInfo' in result:
|
|||
|
|
# 表格识别结果
|
|||
|
|
tables_info = result['prism_tablesInfo']
|
|||
|
|
for table_info in tables_info:
|
|||
|
|
if 'cellContents' in table_info:
|
|||
|
|
for cell in table_info['cellContents']:
|
|||
|
|
if 'word' in cell:
|
|||
|
|
text += cell['word'] + "\t"
|
|||
|
|
text += "\n"
|
|||
|
|
|
|||
|
|
return text.strip()
|
|||
|
|
|
|||
|
|
def _extract_table_data(self, result):
|
|||
|
|
"""提取表格数据"""
|
|||
|
|
table_data = []
|
|||
|
|
|
|||
|
|
if 'content' in result:
|
|||
|
|
# 直接返回内容
|
|||
|
|
return result['content']
|
|||
|
|
elif 'prism_tablesInfo' in result:
|
|||
|
|
# 结构化表格数据
|
|||
|
|
tables_info = result['prism_tablesInfo']
|
|||
|
|
for table_info in tables_info:
|
|||
|
|
table_rows = []
|
|||
|
|
if 'cellContents' in table_info:
|
|||
|
|
# 按行组织数据
|
|||
|
|
max_row = max([cell.get('row', 0) for cell in table_info['cellContents']]) + 1
|
|||
|
|
max_col = max([cell.get('col', 0) for cell in table_info['cellContents']]) + 1
|
|||
|
|
|
|||
|
|
# 创建空表格
|
|||
|
|
table = [['' for _ in range(max_col)] for _ in range(max_row)]
|
|||
|
|
|
|||
|
|
# 填充数据
|
|||
|
|
for cell in table_info['cellContents']:
|
|||
|
|
row = cell.get('row', 0)
|
|||
|
|
col = cell.get('col', 0)
|
|||
|
|
word = cell.get('word', '')
|
|||
|
|
if row < max_row and col < max_col:
|
|||
|
|
table[row][col] = word
|
|||
|
|
|
|||
|
|
# 转换为文本格式
|
|||
|
|
for row in table:
|
|||
|
|
table_rows.append('\t'.join(row))
|
|||
|
|
|
|||
|
|
table_data.append('\n'.join(table_rows))
|
|||
|
|
|
|||
|
|
return '\n\n'.join(table_data) if table_data else "未识别到表格数据"
|
|||
|
|
|
|||
|
|
def extract_text_with_aliyun(image_path, ocr_type='general', options=None):
|
|||
|
|
"""使用阿里云OCR提取图片文字"""
|
|||
|
|
try:
|
|||
|
|
ocr_client = AliyunOCR()
|
|||
|
|
|
|||
|
|
if ocr_type == 'general':
|
|||
|
|
return ocr_client.recognize_general(image_path)
|
|||
|
|
elif ocr_type == 'advanced':
|
|||
|
|
return ocr_client.recognize_advanced(image_path, options)
|
|||
|
|
elif ocr_type == 'table':
|
|||
|
|
return ocr_client.recognize_table(image_path)
|
|||
|
|
else:
|
|||
|
|
raise Exception(f"不支持的OCR类型: {ocr_type}")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
raise Exception(f"阿里云OCR识别失败: {str(e)}")
|
|||
|
|
|
|||
|
|
def check_aliyun_config():
|
|||
|
|
"""检查阿里云配置是否完整"""
|
|||
|
|
access_key_id = os.getenv('ALIYUN_ACCESS_KEY_ID')
|
|||
|
|
access_key_secret = os.getenv('ALIYUN_ACCESS_KEY_SECRET')
|
|||
|
|
|
|||
|
|
if not access_key_id or not access_key_secret:
|
|||
|
|
return False, "阿里云AccessKey未配置"
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
# 测试连接
|
|||
|
|
ocr_client = AliyunOCR()
|
|||
|
|
return True, "阿里云OCR配置正确"
|
|||
|
|
except Exception as e:
|
|||
|
|
return False, f"阿里云OCR配置错误: {str(e)}"
|