generated from Java-2025Fall/final-vibevault-template
211 lines
6.4 KiB
Python
211 lines
6.4 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
llm_grade.py - 使用LLM对报告进行评分
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import argparse
|
|||
|
|
import os
|
|||
|
|
import requests
|
|||
|
|
import time
|
|||
|
|
from typing import Dict, Any
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_args():
|
|||
|
|
"""解析命令行参数"""
|
|||
|
|
parser = argparse.ArgumentParser(description='LLM Report Grading Script')
|
|||
|
|
parser.add_argument('--question', required=True, help='评分问题描述')
|
|||
|
|
parser.add_argument('--answer', required=True, help='待评分的答案文件')
|
|||
|
|
parser.add_argument('--rubric', required=True, help='评分标准文件')
|
|||
|
|
parser.add_argument('--out', required=True, help='输出评分结果文件')
|
|||
|
|
parser.add_argument('--summary', required=True, help='输出评分摘要文件')
|
|||
|
|
return parser.parse_args()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def load_file_content(file_path: str) -> str:
|
|||
|
|
"""加载文件内容"""
|
|||
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|||
|
|
return f.read()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def load_rubric(rubric_path: str) -> Dict[str, Any]:
|
|||
|
|
"""加载评分标准"""
|
|||
|
|
with open(rubric_path, 'r', encoding='utf-8') as f:
|
|||
|
|
return json.load(f)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def call_llm_api(prompt: str, max_retries: int = 3, timeout: int = 30) -> str:
|
|||
|
|
"""调用LLM API"""
|
|||
|
|
# 获取环境变量中的API配置
|
|||
|
|
api_key = os.environ.get('LLM_API_KEY', '')
|
|||
|
|
api_url = os.environ.get('LLM_API_URL', 'http://localhost:11434/api/generate')
|
|||
|
|
model = os.environ.get('LLM_MODEL', 'llama3')
|
|||
|
|
|
|||
|
|
headers = {
|
|||
|
|
'Content-Type': 'application/json',
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if api_key:
|
|||
|
|
headers['Authorization'] = f'Bearer {api_key}'
|
|||
|
|
|
|||
|
|
payload = {
|
|||
|
|
'model': model,
|
|||
|
|
'prompt': prompt,
|
|||
|
|
'stream': False,
|
|||
|
|
'temperature': 0.3
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for attempt in range(max_retries):
|
|||
|
|
try:
|
|||
|
|
response = requests.post(api_url, json=payload, headers=headers, timeout=timeout)
|
|||
|
|
response.raise_for_status()
|
|||
|
|
|
|||
|
|
result = response.json()
|
|||
|
|
return result.get('response', '')
|
|||
|
|
|
|||
|
|
except requests.exceptions.RequestException as e:
|
|||
|
|
print(f"⚠️ LLM API调用失败 (尝试 {attempt + 1}/{max_retries}): {e}")
|
|||
|
|
if attempt < max_retries - 1:
|
|||
|
|
time.sleep(2 ** attempt) # 指数退避
|
|||
|
|
else:
|
|||
|
|
raise
|
|||
|
|
|
|||
|
|
|
|||
|
|
def generate_grading_prompt(question: str, answer: str, rubric: Dict[str, Any]) -> str:
|
|||
|
|
"""生成评分提示词"""
|
|||
|
|
prompt = f"""你是一位专业的课程作业评分专家。请根据以下评分标准,对学生的作业进行客观、公正的评分。
|
|||
|
|
|
|||
|
|
## 评分问题
|
|||
|
|
{question}
|
|||
|
|
|
|||
|
|
## 学生答案
|
|||
|
|
{answer}
|
|||
|
|
|
|||
|
|
## 评分标准
|
|||
|
|
{json.dumps(rubric, ensure_ascii=False, indent=2)}
|
|||
|
|
|
|||
|
|
## 评分要求
|
|||
|
|
1. 严格按照评分标准进行评分,每个评分项给出具体得分
|
|||
|
|
2. 详细说明每个评分项的得分理由
|
|||
|
|
3. 给出总体评价和建议
|
|||
|
|
4. 最终输出必须包含JSON格式的评分结果,格式如下:
|
|||
|
|
```json
|
|||
|
|
{
|
|||
|
|
"total": 总分,
|
|||
|
|
"scores": {
|
|||
|
|
"评分项1": 得分,
|
|||
|
|
"评分项2": 得分,
|
|||
|
|
...
|
|||
|
|
},
|
|||
|
|
"feedback": "详细的评分反馈和建议"
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
请确保输出格式正确,只包含上述JSON格式内容,不要添加任何其他说明。"""
|
|||
|
|
|
|||
|
|
return prompt
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_llm_response(response: str) -> Dict[str, Any]:
|
|||
|
|
"""解析LLM响应"""
|
|||
|
|
# 提取JSON部分
|
|||
|
|
import re
|
|||
|
|
json_match = re.search(r'```json\n(.*?)\n```', response, re.DOTALL)
|
|||
|
|
|
|||
|
|
if json_match:
|
|||
|
|
json_str = json_match.group(1)
|
|||
|
|
try:
|
|||
|
|
return json.loads(json_str)
|
|||
|
|
except json.JSONDecodeError:
|
|||
|
|
print("⚠️ LLM响应中的JSON格式错误")
|
|||
|
|
|
|||
|
|
# 尝试直接解析响应
|
|||
|
|
try:
|
|||
|
|
return json.loads(response)
|
|||
|
|
except json.JSONDecodeError:
|
|||
|
|
print("⚠️ LLM响应不是有效的JSON格式")
|
|||
|
|
|
|||
|
|
# 如果都失败,返回默认值
|
|||
|
|
return {
|
|||
|
|
'total': 0.0,
|
|||
|
|
'scores': {},
|
|||
|
|
'feedback': '评分失败:无法解析LLM响应'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
|
|||
|
|
def generate_summary(grade_result: Dict[str, Any], rubric: Dict[str, Any]) -> str:
|
|||
|
|
"""生成评分摘要"""
|
|||
|
|
summary = "# LLM评分报告\n\n"
|
|||
|
|
|
|||
|
|
summary += f"## 总体评价\n"
|
|||
|
|
summary += f"- 最终得分: {grade_result['total']:.2f}/{sum(rubric.get('criteria', {}).values())}\n"
|
|||
|
|
summary += f"- 评分时间: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}\n\n"
|
|||
|
|
|
|||
|
|
summary += f"## 评分详情\n"
|
|||
|
|
summary += "| 评分项 | 得分 | 满分 | 评分标准 |\n"
|
|||
|
|
summary += "|-------|------|------|---------|\n"
|
|||
|
|
|
|||
|
|
for criterion, full_score in rubric.get('criteria', {}).items():
|
|||
|
|
score = grade_result['scores'].get(criterion, 0.0)
|
|||
|
|
summary += f"| {criterion} | {score:.2f} | {full_score} | {rubric.get('descriptions', {}).get(criterion, '')} |\n"
|
|||
|
|
|
|||
|
|
summary += "\n"
|
|||
|
|
summary += f"## 详细反馈\n"
|
|||
|
|
summary += grade_result['feedback'] + "\n"
|
|||
|
|
|
|||
|
|
return summary
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""主函数"""
|
|||
|
|
args = parse_args()
|
|||
|
|
|
|||
|
|
# 加载文件内容
|
|||
|
|
print(f"📁 加载待评分文件: {args.answer}")
|
|||
|
|
answer_content = load_file_content(args.answer)
|
|||
|
|
|
|||
|
|
# 加载评分标准
|
|||
|
|
print(f"📋 加载评分标准: {args.rubric}")
|
|||
|
|
rubric = load_rubric(args.rubric)
|
|||
|
|
|
|||
|
|
# 生成评分提示词
|
|||
|
|
print("📝 生成评分提示词...")
|
|||
|
|
prompt = generate_grading_prompt(args.question, answer_content, rubric)
|
|||
|
|
|
|||
|
|
# 调用LLM API
|
|||
|
|
print("🤖 调用LLM进行评分...")
|
|||
|
|
try:
|
|||
|
|
llm_response = call_llm_api(prompt)
|
|||
|
|
print("✅ LLM API调用成功")
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"❌ LLM API调用失败: {e}")
|
|||
|
|
# 返回默认评分结果
|
|||
|
|
grade_result = {
|
|||
|
|
'total': 0.0,
|
|||
|
|
'scores': {criterion: 0.0 for criterion in rubric.get('criteria', {})},
|
|||
|
|
'feedback': f'评分失败:LLM API调用错误 - {str(e)}'
|
|||
|
|
}
|
|||
|
|
else:
|
|||
|
|
# 解析LLM响应
|
|||
|
|
print("📊 解析LLM评分结果...")
|
|||
|
|
grade_result = parse_llm_response(llm_response)
|
|||
|
|
|
|||
|
|
# 保存评分结果
|
|||
|
|
print(f"💾 保存评分结果: {args.out}")
|
|||
|
|
with open(args.out, 'w', encoding='utf-8') as f:
|
|||
|
|
json.dump(grade_result, f, ensure_ascii=False, indent=2)
|
|||
|
|
|
|||
|
|
# 生成评分摘要
|
|||
|
|
print(f"📝 生成评分摘要: {args.summary}")
|
|||
|
|
summary = generate_summary(grade_result, rubric)
|
|||
|
|
|
|||
|
|
with open(args.summary, 'w', encoding='utf-8') as f:
|
|||
|
|
f.write(summary)
|
|||
|
|
|
|||
|
|
print(f"✅ 评分完成! 最终得分: {grade_result['total']:.2f}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
main()
|