generated from Java-2025Fall/final-vibevault-template
211 lines
6.4 KiB
Python
211 lines
6.4 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
llm_grade.py - 使用LLM对报告进行评分
|
||
"""
|
||
|
||
import json
|
||
import argparse
|
||
import os
|
||
import requests
|
||
import time
|
||
from typing import Dict, Any
|
||
|
||
|
||
def parse_args():
|
||
"""解析命令行参数"""
|
||
parser = argparse.ArgumentParser(description='LLM Report Grading Script')
|
||
parser.add_argument('--question', required=True, help='评分问题描述')
|
||
parser.add_argument('--answer', required=True, help='待评分的答案文件')
|
||
parser.add_argument('--rubric', required=True, help='评分标准文件')
|
||
parser.add_argument('--out', required=True, help='输出评分结果文件')
|
||
parser.add_argument('--summary', required=True, help='输出评分摘要文件')
|
||
return parser.parse_args()
|
||
|
||
|
||
def load_file_content(file_path: str) -> str:
|
||
"""加载文件内容"""
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
return f.read()
|
||
|
||
|
||
def load_rubric(rubric_path: str) -> Dict[str, Any]:
|
||
"""加载评分标准"""
|
||
with open(rubric_path, 'r', encoding='utf-8') as f:
|
||
return json.load(f)
|
||
|
||
|
||
def call_llm_api(prompt: str, max_retries: int = 3, timeout: int = 30) -> str:
|
||
"""调用LLM API"""
|
||
# 获取环境变量中的API配置
|
||
api_key = os.environ.get('LLM_API_KEY', '')
|
||
api_url = os.environ.get('LLM_API_URL', 'http://localhost:11434/api/generate')
|
||
model = os.environ.get('LLM_MODEL', 'llama3')
|
||
|
||
headers = {
|
||
'Content-Type': 'application/json',
|
||
}
|
||
|
||
if api_key:
|
||
headers['Authorization'] = f'Bearer {api_key}'
|
||
|
||
payload = {
|
||
'model': model,
|
||
'prompt': prompt,
|
||
'stream': False,
|
||
'temperature': 0.3
|
||
}
|
||
|
||
for attempt in range(max_retries):
|
||
try:
|
||
response = requests.post(api_url, json=payload, headers=headers, timeout=timeout)
|
||
response.raise_for_status()
|
||
|
||
result = response.json()
|
||
return result.get('response', '')
|
||
|
||
except requests.exceptions.RequestException as e:
|
||
print(f"⚠️ LLM API调用失败 (尝试 {attempt + 1}/{max_retries}): {e}")
|
||
if attempt < max_retries - 1:
|
||
time.sleep(2 ** attempt) # 指数退避
|
||
else:
|
||
raise
|
||
|
||
|
||
def generate_grading_prompt(question: str, answer: str, rubric: Dict[str, Any]) -> str:
|
||
"""生成评分提示词"""
|
||
prompt = f"""你是一位专业的课程作业评分专家。请根据以下评分标准,对学生的作业进行客观、公正的评分。
|
||
|
||
## 评分问题
|
||
{question}
|
||
|
||
## 学生答案
|
||
{answer}
|
||
|
||
## 评分标准
|
||
{json.dumps(rubric, ensure_ascii=False, indent=2)}
|
||
|
||
## 评分要求
|
||
1. 严格按照评分标准进行评分,每个评分项给出具体得分
|
||
2. 详细说明每个评分项的得分理由
|
||
3. 给出总体评价和建议
|
||
4. 最终输出必须包含JSON格式的评分结果,格式如下:
|
||
```json
|
||
{
|
||
"total": 总分,
|
||
"scores": {
|
||
"评分项1": 得分,
|
||
"评分项2": 得分,
|
||
...
|
||
},
|
||
"feedback": "详细的评分反馈和建议"
|
||
}
|
||
```
|
||
|
||
请确保输出格式正确,只包含上述JSON格式内容,不要添加任何其他说明。"""
|
||
|
||
return prompt
|
||
|
||
|
||
def parse_llm_response(response: str) -> Dict[str, Any]:
|
||
"""解析LLM响应"""
|
||
# 提取JSON部分
|
||
import re
|
||
json_match = re.search(r'```json\n(.*?)\n```', response, re.DOTALL)
|
||
|
||
if json_match:
|
||
json_str = json_match.group(1)
|
||
try:
|
||
return json.loads(json_str)
|
||
except json.JSONDecodeError:
|
||
print("⚠️ LLM响应中的JSON格式错误")
|
||
|
||
# 尝试直接解析响应
|
||
try:
|
||
return json.loads(response)
|
||
except json.JSONDecodeError:
|
||
print("⚠️ LLM响应不是有效的JSON格式")
|
||
|
||
# 如果都失败,返回默认值
|
||
return {
|
||
'total': 0.0,
|
||
'scores': {},
|
||
'feedback': '评分失败:无法解析LLM响应'
|
||
}
|
||
|
||
|
||
def generate_summary(grade_result: Dict[str, Any], rubric: Dict[str, Any]) -> str:
|
||
"""生成评分摘要"""
|
||
summary = "# LLM评分报告\n\n"
|
||
|
||
summary += f"## 总体评价\n"
|
||
summary += f"- 最终得分: {grade_result['total']:.2f}/{sum(rubric.get('criteria', {}).values())}\n"
|
||
summary += f"- 评分时间: {time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}\n\n"
|
||
|
||
summary += f"## 评分详情\n"
|
||
summary += "| 评分项 | 得分 | 满分 | 评分标准 |\n"
|
||
summary += "|-------|------|------|---------|\n"
|
||
|
||
for criterion, full_score in rubric.get('criteria', {}).items():
|
||
score = grade_result['scores'].get(criterion, 0.0)
|
||
summary += f"| {criterion} | {score:.2f} | {full_score} | {rubric.get('descriptions', {}).get(criterion, '')} |\n"
|
||
|
||
summary += "\n"
|
||
summary += f"## 详细反馈\n"
|
||
summary += grade_result['feedback'] + "\n"
|
||
|
||
return summary
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
args = parse_args()
|
||
|
||
# 加载文件内容
|
||
print(f"📁 加载待评分文件: {args.answer}")
|
||
answer_content = load_file_content(args.answer)
|
||
|
||
# 加载评分标准
|
||
print(f"📋 加载评分标准: {args.rubric}")
|
||
rubric = load_rubric(args.rubric)
|
||
|
||
# 生成评分提示词
|
||
print("📝 生成评分提示词...")
|
||
prompt = generate_grading_prompt(args.question, answer_content, rubric)
|
||
|
||
# 调用LLM API
|
||
print("🤖 调用LLM进行评分...")
|
||
try:
|
||
llm_response = call_llm_api(prompt)
|
||
print("✅ LLM API调用成功")
|
||
except Exception as e:
|
||
print(f"❌ LLM API调用失败: {e}")
|
||
# 返回默认评分结果
|
||
grade_result = {
|
||
'total': 0.0,
|
||
'scores': {criterion: 0.0 for criterion in rubric.get('criteria', {})},
|
||
'feedback': f'评分失败:LLM API调用错误 - {str(e)}'
|
||
}
|
||
else:
|
||
# 解析LLM响应
|
||
print("📊 解析LLM评分结果...")
|
||
grade_result = parse_llm_response(llm_response)
|
||
|
||
# 保存评分结果
|
||
print(f"💾 保存评分结果: {args.out}")
|
||
with open(args.out, 'w', encoding='utf-8') as f:
|
||
json.dump(grade_result, f, ensure_ascii=False, indent=2)
|
||
|
||
# 生成评分摘要
|
||
print(f"📝 生成评分摘要: {args.summary}")
|
||
summary = generate_summary(grade_result, rubric)
|
||
|
||
with open(args.summary, 'w', encoding='utf-8') as f:
|
||
f.write(summary)
|
||
|
||
print(f"✅ 评分完成! 最终得分: {grade_result['total']:.2f}")
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|