final-vibevault-template/.autograde/aggregate_llm_grades.py
2025-12-01 22:12:02 +08:00

122 lines
4.0 KiB
Python

#!/usr/bin/env python3
"""
聚合多个 LLM 评分结果
"""
import json
import argparse
from pathlib import Path
def load_grade(filepath):
"""加载单个评分文件"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
return json.load(f)
except FileNotFoundError:
print(f"Warning: {filepath} not found")
return None
except json.JSONDecodeError as e:
print(f"Error parsing {filepath}: {e}")
return None
def aggregate_grades(input_files, output_file, summary_file):
"""聚合多个评分文件"""
grades = []
total_score = 0
max_score = 0
need_review_count = 0
for input_file in input_files:
grade = load_grade(input_file)
if grade:
grades.append(grade)
# 支持两种格式:'total' (llm_grade.py) 或 'score' (旧格式)
score = grade.get('total', grade.get('score', 0))
total_score += score
# 默认每题 10 分
max_score += grade.get('max_score', 10)
# 检查是否需要审核
if 'need_review' in grade.get('flags', []) or grade.get('need_review', False):
need_review_count += 1
# 计算总分
final_score = total_score if max_score > 0 else 0
final_max_score = max_score
# 生成汇总结果
result = {
'total_score': final_score,
'max_score': final_max_score,
'questions': len(grades),
'need_review': need_review_count > 0,
'details': grades
}
# 保存 JSON
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=2, ensure_ascii=False)
# 生成 Markdown 摘要
summary_lines = [
'# LLM 简答题评分汇总',
'',
f'**总分**: {final_score:.1f} / {final_max_score:.1f}',
f'**题目数**: {len(grades)}',
f'**需要人工审核**: {"" if result["need_review"] else ""}',
'',
'## 各题详情',
''
]
for i, grade in enumerate(grades, 1):
q_name = grade.get('question', f'Q{i}')
# 支持两种格式:'total' (llm_grade.py) 或 'score' (旧格式)
score = grade.get('total', grade.get('score', 0))
max_q_score = grade.get('max_score', 10)
# 检查是否需要审核
need_review = 'need_review' in grade.get('flags', []) or grade.get('need_review', False)
confidence = grade.get('confidence', 1.0)
summary_lines.append(f'### SA{i}')
summary_lines.append(f'- **得分**: {score:.2f} / {max_q_score:.1f}')
summary_lines.append(f'- **置信度**: {confidence:.2f}')
if need_review:
summary_lines.append('- ⚠️ **需要人工审核**')
# 显示分项评分
if 'criteria' in grade:
summary_lines.append('- **分项**:')
for criterion in grade['criteria']:
crit_id = criterion.get('id', '')
crit_score = criterion.get('score', 0)
crit_reason = criterion.get('reason', '')
summary_lines.append(f' - {crit_id}: {crit_score:.1f} - {crit_reason}')
summary_lines.append('')
with open(summary_file, 'w', encoding='utf-8') as f:
f.write('\n'.join(summary_lines))
print(f"✅ Aggregated {len(grades)} grades")
print(f" Total: {final_score:.1f} / {final_max_score:.1f}")
print(f" Output: {output_file}")
def main():
parser = argparse.ArgumentParser(description='Aggregate LLM grading results')
parser.add_argument('--inputs', nargs='+', required=True,
help='Input grade JSON files')
parser.add_argument('--out', required=True,
help='Output aggregated JSON file')
parser.add_argument('--summary', required=True,
help='Output summary Markdown file')
args = parser.parse_args()
aggregate_grades(args.inputs, args.out, args.summary)
if __name__ == '__main__':
main()