final-vibevault-template/.autograde/llm_grade.py

230 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
LLM 简答题评分脚本
调用 LLM API按评分量表对简答题进行评分输出 JSON 格式结果
"""
import os
import json
import argparse
import requests
import sys
from pathlib import Path
from dotenv import load_dotenv
# 加载环境变量(支持从 .env 文件或环境变量读取)
load_dotenv()
def read_file(path):
"""读取文件内容"""
if os.path.exists(path):
return open(path, 'r', encoding='utf-8').read()
return ""
def read_file_or_string(value):
"""
如果 value 是一个存在的文件路径,读取文件内容;
否则直接返回 value 作为字符串。
"""
if os.path.exists(value):
return open(value, 'r', encoding='utf-8').read()
return value # 当作字符串直接返回
PROMPT_TEMPLATE = """你是严格且一致的助教,按提供的评分量表为学生的简答题评分。
- 只依据量表,不做主观延伸;允许多样表述。
- 不输出任何解释性文本;只输出 JSON包含:
{{
"total": number(0-10, 两位小数),
"criteria": [
{{"id":"accuracy","score":0-3,"reason":"要点式一句话"}},
{{"id":"coverage","score":0-3,"reason":""}},
{{"id":"clarity","score":0-3,"reason":""}}
],
"flags": [],
"confidence": number(0-1)
}}
如果答案与题目无关total=0并加 flag "need_review"
【题目】
<<<{question}>>>
【评分量表】
<<<{rubric}>>>
【学生答案】
<<<{answer}>>>
"""
def call_llm(url, key, model, prompt):
"""
调用 LLM API
Parameters
----------
url : str
API 地址
key : str
API 密钥
model : str
模型名称
prompt : str
提示词
Returns
-------
dict
LLM 返回的 JSON 结果
"""
headers = {
"Authorization": f"Bearer {key}",
"Content-Type": "application/json"
}
data = {
"model": model,
"temperature": 0,
"top_p": 1,
"messages": [{"role": "user", "content": prompt}],
"response_format": {"type": "json_object"}
}
try:
# 设置超时:连接超时 10 秒,读取超时 60 秒
response = requests.post(
url,
headers=headers,
json=data,
timeout=(10, 60)
)
response.raise_for_status()
result = response.json()
content = result.get("choices", [{}])[0].get("message", {}).get("content", "{}")
return json.loads(content)
except requests.exceptions.Timeout as e:
print(f"LLM API request timeout: {e}", file=sys.stderr)
raise
except requests.exceptions.HTTPError as e:
print(f"LLM API HTTP error: {e} (status: {response.status_code})", file=sys.stderr)
raise
except requests.exceptions.RequestException as e:
print(f"LLM API request failed: {e}", file=sys.stderr)
raise
except json.JSONDecodeError as e:
print(f"Failed to parse LLM response as JSON: {e}", file=sys.stderr)
raise
def main():
parser = argparse.ArgumentParser(description="Grade short answer questions using LLM")
parser.add_argument("--question", required=True, help="Path to question file")
parser.add_argument("--answer", required=True, help="Path to answer file")
parser.add_argument("--rubric", required=True, help="Path to rubric JSON file")
parser.add_argument("--out", default="grade.json", help="Output JSON file")
parser.add_argument("--summary", default="summary.md", help="Output summary markdown file")
parser.add_argument("--model", default=os.getenv("LLM_MODEL", "deepseek-chat"))
parser.add_argument("--api_url", default=os.getenv("LLM_API_URL", "https://api.deepseek.com/chat/completions"))
parser.add_argument("--api_key", default=os.getenv("LLM_API_KEY", ""))
args = parser.parse_args()
# 验证必需的配置
if not args.api_key:
print("Warning: LLM_API_KEY not set. LLM grading may fail.", file=sys.stderr)
# 读取文件或字符串
# question 可以是文件路径或直接的问题字符串
question = read_file_or_string(args.question).strip()
# answer 和 rubric 必须是文件路径
answer = read_file(args.answer).strip()
rubric_text = read_file(args.rubric).strip()
if not question or not answer:
print(f"Warning: Empty question or answer file", file=sys.stderr)
resp = {
"total": 0,
"criteria": [],
"flags": ["need_review", "empty_answer"],
"confidence": 0.0
}
else:
# 调用 LLM
try:
prompt = PROMPT_TEMPLATE.format(
question=question,
rubric=rubric_text,
answer=answer
)
resp = call_llm(args.api_url, args.api_key, args.model, prompt)
except Exception as e:
print(f"LLM grading failed: {e}", file=sys.stderr)
resp = {
"total": 0,
"criteria": [],
"flags": ["need_review", "llm_error"],
"confidence": 0.0
}
# 边界带自动送审
try:
rubric_data = json.loads(rubric_text)
lo, hi = rubric_data.get("borderline_band", [None, None])
total = float(resp.get("total", 0))
flags = set(resp.get("flags", []))
if lo is not None and hi is not None and lo <= total <= hi:
flags.add("need_review")
# 低置信度送审
confidence = resp.get("confidence", 1.0)
if confidence < 0.7:
flags.add("need_review")
resp["flags"] = sorted(list(flags))
except Exception:
pass
# 保存 grade.json
with open(args.out, "w", encoding="utf-8") as f:
json.dump(resp, f, ensure_ascii=False, indent=2)
# 生成 summary.md
try:
rubric_data = json.loads(rubric_text)
max_score = rubric_data.get("max_score", 10)
except Exception:
max_score = 10
lines = [
f"# 简答题评分",
f"",
f"- **总分****{resp.get('total', 0):.2f} / {max_score}**",
f"- **置信度**{resp.get('confidence', 0):.2f}",
f"- **标记**{', '.join(resp.get('flags', [])) or ''}",
f"",
f"## 分项评分"
]
for criterion in resp.get("criteria", []):
criterion_id = criterion.get("id", "")
score = criterion.get("score", 0)
reason = criterion.get("reason", "")
lines.append(f"- **{criterion_id}**: {score}")
if reason:
lines.append(f" - {reason}")
with open(args.summary, "w", encoding="utf-8") as f:
f.write("\n".join(lines))
print(f"LLM grading complete: {resp.get('total', 0):.2f}/{max_score}")
if __name__ == "__main__":
main()