2025-12-01 22:12:02 +08:00
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
"""
|
|
|
|
|
|
LLM 简答题评分脚本
|
|
|
|
|
|
|
|
|
|
|
|
调用 LLM API,按评分量表对简答题进行评分,输出 JSON 格式结果
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
import json
|
|
|
|
|
|
import argparse
|
|
|
|
|
|
import requests
|
|
|
|
|
|
import sys
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
|
# 加载环境变量(支持从 .env 文件或环境变量读取)
|
|
|
|
|
|
load_dotenv()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def read_file(path):
|
|
|
|
|
|
"""读取文件内容"""
|
|
|
|
|
|
if os.path.exists(path):
|
|
|
|
|
|
return open(path, 'r', encoding='utf-8').read()
|
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-12-01 23:46:38 +08:00
|
|
|
|
def read_file_or_string(value):
|
|
|
|
|
|
"""
|
|
|
|
|
|
如果 value 是一个存在的文件路径,读取文件内容;
|
|
|
|
|
|
否则直接返回 value 作为字符串。
|
|
|
|
|
|
"""
|
|
|
|
|
|
if os.path.exists(value):
|
|
|
|
|
|
return open(value, 'r', encoding='utf-8').read()
|
|
|
|
|
|
return value # 当作字符串直接返回
|
|
|
|
|
|
|
|
|
|
|
|
|
2025-12-01 22:12:02 +08:00
|
|
|
|
PROMPT_TEMPLATE = """你是严格且一致的助教,按提供的评分量表为学生的简答题评分。
|
|
|
|
|
|
|
|
|
|
|
|
- 只依据量表,不做主观延伸;允许多样表述。
|
|
|
|
|
|
- 不输出任何解释性文本;只输出 JSON,包含:
|
|
|
|
|
|
{{
|
|
|
|
|
|
"total": number(0-10, 两位小数),
|
|
|
|
|
|
"criteria": [
|
|
|
|
|
|
{{"id":"accuracy","score":0-3,"reason":"要点式一句话"}},
|
|
|
|
|
|
{{"id":"coverage","score":0-3,"reason":""}},
|
|
|
|
|
|
{{"id":"clarity","score":0-3,"reason":""}}
|
|
|
|
|
|
],
|
|
|
|
|
|
"flags": [],
|
|
|
|
|
|
"confidence": number(0-1)
|
|
|
|
|
|
}}
|
|
|
|
|
|
如果答案与题目无关,total=0,并加 flag "need_review"。
|
|
|
|
|
|
|
|
|
|
|
|
【题目】
|
|
|
|
|
|
<<<{question}>>>
|
|
|
|
|
|
|
|
|
|
|
|
【评分量表】
|
|
|
|
|
|
<<<{rubric}>>>
|
|
|
|
|
|
|
|
|
|
|
|
【学生答案】
|
|
|
|
|
|
<<<{answer}>>>
|
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def call_llm(url, key, model, prompt):
|
|
|
|
|
|
"""
|
|
|
|
|
|
调用 LLM API
|
|
|
|
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
|
|
----------
|
|
|
|
|
|
url : str
|
|
|
|
|
|
API 地址
|
|
|
|
|
|
key : str
|
|
|
|
|
|
API 密钥
|
|
|
|
|
|
model : str
|
|
|
|
|
|
模型名称
|
|
|
|
|
|
prompt : str
|
|
|
|
|
|
提示词
|
|
|
|
|
|
|
|
|
|
|
|
Returns
|
|
|
|
|
|
-------
|
|
|
|
|
|
dict
|
|
|
|
|
|
LLM 返回的 JSON 结果
|
|
|
|
|
|
"""
|
|
|
|
|
|
headers = {
|
|
|
|
|
|
"Authorization": f"Bearer {key}",
|
|
|
|
|
|
"Content-Type": "application/json"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
data = {
|
|
|
|
|
|
"model": model,
|
|
|
|
|
|
"temperature": 0,
|
|
|
|
|
|
"top_p": 1,
|
|
|
|
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
|
|
|
|
"response_format": {"type": "json_object"}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
|
# 设置超时:连接超时 10 秒,读取超时 60 秒
|
|
|
|
|
|
response = requests.post(
|
|
|
|
|
|
url,
|
|
|
|
|
|
headers=headers,
|
|
|
|
|
|
json=data,
|
|
|
|
|
|
timeout=(10, 60)
|
|
|
|
|
|
)
|
|
|
|
|
|
response.raise_for_status()
|
|
|
|
|
|
result = response.json()
|
|
|
|
|
|
content = result.get("choices", [{}])[0].get("message", {}).get("content", "{}")
|
|
|
|
|
|
return json.loads(content)
|
|
|
|
|
|
except requests.exceptions.Timeout as e:
|
|
|
|
|
|
print(f"LLM API request timeout: {e}", file=sys.stderr)
|
|
|
|
|
|
raise
|
|
|
|
|
|
except requests.exceptions.HTTPError as e:
|
|
|
|
|
|
print(f"LLM API HTTP error: {e} (status: {response.status_code})", file=sys.stderr)
|
|
|
|
|
|
raise
|
|
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
|
|
|
|
print(f"LLM API request failed: {e}", file=sys.stderr)
|
|
|
|
|
|
raise
|
|
|
|
|
|
except json.JSONDecodeError as e:
|
|
|
|
|
|
print(f"Failed to parse LLM response as JSON: {e}", file=sys.stderr)
|
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
|
parser = argparse.ArgumentParser(description="Grade short answer questions using LLM")
|
|
|
|
|
|
parser.add_argument("--question", required=True, help="Path to question file")
|
|
|
|
|
|
parser.add_argument("--answer", required=True, help="Path to answer file")
|
|
|
|
|
|
parser.add_argument("--rubric", required=True, help="Path to rubric JSON file")
|
|
|
|
|
|
parser.add_argument("--out", default="grade.json", help="Output JSON file")
|
|
|
|
|
|
parser.add_argument("--summary", default="summary.md", help="Output summary markdown file")
|
|
|
|
|
|
parser.add_argument("--model", default=os.getenv("LLM_MODEL", "deepseek-chat"))
|
|
|
|
|
|
parser.add_argument("--api_url", default=os.getenv("LLM_API_URL", "https://api.deepseek.com/chat/completions"))
|
|
|
|
|
|
parser.add_argument("--api_key", default=os.getenv("LLM_API_KEY", ""))
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
# 验证必需的配置
|
|
|
|
|
|
if not args.api_key:
|
|
|
|
|
|
print("Warning: LLM_API_KEY not set. LLM grading may fail.", file=sys.stderr)
|
|
|
|
|
|
|
2025-12-01 23:46:38 +08:00
|
|
|
|
# 读取文件或字符串
|
|
|
|
|
|
# question 可以是文件路径或直接的问题字符串
|
|
|
|
|
|
question = read_file_or_string(args.question).strip()
|
|
|
|
|
|
# answer 和 rubric 必须是文件路径
|
2025-12-01 22:12:02 +08:00
|
|
|
|
answer = read_file(args.answer).strip()
|
|
|
|
|
|
rubric_text = read_file(args.rubric).strip()
|
|
|
|
|
|
|
|
|
|
|
|
if not question or not answer:
|
|
|
|
|
|
print(f"Warning: Empty question or answer file", file=sys.stderr)
|
|
|
|
|
|
resp = {
|
|
|
|
|
|
"total": 0,
|
|
|
|
|
|
"criteria": [],
|
|
|
|
|
|
"flags": ["need_review", "empty_answer"],
|
|
|
|
|
|
"confidence": 0.0
|
|
|
|
|
|
}
|
|
|
|
|
|
else:
|
|
|
|
|
|
# 调用 LLM
|
|
|
|
|
|
try:
|
|
|
|
|
|
prompt = PROMPT_TEMPLATE.format(
|
|
|
|
|
|
question=question,
|
|
|
|
|
|
rubric=rubric_text,
|
|
|
|
|
|
answer=answer
|
|
|
|
|
|
)
|
|
|
|
|
|
resp = call_llm(args.api_url, args.api_key, args.model, prompt)
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
print(f"LLM grading failed: {e}", file=sys.stderr)
|
|
|
|
|
|
resp = {
|
|
|
|
|
|
"total": 0,
|
|
|
|
|
|
"criteria": [],
|
|
|
|
|
|
"flags": ["need_review", "llm_error"],
|
|
|
|
|
|
"confidence": 0.0
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-12-02 14:19:10 +08:00
|
|
|
|
# 重新计算 total(不信任 LLM 返回的 total,使用各项得分之和)
|
|
|
|
|
|
criteria = resp.get("criteria", [])
|
|
|
|
|
|
if criteria:
|
|
|
|
|
|
calculated_total = sum(float(c.get("score", 0)) for c in criteria)
|
|
|
|
|
|
resp["total"] = round(calculated_total, 2)
|
|
|
|
|
|
|
2025-12-01 22:12:02 +08:00
|
|
|
|
# 边界带自动送审
|
|
|
|
|
|
try:
|
|
|
|
|
|
rubric_data = json.loads(rubric_text)
|
|
|
|
|
|
lo, hi = rubric_data.get("borderline_band", [None, None])
|
|
|
|
|
|
total = float(resp.get("total", 0))
|
|
|
|
|
|
flags = set(resp.get("flags", []))
|
|
|
|
|
|
|
|
|
|
|
|
if lo is not None and hi is not None and lo <= total <= hi:
|
|
|
|
|
|
flags.add("need_review")
|
|
|
|
|
|
|
|
|
|
|
|
# 低置信度送审
|
|
|
|
|
|
confidence = resp.get("confidence", 1.0)
|
|
|
|
|
|
if confidence < 0.7:
|
|
|
|
|
|
flags.add("need_review")
|
|
|
|
|
|
|
|
|
|
|
|
resp["flags"] = sorted(list(flags))
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
# 保存 grade.json
|
|
|
|
|
|
with open(args.out, "w", encoding="utf-8") as f:
|
|
|
|
|
|
json.dump(resp, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
|
|
|
|
|
|
# 生成 summary.md
|
|
|
|
|
|
try:
|
|
|
|
|
|
rubric_data = json.loads(rubric_text)
|
|
|
|
|
|
max_score = rubric_data.get("max_score", 10)
|
|
|
|
|
|
except Exception:
|
|
|
|
|
|
max_score = 10
|
|
|
|
|
|
|
|
|
|
|
|
lines = [
|
|
|
|
|
|
f"# 简答题评分",
|
|
|
|
|
|
f"",
|
|
|
|
|
|
f"- **总分**:**{resp.get('total', 0):.2f} / {max_score}**",
|
|
|
|
|
|
f"- **置信度**:{resp.get('confidence', 0):.2f}",
|
|
|
|
|
|
f"- **标记**:{', '.join(resp.get('flags', [])) or '无'}",
|
|
|
|
|
|
f"",
|
|
|
|
|
|
f"## 分项评分"
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
for criterion in resp.get("criteria", []):
|
|
|
|
|
|
criterion_id = criterion.get("id", "")
|
|
|
|
|
|
score = criterion.get("score", 0)
|
|
|
|
|
|
reason = criterion.get("reason", "")
|
|
|
|
|
|
lines.append(f"- **{criterion_id}**: {score} 分")
|
|
|
|
|
|
if reason:
|
|
|
|
|
|
lines.append(f" - {reason}")
|
|
|
|
|
|
|
|
|
|
|
|
with open(args.summary, "w", encoding="utf-8") as f:
|
|
|
|
|
|
f.write("\n".join(lines))
|
|
|
|
|
|
|
|
|
|
|
|
print(f"LLM grading complete: {resp.get('total', 0):.2f}/{max_score}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
|
main()
|
|
|
|
|
|
|
|
|
|
|
|
|