feat: QA self-check agent — 15 prompts, 5 judges, weighted scoring
This commit is contained in:
167
scripts/qa/scorer.py
Normal file
167
scripts/qa/scorer.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""Scorer: Weighted aggregate all judge scores → report"""
|
||||
import json, os
|
||||
from datetime import datetime
|
||||
import subprocess
|
||||
import numpy as np
|
||||
|
||||
|
||||
class NumpyEncoder(json.JSONEncoder):
|
||||
def default(self, obj):
|
||||
if isinstance(obj, (np.integer,)):
|
||||
return int(obj)
|
||||
if isinstance(obj, (np.floating,)):
|
||||
return float(obj)
|
||||
if isinstance(obj, (np.bool_,)):
|
||||
return bool(obj)
|
||||
if isinstance(obj, np.ndarray):
|
||||
return obj.tolist()
|
||||
return super().default(obj)
|
||||
|
||||
OUTPUT_DIR = "/Users/accusys/momentry/output_dev"
|
||||
|
||||
WEIGHTS = {
|
||||
"Gemma4": 0.35,
|
||||
"PaliGemma": 0.25,
|
||||
"YOLO": 0.15,
|
||||
"MaskFormer": 0.15,
|
||||
"GroundingDINO": 0.05,
|
||||
"FaceNet": 0.05,
|
||||
}
|
||||
|
||||
def get_build_info():
|
||||
try:
|
||||
git_hash = subprocess.run(
|
||||
["git", "rev-parse", "--short", "HEAD"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
).stdout.strip()
|
||||
except:
|
||||
git_hash = "unknown"
|
||||
return {
|
||||
"build_git_hash": git_hash,
|
||||
"timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"version": "1.0.0"
|
||||
}
|
||||
|
||||
def compute_scores(judge_results):
|
||||
"""Convert judge outputs to numeric scores."""
|
||||
scores = {}
|
||||
for jr in judge_results:
|
||||
agent = jr["agent"]
|
||||
s = jr.get("score")
|
||||
if s is None:
|
||||
s = 50 # default for non-numeric judges
|
||||
scores[agent] = s
|
||||
return scores
|
||||
|
||||
def aggregate(scores):
|
||||
"""Weighted aggregate across all judges."""
|
||||
total_weight = 0
|
||||
weighted_sum = 0
|
||||
for agent, score in scores.items():
|
||||
w = WEIGHTS.get(agent, 0.1)
|
||||
if score is not None:
|
||||
weighted_sum += w * score
|
||||
total_weight += w
|
||||
return round(weighted_sum / total_weight) if total_weight > 0 else 0
|
||||
|
||||
def generate_report(all_results, file_uuid):
|
||||
"""Generate qa_report.md + qa_report.json."""
|
||||
build = get_build_info()
|
||||
report_path = os.path.join(OUTPUT_DIR, "qa_report.md")
|
||||
json_path = os.path.join(OUTPUT_DIR, "qa_report.json")
|
||||
|
||||
lines = []
|
||||
lines.append("# QA Self-Check Report")
|
||||
lines.append(f"")
|
||||
lines.append(f"**UUID**: `{file_uuid}`")
|
||||
lines.append(f"**Build**: {build['build_git_hash']}")
|
||||
lines.append(f"**Timestamp**: {build['timestamp']}")
|
||||
lines.append(f"**Version**: {build['version']}")
|
||||
lines.append("")
|
||||
lines.append("---")
|
||||
lines.append("")
|
||||
|
||||
# Summary table
|
||||
total_queries = len(all_results)
|
||||
avg_scores = []
|
||||
by_type = {}
|
||||
|
||||
for r in all_results:
|
||||
qtype = r["query"]["type"]
|
||||
qid = r["query"]["id"]
|
||||
|
||||
# Collect all judge scores for this result
|
||||
scores = {}
|
||||
for jr in r.get("judge_results", []):
|
||||
s = jr.get("score")
|
||||
if s is not None:
|
||||
scores[jr["agent"]] = s
|
||||
|
||||
final_score = aggregate(scores)
|
||||
avg_scores.append(final_score)
|
||||
by_type.setdefault(qtype, []).append(final_score)
|
||||
|
||||
overall = round(sum(avg_scores) / len(avg_scores)) if avg_scores else 0
|
||||
|
||||
lines.append("## Summary")
|
||||
lines.append("")
|
||||
lines.append(f"| Metric | Score |")
|
||||
lines.append(f"|--------|:----:|")
|
||||
lines.append(f"| **Overall** | **{overall}/100** |")
|
||||
for qtype in ["identity", "scene", "object"]:
|
||||
scores = by_type.get(qtype, [])
|
||||
if scores:
|
||||
avg = round(sum(scores) / len(scores))
|
||||
lines.append(f"| {qtype.capitalize()} queries | {avg}/100 |")
|
||||
lines.append("")
|
||||
|
||||
# Per-query details
|
||||
lines.append("## Per-Query Details")
|
||||
lines.append("")
|
||||
for r in all_results:
|
||||
q = r["query"]
|
||||
lines.append(f"### {q['id']}: {q['prompt']}")
|
||||
lines.append(f"")
|
||||
lines.append(f"| Type: {q['type']} | Status: {r.get('status', 'ok')} |")
|
||||
lines.append(f"|-----------------|-------------------|")
|
||||
lines.append(f"")
|
||||
|
||||
# Judges
|
||||
lines.append(f"| Judge | Score | Reasoning |")
|
||||
lines.append(f"|-------|:-----:|-----------|")
|
||||
for jr in r.get("judge_results", []):
|
||||
s = jr.get("score", "-")
|
||||
if s is None: s = "-"
|
||||
reasoning = jr.get("reasoning", "")[:80]
|
||||
lines.append(f"| {jr['agent']} | {s} | {reasoning} |")
|
||||
|
||||
scores = {}
|
||||
for jr in r.get("judge_results", []):
|
||||
if jr.get("score") is not None:
|
||||
scores[jr["agent"]] = jr["score"]
|
||||
final = aggregate(scores)
|
||||
lines.append(f"| **Weighted** | **{final}** | |")
|
||||
lines.append(f"")
|
||||
|
||||
lines.append("---")
|
||||
lines.append(f"*Report generated by M5 QA Agent — {build['timestamp']}*")
|
||||
|
||||
report_text = "\n".join(lines)
|
||||
with open(report_path, "w") as f:
|
||||
f.write(report_text)
|
||||
|
||||
# JSON output
|
||||
json_output = {
|
||||
"build": build,
|
||||
"file_uuid": file_uuid,
|
||||
"overall_score": overall,
|
||||
"by_type": {t: round(sum(s)/len(s)) for t, s in by_type.items() if s},
|
||||
"queries": all_results
|
||||
}
|
||||
with open(json_path, "w") as f:
|
||||
json.dump(json_output, f, indent=2, ensure_ascii=False, cls=NumpyEncoder)
|
||||
|
||||
print(f"\n Report: {report_path}")
|
||||
print(f" JSON: {json_path}")
|
||||
print(f" Overall score: {overall}/100")
|
||||
Reference in New Issue
Block a user