feat: QA self-check agent — 15 prompts, 5 judges, weighted scoring

2026-05-14 10:53:30 +08:00
parent 2b633174b9
commit f60a59b280
10 changed files with 875 additions and 0 deletions
--- a/scripts/qa/scorer.py
+++ b/scripts/qa/scorer.py
@@ -0,0 +1,167 @@
+"""Scorer: Weighted aggregate all judge scores → report"""
+import json, os
+from datetime import datetime
+import subprocess
+import numpy as np
+
+
+class NumpyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, (np.integer,)):
+            return int(obj)
+        if isinstance(obj, (np.floating,)):
+            return float(obj)
+        if isinstance(obj, (np.bool_,)):
+            return bool(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        return super().default(obj)
+
+OUTPUT_DIR = "/Users/accusys/momentry/output_dev"
+
+WEIGHTS = {
+    "Gemma4": 0.35,
+    "PaliGemma": 0.25,
+    "YOLO": 0.15,
+    "MaskFormer": 0.15,
+    "GroundingDINO": 0.05,
+    "FaceNet": 0.05,
+}
+
+def get_build_info():
+    try:
+        git_hash = subprocess.run(
+            ["git", "rev-parse", "--short", "HEAD"],
+            capture_output=True, text=True, timeout=5,
+            cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+        ).stdout.strip()
+    except:
+        git_hash = "unknown"
+    return {
+        "build_git_hash": git_hash,
+        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+        "version": "1.0.0"
+    }
+
+def compute_scores(judge_results):
+    """Convert judge outputs to numeric scores."""
+    scores = {}
+    for jr in judge_results:
+        agent = jr["agent"]
+        s = jr.get("score")
+        if s is None:
+            s = 50  # default for non-numeric judges
+        scores[agent] = s
+    return scores
+
+def aggregate(scores):
+    """Weighted aggregate across all judges."""
+    total_weight = 0
+    weighted_sum = 0
+    for agent, score in scores.items():
+        w = WEIGHTS.get(agent, 0.1)
+        if score is not None:
+            weighted_sum += w * score
+            total_weight += w
+    return round(weighted_sum / total_weight) if total_weight > 0 else 0
+
+def generate_report(all_results, file_uuid):
+    """Generate qa_report.md + qa_report.json."""
+    build = get_build_info()
+    report_path = os.path.join(OUTPUT_DIR, "qa_report.md")
+    json_path = os.path.join(OUTPUT_DIR, "qa_report.json")
+    
+    lines = []
+    lines.append("# QA Self-Check Report")
+    lines.append(f"")
+    lines.append(f"**UUID**: `{file_uuid}`")
+    lines.append(f"**Build**: {build['build_git_hash']}")
+    lines.append(f"**Timestamp**: {build['timestamp']}")
+    lines.append(f"**Version**: {build['version']}")
+    lines.append("")
+    lines.append("---")
+    lines.append("")
+    
+    # Summary table
+    total_queries = len(all_results)
+    avg_scores = []
+    by_type = {}
+    
+    for r in all_results:
+        qtype = r["query"]["type"]
+        qid = r["query"]["id"]
+        
+        # Collect all judge scores for this result
+        scores = {}
+        for jr in r.get("judge_results", []):
+            s = jr.get("score")
+            if s is not None:
+                scores[jr["agent"]] = s
+        
+        final_score = aggregate(scores)
+        avg_scores.append(final_score)
+        by_type.setdefault(qtype, []).append(final_score)
+    
+    overall = round(sum(avg_scores) / len(avg_scores)) if avg_scores else 0
+    
+    lines.append("## Summary")
+    lines.append("")
+    lines.append(f"| Metric | Score |")
+    lines.append(f"|--------|:----:|")
+    lines.append(f"| **Overall** | **{overall}/100** |")
+    for qtype in ["identity", "scene", "object"]:
+        scores = by_type.get(qtype, [])
+        if scores:
+            avg = round(sum(scores) / len(scores))
+            lines.append(f"| {qtype.capitalize()} queries | {avg}/100 |")
+    lines.append("")
+    
+    # Per-query details
+    lines.append("## Per-Query Details")
+    lines.append("")
+    for r in all_results:
+        q = r["query"]
+        lines.append(f"### {q['id']}: {q['prompt']}")
+        lines.append(f"")
+        lines.append(f"| Type: {q['type']} | Status: {r.get('status', 'ok')} |")
+        lines.append(f"|-----------------|-------------------|")
+        lines.append(f"")
+        
+        # Judges
+        lines.append(f"| Judge | Score | Reasoning |")
+        lines.append(f"|-------|:-----:|-----------|")
+        for jr in r.get("judge_results", []):
+            s = jr.get("score", "-")
+            if s is None: s = "-"
+            reasoning = jr.get("reasoning", "")[:80]
+            lines.append(f"| {jr['agent']} | {s} | {reasoning} |")
+        
+        scores = {}
+        for jr in r.get("judge_results", []):
+            if jr.get("score") is not None:
+                scores[jr["agent"]] = jr["score"]
+        final = aggregate(scores)
+        lines.append(f"| **Weighted** | **{final}** | |")
+        lines.append(f"")
+    
+    lines.append("---")
+    lines.append(f"*Report generated by M5 QA Agent — {build['timestamp']}*")
+    
+    report_text = "\n".join(lines)
+    with open(report_path, "w") as f:
+        f.write(report_text)
+    
+    # JSON output
+    json_output = {
+        "build": build,
+        "file_uuid": file_uuid,
+        "overall_score": overall,
+        "by_type": {t: round(sum(s)/len(s)) for t, s in by_type.items() if s},
+        "queries": all_results
+    }
+    with open(json_path, "w") as f:
+        json.dump(json_output, f, indent=2, ensure_ascii=False, cls=NumpyEncoder)
+    
+    print(f"\n  Report: {report_path}")
+    print(f"  JSON:   {json_path}")
+    print(f"  Overall score: {overall}/100")