feat: QA self-check agent — 15 prompts, 5 judges, weighted scoring

2026-05-14 10:53:30 +08:00
parent 2b633174b9
commit f60a59b280
10 changed files with 875 additions and 0 deletions
--- a/scripts/qa/pipeline.py
+++ b/scripts/qa/pipeline.py
@@ -0,0 +1,160 @@
+#!/opt/homebrew/bin/python3.11
+"""
+M5 QA Self-Check Agent
+Usage: python3 pipeline.py --uuid aeed71342a899fe4b4c57b7d41bcb692
+"""
+import sys, os, argparse
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "judges"))
+
+from query_generator import generate
+from executor import execute
+from scorer import aggregate, generate_report
+
+# Import judges
+from judges import paligemma, gdino, maskformer, yolo, facenet, gemma4
+
+JUDGE_WEIGHTS = {
+    "PaliGemma": 0.25,
+    "Gemma4": 0.35,
+    "MaskFormer": 0.15,
+    "YOLO": 0.15,
+    "GroundingDINO": 0.05,
+    "FaceNet": 0.05,
+}
+
+def run_judges(query, result, file_uuid):
+    """Run all judges on the extracted frames and prompt."""
+    frames = result.get("frames", [])
+    prompt = query["prompt"]
+    qid = query["id"]
+    
+    if not frames:
+        print(f"  [{qid}] No frames to judge")
+        return []
+    
+    results = []
+    
+    # Run PaliGemma first (produces text needed by Gemma4)
+    print(f"  [{qid}] PaliGemma...", end="", flush=True)
+    try:
+        pg_result = paligemma.score(frames, prompt)
+        print(" done")
+        results.append(pg_result)
+    except Exception as e:
+        print(f" ERROR: {str(e)[:60]}")
+        results.append({"agent": "PaliGemma", "score": 50, "reasoning": f"Judge error: {str(e)[:60]}", "details": {}})
+    
+    # Run other judges
+    print(f"  [{qid}] YOLO...", end="", flush=True)
+    try:
+        yo_result = yolo.score(frames, prompt)
+        print(" done")
+        results.append(yo_result)
+    except Exception as e:
+        print(f" ERROR: {str(e)[:60]}")
+        results.append({"agent": "YOLO", "score": 50, "reasoning": f"Judge error: {str(e)[:60]}", "details": {}})
+    
+    print(f"  [{qid}] MaskFormer...", end="", flush=True)
+    try:
+        mf_result = maskformer.score(frames, prompt)
+        print(" done")
+        results.append(mf_result)
+    except Exception as e:
+        print(f" ERROR: {str(e)[:60]}")
+        results.append({"agent": "MaskFormer", "score": 50, "reasoning": f"Judge error: {str(e)[:60]}", "details": {}})
+    
+    # Grounding DINO — SKIP (too slow per-video search)
+    # print(f"  [{qid}] GDINO...", end="", flush=True)
+    # try:
+    #     gd_result = gdino.score(frames, prompt)
+    #     print(" done")
+    #     results.append(gd_result)
+    # except Exception as e:
+    #     print(f" ERROR: {str(e)[:60]}")
+    results.append({"agent": "GroundingDINO", "score": 50, "reasoning": "Skipped for performance", "details": {}})
+    
+    print(f"  [{qid}] FaceNet...", end="", flush=True)
+    try:
+        fn_result = facenet.score(frames, prompt)
+        print(" done")
+        results.append(fn_result)
+    except Exception as e:
+        print(f" ERROR: {str(e)[:60]}")
+        results.append({"agent": "FaceNet", "score": 50, "reasoning": f"Judge error", "details": {}})
+    
+    # Gemma4 — uses context from other judges
+    print(f"  [{qid}] Gemma4...", end="", flush=True)
+    try:
+        pali_text = ""
+        for r in results:
+            if r["agent"] == "PaliGemma":
+                pali_text = r.get("reasoning", "")
+                break
+        ctx = {
+            "paligemma": pali_text,
+            "maskformer": mf_result.get("reasoning", "") if 'mf_result' in dir() else "",
+            "yolo": yo_result.get("details", {}).get("frames", [{}])[0].get("found", []) if 'yo_result' in dir() else []
+        }
+        gm_result = gemma4.score(frames, prompt, context=ctx)
+        print(" done")
+        results.append(gm_result)
+    except Exception as e:
+        print(f" ERROR: {str(e)[:60]}")
+        results.append({"agent": "Gemma4", "score": 50, "reasoning": f"LLM error: {str(e)[:60]}", "details": {}})
+    
+    return results
+
+
+def main():
+    parser = argparse.ArgumentParser(description="QA Self-Check Agent")
+    parser.add_argument("--uuid", required=True, help="File UUID")
+    args = parser.parse_args()
+    
+    file_uuid = args.uuid
+    print(f"=== QA Self-Check Agent ===")
+    print(f"UUID: {file_uuid}")
+    print()
+    
+    # Phase 1: Generate 15 test queries
+    print("=== Phase 1: Generating queries ===")
+    queries = generate(file_uuid)
+    print(f"  Generated {len(queries)} queries:")
+    for q in queries:
+        print(f"  {q['id']} [{q['type']:>7}] {q['prompt'][:60]}")
+    print()
+    
+    # Phase 2: Execute (API search + video download + frame extraction)
+    print("=== Phase 2: Executing queries ===")
+    results = []
+    for q in queries:
+        result = execute(q, file_uuid)
+        results.append(result)
+    print()
+    
+    # Phase 3: Run judges
+    print("=== Phase 3: Running judges ===")
+    for i, r in enumerate(results):
+        if r.get("status") != "ok" or not r.get("frames"):
+            print(f"  [{r['query']['id']}] Skipped (no video/frames)")
+            r["judge_results"] = []
+            continue
+        r["judge_results"] = run_judges(r["query"], r, file_uuid)
+    
+    # Phase 4: Generate report
+    print()
+    print("=== Phase 4: Generating report ===")
+    # Strip non-serializable data
+    for r in results:
+        r.pop("frames", None)
+        # Strip PIL Image from judge details if any
+        for jr in r.get("judge_results", []):
+            if "frames" in jr.get("details", {}):
+                jr["details"].pop("frames")
+    generate_report(results, file_uuid)
+    print()
+    print("=== Done ===")
+
+
+if __name__ == "__main__":
+    main()