feat: QA self-check agent — 15 prompts, 5 judges, weighted scoring

2026-05-14 10:53:30 +08:00
parent 2b633174b9
commit f60a59b280
10 changed files with 875 additions and 0 deletions
--- a/scripts/qa/judges/gemma4.py
+++ b/scripts/qa/judges/gemma4.py
@@ -0,0 +1,53 @@
+"""Gemma4 judge: LLM-based evaluation comparing prompt with PaliGemma descriptions + YOLO + MaskFormer"""
+import json, urllib.request
+
+LLM_URL = "http://localhost:8082/v1/chat/completions"
+MODEL = "google_gemma-4-26B-A4B-it-Q5_K_M.gguf"
+
+def call_llm(prompt):
+    data = json.dumps({
+        "model": MODEL,
+        "messages": [
+            {"role": "system", "content": "You are a video QA evaluator. Reply only with valid JSON."},
+            {"role": "user", "content": prompt}
+        ],
+        "temperature": 0.1,
+        "max_tokens": 200,
+        "stream": False
+    }).encode()
+    req = urllib.request.Request(LLM_URL, data=data, headers={"Content-Type": "application/json"})
+    resp = urllib.request.urlopen(req, timeout=120)
+    return json.loads(resp.read())["choices"][0]["message"]["content"]
+
+def score(frames, prompt, context=None):
+    """
+    context: dict with paligemma_desc, yolo_objects, maskformer_type, etc.
+    """
+    pali = context.get("paligemma", "No description")
+    mask = context.get("maskformer", "unknown")
+    yolo = context.get("yolo", [])
+    
+    llm_prompt = f"""You are a video QA evaluator.
+Expected query: "{prompt}"
+
+Video analysis:
+- PaliGemma description: {pali}
+- Scene type (MaskFormer): {', '.join(m[:80] for m in mask) if isinstance(mask, list) else mask}
+- YOLO objects detected: {yolo[:10]}
+
+Rate how well this video matches the expected query on a scale of 0-100.
+0 = completely unrelated, 100 = perfect match.
+Reply ONLY with JSON: {{"score": N, "reasoning": "brief one-line reason"}}"""
+    
+    response = call_llm(llm_prompt)
+    try:
+        parsed = json.loads(response)
+    except:
+        parsed = {"score": 50, "reasoning": "LLM parse error"}
+    
+    return {
+        "agent": "Gemma4",
+        "score": parsed.get("score", 50),
+        "reasoning": parsed.get("reasoning", response[:200]),
+        "details": {"raw_llm_output": response[:300]}
+    }