feat: QA self-check agent — 15 prompts, 5 judges, weighted scoring
This commit is contained in:
53
scripts/qa/judges/gemma4.py
Normal file
53
scripts/qa/judges/gemma4.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""Gemma4 judge: LLM-based evaluation comparing prompt with PaliGemma descriptions + YOLO + MaskFormer"""
|
||||
import json, urllib.request
|
||||
|
||||
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
||||
MODEL = "google_gemma-4-26B-A4B-it-Q5_K_M.gguf"
|
||||
|
||||
def call_llm(prompt):
|
||||
data = json.dumps({
|
||||
"model": MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a video QA evaluator. Reply only with valid JSON."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
"temperature": 0.1,
|
||||
"max_tokens": 200,
|
||||
"stream": False
|
||||
}).encode()
|
||||
req = urllib.request.Request(LLM_URL, data=data, headers={"Content-Type": "application/json"})
|
||||
resp = urllib.request.urlopen(req, timeout=120)
|
||||
return json.loads(resp.read())["choices"][0]["message"]["content"]
|
||||
|
||||
def score(frames, prompt, context=None):
|
||||
"""
|
||||
context: dict with paligemma_desc, yolo_objects, maskformer_type, etc.
|
||||
"""
|
||||
pali = context.get("paligemma", "No description")
|
||||
mask = context.get("maskformer", "unknown")
|
||||
yolo = context.get("yolo", [])
|
||||
|
||||
llm_prompt = f"""You are a video QA evaluator.
|
||||
Expected query: "{prompt}"
|
||||
|
||||
Video analysis:
|
||||
- PaliGemma description: {pali}
|
||||
- Scene type (MaskFormer): {', '.join(m[:80] for m in mask) if isinstance(mask, list) else mask}
|
||||
- YOLO objects detected: {yolo[:10]}
|
||||
|
||||
Rate how well this video matches the expected query on a scale of 0-100.
|
||||
0 = completely unrelated, 100 = perfect match.
|
||||
Reply ONLY with JSON: {{"score": N, "reasoning": "brief one-line reason"}}"""
|
||||
|
||||
response = call_llm(llm_prompt)
|
||||
try:
|
||||
parsed = json.loads(response)
|
||||
except:
|
||||
parsed = {"score": 50, "reasoning": "LLM parse error"}
|
||||
|
||||
return {
|
||||
"agent": "Gemma4",
|
||||
"score": parsed.get("score", 50),
|
||||
"reasoning": parsed.get("reasoning", response[:200]),
|
||||
"details": {"raw_llm_output": response[:300]}
|
||||
}
|
||||
Reference in New Issue
Block a user