Files
momentry_core/scripts/qa/judges/gemma4.py

54 lines
1.9 KiB
Python

"""Gemma4 judge: LLM-based evaluation comparing prompt with PaliGemma descriptions + YOLO + MaskFormer"""
import json, urllib.request
LLM_URL = "http://localhost:8082/v1/chat/completions"
MODEL = "google_gemma-4-26B-A4B-it-Q5_K_M.gguf"
def call_llm(prompt):
data = json.dumps({
"model": MODEL,
"messages": [
{"role": "system", "content": "You are a video QA evaluator. Reply only with valid JSON."},
{"role": "user", "content": prompt}
],
"temperature": 0.1,
"max_tokens": 200,
"stream": False
}).encode()
req = urllib.request.Request(LLM_URL, data=data, headers={"Content-Type": "application/json"})
resp = urllib.request.urlopen(req, timeout=120)
return json.loads(resp.read())["choices"][0]["message"]["content"]
def score(frames, prompt, context=None):
"""
context: dict with paligemma_desc, yolo_objects, maskformer_type, etc.
"""
pali = context.get("paligemma", "No description")
mask = context.get("maskformer", "unknown")
yolo = context.get("yolo", [])
llm_prompt = f"""You are a video QA evaluator.
Expected query: "{prompt}"
Video analysis:
- PaliGemma description: {pali}
- Scene type (MaskFormer): {', '.join(m[:80] for m in mask) if isinstance(mask, list) else mask}
- YOLO objects detected: {yolo[:10]}
Rate how well this video matches the expected query on a scale of 0-100.
0 = completely unrelated, 100 = perfect match.
Reply with ONLY this JSON, no markdown, no explanation: {{"score": N, "reasoning": "brief one-line reason"}}"""
response = call_llm(llm_prompt)
try:
parsed = json.loads(response)
except:
parsed = {"score": 50, "reasoning": "LLM parse error"}
return {
"agent": "Gemma4",
"score": parsed.get("score", 50),
"reasoning": parsed.get("reasoning", response[:200]),
"details": {"raw_llm_output": response[:300]}
}