54 lines
1.9 KiB
Python
54 lines
1.9 KiB
Python
"""Gemma4 judge: LLM-based evaluation comparing prompt with PaliGemma descriptions + YOLO + MaskFormer"""
|
|
import json, urllib.request
|
|
|
|
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
|
MODEL = "google_gemma-4-26B-A4B-it-Q5_K_M.gguf"
|
|
|
|
def call_llm(prompt):
|
|
data = json.dumps({
|
|
"model": MODEL,
|
|
"messages": [
|
|
{"role": "system", "content": "You are a video QA evaluator. Reply only with valid JSON."},
|
|
{"role": "user", "content": prompt}
|
|
],
|
|
"temperature": 0.1,
|
|
"max_tokens": 200,
|
|
"stream": False
|
|
}).encode()
|
|
req = urllib.request.Request(LLM_URL, data=data, headers={"Content-Type": "application/json"})
|
|
resp = urllib.request.urlopen(req, timeout=120)
|
|
return json.loads(resp.read())["choices"][0]["message"]["content"]
|
|
|
|
def score(frames, prompt, context=None):
|
|
"""
|
|
context: dict with paligemma_desc, yolo_objects, maskformer_type, etc.
|
|
"""
|
|
pali = context.get("paligemma", "No description")
|
|
mask = context.get("maskformer", "unknown")
|
|
yolo = context.get("yolo", [])
|
|
|
|
llm_prompt = f"""You are a video QA evaluator.
|
|
Expected query: "{prompt}"
|
|
|
|
Video analysis:
|
|
- PaliGemma description: {pali}
|
|
- Scene type (MaskFormer): {', '.join(m[:80] for m in mask) if isinstance(mask, list) else mask}
|
|
- YOLO objects detected: {yolo[:10]}
|
|
|
|
Rate how well this video matches the expected query on a scale of 0-100.
|
|
0 = completely unrelated, 100 = perfect match.
|
|
Reply with ONLY this JSON, no markdown, no explanation: {{"score": N, "reasoning": "brief one-line reason"}}"""
|
|
|
|
response = call_llm(llm_prompt)
|
|
try:
|
|
parsed = json.loads(response)
|
|
except:
|
|
parsed = {"score": 50, "reasoning": "LLM parse error"}
|
|
|
|
return {
|
|
"agent": "Gemma4",
|
|
"score": parsed.get("score", 50),
|
|
"reasoning": parsed.get("reasoning", response[:200]),
|
|
"details": {"raw_llm_output": response[:300]}
|
|
}
|