"""Gemma4 judge: LLM-based evaluation comparing prompt with PaliGemma descriptions + YOLO + MaskFormer""" import json, urllib.request LLM_URL = "http://localhost:8082/v1/chat/completions" MODEL = "google_gemma-4-26B-A4B-it-Q5_K_M.gguf" def call_llm(prompt): data = json.dumps({ "model": MODEL, "messages": [ {"role": "system", "content": "You are a video QA evaluator. Reply only with valid JSON."}, {"role": "user", "content": prompt} ], "temperature": 0.1, "max_tokens": 200, "stream": False }).encode() req = urllib.request.Request(LLM_URL, data=data, headers={"Content-Type": "application/json"}) resp = urllib.request.urlopen(req, timeout=120) return json.loads(resp.read())["choices"][0]["message"]["content"] def score(frames, prompt, context=None): """ context: dict with paligemma_desc, yolo_objects, maskformer_type, etc. """ pali = context.get("paligemma", "No description") mask = context.get("maskformer", "unknown") yolo = context.get("yolo", []) llm_prompt = f"""You are a video QA evaluator. Expected query: "{prompt}" Video analysis: - PaliGemma description: {pali} - Scene type (MaskFormer): {', '.join(m[:80] for m in mask) if isinstance(mask, list) else mask} - YOLO objects detected: {yolo[:10]} Rate how well this video matches the expected query on a scale of 0-100. 0 = completely unrelated, 100 = perfect match. Reply with ONLY this JSON, no markdown, no explanation: {{"score": N, "reasoning": "brief one-line reason"}}""" response = call_llm(llm_prompt) try: parsed = json.loads(response) except: parsed = {"score": 50, "reasoning": "LLM parse error"} return { "agent": "Gemma4", "score": parsed.get("score", 50), "reasoning": parsed.get("reasoning", response[:200]), "details": {"raw_llm_output": response[:300]} }