momentry_core/scripts/qa/judges/gemma4.py

"""Gemma4 judge: LLM-based evaluation comparing prompt with PaliGemma descriptions + YOLO + MaskFormer"""
import json, urllib.request

LLM_URL = "http://localhost:8082/v1/chat/completions"
MODEL = "google_gemma-4-26B-A4B-it-Q5_K_M.gguf"

def call_llm(prompt):
    data = json.dumps({
        "model": MODEL,
        "messages": [
            {"role": "system", "content": "You are a video QA evaluator. Reply only with valid JSON."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.1,
        "max_tokens": 200,
        "stream": False
    }).encode()
    req = urllib.request.Request(LLM_URL, data=data, headers={"Content-Type": "application/json"})
    resp = urllib.request.urlopen(req, timeout=120)
    return json.loads(resp.read())["choices"][0]["message"]["content"]

def score(frames, prompt, context=None):
    """
    context: dict with paligemma_desc, yolo_objects, maskformer_type, etc.
    """
    pali = context.get("paligemma", "No description")
    mask = context.get("maskformer", "unknown")
    yolo = context.get("yolo", [])

    llm_prompt = f"""You are a video QA evaluator.
Expected query: "{prompt}"

Video analysis:
- PaliGemma description: {pali}
- Scene type (MaskFormer): {', '.join(m[:80] for m in mask) if isinstance(mask, list) else mask}
- YOLO objects detected: {yolo[:10]}

Rate how well this video matches the expected query on a scale of 0-100.
0 = completely unrelated, 100 = perfect match.
Reply with ONLY this JSON, no markdown, no explanation: {{"score": N, "reasoning": "brief one-line reason"}}"""

    response = call_llm(llm_prompt)
    try:
        parsed = json.loads(response)
    except:
        parsed = {"score": 50, "reasoning": "LLM parse error"}

    return {
        "agent": "Gemma4",
        "score": parsed.get("score", 50),
        "reasoning": parsed.get("reasoning", response[:200]),
        "details": {"raw_llm_output": response[:300]}
    }