momentry_core/scripts/qa/scorer.py

"""Scorer: Weighted aggregate all judge scores → report"""
import json, os
from datetime import datetime
import subprocess
import numpy as np


class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (np.integer,)):
            return int(obj)
        if isinstance(obj, (np.floating,)):
            return float(obj)
        if isinstance(obj, (np.bool_,)):
            return bool(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super().default(obj)

OUTPUT_DIR = "/Users/accusys/momentry/output_dev"

WEIGHTS = {
    "Gemma4": 0.35,
    "PaliGemma": 0.25,
    "YOLO": 0.15,
    "MaskFormer": 0.15,
    "GroundingDINO": 0.05,
    "FaceNet": 0.05,
}

def get_build_info():
    try:
        git_hash = subprocess.run(
            ["git", "rev-parse", "--short", "HEAD"],
            capture_output=True, text=True, timeout=5,
            cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        ).stdout.strip()
    except:
        git_hash = "unknown"
    return {
        "build_git_hash": git_hash,
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "version": "1.0.0"
    }

def compute_scores(judge_results):
    """Convert judge outputs to numeric scores."""
    scores = {}
    for jr in judge_results:
        agent = jr["agent"]
        s = jr.get("score")
        if s is None:
            s = 50  # default for non-numeric judges
        scores[agent] = s
    return scores

def aggregate(scores):
    """Weighted aggregate across all judges."""
    total_weight = 0
    weighted_sum = 0
    for agent, score in scores.items():
        w = WEIGHTS.get(agent, 0.1)
        if score is not None:
            weighted_sum += w * score
            total_weight += w
    return round(weighted_sum / total_weight) if total_weight > 0 else 0

def generate_report(all_results, file_uuid):
    """Generate qa_report.md + qa_report.json."""
    build = get_build_info()
    report_path = os.path.join(OUTPUT_DIR, "qa_report.md")
    json_path = os.path.join(OUTPUT_DIR, "qa_report.json")

    lines = []
    lines.append("# QA Self-Check Report")
    lines.append(f"")
    lines.append(f"**UUID**: `{file_uuid}`")
    lines.append(f"**Build**: {build['build_git_hash']}")
    lines.append(f"**Timestamp**: {build['timestamp']}")
    lines.append(f"**Version**: {build['version']}")
    lines.append("")
    lines.append("---")
    lines.append("")

    # Summary table
    total_queries = len(all_results)
    avg_scores = []
    by_type = {}

    for r in all_results:
        qtype = r["query"]["type"]
        qid = r["query"]["id"]

        # Collect all judge scores for this result
        scores = {}
        for jr in r.get("judge_results", []):
            s = jr.get("score")
            if s is not None:
                scores[jr["agent"]] = s

        final_score = aggregate(scores)
        avg_scores.append(final_score)
        by_type.setdefault(qtype, []).append(final_score)

    overall = round(sum(avg_scores) / len(avg_scores)) if avg_scores else 0

    lines.append("## Summary")
    lines.append("")
    lines.append(f"| Metric | Score |")
    lines.append(f"|--------|:----:|")
    lines.append(f"| **Overall** | **{overall}/100** |")
    for qtype in ["identity", "scene", "object"]:
        scores = by_type.get(qtype, [])
        if scores:
            avg = round(sum(scores) / len(scores))
            lines.append(f"| {qtype.capitalize()} queries | {avg}/100 |")
    lines.append("")

    # Per-query details
    lines.append("## Per-Query Details")
    lines.append("")
    for r in all_results:
        q = r["query"]
        lines.append(f"### {q['id']}: {q['prompt']}")
        lines.append(f"")
        lines.append(f"| Type: {q['type']} | Status: {r.get('status', 'ok')} |")
        lines.append(f"|-----------------|-------------------|")
        lines.append(f"")

        # Judges
        lines.append(f"| Judge | Score | Reasoning |")
        lines.append(f"|-------|:-----:|-----------|")
        for jr in r.get("judge_results", []):
            s = jr.get("score", "-")
            if s is None: s = "-"
            reasoning = jr.get("reasoning", "")[:80]
            lines.append(f"| {jr['agent']} | {s} | {reasoning} |")

        scores = {}
        for jr in r.get("judge_results", []):
            if jr.get("score") is not None:
                scores[jr["agent"]] = jr["score"]
        final = aggregate(scores)
        lines.append(f"| **Weighted** | **{final}** | |")
        lines.append(f"")

    lines.append("---")
    lines.append(f"*Report generated by M5 QA Agent — {build['timestamp']}*")

    report_text = "\n".join(lines)
    with open(report_path, "w") as f:
        f.write(report_text)

    # JSON output
    json_output = {
        "build": build,
        "file_uuid": file_uuid,
        "overall_score": overall,
        "by_type": {t: round(sum(s)/len(s)) for t, s in by_type.items() if s},
        "queries": all_results
    }
    with open(json_path, "w") as f:
        json.dump(json_output, f, indent=2, ensure_ascii=False, cls=NumpyEncoder)

    print(f"\n  Report: {report_path}")
    print(f"  JSON:   {json_path}")
    print(f"  Overall score: {overall}/100")