momentry_core/scripts/regenerate_parent_5w1h.py

#!/opt/homebrew/bin/python3.11
"""
Regenerate parent chunk summaries using 5W1H multi-dimensional structure via gemma4.

5W1H Structure:
- Who: Main characters/people involved
- What: Key actions/events
- When: Temporal context (sequence in story)
- Where: Location/setting
- Why: Motivation/conflict driving the scene
- How: Emotional tone/manner of events
"""

import json
import requests
import psycopg2
import psycopg2.extras

DB_CONFIG = {"host": "localhost", "user": "accusys", "dbname": "momentry"}
UUID = "384b0ff44aaaa1f1"
LLAMA_URL = "http://127.0.0.1:8081/v1/chat/completions"


def get_parent_with_children():
    """Get all parent chunks with their child chunk texts"""
    conn = psycopg2.connect(**DB_CONFIG)
    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)

    cur.execute(
        """
        SELECT pc.id, pc.scene_order, pc.start_time, pc.end_time,
               pc.start_frame, pc.end_frame, pc.fps, pc.summary_text as old_summary,
               pc.metadata,
               ARRAY_AGG(c.text_content ORDER BY c.start_time) as child_texts
        FROM parent_chunks pc
        LEFT JOIN chunks c ON c.parent_chunk_id = pc.id::varchar
        WHERE pc.uuid = %s
        GROUP BY pc.id, pc.scene_order, pc.start_time, pc.end_time,
                 pc.start_frame, pc.end_frame, pc.fps, pc.summary_text, pc.metadata
        ORDER BY pc.scene_order
    """,
        (UUID,),
    )

    parents = cur.fetchall()
    cur.close()
    conn.close()
    return parents


def call_gemma4(prompt, max_tokens=1500):
    """Call Gemma4 via llama-server OpenAI-compatible API"""
    payload = {
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": max_tokens,
        "temperature": 0.3,
        "min_p": 0.1,
    }
    try:
        resp = requests.post(LLAMA_URL, json=payload, timeout=180)
        if resp.status_code == 200:
            result = resp.json()
            content = (
                result.get("choices", [{}])[0]
                .get("message", {})
                .get("content", "")
                .strip()
            )
            return content
    except Exception as e:
        print(f"    ⚠️  llama-server error: {e}")
    return ""


def generate_5w1h_summary(parent, scene_num):
    """Generate 5W1H structured summary using gemma4"""
    texts = [t for t in (parent["child_texts"] or []) if t]
    if not texts:
        return None

    # Use only first 3 and last 3 dialogue lines for context (much faster)
    sample_texts = texts[:3] + ["..."] + texts[-3:] if len(texts) > 6 else texts
    combined = "\n".join(sample_texts)[:1500]
    duration = parent["end_time"] - parent["start_time"]

    prompt = f"""You are a film scene analyst. Analyze this scene and provide 5W1H analysis.

Scene {scene_num}/17 | {duration:.0f}s | {len(texts)} dialogue lines

Key dialogue:
{combined}

Respond with ONLY this JSON:
{{"summary_5lines":"...","who":"...","what":"...","when":"...","where":"...","why":"...","how":"...","characters":[],"tone":[],"key_events":[]}}
IMPORTANT: "summary_5lines" must be EXACTLY 5 lines describing the scene. Each line should be a complete sentence separated by \\n."""

    response = call_gemma4(prompt, max_tokens=2000)

    if not response:
        return None

    # Simple JSON extraction: find first { and last }
    try:
        start = response.find("{")
        end = response.rfind("}") + 1
        if start >= 0 and end > start:
            return json.loads(response[start:end])
    except Exception:
        pass

    return None


def update_parent_chunk(parent, analysis):
    """Update parent chunk with 5W1H structured data"""
    if not analysis:
        return False

    conn = psycopg2.connect(**DB_CONFIG)
    cur = conn.cursor()

    # Create structured summary text (5 lines)
    structured_text = f"{analysis.get('summary_5lines', '')}"

    # Update metadata with full 5W1H structure
    metadata = parent["metadata"] if parent["metadata"] else {}
    metadata["auto_generated_by"] = "gemma4"
    metadata["chunk_count"] = len(parent["child_texts"] or [])
    metadata["structured_summary"] = {
        "summary_5lines": analysis.get("summary_5lines", ""),
        "who": analysis.get("who", ""),
        "what": analysis.get("what", ""),
        "when": analysis.get("when", ""),
        "where": analysis.get("where", ""),
        "why": analysis.get("why", ""),
        "how": analysis.get("how", ""),
        "characters": analysis.get("characters", []),
        "tone": analysis.get("tone", []),
        "key_events": analysis.get("key_events", []),
    }

    cur.execute(
        """
        UPDATE parent_chunks
        SET summary_text = %s,
            metadata = %s::jsonb
        WHERE id = %s
    """,
        (structured_text, json.dumps(metadata, ensure_ascii=False), parent["id"]),
    )

    conn.commit()
    cur.close()
    conn.close()
    return True


def main():
    print(f"🎬 Regenerating 5W1H summaries for {UUID}")
    print(f"   Using llama.cpp server at {LLAMA_URL}")
    print("=" * 70)

    parents = get_parent_with_children()
    print(f"📥 Found {len(parents)} parent chunks")

    success_count = 0
    for i, parent in enumerate(parents):
        duration = parent["end_time"] - parent["start_time"]
        text_count = len(parent["child_texts"] or [])
        print(
            f"\n🎬 Scene {parent['scene_order']}: {parent['start_time']:.0f}s-{parent['end_time']:.0f}s ({duration:.0f}s, {text_count} chunks)"
        )
        if parent["old_summary"]:
            print(f"   Old: {parent['old_summary'][:80]}...")

        analysis = generate_5w1h_summary(parent, parent["scene_order"])

        if analysis:
            summary = analysis.get("summary_5lines", "N/A")
            print(f"   ✅ Summary: {summary[:100]}...")
            print(f"   👤 Who: {analysis.get('who', 'N/A')[:60]}")
            print(f"   📍 Where: {analysis.get('where', 'N/A')[:60]}")
            print(f"   💡 Why: {analysis.get('why', 'N/A')[:60]}")

            if update_parent_chunk(parent, analysis):
                success_count += 1
        else:
            print("   ❌ Failed to generate analysis")

    print(f"\n{'=' * 70}")
    print(
        f"✅ Updated {success_count}/{len(parents)} parent chunks with 5W1H summaries"
    )


if __name__ == "__main__":
    main()