#!/opt/homebrew/bin/python3.11 """ Regenerate parent chunk summaries using 5W1H multi-dimensional structure via gemma4. 5W1H Structure: - Who: Main characters/people involved - What: Key actions/events - When: Temporal context (sequence in story) - Where: Location/setting - Why: Motivation/conflict driving the scene - How: Emotional tone/manner of events """ import json import requests import psycopg2 import psycopg2.extras DB_CONFIG = {"host": "localhost", "user": "accusys", "dbname": "momentry"} UUID = "384b0ff44aaaa1f1" LLAMA_URL = "http://127.0.0.1:8081/v1/chat/completions" def get_parent_with_children(): """Get all parent chunks with their child chunk texts""" conn = psycopg2.connect(**DB_CONFIG) cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) cur.execute( """ SELECT pc.id, pc.scene_order, pc.start_time, pc.end_time, pc.start_frame, pc.end_frame, pc.fps, pc.summary_text as old_summary, pc.metadata, ARRAY_AGG(c.text_content ORDER BY c.start_time) as child_texts FROM parent_chunks pc LEFT JOIN chunks c ON c.parent_chunk_id = pc.id::varchar WHERE pc.uuid = %s GROUP BY pc.id, pc.scene_order, pc.start_time, pc.end_time, pc.start_frame, pc.end_frame, pc.fps, pc.summary_text, pc.metadata ORDER BY pc.scene_order """, (UUID,), ) parents = cur.fetchall() cur.close() conn.close() return parents def call_gemma4(prompt, max_tokens=1500): """Call Gemma4 via llama-server OpenAI-compatible API""" payload = { "messages": [{"role": "user", "content": prompt}], "max_tokens": max_tokens, "temperature": 0.3, "min_p": 0.1, } try: resp = requests.post(LLAMA_URL, json=payload, timeout=180) if resp.status_code == 200: result = resp.json() content = ( result.get("choices", [{}])[0] .get("message", {}) .get("content", "") .strip() ) return content except Exception as e: print(f" āš ļø llama-server error: {e}") return "" def generate_5w1h_summary(parent, scene_num): """Generate 5W1H structured summary using gemma4""" texts = [t for t in (parent["child_texts"] or []) if t] if not texts: return None # Use only first 3 and last 3 dialogue lines for context (much faster) sample_texts = texts[:3] + ["..."] + texts[-3:] if len(texts) > 6 else texts combined = "\n".join(sample_texts)[:1500] duration = parent["end_time"] - parent["start_time"] prompt = f"""You are a film scene analyst. Analyze this scene and provide 5W1H analysis. Scene {scene_num}/17 | {duration:.0f}s | {len(texts)} dialogue lines Key dialogue: {combined} Respond with ONLY this JSON: {{"summary_5lines":"...","who":"...","what":"...","when":"...","where":"...","why":"...","how":"...","characters":[],"tone":[],"key_events":[]}} IMPORTANT: "summary_5lines" must be EXACTLY 5 lines describing the scene. Each line should be a complete sentence separated by \\n.""" response = call_gemma4(prompt, max_tokens=2000) if not response: return None # Simple JSON extraction: find first { and last } try: start = response.find("{") end = response.rfind("}") + 1 if start >= 0 and end > start: return json.loads(response[start:end]) except Exception: pass return None def update_parent_chunk(parent, analysis): """Update parent chunk with 5W1H structured data""" if not analysis: return False conn = psycopg2.connect(**DB_CONFIG) cur = conn.cursor() # Create structured summary text (5 lines) structured_text = f"{analysis.get('summary_5lines', '')}" # Update metadata with full 5W1H structure metadata = parent["metadata"] if parent["metadata"] else {} metadata["auto_generated_by"] = "gemma4" metadata["chunk_count"] = len(parent["child_texts"] or []) metadata["structured_summary"] = { "summary_5lines": analysis.get("summary_5lines", ""), "who": analysis.get("who", ""), "what": analysis.get("what", ""), "when": analysis.get("when", ""), "where": analysis.get("where", ""), "why": analysis.get("why", ""), "how": analysis.get("how", ""), "characters": analysis.get("characters", []), "tone": analysis.get("tone", []), "key_events": analysis.get("key_events", []), } cur.execute( """ UPDATE parent_chunks SET summary_text = %s, metadata = %s::jsonb WHERE id = %s """, (structured_text, json.dumps(metadata, ensure_ascii=False), parent["id"]), ) conn.commit() cur.close() conn.close() return True def main(): print(f"šŸŽ¬ Regenerating 5W1H summaries for {UUID}") print(f" Using llama.cpp server at {LLAMA_URL}") print("=" * 70) parents = get_parent_with_children() print(f"šŸ“„ Found {len(parents)} parent chunks") success_count = 0 for i, parent in enumerate(parents): duration = parent["end_time"] - parent["start_time"] text_count = len(parent["child_texts"] or []) print( f"\nšŸŽ¬ Scene {parent['scene_order']}: {parent['start_time']:.0f}s-{parent['end_time']:.0f}s ({duration:.0f}s, {text_count} chunks)" ) if parent["old_summary"]: print(f" Old: {parent['old_summary'][:80]}...") analysis = generate_5w1h_summary(parent, parent["scene_order"]) if analysis: summary = analysis.get("summary_5lines", "N/A") print(f" āœ… Summary: {summary[:100]}...") print(f" šŸ‘¤ Who: {analysis.get('who', 'N/A')[:60]}") print(f" šŸ“ Where: {analysis.get('where', 'N/A')[:60]}") print(f" šŸ’” Why: {analysis.get('why', 'N/A')[:60]}") if update_parent_chunk(parent, analysis): success_count += 1 else: print(" āŒ Failed to generate analysis") print(f"\n{'=' * 70}") print( f"āœ… Updated {success_count}/{len(parents)} parent chunks with 5W1H summaries" ) if __name__ == "__main__": main()