#!/opt/homebrew/bin/python3.11 """ Regenerate ALL parent chunks for 384b0ff44aaaa1f1 using gemma4 Groups ASR chunks into ~17 logical scenes and generates summaries. """ import json import subprocess import psycopg2 import psycopg2.extras DB_CONFIG = {"host": "localhost", "user": "accusys", "dbname": "momentry"} UUID = "384b0ff44aaaa1f1" OLLAMA_URL = "http://localhost:11434/api/generate" MODEL = "gemma4:latest" # Target ~17 scenes across 6865s = ~400s per scene # But use natural breaks (gaps in dialogue) to split SCENE_TARGET_COUNT = 17 def get_chunks(): conn = psycopg2.connect(**DB_CONFIG) cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) cur.execute( """ SELECT id, chunk_id, start_time, end_time, start_frame, end_frame, text_content, fps FROM chunks WHERE uuid = %s AND chunk_type = 'sentence' ORDER BY start_time """, (UUID,), ) chunks = cur.fetchall() cur.close() conn.close() return chunks def call_gemma4(prompt, max_tokens=300): payload = { "model": MODEL, "prompt": prompt, "stream": False, "options": {"temperature": 0.3, "num_predict": max_tokens}, } try: resp = subprocess.run( ["curl", "-s", OLLAMA_URL, "-d", json.dumps(payload)], capture_output=True, text=True, timeout=180, ) if resp.returncode == 0: result = json.loads(resp.stdout) return result.get("response", "").strip() except Exception as e: print(f" āš ļø Ollama error: {e}") return "" def find_scene_boundaries(chunks, target_count=SCENE_TARGET_COUNT): """Find optimal scene boundaries based on dialogue gaps""" if not chunks: return [] # Calculate gaps between consecutive chunks gaps = [] for i in range(1, len(chunks)): gap = chunks[i]["start_time"] - chunks[i - 1]["end_time"] gaps.append((i, gap)) # Sort by gap size, take top (target_count - 1) gaps gaps.sort(key=lambda x: x[1], reverse=True) split_indices = sorted([g[0] for g in gaps[: target_count - 1]]) # Create scenes scenes = [] start = 0 for split in split_indices: scenes.append(chunks[start:split]) start = split scenes.append(chunks[start:]) return scenes def generate_summary(scene_chunks, scene_num): """Generate summary for a scene using gemma4""" texts = [c["text_content"] for c in scene_chunks if c["text_content"]] if not texts: return f"Scene {scene_num}: No dialogue" combined = " ".join(texts)[:3000] duration = scene_chunks[-1]["end_time"] - scene_chunks[0]["start_time"] prompt = f"""You are a professional film scene analyst. Given the following dialogue transcript from a movie scene, write a concise one-sentence English summary. Duration: {duration:.0f} seconds Dialogue: {combined} Provide ONLY the summary sentence, nothing else. Focus on plot events and character actions.""" summary = call_gemma4(prompt, max_tokens=250) if not summary: # Fallback: use first few words of dialogue summary = f"Scene {scene_num}: {' '.join(texts[:3])[:80]}..." return summary def insert_parent_chunks(scenes): """Insert parent chunks and update child relationships""" conn = psycopg2.connect(**DB_CONFIG) cur = conn.cursor() inserted = 0 for i, scene_chunks in enumerate(scenes): start_time = scene_chunks[0]["start_time"] end_time = scene_chunks[-1]["end_time"] start_frame = int(scene_chunks[0]["start_frame"]) end_frame = int(scene_chunks[-1]["end_frame"]) fps = float(scene_chunks[0]["fps"]) if scene_chunks[0]["fps"] else 59.94 chunk_count = len(scene_chunks) print( f" Scene {i}: {start_time:.0f}s-{end_time:.0f}s ({chunk_count} chunks, {end_time - start_time:.0f}s)" ) # Generate summary summary = generate_summary(scene_chunks, i) print(f" šŸ“ {summary[:100]}...") # Insert parent chunk cur.execute( """ INSERT INTO parent_chunks ( uuid, scene_order, start_time, end_time, start_frame, end_frame, fps, summary_text, metadata, rule_3_markers, created_at ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, NOW()) RETURNING id """, ( UUID, i, start_time, end_time, start_frame, end_frame, fps, summary, json.dumps({"auto_generated_by": "gemma4", "chunk_count": chunk_count}), json.dumps({}), ), ) parent_id = cur.fetchone()[0] # Update chunks with parent_chunk_id chunk_ids = [c["chunk_id"] for c in scene_chunks] child_ids_array = chunk_ids # Store all child chunk IDs cur.execute( """ UPDATE chunks SET parent_chunk_id = %s::varchar WHERE uuid = %s AND chunk_id = ANY(%s) """, (str(parent_id), UUID, chunk_ids), ) inserted += 1 if i % 5 == 4 or i == len(scenes) - 1: conn.commit() print(f" āœ… Committed scenes 0-{i}") conn.commit() cur.close() conn.close() return inserted def main(): print(f"šŸŽ¬ Regenerating parent chunks for {UUID}") print(f" Using model: {MODEL}") print("=" * 70) # Step 1: Get all chunks print("\nšŸ“„ Fetching ASR chunks...") chunks = get_chunks() print(f" Found {len(chunks)} sentence chunks") if chunks: print(f" Time range: 0-{chunks[-1]['end_time']:.0f}s") # Step 2: Find scene boundaries print(f"\nšŸ” Finding {SCENE_TARGET_COUNT} scene boundaries...") scenes = find_scene_boundaries(chunks, SCENE_TARGET_COUNT) print(f" Created {len(scenes)} scenes") for i, s in enumerate(scenes): print( f" Scene {i}: {s[0]['start_time']:.0f}s-{s[-1]['end_time']:.0f}s ({len(s)} chunks)" ) # Step 3: Generate summaries and insert print("\nšŸ¤– Generating summaries with gemma4...") inserted = insert_parent_chunks(scenes) print(f"\n{'=' * 70}") print(f"āœ… Created {inserted} parent chunks") # Step 4: Verify print("\nšŸ“Š Verification:") conn = psycopg2.connect(**DB_CONFIG) cur = conn.cursor() cur.execute("SELECT COUNT(*) FROM parent_chunks WHERE uuid = %s", (UUID,)) print(f" parent_chunks: {cur.fetchone()[0]}") cur.execute( "SELECT COUNT(*) FROM chunks WHERE uuid = %s AND parent_chunk_id IS NULL AND chunk_type = 'sentence'", (UUID,), ) print(f" orphan chunks: {cur.fetchone()[0]}") cur.close() conn.close() if __name__ == "__main__": main()