#!/opt/homebrew/bin/python3.11 """ Rebuild story chunk text_content and regenerates summaries using new ASRX speaker assignments. Then updates Qdrant momentry_dev_stories and sentence_story/sentence_summary collections. """ import json, sys, time, urllib.request from urllib.request import Request, urlopen import psycopg2 UUID = "aeed71342a899fe4b4c57b7d41bcb692" DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp" QDRANT_URL = "http://localhost:6333" LLM_URL = "http://localhost:8082/v1/chat/completions" EMBED_URL = "http://localhost:11436/v1/embeddings" def call_llm(dialogue_text): prompt = f"Dialogue:\n{dialogue_text}\n\n50-word summary:" body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf", "messages": [{"role": "user", "content": prompt}], "temperature": 0.1, "max_tokens": 100}).encode() req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"}) try: resp = urlopen(req, timeout=120) return json.loads(resp.read())["choices"][0]["message"]["content"].strip() except Exception as e: print(f" LLM error: {e}") return "" def call_embed(text): body = json.dumps({"input": text}).encode() req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"}) try: resp = urlopen(req, timeout=30) return json.loads(resp.read())["data"][0]["embedding"] except Exception as e: print(f" Embed error: {e}") return [0.0] * 768 print("=== Step 1: Load sentence chunks with new speaker info ===") conn = psycopg2.connect(DB_URL) cur = conn.cursor() cur.execute(""" SELECT chunk_index, text_content, metadata->>'new_speaker_name', metadata->>'speaker_name', content FROM dev.chunks WHERE file_uuid = %s AND chunk_type = 'sentence' ORDER BY chunk_index """, (UUID,)) sentence_rows = cur.fetchall() print(f"Loaded {len(sentence_rows)} sentence chunks") # Build lookup sentences = {} for r in sentence_rows: idx, old_text, new_name, old_name, content = r sentences[idx] = { "old_text": old_text or "", "new_name": new_name or old_name or "Unknown", "old_name": old_name or "Unknown", "content": content or {}, } # Rebuild sentence text_content with new speaker names print("\n=== Step 2: Rebuild sentence text_content ===") updated_sentences = 0 for r in sentence_rows: idx, old_text, new_name, old_name, content = r new_name = new_name or old_name or "Unknown" # Extract the text part (remove old speaker prefix if exists) raw_text = "" if content and isinstance(content, dict): raw_text = content.get("data", {}).get("text", "") if not raw_text and old_text: # Parse old format: [Speaker] text import re m = re.search(r'\]\s*(.*)', old_text) if m: raw_text = m.group(1) else: raw_text = old_text new_text = f"[{new_name}] {raw_text}" cur.execute(""" UPDATE dev.chunks SET text_content = %s, updated_at = NOW() WHERE file_uuid = %s AND chunk_type = 'sentence' AND chunk_index = %s """, (new_text, UUID, idx)) updated_sentences += 1 conn.commit() print(f"Updated {updated_sentences} sentence chunks text_content") print("\n=== Step 3: Rebuild story chunk text_content ===") cur.execute(""" SELECT id, chunk_id, chunk_index, child_chunk_ids, start_time, end_time, text_content, summary_text FROM dev.chunks WHERE file_uuid = %s AND chunk_type = 'story' ORDER BY chunk_index """, (UUID,)) story_rows = cur.fetchall() print(f"Loaded {len(story_rows)} story chunks") # Build child text per story chunk story_dialogue_texts = [] for r in story_rows: db_id, cid, idx, child_ids, st, et, old_text, old_summary = r dialogue_parts = [] for child_cid in (child_ids or []): parts = child_cid.split("_") child_idx = int(parts[-1]) if child_idx in sentences: s = sentences[child_idx] raw = "" if s["content"] and isinstance(s["content"], dict): raw = s["content"].get("data", {}).get("text", "") if not raw: import re m = re.search(r'\]\s*(.*)', s["old_text"]) if m: raw = m.group(1) else: raw = s["old_text"] if raw: dialogue_parts.append(f'({s["new_name"]}) {raw}') dialogue_text = " ".join(dialogue_parts) story_dialogue_texts.append((db_id, cid, idx, st, et, dialogue_text, old_summary)) print(f"Built {len(story_dialogue_texts)} story dialogue texts") # Update DB with new text_content (dialogue only, not summary yet) for item in story_dialogue_texts: db_id, cid, idx, st, et, dialogue_text, old_summary = item cur.execute(""" UPDATE dev.chunks SET text_content = %s, updated_at = NOW() WHERE id = %s """, (dialogue_text, db_id)) conn.commit() print("Updated story chunk dialogue texts") print("\n=== Step 4: Generate LLM summaries (all 228 stories) ===") summaries = [] for i, item in enumerate(story_dialogue_texts): db_id, cid, idx, st, et, dialogue_text, old_summary = item if len(dialogue_text) < 10: summary = "[no dialogue]" embedding = [0.0] * 768 else: print(f" [{i+1}/{len(story_dialogue_texts)}] {cid}: {len(dialogue_text)} chars", end="") try: summary = call_llm(dialogue_text[:3000]) print(f" -> {len(summary)} chars") time.sleep(0.3) embedding = call_embed(summary) except Exception as e: print(f" ERROR: {e}") summary = "[error]" embedding = [0.0] * 768 # Update DB s_esc = summary.replace("'", "''") cur.execute(f""" UPDATE dev.chunks SET summary_text = '{s_esc}', updated_at = NOW() WHERE id = {db_id} """) summaries.append({ "db_id": db_id, "chunk_id": cid, "chunk_index": idx, "start_time": st, "end_time": et, "dialogue": dialogue_text, "summary": summary, "embedding": embedding, }) conn.commit() print(f"\nGenerated {len(summaries)} summaries") print("\n=== Step 5: Rebuild Qdrant momentry_dev_stories ===") # Delete existing req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories", method="DELETE") try: urlopen(req) time.sleep(0.3) except: pass # Recreate req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories", data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(), headers={"Content-Type": "application/json"}, method="PUT") urlopen(req) time.sleep(0.3) # Upload dialogue points (0..227) and summary points (228..455) dialogue_points = [] summary_points = [] for s in summaries: idx = s["chunk_index"] dialogue_points.append({ "id": idx + 1, "vector": [0.0] * 768, "payload": { "chunk_id": s["chunk_id"], "file_uuid": UUID, "start_time": s["start_time"], "end_time": s["end_time"], "type": "story_dialogue", "text": s["dialogue"][:500], } }) summary_points.append({ "id": idx + 1 + 228, "vector": s["embedding"], "payload": { "chunk_id": s["chunk_id"], "file_uuid": UUID, "start_time": s["start_time"], "end_time": s["end_time"], "type": "story_summary", "summary": s["summary"], } }) all_story_points = dialogue_points + summary_points batch_size = 100 for start in range(0, len(all_story_points), batch_size): batch = all_story_points[start:start+batch_size] req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories/points?wait=true", data=json.dumps({"points": batch}).encode(), headers={"Content-Type": "application/json"}, method="PUT") try: urlopen(req) except Exception as e: print(f" Batch {start}: {e}") if (start // batch_size) % 3 == 0: print(f" Uploaded {start + len(batch)}/{len(all_story_points)}") print(f"Uploaded {len(all_story_points)} points to momentry_dev_stories") print("\n=== Step 6: Populate sentence_story and sentence_summary ===") # These are the per-sentence template + summary collections # sentence_story: 3417 points, 768D, template payloads # sentence_summary: 3417 points, 768D, LLM summary payloads for col_name in ["sentence_story", "sentence_summary"]: req = Request(f"{QDRANT_URL}/collections/{col_name}", method="DELETE") try: urlopen(req) time.sleep(0.2) except: pass req = Request(f"{QDRANT_URL}/collections/{col_name}", data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(), headers={"Content-Type": "application/json"}, method="PUT") urlopen(req) time.sleep(0.2) # Build points for sentence_story and sentence_summary story_sentence_points = [] summary_sentence_points = [] for idx in sorted(sentences.keys()): s = sentences[idx] raw_text = "" if s["content"] and isinstance(s["content"], dict): raw_text = s["content"].get("data", {}).get("text", "") dialog_line = f'({s["new_name"]}) {raw_text}' story_sentence_points.append({ "id": idx + 1, "vector": [0.0] * 768, "payload": { "chunk_id": f"{UUID}_{idx}", "file_uuid": UUID, "start_time": 0, "end_time": 0, "text": dialog_line, "speaker_name": s["new_name"], "chunk_type": "sentence", } }) # Upload sentence_story (dialogue template) batch_size = 200 for start in range(0, len(story_sentence_points), batch_size): batch = story_sentence_points[start:start+batch_size] req = Request(f"{QDRANT_URL}/collections/sentence_story/points?wait=true", data=json.dumps({"points": batch}).encode(), headers={"Content-Type": "application/json"}, method="PUT") try: urlopen(req) except Exception as e: print(f" sentence_story batch {start}: {e}") if (start // batch_size) % 5 == 0: print(f" Uploaded {start + len(batch)}/3417 sentence_story") print("Uploaded sentence_story points") # sentence_summary will be populated when we generate per-sentence summaries # For now, mark as TODO print("sentence_summary: SKIPPED (needs per-sentence LLM summaries)") cur.close() conn.close() print("\n=== Done ===")