momentry_core/scripts/rebuild_story_content.py

#!/opt/homebrew/bin/python3.11
"""
Rebuild story chunk text_content and regenerates summaries using new ASRX speaker assignments.
Then updates Qdrant momentry_dev_stories and sentence_story/sentence_summary collections.
"""

import json, sys, time, urllib.request
from urllib.request import Request, urlopen
import psycopg2

UUID = "aeed71342a899fe4b4c57b7d41bcb692"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
QDRANT_URL = "http://localhost:6333"
LLM_URL = "http://localhost:8082/v1/chat/completions"
EMBED_URL = "http://localhost:11436/v1/embeddings"

def call_llm(dialogue_text):
    prompt = f"Dialogue:\n{dialogue_text}\n\n50-word summary:"
    body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.1, "max_tokens": 100}).encode()
    req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
    try:
        resp = urlopen(req, timeout=120)
        return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
    except Exception as e:
        print(f"    LLM error: {e}")
        return ""

def call_embed(text):
    body = json.dumps({"input": text}).encode()
    req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
    try:
        resp = urlopen(req, timeout=30)
        return json.loads(resp.read())["data"][0]["embedding"]
    except Exception as e:
        print(f"    Embed error: {e}")
        return [0.0] * 768

print("=== Step 1: Load sentence chunks with new speaker info ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()

cur.execute("""
    SELECT chunk_index, text_content, metadata->>'new_speaker_name',
           metadata->>'speaker_name', content
    FROM dev.chunks
    WHERE file_uuid = %s AND chunk_type = 'sentence'
    ORDER BY chunk_index
""", (UUID,))
sentence_rows = cur.fetchall()
print(f"Loaded {len(sentence_rows)} sentence chunks")

# Build lookup
sentences = {}
for r in sentence_rows:
    idx, old_text, new_name, old_name, content = r
    sentences[idx] = {
        "old_text": old_text or "",
        "new_name": new_name or old_name or "Unknown",
        "old_name": old_name or "Unknown",
        "content": content or {},
    }

# Rebuild sentence text_content with new speaker names
print("\n=== Step 2: Rebuild sentence text_content ===")
updated_sentences = 0
for r in sentence_rows:
    idx, old_text, new_name, old_name, content = r
    new_name = new_name or old_name or "Unknown"

    # Extract the text part (remove old speaker prefix if exists)
    raw_text = ""
    if content and isinstance(content, dict):
        raw_text = content.get("data", {}).get("text", "")
    if not raw_text and old_text:
        # Parse old format: [Speaker] text
        import re
        m = re.search(r'\]\s*(.*)', old_text)
        if m:
            raw_text = m.group(1)
        else:
            raw_text = old_text

    new_text = f"[{new_name}] {raw_text}"

    cur.execute("""
        UPDATE dev.chunks
        SET text_content = %s, updated_at = NOW()
        WHERE file_uuid = %s AND chunk_type = 'sentence' AND chunk_index = %s
    """, (new_text, UUID, idx))
    updated_sentences += 1

conn.commit()
print(f"Updated {updated_sentences} sentence chunks text_content")

print("\n=== Step 3: Rebuild story chunk text_content ===")
cur.execute("""
    SELECT id, chunk_id, chunk_index, child_chunk_ids, start_time, end_time,
           text_content, summary_text
    FROM dev.chunks
    WHERE file_uuid = %s AND chunk_type = 'story'
    ORDER BY chunk_index
""", (UUID,))
story_rows = cur.fetchall()
print(f"Loaded {len(story_rows)} story chunks")

# Build child text per story chunk
story_dialogue_texts = []
for r in story_rows:
    db_id, cid, idx, child_ids, st, et, old_text, old_summary = r

    dialogue_parts = []
    for child_cid in (child_ids or []):
        parts = child_cid.split("_")
        child_idx = int(parts[-1])
        if child_idx in sentences:
            s = sentences[child_idx]
            raw = ""
            if s["content"] and isinstance(s["content"], dict):
                raw = s["content"].get("data", {}).get("text", "")
            if not raw:
                import re
                m = re.search(r'\]\s*(.*)', s["old_text"])
                if m:
                    raw = m.group(1)
                else:
                    raw = s["old_text"]
            if raw:
                dialogue_parts.append(f'({s["new_name"]}) {raw}')

    dialogue_text = " ".join(dialogue_parts)
    story_dialogue_texts.append((db_id, cid, idx, st, et, dialogue_text, old_summary))

print(f"Built {len(story_dialogue_texts)} story dialogue texts")

# Update DB with new text_content (dialogue only, not summary yet)
for item in story_dialogue_texts:
    db_id, cid, idx, st, et, dialogue_text, old_summary = item
    cur.execute("""
        UPDATE dev.chunks
        SET text_content = %s, updated_at = NOW()
        WHERE id = %s
    """, (dialogue_text, db_id))

conn.commit()
print("Updated story chunk dialogue texts")

print("\n=== Step 4: Generate LLM summaries (all 228 stories) ===")
summaries = []
for i, item in enumerate(story_dialogue_texts):
    db_id, cid, idx, st, et, dialogue_text, old_summary = item

    if len(dialogue_text) < 10:
        summary = "[no dialogue]"
        embedding = [0.0] * 768
    else:
        print(f"  [{i+1}/{len(story_dialogue_texts)}] {cid}: {len(dialogue_text)} chars", end="")
        try:
            summary = call_llm(dialogue_text[:3000])
            print(f" -> {len(summary)} chars")
            time.sleep(0.3)
            embedding = call_embed(summary)
        except Exception as e:
            print(f" ERROR: {e}")
            summary = "[error]"
            embedding = [0.0] * 768

    # Update DB
    s_esc = summary.replace("'", "''")
    cur.execute(f"""
        UPDATE dev.chunks
        SET summary_text = '{s_esc}', updated_at = NOW()
        WHERE id = {db_id}
    """)

    summaries.append({
        "db_id": db_id,
        "chunk_id": cid,
        "chunk_index": idx,
        "start_time": st,
        "end_time": et,
        "dialogue": dialogue_text,
        "summary": summary,
        "embedding": embedding,
    })

conn.commit()
print(f"\nGenerated {len(summaries)} summaries")

print("\n=== Step 5: Rebuild Qdrant momentry_dev_stories ===")
# Delete existing
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories", method="DELETE")
try:
    urlopen(req)
    time.sleep(0.3)
except:
    pass

# Recreate
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories",
    data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
    headers={"Content-Type": "application/json"}, method="PUT")
urlopen(req)
time.sleep(0.3)

# Upload dialogue points (0..227) and summary points (228..455)
dialogue_points = []
summary_points = []
for s in summaries:
    idx = s["chunk_index"]
    dialogue_points.append({
        "id": idx + 1,
        "vector": [0.0] * 768,
        "payload": {
            "chunk_id": s["chunk_id"],
            "file_uuid": UUID,
            "start_time": s["start_time"],
            "end_time": s["end_time"],
            "type": "story_dialogue",
            "text": s["dialogue"][:500],
        }
    })
    summary_points.append({
        "id": idx + 1 + 228,
        "vector": s["embedding"],
        "payload": {
            "chunk_id": s["chunk_id"],
            "file_uuid": UUID,
            "start_time": s["start_time"],
            "end_time": s["end_time"],
            "type": "story_summary",
            "summary": s["summary"],
        }
    })

all_story_points = dialogue_points + summary_points

batch_size = 100
for start in range(0, len(all_story_points), batch_size):
    batch = all_story_points[start:start+batch_size]
    req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories/points?wait=true",
        data=json.dumps({"points": batch}).encode(),
        headers={"Content-Type": "application/json"}, method="PUT")
    try:
        urlopen(req)
    except Exception as e:
        print(f"  Batch {start}: {e}")
    if (start // batch_size) % 3 == 0:
        print(f"  Uploaded {start + len(batch)}/{len(all_story_points)}")

print(f"Uploaded {len(all_story_points)} points to momentry_dev_stories")

print("\n=== Step 6: Populate sentence_story and sentence_summary ===")
# These are the per-sentence template + summary collections
# sentence_story: 3417 points, 768D, template payloads
# sentence_summary: 3417 points, 768D, LLM summary payloads

for col_name in ["sentence_story", "sentence_summary"]:
    req = Request(f"{QDRANT_URL}/collections/{col_name}", method="DELETE")
    try:
        urlopen(req)
        time.sleep(0.2)
    except:
        pass

    req = Request(f"{QDRANT_URL}/collections/{col_name}",
        data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
        headers={"Content-Type": "application/json"}, method="PUT")
    urlopen(req)
    time.sleep(0.2)

# Build points for sentence_story and sentence_summary
story_sentence_points = []
summary_sentence_points = []
for idx in sorted(sentences.keys()):
    s = sentences[idx]
    raw_text = ""
    if s["content"] and isinstance(s["content"], dict):
        raw_text = s["content"].get("data", {}).get("text", "")

    dialog_line = f'({s["new_name"]}) {raw_text}'

    story_sentence_points.append({
        "id": idx + 1,
        "vector": [0.0] * 768,
        "payload": {
            "chunk_id": f"{UUID}_{idx}",
            "file_uuid": UUID,
            "start_time": 0,
            "end_time": 0,
            "text": dialog_line,
            "speaker_name": s["new_name"],
            "chunk_type": "sentence",
        }
    })

# Upload sentence_story (dialogue template)
batch_size = 200
for start in range(0, len(story_sentence_points), batch_size):
    batch = story_sentence_points[start:start+batch_size]
    req = Request(f"{QDRANT_URL}/collections/sentence_story/points?wait=true",
        data=json.dumps({"points": batch}).encode(),
        headers={"Content-Type": "application/json"}, method="PUT")
    try:
        urlopen(req)
    except Exception as e:
        print(f"  sentence_story batch {start}: {e}")
    if (start // batch_size) % 5 == 0:
        print(f"  Uploaded {start + len(batch)}/3417 sentence_story")

print("Uploaded sentence_story points")

# sentence_summary will be populated when we generate per-sentence summaries
# For now, mark as TODO
print("sentence_summary: SKIPPED (needs per-sentence LLM summaries)")

cur.close()
conn.close()
print("\n=== Done ===")