momentry_core/scripts/generate_sentence_summaries.py

#!/opt/homebrew/bin/python3.11
"""
Generate sentence-level summaries using parent story context.
Each sentence gets an LLM summary informed by the parent chunk scene overview.
"""

import json, time, sys, os
from urllib.request import Request, urlopen
import psycopg2

UUID = "aeed71342a899fe4b4c57b7d41bcb692"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
QDRANT_URL = "http://localhost:6333"
LLM_URL = "http://localhost:8082/v1/chat/completions"
EMBED_URL = "http://localhost:11436/v1/embeddings"

CHECKPOINT = f"/tmp/sentence_summaries_{UUID}.json"

def call_llm(prompt):
    body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
        "messages": [{"role": "user", "content": prompt}],
        "temperature": 0.1, "max_tokens": 80}).encode()
    req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
    try:
        resp = urlopen(req, timeout=30)
        data = json.loads(resp.read())
        return data["choices"][0]["message"]["content"].strip()
    except Exception as e:
        return ""

def call_embed(text):
    body = json.dumps({"input": text}).encode()
    req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
    try:
        resp = urlopen(req, timeout=30)
        return json.loads(resp.read())["data"][0]["embedding"]
    except Exception as e:
        return None

print("=== Step 1: Build sentence→parent mapping ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()

# Get all story chunks with their child_chunk_ids
cur.execute("""
    SELECT chunk_index, summary_text, child_chunk_ids
    FROM dev.chunks
    WHERE file_uuid = %s AND chunk_type = 'story'
    ORDER BY chunk_index
""", (UUID,))
stories = cur.fetchall()
print(f"Loaded {len(stories)} story chunks")

# Get all sentence chunks
cur.execute("""
    SELECT chunk_index, text_content, metadata->>'new_speaker_name' as speaker
    FROM dev.chunks
    WHERE file_uuid = %s AND chunk_type = 'sentence'
    ORDER BY chunk_index
""", (UUID,))
all_sentences = {r[0]: {"text": r[1], "speaker": r[2]} for r in cur.fetchall()}
print(f"Loaded {len(all_sentences)} sentence chunks")

# Build: sentence_index → (parent_summary, sentence_text, speaker)
sentence_map = {}
for r in stories:
    story_idx, summary_text, child_ids = r
    if not child_ids:
        continue
    for cid in child_ids:
        parts = cid.split("_")
        child_idx = int(parts[-1])
        if child_idx in all_sentences:
            sentence_map[child_idx] = {
                "parent_summary": summary_text or "",
                "sentence_text": all_sentences[child_idx]["text"] or "",
                "speaker": all_sentences[child_idx]["speaker"] or "Unknown",
            }

# Load checkpoint if exists
completed = set()
if os.path.exists(CHECKPOINT):
    with open(CHECKPOINT) as f:
        old = json.load(f)
        completed = set(old.get("completed", []))
    print(f"Loaded checkpoint: {len(completed)} already completed")

conn.close()

print("\n=== Step 2: Generate summaries ===")
results = []
errors = 0
sorted_indices = sorted(sentence_map.keys())

for i, idx in enumerate(sorted_indices):
    if idx in completed:
        continue

    info = sentence_map[idx]
    parent_summary = info["parent_summary"]
    sent_text = info["sentence_text"]
    speaker = info["speaker"]

    if not parent_summary or not sent_text:
        summary = sent_text or ""
        embedding = [0.0] * 768
    else:
        prompt = f"Context: {parent_summary}\nUtterance: {sent_text}\n\nIn one short sentence, explain what the speaker communicates with this line within the context above."
        summary = call_llm(prompt)
        if not summary:
            summary = sent_text
            embedding = [0.0] * 768
        else:
            embedding = call_embed(summary)
            if embedding is None:
                embedding = [0.0] * 768
        time.sleep(0.15)

    results.append({
        "index": idx,
        "chunk_id": f"{UUID}_{idx}",
        "speaker_name": speaker,
        "utterance": sent_text,
        "summary": summary,
        "embedding": embedding,
    })

    if (i + 1) % 50 == 0:
        print(f"  [{i+1}/{len(sorted_indices)}] idx={idx} summary_len={len(summary)} errs={errors}")
        json.dump({"completed": list(completed | {r["index"] for r in results}), "results": results}, open(CHECKPOINT, "w"))

print(f"Generated {len(results)} summaries, {errors} errors")

# Recompute all results including checkpointed
all_results = results
if os.path.exists(CHECKPOINT):
    cp = json.load(open(CHECKPOINT))
    all_results = cp.get("results", [])
    # Merge
    existing = {r["index"] for r in all_results}
    for r in results:
        if r["index"] not in existing:
            all_results.append(r)
    all_results.sort(key=lambda x: x["index"])

print(f"\nTotal summaries: {len(all_results)}")

print("\n=== Step 3: Update Qdrant sentence_summary ===")
# Delete old collection
req = Request(f"{QDRANT_URL}/collections/sentence_summary", method="DELETE")
try:
    urlopen(req)
    time.sleep(0.5)
except:
    pass

# Recreate
req = Request(f"{QDRANT_URL}/collections/sentence_summary",
    data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
    headers={"Content-Type": "application/json"}, method="PUT")
urlopen(req)
time.sleep(0.5)

# Upload
batch_size = 100
points = []
for r in all_results:
    points.append({
        "id": r["index"] + 1,
        "vector": r["embedding"],
        "payload": {
            "chunk_type": "sentence",
            "uuid": UUID,
            "chunk_id": r["chunk_id"],
            "speaker_name": r["speaker_name"],
            "utterance": r["utterance"],
            "summary": r["summary"],
        }
    })

for start in range(0, len(points), batch_size):
    batch = points[start:start+batch_size]
    req = Request(f"{QDRANT_URL}/collections/sentence_summary/points?wait=true",
        data=json.dumps({"points": batch}).encode(),
        headers={"Content-Type": "application/json"}, method="PUT")
    try:
        urlopen(req)
    except Exception as e:
        print(f"  Batch {start}: {e}")
    if (start // batch_size) % 5 == 0:
        print(f"  Uploaded {start + len(batch)}/{len(points)}")

print(f"Done: {len(points)} points in sentence_summary")

# Verify
resp = json.loads(urlopen(f"{QDRANT_URL}/collections/sentence_summary").read())
info = resp["result"]
print(f"Verified: points={info['points_count']}, dim={info['config']['params']['vectors'].get('size','?')}")