momentry_core/scripts/vectorize_4188.py

#!/opt/homebrew/bin/python3.11
"""
Vectorize 4188 sentence chunks via EmbeddingGemma (768D) + rebuild Qdrant collections.
"""
import json, sys, time
from urllib.request import Request, urlopen
import psycopg2
import urllib.request

UUID = "aeed71342a899fe4b4c57b7d41bcb692"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
QDRANT_URL = "http://localhost:6333"
EMBED_URL = "http://localhost:11436/v1/embeddings"
COLLECTIONS = ["momentry_dev_v1", "sentence_story", "sentence_summary"]

def call_embed(text):
    body = json.dumps({"input": text}).encode()
    req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
    resp = urlopen(req, timeout=30)
    return json.loads(resp.read())["data"][0]["embedding"]

print("=== Step 1: Load chunks ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("""
    SELECT chunk_index, chunk_id, text_content, metadata->>'speaker_name',
           start_time, end_time, metadata->>'speaker_id'
    FROM dev.chunks
    WHERE file_uuid=%s AND chunk_type='sentence'
    ORDER BY chunk_index
""", (UUID,))
chunks = cur.fetchall()
conn.close()
print(f"Loaded {len(chunks)} chunks")

print("\n=== Step 2: Vectorize (EmbeddingGemma 768D) ===")
# Generate cleaned text for embedding: "Speaker: text" format
texts_for_embed = []
for r in chunks:
    spk = r[3] or "Unknown"
    txt = r[2] or ""
    # Remove [Speaker] prefix if present
    if txt.startswith("["):
        txt = txt.split("]", 1)[-1].strip()
    texts_for_embed.append(f"{spk}: \"{txt}\"")

t0 = time.time()
embeddings = []
batch_size = 50
for start in range(0, len(texts_for_embed), batch_size):
    batch = texts_for_embed[start:start+batch_size]
    # Try batch embed
    body = json.dumps({"input": batch}).encode()
    req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
    try:
        resp = json.loads(urlopen(req, timeout=60).read())
        batch_embs = [d["embedding"] for d in resp["data"]]
    except:
        # Fallback to single
        batch_embs = []
        for t in batch:
            batch_embs.append(call_embed(t))
    embeddings.extend(batch_embs)

    if (start // batch_size) % 10 == 0:
        pct = (start + len(batch)) * 100 // len(texts_for_embed)
        print(f"  {start+len(batch)}/{len(texts_for_embed)} ({pct}%) [{time.time()-t0:.0f}s]")

elapsed = time.time() - t0
print(f"  Done: {len(embeddings)} embeddings in {elapsed:.1f}s ({elapsed/len(embeddings):.2f}s each)")

print("\n=== Step 3: Rebuild Qdrant collections ===")
import time as time_module

for col in COLLECTIONS:
    # Delete
    req = Request(f"{QDRANT_URL}/collections/{col}", method="DELETE")
    try: urlopen(req); time_module.sleep(0.3)
    except: pass

    # Create
    req = Request(f"{QDRANT_URL}/collections/{col}",
        data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
        headers={"Content-Type": "application/json"}, method="PUT")
    urlopen(req)
    time_module.sleep(0.3)
    print(f"  Created {col}")

# Upload
print("\n=== Step 4: Upload points ===")
batch_size = 100
for col in COLLECTIONS:
    points = []
    for i, r in enumerate(chunks):
        idx = r[0]
        cid = r[1]
        spk_name = r[3] or "Unknown"
        spk_id = r[6] or "Unknown"
        txt = r[2] or ""
        st = r[4]
        et = r[5]

        payload = {
            "chunk_type": "sentence", "uuid": UUID,
            "chunk_id": cid, "start_time": st, "end_time": et,
            "speaker_name": spk_name, "speaker_id": spk_id,
        }
        if col == "momentry_dev_v1":
            payload["text"] = txt
        elif col == "sentence_story":
            payload["text"] = txt
        elif col == "sentence_summary":
            payload["summary"] = txt

        points.append({
            "id": idx + 1,
            "vector": embeddings[i],
            "payload": payload,
        })

    for start in range(0, len(points), batch_size):
        batch = points[start:start+batch_size]
        req = Request(f"{QDRANT_URL}/collections/{col}/points?wait=true",
            data=json.dumps({"points": batch}).encode(),
            headers={"Content-Type": "application/json"}, method="PUT")
        try: urlopen(req)
        except Exception as e: print(f"  {col} batch {start}: {e}")
        if (start // batch_size) % 5 == 0:
            print(f"  {col}: {start+len(batch)}/{len(points)}")
    print(f"  {col}: done")

# Verify
print("\n=== Verify ===")
for col in COLLECTIONS:
    resp = json.loads(urlopen(f"{QDRANT_URL}/collections/{col}").read())
    info = resp["result"]
    print(f"  {col}: {info['points_count']} pts, {info['config']['params']['vectors'].get('size','?')}D")

print("\n=== Done ===")