#!/opt/homebrew/bin/python3.11 """ Vectorize 4188 sentence chunks via EmbeddingGemma (768D) + rebuild Qdrant collections. """ import json, sys, time from urllib.request import Request, urlopen import psycopg2 import urllib.request UUID = "aeed71342a899fe4b4c57b7d41bcb692" DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp" QDRANT_URL = "http://localhost:6333" EMBED_URL = "http://localhost:11436/v1/embeddings" COLLECTIONS = ["momentry_dev_v1", "sentence_story", "sentence_summary"] def call_embed(text): body = json.dumps({"input": text}).encode() req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"}) resp = urlopen(req, timeout=30) return json.loads(resp.read())["data"][0]["embedding"] print("=== Step 1: Load chunks ===") conn = psycopg2.connect(DB_URL) cur = conn.cursor() cur.execute(""" SELECT chunk_index, chunk_id, text_content, metadata->>'speaker_name', start_time, end_time, metadata->>'speaker_id' FROM dev.chunks WHERE file_uuid=%s AND chunk_type='sentence' ORDER BY chunk_index """, (UUID,)) chunks = cur.fetchall() conn.close() print(f"Loaded {len(chunks)} chunks") print("\n=== Step 2: Vectorize (EmbeddingGemma 768D) ===") # Generate cleaned text for embedding: "Speaker: text" format texts_for_embed = [] for r in chunks: spk = r[3] or "Unknown" txt = r[2] or "" # Remove [Speaker] prefix if present if txt.startswith("["): txt = txt.split("]", 1)[-1].strip() texts_for_embed.append(f"{spk}: \"{txt}\"") t0 = time.time() embeddings = [] batch_size = 50 for start in range(0, len(texts_for_embed), batch_size): batch = texts_for_embed[start:start+batch_size] # Try batch embed body = json.dumps({"input": batch}).encode() req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"}) try: resp = json.loads(urlopen(req, timeout=60).read()) batch_embs = [d["embedding"] for d in resp["data"]] except: # Fallback to single batch_embs = [] for t in batch: batch_embs.append(call_embed(t)) embeddings.extend(batch_embs) if (start // batch_size) % 10 == 0: pct = (start + len(batch)) * 100 // len(texts_for_embed) print(f" {start+len(batch)}/{len(texts_for_embed)} ({pct}%) [{time.time()-t0:.0f}s]") elapsed = time.time() - t0 print(f" Done: {len(embeddings)} embeddings in {elapsed:.1f}s ({elapsed/len(embeddings):.2f}s each)") print("\n=== Step 3: Rebuild Qdrant collections ===") import time as time_module for col in COLLECTIONS: # Delete req = Request(f"{QDRANT_URL}/collections/{col}", method="DELETE") try: urlopen(req); time_module.sleep(0.3) except: pass # Create req = Request(f"{QDRANT_URL}/collections/{col}", data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(), headers={"Content-Type": "application/json"}, method="PUT") urlopen(req) time_module.sleep(0.3) print(f" Created {col}") # Upload print("\n=== Step 4: Upload points ===") batch_size = 100 for col in COLLECTIONS: points = [] for i, r in enumerate(chunks): idx = r[0] cid = r[1] spk_name = r[3] or "Unknown" spk_id = r[6] or "Unknown" txt = r[2] or "" st = r[4] et = r[5] payload = { "chunk_type": "sentence", "uuid": UUID, "chunk_id": cid, "start_time": st, "end_time": et, "speaker_name": spk_name, "speaker_id": spk_id, } if col == "momentry_dev_v1": payload["text"] = txt elif col == "sentence_story": payload["text"] = txt elif col == "sentence_summary": payload["summary"] = txt points.append({ "id": idx + 1, "vector": embeddings[i], "payload": payload, }) for start in range(0, len(points), batch_size): batch = points[start:start+batch_size] req = Request(f"{QDRANT_URL}/collections/{col}/points?wait=true", data=json.dumps({"points": batch}).encode(), headers={"Content-Type": "application/json"}, method="PUT") try: urlopen(req) except Exception as e: print(f" {col} batch {start}: {e}") if (start // batch_size) % 5 == 0: print(f" {col}: {start+len(batch)}/{len(points)}") print(f" {col}: done") # Verify print("\n=== Verify ===") for col in COLLECTIONS: resp = json.loads(urlopen(f"{QDRANT_URL}/collections/{col}").read()) info = resp["result"] print(f" {col}: {info['points_count']} pts, {info['config']['params']['vectors'].get('size','?')}D") print("\n=== Done ===")