#!/opt/homebrew/bin/python3.11 """Vectorize sentence chunks via Ollama mxbai-embed-large and store in DB + Qdrant.""" import json, sys, time import psycopg2 from urllib.request import Request, urlopen DB = "dbname=momentry user=accusys" UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5" OLLAMA = "http://localhost:11434/api/embeddings" QDRANT = "http://localhost:6333" conn = psycopg2.connect(DB) cur = conn.cursor() cur.execute(""" SELECT chunk_id, text_content FROM dev.chunk WHERE file_uuid = %s AND chunk_type = 'sentence' AND (text_content IS NOT NULL AND text_content != '') ORDER BY id """, (UUID,)) rows = cur.fetchall() print(f"Vectorizing {len(rows)} chunks for {UUID}...") stored = 0 batch = [] for chunk_id, text in rows: req = Request(OLLAMA, data=json.dumps({ "model": "nomic-embed-text-v2-moe:latest", "prompt": text }).encode(), headers={"Content-Type": "application/json"}) resp = json.loads(urlopen(req).read()) embedding = resp["embedding"] # Store in PostgreSQL chunk_vectors cur.execute(""" INSERT INTO dev.chunk_vectors (chunk_id, uuid, chunk_type, embedding) VALUES (%s, %s, 'sentence', %s::jsonb) ON CONFLICT (chunk_id, uuid) DO UPDATE SET embedding = EXCLUDED.embedding """, (chunk_id, UUID, json.dumps(embedding))) # Batch for Qdrant batch.append({ "id": int(chunk_id) + 1 if chunk_id.isdigit() else len(batch) + 10000, "vector": embedding, "payload": {"chunk_id": chunk_id, "chunk_type": "sentence"} }) if len(batch) >= 100: req = Request(f"{QDRANT}/collections/momentry_dev_rule1_v2/points?wait=true", data=json.dumps({"points": batch}).encode(), headers={"Content-Type": "application/json"}, method="PUT") urlopen(req) batch = [] stored += 1 if stored % 50 == 0: print(f" {stored}/{len(rows)}") conn.commit() if batch: req = Request(f"{QDRANT}/collections/momentry_dev_rule1_v2/points?wait=true", data=json.dumps({"points": batch}).encode(), headers={"Content-Type": "application/json"}, method="PUT") urlopen(req) conn.commit() cur.close() conn.close() print(f"Done: {stored} vectors stored")