momentry_core/scripts/vectorize_chunks.py

#!/opt/homebrew/bin/python3.11
"""Vectorize sentence chunks via Ollama mxbai-embed-large and store in DB + Qdrant."""
import json, sys, time
import psycopg2
from urllib.request import Request, urlopen

DB = "dbname=momentry user=accusys"
UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5"
OLLAMA = "http://localhost:11434/api/embeddings"
QDRANT = "http://localhost:6333"

conn = psycopg2.connect(DB)
cur = conn.cursor()

cur.execute("""
    SELECT chunk_id, text_content FROM dev.chunk
    WHERE file_uuid = %s AND chunk_type = 'sentence'
    AND (text_content IS NOT NULL AND text_content != '')
    ORDER BY id
""", (UUID,))
rows = cur.fetchall()
print(f"Vectorizing {len(rows)} chunks for {UUID}...")

stored = 0
batch = []
for chunk_id, text in rows:
    req = Request(OLLAMA, data=json.dumps({
        "model": "nomic-embed-text-v2-moe:latest",
        "prompt": text
    }).encode(), headers={"Content-Type": "application/json"})
    resp = json.loads(urlopen(req).read())
    embedding = resp["embedding"]

    # Store in PostgreSQL chunk_vectors
    cur.execute("""
        INSERT INTO dev.chunk_vectors (chunk_id, uuid, chunk_type, embedding)
        VALUES (%s, %s, 'sentence', %s::jsonb)
        ON CONFLICT (chunk_id, uuid) DO UPDATE SET embedding = EXCLUDED.embedding
    """, (chunk_id, UUID, json.dumps(embedding)))

    # Batch for Qdrant
    batch.append({
        "id": int(chunk_id) + 1 if chunk_id.isdigit() else len(batch) + 10000,
        "vector": embedding,
        "payload": {"chunk_id": chunk_id, "chunk_type": "sentence"}
    })

    if len(batch) >= 100:
        req = Request(f"{QDRANT}/collections/momentry_dev_rule1_v2/points?wait=true",
            data=json.dumps({"points": batch}).encode(),
            headers={"Content-Type": "application/json"}, method="PUT")
        urlopen(req)
        batch = []

    stored += 1
    if stored % 50 == 0:
        print(f"  {stored}/{len(rows)}")
        conn.commit()

if batch:
    req = Request(f"{QDRANT}/collections/momentry_dev_rule1_v2/points?wait=true",
        data=json.dumps({"points": batch}).encode(),
        headers={"Content-Type": "application/json"}, method="PUT")
    urlopen(req)

conn.commit()
cur.close()
conn.close()
print(f"Done: {stored} vectors stored")