- Add SERVICE_INVENTORY_V1.0.0.md (25 source-verified tools, 3.7GB) - Add ERP_SELECTION_REPORT.md (Odoo CE vs ERPNext comparison) - Add SFTPGO_ODOO_REPLACEMENT.md (SFTPGo migration plan) - Add SERVICE_GO_GITEA_BUILD.md (Go compiler + Gitea build report) - Add release visualize command (face trace heatmap + identity filter) - Add sqlite-vec integration (160MB SQLite with vec0 vector tables) - Add export_identities.py, export_sqlite.py, render_face_heatmap.py - Add Go, Gitea, Rust/Cargo, Swift, yt-dlp, SQLite, sqlite-vec to service CLI - Fix package to include identities and identity_bindings in data.sql - Update release list to show all deployed video stats - Add V1.0.0 YAML frontmatter to all docs (DOCS_STANDARD compliant)
70 lines
2.2 KiB
Python
70 lines
2.2 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""Vectorize sentence chunks via Ollama mxbai-embed-large and store in DB + Qdrant."""
|
|
import json, sys, time
|
|
import psycopg2
|
|
from urllib.request import Request, urlopen
|
|
|
|
DB = "dbname=momentry user=accusys"
|
|
UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5"
|
|
OLLAMA = "http://localhost:11434/api/embeddings"
|
|
QDRANT = "http://localhost:6333"
|
|
|
|
conn = psycopg2.connect(DB)
|
|
cur = conn.cursor()
|
|
|
|
cur.execute("""
|
|
SELECT chunk_id, text_content FROM dev.chunk
|
|
WHERE file_uuid = %s AND chunk_type = 'sentence'
|
|
AND (text_content IS NOT NULL AND text_content != '')
|
|
ORDER BY id
|
|
""", (UUID,))
|
|
rows = cur.fetchall()
|
|
print(f"Vectorizing {len(rows)} chunks for {UUID}...")
|
|
|
|
stored = 0
|
|
batch = []
|
|
for chunk_id, text in rows:
|
|
req = Request(OLLAMA, data=json.dumps({
|
|
"model": "nomic-embed-text-v2-moe:latest",
|
|
"prompt": text
|
|
}).encode(), headers={"Content-Type": "application/json"})
|
|
resp = json.loads(urlopen(req).read())
|
|
embedding = resp["embedding"]
|
|
|
|
# Store in PostgreSQL chunk_vectors
|
|
cur.execute("""
|
|
INSERT INTO dev.chunk_vectors (chunk_id, uuid, chunk_type, embedding)
|
|
VALUES (%s, %s, 'sentence', %s::jsonb)
|
|
ON CONFLICT (chunk_id, uuid) DO UPDATE SET embedding = EXCLUDED.embedding
|
|
""", (chunk_id, UUID, json.dumps(embedding)))
|
|
|
|
# Batch for Qdrant
|
|
batch.append({
|
|
"id": int(chunk_id) + 1 if chunk_id.isdigit() else len(batch) + 10000,
|
|
"vector": embedding,
|
|
"payload": {"chunk_id": chunk_id, "chunk_type": "sentence"}
|
|
})
|
|
|
|
if len(batch) >= 100:
|
|
req = Request(f"{QDRANT}/collections/momentry_dev_rule1_v2/points?wait=true",
|
|
data=json.dumps({"points": batch}).encode(),
|
|
headers={"Content-Type": "application/json"}, method="PUT")
|
|
urlopen(req)
|
|
batch = []
|
|
|
|
stored += 1
|
|
if stored % 50 == 0:
|
|
print(f" {stored}/{len(rows)}")
|
|
conn.commit()
|
|
|
|
if batch:
|
|
req = Request(f"{QDRANT}/collections/momentry_dev_rule1_v2/points?wait=true",
|
|
data=json.dumps({"points": batch}).encode(),
|
|
headers={"Content-Type": "application/json"}, method="PUT")
|
|
urlopen(req)
|
|
|
|
conn.commit()
|
|
cur.close()
|
|
conn.close()
|
|
print(f"Done: {stored} vectors stored")
|