Files
momentry_core/scripts/vectorize_chunks.py
Accusys 2992a0e650 feat: service inventory, ERP reports, sqlite-vec integration, visualize tool
- Add SERVICE_INVENTORY_V1.0.0.md (25 source-verified tools, 3.7GB)
- Add ERP_SELECTION_REPORT.md (Odoo CE vs ERPNext comparison)
- Add SFTPGO_ODOO_REPLACEMENT.md (SFTPGo migration plan)
- Add SERVICE_GO_GITEA_BUILD.md (Go compiler + Gitea build report)
- Add release visualize command (face trace heatmap + identity filter)
- Add sqlite-vec integration (160MB SQLite with vec0 vector tables)
- Add export_identities.py, export_sqlite.py, render_face_heatmap.py
- Add Go, Gitea, Rust/Cargo, Swift, yt-dlp, SQLite, sqlite-vec to service CLI
- Fix package to include identities and identity_bindings in data.sql
- Update release list to show all deployed video stats
- Add V1.0.0 YAML frontmatter to all docs (DOCS_STANDARD compliant)
2026-05-13 02:37:45 +08:00

70 lines
2.2 KiB
Python

#!/opt/homebrew/bin/python3.11
"""Vectorize sentence chunks via Ollama mxbai-embed-large and store in DB + Qdrant."""
import json, sys, time
import psycopg2
from urllib.request import Request, urlopen
DB = "dbname=momentry user=accusys"
UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5"
OLLAMA = "http://localhost:11434/api/embeddings"
QDRANT = "http://localhost:6333"
conn = psycopg2.connect(DB)
cur = conn.cursor()
cur.execute("""
SELECT chunk_id, text_content FROM dev.chunk
WHERE file_uuid = %s AND chunk_type = 'sentence'
AND (text_content IS NOT NULL AND text_content != '')
ORDER BY id
""", (UUID,))
rows = cur.fetchall()
print(f"Vectorizing {len(rows)} chunks for {UUID}...")
stored = 0
batch = []
for chunk_id, text in rows:
req = Request(OLLAMA, data=json.dumps({
"model": "nomic-embed-text-v2-moe:latest",
"prompt": text
}).encode(), headers={"Content-Type": "application/json"})
resp = json.loads(urlopen(req).read())
embedding = resp["embedding"]
# Store in PostgreSQL chunk_vectors
cur.execute("""
INSERT INTO dev.chunk_vectors (chunk_id, uuid, chunk_type, embedding)
VALUES (%s, %s, 'sentence', %s::jsonb)
ON CONFLICT (chunk_id, uuid) DO UPDATE SET embedding = EXCLUDED.embedding
""", (chunk_id, UUID, json.dumps(embedding)))
# Batch for Qdrant
batch.append({
"id": int(chunk_id) + 1 if chunk_id.isdigit() else len(batch) + 10000,
"vector": embedding,
"payload": {"chunk_id": chunk_id, "chunk_type": "sentence"}
})
if len(batch) >= 100:
req = Request(f"{QDRANT}/collections/momentry_dev_rule1_v2/points?wait=true",
data=json.dumps({"points": batch}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
urlopen(req)
batch = []
stored += 1
if stored % 50 == 0:
print(f" {stored}/{len(rows)}")
conn.commit()
if batch:
req = Request(f"{QDRANT}/collections/momentry_dev_rule1_v2/points?wait=true",
data=json.dumps({"points": batch}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
urlopen(req)
conn.commit()
cur.close()
conn.close()
print(f"Done: {stored} vectors stored")