feat: service inventory, ERP reports, sqlite-vec integration, visualize tool
- Add SERVICE_INVENTORY_V1.0.0.md (25 source-verified tools, 3.7GB) - Add ERP_SELECTION_REPORT.md (Odoo CE vs ERPNext comparison) - Add SFTPGO_ODOO_REPLACEMENT.md (SFTPGo migration plan) - Add SERVICE_GO_GITEA_BUILD.md (Go compiler + Gitea build report) - Add release visualize command (face trace heatmap + identity filter) - Add sqlite-vec integration (160MB SQLite with vec0 vector tables) - Add export_identities.py, export_sqlite.py, render_face_heatmap.py - Add Go, Gitea, Rust/Cargo, Swift, yt-dlp, SQLite, sqlite-vec to service CLI - Fix package to include identities and identity_bindings in data.sql - Update release list to show all deployed video stats - Add V1.0.0 YAML frontmatter to all docs (DOCS_STANDARD compliant)
This commit is contained in:
69
scripts/vectorize_chunks.py
Normal file
69
scripts/vectorize_chunks.py
Normal file
@@ -0,0 +1,69 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""Vectorize sentence chunks via Ollama mxbai-embed-large and store in DB + Qdrant."""
|
||||
import json, sys, time
|
||||
import psycopg2
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
DB = "dbname=momentry user=accusys"
|
||||
UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5"
|
||||
OLLAMA = "http://localhost:11434/api/embeddings"
|
||||
QDRANT = "http://localhost:6333"
|
||||
|
||||
conn = psycopg2.connect(DB)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("""
|
||||
SELECT chunk_id, text_content FROM dev.chunk
|
||||
WHERE file_uuid = %s AND chunk_type = 'sentence'
|
||||
AND (text_content IS NOT NULL AND text_content != '')
|
||||
ORDER BY id
|
||||
""", (UUID,))
|
||||
rows = cur.fetchall()
|
||||
print(f"Vectorizing {len(rows)} chunks for {UUID}...")
|
||||
|
||||
stored = 0
|
||||
batch = []
|
||||
for chunk_id, text in rows:
|
||||
req = Request(OLLAMA, data=json.dumps({
|
||||
"model": "nomic-embed-text-v2-moe:latest",
|
||||
"prompt": text
|
||||
}).encode(), headers={"Content-Type": "application/json"})
|
||||
resp = json.loads(urlopen(req).read())
|
||||
embedding = resp["embedding"]
|
||||
|
||||
# Store in PostgreSQL chunk_vectors
|
||||
cur.execute("""
|
||||
INSERT INTO dev.chunk_vectors (chunk_id, uuid, chunk_type, embedding)
|
||||
VALUES (%s, %s, 'sentence', %s::jsonb)
|
||||
ON CONFLICT (chunk_id, uuid) DO UPDATE SET embedding = EXCLUDED.embedding
|
||||
""", (chunk_id, UUID, json.dumps(embedding)))
|
||||
|
||||
# Batch for Qdrant
|
||||
batch.append({
|
||||
"id": int(chunk_id) + 1 if chunk_id.isdigit() else len(batch) + 10000,
|
||||
"vector": embedding,
|
||||
"payload": {"chunk_id": chunk_id, "chunk_type": "sentence"}
|
||||
})
|
||||
|
||||
if len(batch) >= 100:
|
||||
req = Request(f"{QDRANT}/collections/momentry_dev_rule1_v2/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req)
|
||||
batch = []
|
||||
|
||||
stored += 1
|
||||
if stored % 50 == 0:
|
||||
print(f" {stored}/{len(rows)}")
|
||||
conn.commit()
|
||||
|
||||
if batch:
|
||||
req = Request(f"{QDRANT}/collections/momentry_dev_rule1_v2/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req)
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
print(f"Done: {stored} vectors stored")
|
||||
Reference in New Issue
Block a user