feat: TKG completion, PG audit, SQLite backup with Qdrant voice vectors
- Add voice_embeddings vec0 table (192D) from Qdrant to SQLite export - Add tkg_nodes + tkg_edges tables to SQLite export - Clean orphan TKG data (2414 nodes, 64 chunks) - Rebuild TKG for both Charade files with speaker nodes - Create asrx.json from chunk speaker metadata for TKG builder - PG audit: pre_chunks 1.8GB (largest), 3 empty tables found - Update release package to include all output files (not just JSON) - Full backup: 9 SQLite tables + 3 vec0 vector tables
This commit is contained in:
@@ -200,13 +200,64 @@ pg_to_sqlite(
|
|||||||
"CREATE TABLE IF NOT EXISTS identity_bindings (identity_id INTEGER, identity_type TEXT, identity_value TEXT, confidence REAL)",
|
"CREATE TABLE IF NOT EXISTS identity_bindings (identity_id INTEGER, identity_type TEXT, identity_value TEXT, confidence REAL)",
|
||||||
[UUID])
|
[UUID])
|
||||||
|
|
||||||
|
# tkg_nodes
|
||||||
|
pg_to_sqlite(
|
||||||
|
"SELECT id, node_type, external_id, file_uuid, label, properties::text FROM dev.tkg_nodes WHERE file_uuid=%s",
|
||||||
|
"tkg_nodes",
|
||||||
|
"CREATE TABLE IF NOT EXISTS tkg_nodes (id INTEGER PRIMARY KEY, node_type TEXT, external_id TEXT, file_uuid TEXT, label TEXT, properties TEXT)",
|
||||||
|
[UUID])
|
||||||
|
|
||||||
|
# tkg_edges
|
||||||
|
pg_to_sqlite(
|
||||||
|
"SELECT id, edge_type, source_node_id, target_node_id, file_uuid, properties::text FROM dev.tkg_edges WHERE file_uuid=%s",
|
||||||
|
"tkg_edges",
|
||||||
|
"CREATE TABLE IF NOT EXISTS tkg_edges (id INTEGER PRIMARY KEY, edge_type TEXT, source_node_id INTEGER, target_node_id INTEGER, file_uuid TEXT, properties TEXT)",
|
||||||
|
[UUID])
|
||||||
|
|
||||||
|
# Voice vectors from Qdrant (ECAPA-TDNN speaker embeddings, 192D)
|
||||||
|
print(" Exporting voice vectors from Qdrant...")
|
||||||
|
try:
|
||||||
|
from urllib.request import Request, urlopen
|
||||||
|
lite_cur.execute("SELECT chunk_id FROM chunk")
|
||||||
|
db_chunk_ids = set(r[0] for r in lite_cur.fetchall())
|
||||||
|
qdrant_chunks = {}
|
||||||
|
offset_val = None
|
||||||
|
while True:
|
||||||
|
data = {"limit": 100, "with_vector": True, "with_payload": True}
|
||||||
|
if offset_val:
|
||||||
|
data["offset"] = offset_val
|
||||||
|
req = Request("http://localhost:6333/collections/momentry_dev_voice/points/scroll",
|
||||||
|
data=json.dumps(data).encode(),
|
||||||
|
headers={"Content-Type": "application/json"}, method="POST")
|
||||||
|
resp = json.loads(urlopen(req).read())
|
||||||
|
for pt in resp["result"].get("points", []):
|
||||||
|
cid = pt["payload"].get("chunk_id", "")
|
||||||
|
if cid in db_chunk_ids:
|
||||||
|
qdrant_chunks[cid] = pt.get("vector", [])
|
||||||
|
offset_val = resp["result"].get("next_page_offset")
|
||||||
|
if offset_val is None:
|
||||||
|
break
|
||||||
|
if qdrant_chunks:
|
||||||
|
dim = len(next(iter(qdrant_chunks.values())))
|
||||||
|
lite_cur.execute("CREATE VIRTUAL TABLE IF NOT EXISTS voice_embeddings USING vec0(embedding float[{}])".format(dim))
|
||||||
|
for chunk_id, vec in qdrant_chunks.items():
|
||||||
|
if len(vec) == dim:
|
||||||
|
rid = int(chunk_id) if chunk_id.isdigit() else hash(chunk_id) & 0x7fffffff
|
||||||
|
lite_cur.execute("INSERT INTO voice_embeddings (rowid, embedding) VALUES (?, ?)",
|
||||||
|
[rid, json.dumps(vec)])
|
||||||
|
lite.commit()
|
||||||
|
print(" voice_embeddings (vec0, {}D): {} vectors".format(dim, len(qdrant_chunks)))
|
||||||
|
except Exception as e:
|
||||||
|
print(" WARNING: Qdrant voice export skipped: {}".format(e))
|
||||||
|
|
||||||
# ---- Create indexes ----
|
# ---- Create indexes ----
|
||||||
print("Creating indexes...")
|
print("Creating indexes...")
|
||||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_trace ON face_detections(trace_id)")
|
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_trace ON face_detections(trace_id)")
|
||||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_identity ON face_detections(identity_id)")
|
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_identity ON face_detections(identity_id)")
|
||||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_frame ON face_detections(frame_number)")
|
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_frame ON face_detections(frame_number)")
|
||||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_time ON face_detections(timestamp_secs)")
|
|
||||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_chunk_chunkid ON chunk(chunk_id)")
|
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_chunk_chunkid ON chunk(chunk_id)")
|
||||||
|
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_tkg_node_type ON tkg_nodes(node_type)")
|
||||||
|
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_tkg_edge_type ON tkg_edges(edge_type)")
|
||||||
lite.commit()
|
lite.commit()
|
||||||
|
|
||||||
# ---- Stats ----
|
# ---- Stats ----
|
||||||
@@ -223,7 +274,7 @@ if VEC_DYLIB:
|
|||||||
lite.load_extension(VEC_DYLIB)
|
lite.load_extension(VEC_DYLIB)
|
||||||
lite.enable_load_extension(False)
|
lite.enable_load_extension(False)
|
||||||
c = lite.cursor()
|
c = lite.cursor()
|
||||||
for tbl in ['videos', 'chunk', 'face_detections', 'identities', 'identity_bindings']:
|
for tbl in ['videos', 'chunk', 'face_detections', 'identities', 'identity_bindings', 'tkg_nodes', 'tkg_edges']:
|
||||||
c.execute(f"SELECT COUNT(*) FROM {tbl}")
|
c.execute(f"SELECT COUNT(*) FROM {tbl}")
|
||||||
print(f" {tbl}: {c.fetchone()[0]} rows")
|
print(f" {tbl}: {c.fetchone()[0]} rows")
|
||||||
# Check vec tables
|
# Check vec tables
|
||||||
@@ -235,4 +286,8 @@ try:
|
|||||||
c.execute("SELECT COUNT(*) FROM face_embeddings")
|
c.execute("SELECT COUNT(*) FROM face_embeddings")
|
||||||
print(f" face_embeddings (vec0, 512D): {c.fetchone()[0]} vectors")
|
print(f" face_embeddings (vec0, 512D): {c.fetchone()[0]} vectors")
|
||||||
except: print(" face_embeddings: N/A")
|
except: print(" face_embeddings: N/A")
|
||||||
|
try:
|
||||||
|
c.execute("SELECT COUNT(*) FROM voice_embeddings")
|
||||||
|
print(f" voice_embeddings (vec0, 192D): {c.fetchone()[0]} vectors")
|
||||||
|
except: print(" voice_embeddings: N/A")
|
||||||
c.close(); lite.close()
|
c.close(); lite.close()
|
||||||
|
|||||||
Reference in New Issue
Block a user