From 007fe10c2e1c5da330e561a34057ce57fc4b9b7a Mon Sep 17 00:00:00 2001 From: Accusys Date: Wed, 13 May 2026 03:03:38 +0800 Subject: [PATCH] feat: TKG completion, PG audit, SQLite backup with Qdrant voice vectors - Add voice_embeddings vec0 table (192D) from Qdrant to SQLite export - Add tkg_nodes + tkg_edges tables to SQLite export - Clean orphan TKG data (2414 nodes, 64 chunks) - Rebuild TKG for both Charade files with speaker nodes - Create asrx.json from chunk speaker metadata for TKG builder - PG audit: pre_chunks 1.8GB (largest), 3 empty tables found - Update release package to include all output files (not just JSON) - Full backup: 9 SQLite tables + 3 vec0 vector tables --- scripts/export_sqlite.py | 59 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) diff --git a/scripts/export_sqlite.py b/scripts/export_sqlite.py index 35a3af2..bc7402d 100644 --- a/scripts/export_sqlite.py +++ b/scripts/export_sqlite.py @@ -200,13 +200,64 @@ pg_to_sqlite( "CREATE TABLE IF NOT EXISTS identity_bindings (identity_id INTEGER, identity_type TEXT, identity_value TEXT, confidence REAL)", [UUID]) +# tkg_nodes +pg_to_sqlite( + "SELECT id, node_type, external_id, file_uuid, label, properties::text FROM dev.tkg_nodes WHERE file_uuid=%s", + "tkg_nodes", + "CREATE TABLE IF NOT EXISTS tkg_nodes (id INTEGER PRIMARY KEY, node_type TEXT, external_id TEXT, file_uuid TEXT, label TEXT, properties TEXT)", + [UUID]) + +# tkg_edges +pg_to_sqlite( + "SELECT id, edge_type, source_node_id, target_node_id, file_uuid, properties::text FROM dev.tkg_edges WHERE file_uuid=%s", + "tkg_edges", + "CREATE TABLE IF NOT EXISTS tkg_edges (id INTEGER PRIMARY KEY, edge_type TEXT, source_node_id INTEGER, target_node_id INTEGER, file_uuid TEXT, properties TEXT)", + [UUID]) + +# Voice vectors from Qdrant (ECAPA-TDNN speaker embeddings, 192D) +print(" Exporting voice vectors from Qdrant...") +try: + from urllib.request import Request, urlopen + lite_cur.execute("SELECT chunk_id FROM chunk") + db_chunk_ids = set(r[0] for r in lite_cur.fetchall()) + qdrant_chunks = {} + offset_val = None + while True: + data = {"limit": 100, "with_vector": True, "with_payload": True} + if offset_val: + data["offset"] = offset_val + req = Request("http://localhost:6333/collections/momentry_dev_voice/points/scroll", + data=json.dumps(data).encode(), + headers={"Content-Type": "application/json"}, method="POST") + resp = json.loads(urlopen(req).read()) + for pt in resp["result"].get("points", []): + cid = pt["payload"].get("chunk_id", "") + if cid in db_chunk_ids: + qdrant_chunks[cid] = pt.get("vector", []) + offset_val = resp["result"].get("next_page_offset") + if offset_val is None: + break + if qdrant_chunks: + dim = len(next(iter(qdrant_chunks.values()))) + lite_cur.execute("CREATE VIRTUAL TABLE IF NOT EXISTS voice_embeddings USING vec0(embedding float[{}])".format(dim)) + for chunk_id, vec in qdrant_chunks.items(): + if len(vec) == dim: + rid = int(chunk_id) if chunk_id.isdigit() else hash(chunk_id) & 0x7fffffff + lite_cur.execute("INSERT INTO voice_embeddings (rowid, embedding) VALUES (?, ?)", + [rid, json.dumps(vec)]) + lite.commit() + print(" voice_embeddings (vec0, {}D): {} vectors".format(dim, len(qdrant_chunks))) +except Exception as e: + print(" WARNING: Qdrant voice export skipped: {}".format(e)) + # ---- Create indexes ---- print("Creating indexes...") lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_trace ON face_detections(trace_id)") lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_identity ON face_detections(identity_id)") lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_frame ON face_detections(frame_number)") -lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_time ON face_detections(timestamp_secs)") lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_chunk_chunkid ON chunk(chunk_id)") +lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_tkg_node_type ON tkg_nodes(node_type)") +lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_tkg_edge_type ON tkg_edges(edge_type)") lite.commit() # ---- Stats ---- @@ -223,7 +274,7 @@ if VEC_DYLIB: lite.load_extension(VEC_DYLIB) lite.enable_load_extension(False) c = lite.cursor() -for tbl in ['videos', 'chunk', 'face_detections', 'identities', 'identity_bindings']: +for tbl in ['videos', 'chunk', 'face_detections', 'identities', 'identity_bindings', 'tkg_nodes', 'tkg_edges']: c.execute(f"SELECT COUNT(*) FROM {tbl}") print(f" {tbl}: {c.fetchone()[0]} rows") # Check vec tables @@ -235,4 +286,8 @@ try: c.execute("SELECT COUNT(*) FROM face_embeddings") print(f" face_embeddings (vec0, 512D): {c.fetchone()[0]} vectors") except: print(" face_embeddings: N/A") +try: + c.execute("SELECT COUNT(*) FROM voice_embeddings") + print(f" voice_embeddings (vec0, 192D): {c.fetchone()[0]} vectors") +except: print(" voice_embeddings: N/A") c.close(); lite.close()