feat: TKG completion, PG audit, SQLite backup with Qdrant voice vectors

- Add voice_embeddings vec0 table (192D) from Qdrant to SQLite export
- Add tkg_nodes + tkg_edges tables to SQLite export
- Clean orphan TKG data (2414 nodes, 64 chunks)
- Rebuild TKG for both Charade files with speaker nodes
- Create asrx.json from chunk speaker metadata for TKG builder
- PG audit: pre_chunks 1.8GB (largest), 3 empty tables found
- Update release package to include all output files (not just JSON)
- Full backup: 9 SQLite tables + 3 vec0 vector tables
This commit is contained in:
Accusys
2026-05-13 03:03:38 +08:00
parent 2992a0e650
commit 007fe10c2e

View File

@@ -200,13 +200,64 @@ pg_to_sqlite(
"CREATE TABLE IF NOT EXISTS identity_bindings (identity_id INTEGER, identity_type TEXT, identity_value TEXT, confidence REAL)",
[UUID])
# tkg_nodes
pg_to_sqlite(
"SELECT id, node_type, external_id, file_uuid, label, properties::text FROM dev.tkg_nodes WHERE file_uuid=%s",
"tkg_nodes",
"CREATE TABLE IF NOT EXISTS tkg_nodes (id INTEGER PRIMARY KEY, node_type TEXT, external_id TEXT, file_uuid TEXT, label TEXT, properties TEXT)",
[UUID])
# tkg_edges
pg_to_sqlite(
"SELECT id, edge_type, source_node_id, target_node_id, file_uuid, properties::text FROM dev.tkg_edges WHERE file_uuid=%s",
"tkg_edges",
"CREATE TABLE IF NOT EXISTS tkg_edges (id INTEGER PRIMARY KEY, edge_type TEXT, source_node_id INTEGER, target_node_id INTEGER, file_uuid TEXT, properties TEXT)",
[UUID])
# Voice vectors from Qdrant (ECAPA-TDNN speaker embeddings, 192D)
print(" Exporting voice vectors from Qdrant...")
try:
from urllib.request import Request, urlopen
lite_cur.execute("SELECT chunk_id FROM chunk")
db_chunk_ids = set(r[0] for r in lite_cur.fetchall())
qdrant_chunks = {}
offset_val = None
while True:
data = {"limit": 100, "with_vector": True, "with_payload": True}
if offset_val:
data["offset"] = offset_val
req = Request("http://localhost:6333/collections/momentry_dev_voice/points/scroll",
data=json.dumps(data).encode(),
headers={"Content-Type": "application/json"}, method="POST")
resp = json.loads(urlopen(req).read())
for pt in resp["result"].get("points", []):
cid = pt["payload"].get("chunk_id", "")
if cid in db_chunk_ids:
qdrant_chunks[cid] = pt.get("vector", [])
offset_val = resp["result"].get("next_page_offset")
if offset_val is None:
break
if qdrant_chunks:
dim = len(next(iter(qdrant_chunks.values())))
lite_cur.execute("CREATE VIRTUAL TABLE IF NOT EXISTS voice_embeddings USING vec0(embedding float[{}])".format(dim))
for chunk_id, vec in qdrant_chunks.items():
if len(vec) == dim:
rid = int(chunk_id) if chunk_id.isdigit() else hash(chunk_id) & 0x7fffffff
lite_cur.execute("INSERT INTO voice_embeddings (rowid, embedding) VALUES (?, ?)",
[rid, json.dumps(vec)])
lite.commit()
print(" voice_embeddings (vec0, {}D): {} vectors".format(dim, len(qdrant_chunks)))
except Exception as e:
print(" WARNING: Qdrant voice export skipped: {}".format(e))
# ---- Create indexes ----
print("Creating indexes...")
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_trace ON face_detections(trace_id)")
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_identity ON face_detections(identity_id)")
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_frame ON face_detections(frame_number)")
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_time ON face_detections(timestamp_secs)")
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_chunk_chunkid ON chunk(chunk_id)")
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_tkg_node_type ON tkg_nodes(node_type)")
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_tkg_edge_type ON tkg_edges(edge_type)")
lite.commit()
# ---- Stats ----
@@ -223,7 +274,7 @@ if VEC_DYLIB:
lite.load_extension(VEC_DYLIB)
lite.enable_load_extension(False)
c = lite.cursor()
for tbl in ['videos', 'chunk', 'face_detections', 'identities', 'identity_bindings']:
for tbl in ['videos', 'chunk', 'face_detections', 'identities', 'identity_bindings', 'tkg_nodes', 'tkg_edges']:
c.execute(f"SELECT COUNT(*) FROM {tbl}")
print(f" {tbl}: {c.fetchone()[0]} rows")
# Check vec tables
@@ -235,4 +286,8 @@ try:
c.execute("SELECT COUNT(*) FROM face_embeddings")
print(f" face_embeddings (vec0, 512D): {c.fetchone()[0]} vectors")
except: print(" face_embeddings: N/A")
try:
c.execute("SELECT COUNT(*) FROM voice_embeddings")
print(f" voice_embeddings (vec0, 192D): {c.fetchone()[0]} vectors")
except: print(" voice_embeddings: N/A")
c.close(); lite.close()