feat: TKG completion, PG audit, SQLite backup with Qdrant voice vectors
- Add voice_embeddings vec0 table (192D) from Qdrant to SQLite export - Add tkg_nodes + tkg_edges tables to SQLite export - Clean orphan TKG data (2414 nodes, 64 chunks) - Rebuild TKG for both Charade files with speaker nodes - Create asrx.json from chunk speaker metadata for TKG builder - PG audit: pre_chunks 1.8GB (largest), 3 empty tables found - Update release package to include all output files (not just JSON) - Full backup: 9 SQLite tables + 3 vec0 vector tables
This commit is contained in:
@@ -200,13 +200,64 @@ pg_to_sqlite(
|
||||
"CREATE TABLE IF NOT EXISTS identity_bindings (identity_id INTEGER, identity_type TEXT, identity_value TEXT, confidence REAL)",
|
||||
[UUID])
|
||||
|
||||
# tkg_nodes
|
||||
pg_to_sqlite(
|
||||
"SELECT id, node_type, external_id, file_uuid, label, properties::text FROM dev.tkg_nodes WHERE file_uuid=%s",
|
||||
"tkg_nodes",
|
||||
"CREATE TABLE IF NOT EXISTS tkg_nodes (id INTEGER PRIMARY KEY, node_type TEXT, external_id TEXT, file_uuid TEXT, label TEXT, properties TEXT)",
|
||||
[UUID])
|
||||
|
||||
# tkg_edges
|
||||
pg_to_sqlite(
|
||||
"SELECT id, edge_type, source_node_id, target_node_id, file_uuid, properties::text FROM dev.tkg_edges WHERE file_uuid=%s",
|
||||
"tkg_edges",
|
||||
"CREATE TABLE IF NOT EXISTS tkg_edges (id INTEGER PRIMARY KEY, edge_type TEXT, source_node_id INTEGER, target_node_id INTEGER, file_uuid TEXT, properties TEXT)",
|
||||
[UUID])
|
||||
|
||||
# Voice vectors from Qdrant (ECAPA-TDNN speaker embeddings, 192D)
|
||||
print(" Exporting voice vectors from Qdrant...")
|
||||
try:
|
||||
from urllib.request import Request, urlopen
|
||||
lite_cur.execute("SELECT chunk_id FROM chunk")
|
||||
db_chunk_ids = set(r[0] for r in lite_cur.fetchall())
|
||||
qdrant_chunks = {}
|
||||
offset_val = None
|
||||
while True:
|
||||
data = {"limit": 100, "with_vector": True, "with_payload": True}
|
||||
if offset_val:
|
||||
data["offset"] = offset_val
|
||||
req = Request("http://localhost:6333/collections/momentry_dev_voice/points/scroll",
|
||||
data=json.dumps(data).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="POST")
|
||||
resp = json.loads(urlopen(req).read())
|
||||
for pt in resp["result"].get("points", []):
|
||||
cid = pt["payload"].get("chunk_id", "")
|
||||
if cid in db_chunk_ids:
|
||||
qdrant_chunks[cid] = pt.get("vector", [])
|
||||
offset_val = resp["result"].get("next_page_offset")
|
||||
if offset_val is None:
|
||||
break
|
||||
if qdrant_chunks:
|
||||
dim = len(next(iter(qdrant_chunks.values())))
|
||||
lite_cur.execute("CREATE VIRTUAL TABLE IF NOT EXISTS voice_embeddings USING vec0(embedding float[{}])".format(dim))
|
||||
for chunk_id, vec in qdrant_chunks.items():
|
||||
if len(vec) == dim:
|
||||
rid = int(chunk_id) if chunk_id.isdigit() else hash(chunk_id) & 0x7fffffff
|
||||
lite_cur.execute("INSERT INTO voice_embeddings (rowid, embedding) VALUES (?, ?)",
|
||||
[rid, json.dumps(vec)])
|
||||
lite.commit()
|
||||
print(" voice_embeddings (vec0, {}D): {} vectors".format(dim, len(qdrant_chunks)))
|
||||
except Exception as e:
|
||||
print(" WARNING: Qdrant voice export skipped: {}".format(e))
|
||||
|
||||
# ---- Create indexes ----
|
||||
print("Creating indexes...")
|
||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_trace ON face_detections(trace_id)")
|
||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_identity ON face_detections(identity_id)")
|
||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_frame ON face_detections(frame_number)")
|
||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_time ON face_detections(timestamp_secs)")
|
||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_chunk_chunkid ON chunk(chunk_id)")
|
||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_tkg_node_type ON tkg_nodes(node_type)")
|
||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_tkg_edge_type ON tkg_edges(edge_type)")
|
||||
lite.commit()
|
||||
|
||||
# ---- Stats ----
|
||||
@@ -223,7 +274,7 @@ if VEC_DYLIB:
|
||||
lite.load_extension(VEC_DYLIB)
|
||||
lite.enable_load_extension(False)
|
||||
c = lite.cursor()
|
||||
for tbl in ['videos', 'chunk', 'face_detections', 'identities', 'identity_bindings']:
|
||||
for tbl in ['videos', 'chunk', 'face_detections', 'identities', 'identity_bindings', 'tkg_nodes', 'tkg_edges']:
|
||||
c.execute(f"SELECT COUNT(*) FROM {tbl}")
|
||||
print(f" {tbl}: {c.fetchone()[0]} rows")
|
||||
# Check vec tables
|
||||
@@ -235,4 +286,8 @@ try:
|
||||
c.execute("SELECT COUNT(*) FROM face_embeddings")
|
||||
print(f" face_embeddings (vec0, 512D): {c.fetchone()[0]} vectors")
|
||||
except: print(" face_embeddings: N/A")
|
||||
try:
|
||||
c.execute("SELECT COUNT(*) FROM voice_embeddings")
|
||||
print(f" voice_embeddings (vec0, 192D): {c.fetchone()[0]} vectors")
|
||||
except: print(" voice_embeddings: N/A")
|
||||
c.close(); lite.close()
|
||||
|
||||
Reference in New Issue
Block a user