feat: service inventory, ERP reports, sqlite-vec integration, visualize tool
- Add SERVICE_INVENTORY_V1.0.0.md (25 source-verified tools, 3.7GB) - Add ERP_SELECTION_REPORT.md (Odoo CE vs ERPNext comparison) - Add SFTPGO_ODOO_REPLACEMENT.md (SFTPGo migration plan) - Add SERVICE_GO_GITEA_BUILD.md (Go compiler + Gitea build report) - Add release visualize command (face trace heatmap + identity filter) - Add sqlite-vec integration (160MB SQLite with vec0 vector tables) - Add export_identities.py, export_sqlite.py, render_face_heatmap.py - Add Go, Gitea, Rust/Cargo, Swift, yt-dlp, SQLite, sqlite-vec to service CLI - Fix package to include identities and identity_bindings in data.sql - Update release list to show all deployed video stats - Add V1.0.0 YAML frontmatter to all docs (DOCS_STANDARD compliant)
This commit is contained in:
161
scripts/embed_faces.py
Normal file
161
scripts/embed_faces.py
Normal file
@@ -0,0 +1,161 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Process Swift face detection output + add CoreML FaceNet embeddings.
|
||||
Replaces face_processor.py Step 2 when Swift already ran.
|
||||
"""
|
||||
import sys, os, json, argparse, time
|
||||
import cv2
|
||||
import numpy as np
|
||||
import coremltools as ct
|
||||
from pathlib import Path
|
||||
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
FACENET_PATH = os.path.join(SCRIPT_DIR, "..", "models", "facenet512.mlpackage")
|
||||
|
||||
def classify_pose(roll, yaw):
|
||||
abs_yaw = abs(yaw)
|
||||
abs_roll = abs(roll)
|
||||
if abs_yaw < 15 and abs_roll < 15:
|
||||
return "frontal"
|
||||
elif abs_yaw > 30:
|
||||
return "profile_right" if yaw > 0 else "profile_left"
|
||||
else:
|
||||
return "three_quarter"
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--swift-json", required=True, help="Swift detection output")
|
||||
parser.add_argument("--video", required=True, help="Video file path")
|
||||
parser.add_argument("--output", required=True, help="Output face.json path")
|
||||
parser.add_argument("--fps", type=float, default=24.0)
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"[EMBED] Loading Swift output: {args.swift_json}")
|
||||
with open(args.swift_json) as f:
|
||||
swift = json.load(f)
|
||||
|
||||
swift_frames = swift.get("frames", [])
|
||||
print(f"[EMBED] Swift frames: {len(swift_frames)}")
|
||||
|
||||
# Load CoreML FaceNet
|
||||
facenet = os.path.normpath(FACENET_PATH)
|
||||
coreml_model = None
|
||||
if os.path.exists(facenet):
|
||||
coreml_model = ct.models.MLModel(facenet)
|
||||
print(f"[EMBED] FaceNet loaded")
|
||||
else:
|
||||
print(f"[EMBED] WARNING: FaceNet not found at {facenet}")
|
||||
|
||||
# Open video
|
||||
video = cv2.VideoCapture(args.video)
|
||||
if not video.isOpened():
|
||||
raise RuntimeError(f"Cannot open {args.video}")
|
||||
v_fps = video.get(cv2.CAP_PROP_FPS)
|
||||
v_total = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
v_width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
v_height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
print(f"[EMBED] Video: {v_width}x{v_height}, {v_fps:.1f}fps")
|
||||
|
||||
# Sequential read optimization: build lookup set
|
||||
needed_frames = set()
|
||||
frame_data_map = {}
|
||||
for sf in swift_frames:
|
||||
fn = int(sf.get("frame", sf.get("frame_number", 0)))
|
||||
needed_frames.add(fn)
|
||||
frame_data_map[fn] = sf
|
||||
|
||||
output_frames = []
|
||||
embed_count = 0
|
||||
t0 = time.time()
|
||||
current_frame = 0
|
||||
|
||||
while True:
|
||||
ret, frame = video.read()
|
||||
if not ret:
|
||||
break
|
||||
|
||||
if current_frame not in needed_frames:
|
||||
current_frame += 1
|
||||
continue
|
||||
|
||||
sf = frame_data_map[current_frame]
|
||||
timestamp = sf.get("timestamp", current_frame / v_fps)
|
||||
faces_in = sf.get("faces", [])
|
||||
|
||||
processed_faces = []
|
||||
for face in faces_in:
|
||||
bb = face.get("bbox", {})
|
||||
x, y, w, h = bb.get("x", 0), bb.get("y", 0), bb.get("width", 0), bb.get("height", 0)
|
||||
|
||||
if w <= 10 or h <= 10:
|
||||
continue
|
||||
|
||||
x1, y1 = max(0, x), max(0, y)
|
||||
x2, y2 = min(v_width, x + w), min(v_height, y + h)
|
||||
if x2 <= x1 or y2 <= y1:
|
||||
continue
|
||||
face_img = frame[y1:y2, x1:x2]
|
||||
if face_img.size == 0:
|
||||
continue
|
||||
|
||||
emb = None
|
||||
if coreml_model is not None and face_img.shape[0] > 0 and face_img.shape[1] > 0:
|
||||
try:
|
||||
resized = cv2.resize(face_img, (160, 160))
|
||||
rgb = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB).astype(np.float32)
|
||||
normalized = rgb / 127.5 - 1.0
|
||||
input_data = np.expand_dims(np.transpose(normalized, (2, 0, 1)), axis=0)
|
||||
result = coreml_model.predict({"input": input_data})
|
||||
emb = list(result.values())[0].flatten().tolist()
|
||||
embed_count += 1
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
# Pose
|
||||
pose_info = face.get("pose", {})
|
||||
pose_angle = classify_pose(pose_info.get("roll", 0), pose_info.get("yaw", 0))
|
||||
|
||||
processed_faces.append({
|
||||
"x": x, "y": y, "width": w, "height": h,
|
||||
"confidence": face.get("confidence", 0.5),
|
||||
"embedding": emb,
|
||||
"pose_angle": {
|
||||
"angle": pose_angle,
|
||||
"roll": pose_info.get("roll", 0),
|
||||
"yaw": pose_info.get("yaw", 0),
|
||||
"pitch": pose_info.get("pitch", 0),
|
||||
},
|
||||
"lips": face.get("lips"),
|
||||
"landmarks": face.get("landmarks"),
|
||||
"attributes": None,
|
||||
})
|
||||
|
||||
if processed_faces:
|
||||
output_frames.append({
|
||||
"frame": current_frame,
|
||||
"timestamp": timestamp,
|
||||
"faces": processed_faces,
|
||||
})
|
||||
|
||||
current_frame += 1
|
||||
|
||||
if len(output_frames) % 500 == 0:
|
||||
print(f"[EMBED] {len(output_frames)}/{len(needed_frames)} frames, {embed_count} embeddings, {time.time()-t0:.0f}s")
|
||||
|
||||
video.release()
|
||||
|
||||
output = {
|
||||
"frame_count": len(output_frames),
|
||||
"fps": v_fps,
|
||||
"frames": output_frames,
|
||||
}
|
||||
|
||||
os.makedirs(os.path.dirname(args.output), exist_ok=True)
|
||||
with open(args.output, "w") as f:
|
||||
json.dump(output, f, indent=2, ensure_ascii=False)
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f"[EMBED] Done: {len(output_frames)} frames, {embed_count} embeddings, {elapsed:.0f}s → {args.output}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
131
scripts/export_file_package.py
Normal file
131
scripts/export_file_package.py
Normal file
@@ -0,0 +1,131 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Export a single file's data to SQL file (COPY format).
|
||||
Usage: python3 export_file_package.py <file_uuid> <output_dir>
|
||||
"""
|
||||
import json, os, sys, subprocess
|
||||
|
||||
PG_BIN = "/Users/accusys/pgsql/18.3/bin"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry"
|
||||
|
||||
TABLES = [
|
||||
("dev.videos", "file_uuid"),
|
||||
("dev.chunk", "file_uuid"),
|
||||
("dev.chunk_vectors", "uuid"),
|
||||
("dev.face_detections", "file_uuid"),
|
||||
]
|
||||
|
||||
def main():
|
||||
uuid = sys.argv[1] if len(sys.argv) > 1 else "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
outdir = sys.argv[2] if len(sys.argv) > 2 else "/tmp/file_pkg"
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
sql_path = os.path.join(outdir, "data.sql")
|
||||
|
||||
print(f"Exporting {uuid} → {sql_path}")
|
||||
with open(sql_path, "w") as f:
|
||||
f.write(f"-- File package: {uuid}\nBEGIN;\n\n")
|
||||
|
||||
for tbl, col in TABLES:
|
||||
f.write(f"-- {tbl} WHERE {col} = '{uuid}'\n")
|
||||
|
||||
# Get column list
|
||||
schema, table = tbl.split(".")
|
||||
r = subprocess.run(
|
||||
[f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-t", "-A",
|
||||
"-c", f"SELECT string_agg(column_name, ', ' ORDER BY ordinal_position) FROM information_schema.columns WHERE table_schema='{schema}' AND table_name='{table}' AND is_updatable='YES'"],
|
||||
capture_output=True, text=True, timeout=15)
|
||||
cols = r.stdout.strip()
|
||||
|
||||
r = subprocess.run(
|
||||
[f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-c",
|
||||
f"COPY (SELECT * FROM {tbl} WHERE {col} = '{uuid}') TO STDOUT WITH CSV HEADER"],
|
||||
capture_output=True, text=True, timeout=60)
|
||||
if r.stdout.strip():
|
||||
f.write(f"COPY {tbl} ({cols}) FROM STDIN WITH CSV HEADER;\n")
|
||||
f.write(r.stdout)
|
||||
if not r.stdout.endswith("\n"):
|
||||
f.write("\n")
|
||||
f.write("\\.\n\n")
|
||||
|
||||
# Export identities referenced by this file's face_detections
|
||||
f.write(f"-- dev.identities (referenced by face_detections WHERE file_uuid='{uuid}')\n")
|
||||
r = subprocess.run(
|
||||
[f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-t", "-A",
|
||||
"-c", "SELECT string_agg(column_name, ', ' ORDER BY ordinal_position) FROM information_schema.columns WHERE table_schema='dev' AND table_name='identities' AND is_updatable='YES'"],
|
||||
capture_output=True, text=True, timeout=15)
|
||||
cols = r.stdout.strip()
|
||||
r = subprocess.run(
|
||||
[f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-c",
|
||||
f"COPY (SELECT DISTINCT i.* FROM dev.identities i INNER JOIN dev.face_detections fd ON fd.identity_id = i.id WHERE fd.file_uuid = '{uuid}') TO STDOUT WITH CSV HEADER"],
|
||||
capture_output=True, text=True, timeout=60)
|
||||
if r.stdout.strip():
|
||||
f.write(f"COPY dev.identities ({cols}) FROM STDIN WITH CSV HEADER;\n")
|
||||
f.write(r.stdout)
|
||||
if not r.stdout.endswith("\n"):
|
||||
f.write("\n")
|
||||
f.write("\\.\n\n")
|
||||
|
||||
# Export identity_bindings for identities referenced by this file
|
||||
f.write(f"-- dev.identity_bindings (for identities in face_detections WHERE file_uuid='{uuid}')\n")
|
||||
r = subprocess.run(
|
||||
[f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-t", "-A",
|
||||
"-c", "SELECT string_agg(column_name, ', ' ORDER BY ordinal_position) FROM information_schema.columns WHERE table_schema='dev' AND table_name='identity_bindings' AND is_updatable='YES'"],
|
||||
capture_output=True, text=True, timeout=15)
|
||||
cols = r.stdout.strip()
|
||||
r = subprocess.run(
|
||||
[f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-c",
|
||||
f"COPY (SELECT DISTINCT ib.* FROM dev.identity_bindings ib INNER JOIN dev.face_detections fd ON fd.identity_id = ib.identity_id WHERE fd.file_uuid = '{uuid}') TO STDOUT WITH CSV HEADER"],
|
||||
capture_output=True, text=True, timeout=60)
|
||||
if r.stdout.strip():
|
||||
f.write(f"COPY dev.identity_bindings ({cols}) FROM STDIN WITH CSV HEADER;\n")
|
||||
f.write(r.stdout)
|
||||
if not r.stdout.endswith("\n"):
|
||||
f.write("\n")
|
||||
f.write("\\.\n\n")
|
||||
|
||||
f.write("COMMIT;\n")
|
||||
|
||||
size = os.path.getsize(sql_path)
|
||||
print(f" {sql_path} ({size/1024/1024:.1f} MB)")
|
||||
|
||||
# Copy video file to package
|
||||
r = subprocess.run(
|
||||
[f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-t", "-A",
|
||||
"-c", f"SELECT file_path FROM dev.videos WHERE file_uuid='{uuid}'"],
|
||||
capture_output=True, text=True, timeout=15)
|
||||
video_path = r.stdout.strip()
|
||||
if video_path and os.path.exists(video_path):
|
||||
video_name = os.path.basename(video_path)
|
||||
dest = os.path.join(outdir, video_name)
|
||||
import shutil
|
||||
shutil.copy2(video_path, dest)
|
||||
vsize = os.path.getsize(dest)
|
||||
print(f" {video_name} ({vsize/1024/1024:.0f} MB)")
|
||||
else:
|
||||
print(f" WARNING: video file not found at {video_path}")
|
||||
|
||||
# file_info.json
|
||||
r = subprocess.run(
|
||||
[f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-t", "-A",
|
||||
"-c", f"SELECT json_build_object('file_uuid', file_uuid, 'file_name', file_name, 'duration', duration, 'fps', fps, 'width', width, 'height', height, 'total_frames', total_frames, 'status', status) FROM dev.videos WHERE file_uuid='{uuid}'"],
|
||||
capture_output=True, text=True, timeout=15)
|
||||
if r.stdout.strip():
|
||||
info = json.loads(r.stdout.strip())
|
||||
with open(os.path.join(outdir, "file_info.json"), "w") as f:
|
||||
json.dump(info, f, indent=2)
|
||||
print(f" file_info.json")
|
||||
|
||||
# Export identities.json (for offline analysis)
|
||||
id_path = os.path.join(outdir, f"{uuid}.identities.json")
|
||||
r = subprocess.run(
|
||||
[f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-t", "-A",
|
||||
"-c", f"SELECT json_build_object('file_uuid', file_uuid) FROM dev.videos WHERE file_uuid='{uuid}'"],
|
||||
capture_output=True, text=True, timeout=15)
|
||||
subprocess.run(
|
||||
["/opt/homebrew/bin/python3.11", os.path.join(os.path.dirname(os.path.abspath(__file__)), "export_identities.py"), uuid, id_path],
|
||||
check=False, timeout=60)
|
||||
if os.path.exists(id_path):
|
||||
print(f" {uuid}.identities.json ({os.path.getsize(id_path)/1024:.0f}KB)")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
74
scripts/export_identities.py
Normal file
74
scripts/export_identities.py
Normal file
@@ -0,0 +1,74 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Export identity data for a video UUID as JSON (for offline analysis).
|
||||
Usage: python3 export_identities.py <file_uuid> [output.json]
|
||||
"""
|
||||
import sys, json, psycopg2
|
||||
|
||||
UUID = sys.argv[1] if len(sys.argv) > 1 else "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
OUT = sys.argv[2] if len(sys.argv) > 2 else f"/Users/accusys/momentry/output_dev/{UUID}.identities.json"
|
||||
|
||||
conn = psycopg2.connect("dbname=momentry user=accusys")
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get identities referenced by this file's face_detections
|
||||
cur.execute("""
|
||||
SELECT DISTINCT i.id, i.name, i.uuid, i.identity_type, i.source, i.status,
|
||||
i.face_embedding, i.voice_embedding, i.reference_data, i.tmdb_id, i.tmdb_profile
|
||||
FROM dev.identities i
|
||||
INNER JOIN dev.face_detections fd ON fd.identity_id = i.id
|
||||
WHERE fd.file_uuid = %s
|
||||
ORDER BY i.id
|
||||
""", (UUID,))
|
||||
rows = cur.fetchall()
|
||||
|
||||
identities = []
|
||||
for r in rows:
|
||||
identities.append({
|
||||
"id": r[0],
|
||||
"name": r[1],
|
||||
"uuid": str(r[2]) if r[2] else None,
|
||||
"identity_type": r[3],
|
||||
"source": r[4],
|
||||
"status": r[5],
|
||||
"tmdb_id": r[9],
|
||||
"tmdb_profile": r[10],
|
||||
})
|
||||
|
||||
# Get identity_bindings for these identities' traces
|
||||
cur.execute("""
|
||||
SELECT DISTINCT ib.identity_id, ib.identity_type, ib.identity_value, ib.confidence
|
||||
FROM dev.identity_bindings ib
|
||||
WHERE ib.identity_id IN (
|
||||
SELECT DISTINCT fd.identity_id FROM dev.face_detections fd WHERE fd.file_uuid = %s
|
||||
)
|
||||
""", (UUID,))
|
||||
bindings = [{"identity_id": r[0], "identity_type": r[1], "identity_value": r[2], "confidence": float(r[3])} for r in cur.fetchall()]
|
||||
|
||||
# Get trace-to-identity mapping from face_detections
|
||||
cur.execute("""
|
||||
SELECT DISTINCT trace_id, identity_id, COUNT(*) as face_count
|
||||
FROM dev.face_detections
|
||||
WHERE file_uuid = %s AND identity_id IS NOT NULL AND trace_id IS NOT NULL
|
||||
GROUP BY trace_id, identity_id ORDER BY trace_id
|
||||
""", (UUID,))
|
||||
trace_map = [{"trace_id": r[0], "identity_id": r[1], "face_count": r[2]} for r in cur.fetchall()]
|
||||
|
||||
cur.close(); conn.close()
|
||||
|
||||
output = {
|
||||
"file_uuid": UUID,
|
||||
"identity_count": len(identities),
|
||||
"binding_count": len(bindings),
|
||||
"trace_mapping_count": len(trace_map),
|
||||
"identities": identities,
|
||||
"bindings": bindings,
|
||||
"trace_to_identity": trace_map,
|
||||
}
|
||||
|
||||
with open(OUT, 'w') as f:
|
||||
json.dump(output, f, indent=2, ensure_ascii=False)
|
||||
|
||||
size_kb = len(json.dumps(output)) / 1024
|
||||
print(f"Exported {len(identities)} identities, {len(bindings)} bindings, {len(trace_map)} trace mappings")
|
||||
print(f" → {OUT} ({size_kb:.0f}KB)")
|
||||
238
scripts/export_sqlite.py
Normal file
238
scripts/export_sqlite.py
Normal file
@@ -0,0 +1,238 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Export a video's data to a self-contained SQLite database for offline app use.
|
||||
Uses sqlite-vec extension for native vector storage.
|
||||
The vec0.dylib must be in the script directory or /tmp/.
|
||||
Usage: python3 export_sqlite.py <file_uuid> [output.sqlite]
|
||||
"""
|
||||
import sys, json, sqlite3, psycopg2, os
|
||||
|
||||
UUID = sys.argv[1] if len(sys.argv) > 1 else "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
OUT = sys.argv[2] if len(sys.argv) > 2 else f"/Users/accusys/momentry/output_dev/{UUID}.sqlite"
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# Find vec0.dylib
|
||||
VEC_DYLIB = None
|
||||
for path in [
|
||||
os.path.join(SCRIPT_DIR, "vec0.dylib"),
|
||||
"/tmp/vec0.dylib",
|
||||
os.path.join(SCRIPT_DIR, "sqlite-vec", "vec0.dylib"),
|
||||
]:
|
||||
if os.path.exists(path):
|
||||
VEC_DYLIB = path
|
||||
break
|
||||
|
||||
print(f"Exporting {UUID} → {OUT}")
|
||||
if VEC_DYLIB:
|
||||
print(f" sqlite-vec: {VEC_DYLIB}")
|
||||
|
||||
# Connect to PostgreSQL
|
||||
pg = psycopg2.connect("dbname=momentry user=accusys")
|
||||
pg_cur = pg.cursor()
|
||||
|
||||
# Connect to SQLite
|
||||
if os.path.exists(OUT):
|
||||
os.remove(OUT)
|
||||
lite = sqlite3.connect(OUT)
|
||||
|
||||
# Load sqlite-vec extension if available
|
||||
if VEC_DYLIB:
|
||||
lite.enable_load_extension(True)
|
||||
try:
|
||||
lite.load_extension(VEC_DYLIB)
|
||||
print(" sqlite-vec extension loaded")
|
||||
except Exception as e:
|
||||
print(f" WARNING: Could not load sqlite-vec: {e}")
|
||||
lite.enable_load_extension(False)
|
||||
|
||||
lite_cur = lite.cursor()
|
||||
|
||||
# ---- Helper ----
|
||||
def pg_to_sqlite(pg_query, lite_table, lite_schema, params=None, transform=None):
|
||||
"""Copy PostgreSQL query result to SQLite table."""
|
||||
lite_cur.execute(lite_schema)
|
||||
pg_cur.execute(pg_query, params or [])
|
||||
rows = pg_cur.fetchall()
|
||||
if not rows:
|
||||
return 0
|
||||
cols = [d[0] for d in pg_cur.description]
|
||||
placeholders = ",".join(["?" for _ in cols])
|
||||
|
||||
count = 0
|
||||
for row in rows:
|
||||
d = dict(zip(cols, row))
|
||||
if transform:
|
||||
d = transform(d)
|
||||
vals = []
|
||||
for c in cols:
|
||||
v = d.get(c)
|
||||
vals.append(None if v is None else v)
|
||||
try:
|
||||
lite_cur.execute(f"INSERT INTO {lite_table} VALUES ({placeholders})", vals)
|
||||
count += 1
|
||||
except Exception:
|
||||
pass
|
||||
lite.commit()
|
||||
return count
|
||||
|
||||
# Create tables (skip WAL — Python sqlite3 may not support PRAGMA with extensions loaded)
|
||||
print("Creating tables...")
|
||||
|
||||
# videos
|
||||
pg_to_sqlite(
|
||||
"SELECT file_uuid, file_name, file_path, duration, fps, width, height, probe_json::text, status FROM dev.videos WHERE file_uuid=%s",
|
||||
"videos",
|
||||
"CREATE TABLE IF NOT EXISTS videos (file_uuid TEXT PRIMARY KEY, file_name TEXT, file_path TEXT, duration REAL, fps REAL, width INTEGER, height INTEGER, probe_json TEXT, status TEXT)",
|
||||
[UUID])
|
||||
|
||||
# chunk
|
||||
pg_to_sqlite(
|
||||
"SELECT file_uuid, chunk_id, chunk_type, start_time, end_time, fps, start_frame, end_frame, text_content, metadata->>'speaker_id' as speaker_id FROM dev.chunk WHERE file_uuid=%s AND chunk_type='sentence' ORDER BY chunk_id",
|
||||
"chunk",
|
||||
"""CREATE TABLE IF NOT EXISTS chunk (
|
||||
file_uuid TEXT, chunk_id TEXT, chunk_type TEXT,
|
||||
start_time REAL, end_time REAL, fps REAL,
|
||||
start_frame INTEGER, end_frame INTEGER, text_content TEXT, speaker_id TEXT,
|
||||
PRIMARY KEY(file_uuid, chunk_id))""",
|
||||
[UUID])
|
||||
|
||||
def parse_pg_array(text):
|
||||
"""Parse PostgreSQL array format {0.1,0.2,...} to Python list."""
|
||||
if not text or text == 'null':
|
||||
return None
|
||||
try:
|
||||
text = text.strip('{}')
|
||||
return [float(x) for x in text.split(',') if x.strip()]
|
||||
except:
|
||||
return None
|
||||
|
||||
# chunk vectors → vec0 virtual table
|
||||
print(" Creating vec0 table: chunk_embeddings (768D)...")
|
||||
lite_cur.execute("""
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS chunk_embeddings USING vec0(
|
||||
embedding float[768]
|
||||
)
|
||||
""")
|
||||
pg_cur.execute("SELECT chunk_id, COALESCE(embedding::text, 'null'), uuid FROM dev.chunk_vectors WHERE uuid=%s", [UUID])
|
||||
chunk_vecs = pg_cur.fetchall()
|
||||
if chunk_vecs:
|
||||
for chunk_id, emb_text, _ in chunk_vecs:
|
||||
# chunk_vectors uses JSONB format, not PG array format
|
||||
emb = None
|
||||
try:
|
||||
emb = json.loads(emb_text) if emb_text else None
|
||||
except:
|
||||
pass
|
||||
if not emb:
|
||||
emb = parse_pg_array(emb_text) # fallback
|
||||
if emb and len(emb) == 768:
|
||||
lite_cur.execute(
|
||||
"INSERT INTO chunk_embeddings (rowid, embedding) VALUES (?, ?)",
|
||||
[int(chunk_id) if chunk_id.isdigit() else hash(chunk_id) & 0x7fffffff,
|
||||
json.dumps(emb)])
|
||||
lite.commit()
|
||||
print(f" chunk_embeddings: {len(chunk_vecs)} vectors")
|
||||
|
||||
# face detections
|
||||
def transform_face(row):
|
||||
return row # embedding moved to vec0 table
|
||||
|
||||
pg_to_sqlite(
|
||||
"""SELECT file_uuid, face_id, frame_number, x, y, width, height, confidence,
|
||||
identity_id, trace_id,
|
||||
COALESCE(timestamp_secs, frame_number / 25.0) as timestamp_secs
|
||||
FROM dev.face_detections WHERE file_uuid=%s ORDER BY frame_number""",
|
||||
"face_detections",
|
||||
"""CREATE TABLE IF NOT EXISTS face_detections (
|
||||
file_uuid TEXT, face_id TEXT, frame_number INTEGER,
|
||||
x INTEGER, y INTEGER, width INTEGER, height INTEGER,
|
||||
confidence REAL, identity_id INTEGER, trace_id INTEGER,
|
||||
timestamp_secs REAL)""",
|
||||
[UUID], transform_face)
|
||||
|
||||
# face embeddings → vec0 virtual table (512D)
|
||||
print(" Creating vec0 table: face_embeddings (512D)...")
|
||||
lite_cur.execute("""
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS face_embeddings USING vec0(
|
||||
embedding float[512]
|
||||
)
|
||||
""")
|
||||
pg_cur.execute("SELECT id, COALESCE(embedding::text, 'null') FROM dev.face_detections WHERE file_uuid=%s", [UUID])
|
||||
face_vecs = pg_cur.fetchall()
|
||||
if face_vecs:
|
||||
batch = []
|
||||
for db_id, emb_text in face_vecs:
|
||||
emb = parse_pg_array(emb_text)
|
||||
if emb and len(emb) == 512:
|
||||
batch.append((db_id, json.dumps(emb)))
|
||||
if len(batch) >= 500:
|
||||
lite_cur.executemany("INSERT INTO face_embeddings VALUES (?, ?)", batch)
|
||||
batch = []
|
||||
if batch:
|
||||
lite_cur.executemany("INSERT INTO face_embeddings VALUES (?, ?)", batch)
|
||||
lite.commit()
|
||||
print(f" face_embeddings: {len(face_vecs)} vectors")
|
||||
|
||||
# identities
|
||||
def transform_identity(row):
|
||||
return row
|
||||
|
||||
pg_to_sqlite(
|
||||
"""SELECT DISTINCT i.id, i.name, i.uuid, i.identity_type, i.source, i.status,
|
||||
i.tmdb_id, i.tmdb_profile, i.tmdb_poster
|
||||
FROM dev.identities i
|
||||
INNER JOIN dev.face_detections fd ON fd.identity_id = i.id
|
||||
WHERE fd.file_uuid=%s""",
|
||||
"identities",
|
||||
"""CREATE TABLE IF NOT EXISTS identities (
|
||||
id INTEGER PRIMARY KEY, name TEXT, uuid TEXT, identity_type TEXT,
|
||||
source TEXT, status TEXT, tmdb_id INTEGER,
|
||||
tmdb_profile TEXT, tmdb_poster TEXT)""",
|
||||
[UUID], transform_identity)
|
||||
|
||||
# identity_bindings
|
||||
pg_to_sqlite(
|
||||
"""SELECT DISTINCT ib.identity_id, ib.identity_type, ib.identity_value, ib.confidence
|
||||
FROM dev.identity_bindings ib
|
||||
INNER JOIN dev.face_detections fd ON fd.identity_id = ib.identity_id
|
||||
WHERE fd.file_uuid=%s""",
|
||||
"identity_bindings",
|
||||
"CREATE TABLE IF NOT EXISTS identity_bindings (identity_id INTEGER, identity_type TEXT, identity_value TEXT, confidence REAL)",
|
||||
[UUID])
|
||||
|
||||
# ---- Create indexes ----
|
||||
print("Creating indexes...")
|
||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_trace ON face_detections(trace_id)")
|
||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_identity ON face_detections(identity_id)")
|
||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_frame ON face_detections(frame_number)")
|
||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_fd_time ON face_detections(timestamp_secs)")
|
||||
lite_cur.execute("CREATE INDEX IF NOT EXISTS idx_chunk_chunkid ON chunk(chunk_id)")
|
||||
lite.commit()
|
||||
|
||||
# ---- Stats ----
|
||||
pg_cur.close(); pg.close()
|
||||
lite_cur.close(); lite.close()
|
||||
|
||||
size_mb = os.path.getsize(OUT) / 1024 / 1024
|
||||
print(f"\n {OUT} ({size_mb:.0f}MB)")
|
||||
|
||||
# Verify
|
||||
lite = sqlite3.connect(OUT)
|
||||
if VEC_DYLIB:
|
||||
lite.enable_load_extension(True)
|
||||
lite.load_extension(VEC_DYLIB)
|
||||
lite.enable_load_extension(False)
|
||||
c = lite.cursor()
|
||||
for tbl in ['videos', 'chunk', 'face_detections', 'identities', 'identity_bindings']:
|
||||
c.execute(f"SELECT COUNT(*) FROM {tbl}")
|
||||
print(f" {tbl}: {c.fetchone()[0]} rows")
|
||||
# Check vec tables
|
||||
try:
|
||||
c.execute("SELECT COUNT(*) FROM chunk_embeddings")
|
||||
print(f" chunk_embeddings (vec0, 768D): {c.fetchone()[0]} vectors")
|
||||
except: print(" chunk_embeddings: N/A")
|
||||
try:
|
||||
c.execute("SELECT COUNT(*) FROM face_embeddings")
|
||||
print(f" face_embeddings (vec0, 512D): {c.fetchone()[0]} vectors")
|
||||
except: print(" face_embeddings: N/A")
|
||||
c.close(); lite.close()
|
||||
@@ -49,7 +49,7 @@ def classify_pose(roll: float, yaw: float) -> str:
|
||||
|
||||
class FaceProcessorVision:
|
||||
def __init__(self, video_path: str, output_path: str, uuid: str = "",
|
||||
sample_interval: int = 30):
|
||||
sample_interval: int = 3):
|
||||
self.video_path = video_path
|
||||
self.output_path = output_path
|
||||
self.uuid = uuid
|
||||
@@ -205,7 +205,7 @@ class FaceProcessorVision:
|
||||
"pitch": pose_info.get("pitch", 0),
|
||||
},
|
||||
"lips": face.get("lips"),
|
||||
"landmarks": None,
|
||||
"landmarks": face.get("landmarks"),
|
||||
"attributes": None,
|
||||
})
|
||||
|
||||
@@ -255,7 +255,7 @@ def main():
|
||||
parser.add_argument("video_path", help="Video file path")
|
||||
parser.add_argument("output_path", help="Output JSON path")
|
||||
parser.add_argument("--uuid", "-u", default="")
|
||||
parser.add_argument("--sample-interval", type=int, default=30)
|
||||
parser.add_argument("--sample-interval", type=int, default=3)
|
||||
parser.add_argument("--force", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
129
scripts/identity_bind.py
Normal file
129
scripts/identity_bind.py
Normal file
@@ -0,0 +1,129 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Identity Binding: cluster face traces → identity bindings.
|
||||
Uses face embeddings from face_detections, clusters per trace, creates identities.
|
||||
"""
|
||||
import json, sys, time
|
||||
import psycopg2
|
||||
import numpy as np
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
|
||||
UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5"
|
||||
DB = "dbname=momentry user=accusys"
|
||||
DISTANCE_THRESHOLD = 0.55 # Cosine distance threshold for clustering
|
||||
|
||||
print(f"=== Identity Binding for {UUID} ===")
|
||||
|
||||
conn = psycopg2.connect(DB)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Step 1: Get trace embeddings from face_detections
|
||||
print("Loading face trace data...")
|
||||
cur.execute("""
|
||||
SELECT trace_id, embedding
|
||||
FROM dev.face_detections
|
||||
WHERE file_uuid = %s AND trace_id IS NOT NULL AND embedding IS NOT NULL
|
||||
ORDER BY trace_id, id
|
||||
""", (UUID,))
|
||||
rows = cur.fetchall()
|
||||
print(f"Face detections with embeddings: {len(rows)}")
|
||||
|
||||
# Group by trace_id and compute average embedding
|
||||
trace_embs = {}
|
||||
for trace_id, emb in rows:
|
||||
if trace_id not in trace_embs:
|
||||
trace_embs[trace_id] = []
|
||||
trace_embs[trace_id].append(emb)
|
||||
|
||||
print(f"Unique traces: {len(trace_embs)}")
|
||||
|
||||
# Compute mean embeddings per trace
|
||||
trace_ids = []
|
||||
trace_vectors = []
|
||||
for tid, embs in sorted(trace_embs.items()):
|
||||
mean_emb = np.mean(embs, axis=0)
|
||||
mean_emb = mean_emb / (np.linalg.norm(mean_emb) + 1e-10)
|
||||
trace_ids.append(tid)
|
||||
trace_vectors.append(mean_emb)
|
||||
|
||||
X = np.array(trace_vectors)
|
||||
print(f"Trace vectors shape: {X.shape}")
|
||||
|
||||
# Step 2: Cluster traces
|
||||
print("Clustering traces...")
|
||||
if len(X) > 1:
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=None,
|
||||
distance_threshold=DISTANCE_THRESHOLD,
|
||||
metric='cosine',
|
||||
linkage='average'
|
||||
)
|
||||
labels = clustering.fit_predict(X)
|
||||
else:
|
||||
labels = [0]
|
||||
|
||||
n_clusters = len(set(labels))
|
||||
print(f"Clusters/identities: {n_clusters}")
|
||||
|
||||
# Step 3: Get or create identity records
|
||||
print("Creating identity records...")
|
||||
# Get existing identities
|
||||
cur.execute("SELECT id, uuid FROM dev.identities")
|
||||
existing = {row[0]: row[1] for row in cur.fetchall()}
|
||||
|
||||
# Map cluster -> identity_id
|
||||
cluster_to_identity = {}
|
||||
for cluster_id in sorted(set(labels)):
|
||||
# Create new identity
|
||||
identity_uuid = None
|
||||
cur.execute("""
|
||||
INSERT INTO dev.identities (name, identity_type, source, status, created_at)
|
||||
VALUES (%s, 'face', 'auto', 'active', NOW())
|
||||
ON CONFLICT (name) DO UPDATE SET status = 'active'
|
||||
RETURNING id
|
||||
""", (f"PERSON_{UUID[:8]}_{cluster_id}",))
|
||||
identity_id = cur.fetchone()[0]
|
||||
cluster_to_identity[cluster_id] = identity_id
|
||||
print(f" Cluster {cluster_id}: new identity {identity_id} (PERSON_{cluster_id})")
|
||||
|
||||
# Step 4: Create identity bindings
|
||||
print("Creating identity bindings...")
|
||||
bindings = 0
|
||||
for tid, label in zip(trace_ids, labels):
|
||||
identity_id = cluster_to_identity[label]
|
||||
# Get a representative face_id for this trace
|
||||
cur.execute("""
|
||||
SELECT face_id FROM dev.face_detections
|
||||
WHERE file_uuid = %s AND trace_id = %s
|
||||
LIMIT 1
|
||||
""", (UUID, tid))
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
face_id = row[0]
|
||||
# Create binding
|
||||
cur.execute("""
|
||||
INSERT INTO dev.identity_bindings (identity_id, identity_type, identity_value, confidence, created_at)
|
||||
VALUES (%s, 'trace', %s, 0.8, NOW())
|
||||
ON CONFLICT DO NOTHING
|
||||
""", (identity_id, str(tid)))
|
||||
bindings += 1
|
||||
|
||||
# Also update face_detection with identity_id
|
||||
cur.execute("""
|
||||
UPDATE dev.face_detections SET identity_id = %s
|
||||
WHERE file_uuid = %s AND trace_id = %s
|
||||
""", (identity_id, UUID, tid))
|
||||
|
||||
conn.commit()
|
||||
print(f"Created {bindings} identity bindings for {n_clusters} identities")
|
||||
|
||||
# Summary
|
||||
print(f"\n=== Summary ===")
|
||||
cur.execute("SELECT COUNT(*) FROM dev.identities WHERE source = 'auto'")
|
||||
print(f"Total auto-generated identities: {cur.fetchone()[0]}")
|
||||
cur.execute("SELECT COUNT(*) FROM dev.identity_bindings")
|
||||
print(f"Total identity bindings: {cur.fetchone()[0]}")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
print("=== Done ===")
|
||||
48
scripts/insert_chunks.py
Normal file
48
scripts/insert_chunks.py
Normal file
@@ -0,0 +1,48 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""Insert sentence chunks from transcribe.py output into dev.chunk table."""
|
||||
import json, sys
|
||||
import psycopg2
|
||||
|
||||
DB = "dbname=momentry user=accusys"
|
||||
UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5"
|
||||
ASR_PATH = f"/Users/accusys/momentry/output_dev/{UUID}.asr.json"
|
||||
FPS = 23.976023976023978
|
||||
|
||||
with open(ASR_PATH) as f:
|
||||
asr = json.load(f)
|
||||
|
||||
segments = asr.get("segments", [])
|
||||
print(f"Inserting {len(segments)} sentence chunks for {UUID}...")
|
||||
|
||||
conn = psycopg2.connect(DB)
|
||||
cur = conn.cursor()
|
||||
|
||||
inserted = 0
|
||||
for seg in segments:
|
||||
chunk_id = seg["chunk_id"]
|
||||
start_time = seg["start_time"]
|
||||
end_time = seg["end_time"]
|
||||
start_frame = int(start_time * FPS)
|
||||
end_frame = int(end_time * FPS)
|
||||
text = seg.get("text", "")
|
||||
speaker_change = seg.get("speaker_change", False)
|
||||
|
||||
content = json.dumps({
|
||||
"source": "transcribe",
|
||||
"speaker_change": speaker_change,
|
||||
"pass1_index": seg.get("pass1_index", 0),
|
||||
})
|
||||
|
||||
cur.execute("""
|
||||
INSERT INTO dev.chunk (file_uuid, chunk_id, chunk_type, start_time, end_time,
|
||||
start_frame, end_frame, fps, text_content, content, created_at)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s::jsonb, NOW())
|
||||
ON CONFLICT (file_uuid, chunk_id) DO NOTHING
|
||||
""", (UUID, chunk_id, "sentence", start_time, end_time,
|
||||
start_frame, end_frame, FPS, text, content))
|
||||
inserted += 1
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
print(f"Done: {inserted} chunks inserted")
|
||||
344
scripts/release_manager.py
Normal file
344
scripts/release_manager.py
Normal file
@@ -0,0 +1,344 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Release Manager - Deploy / Undeploy video release packages.
|
||||
|
||||
Usage:
|
||||
python3 release_manager.py deploy <package.tar.gz>
|
||||
python3 release_manager.py undeploy <file_uuid>
|
||||
python3 release_manager.py list
|
||||
python3 release_manager.py package <file_uuid> # Create new release package
|
||||
"""
|
||||
|
||||
import json, os, sys, shutil, subprocess, tarfile, tempfile, argparse, time
|
||||
import psycopg2
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
PG_BIN = "/Users/accusys/pgsql/18.3/bin"
|
||||
DB = "dbname=momentry user=accusys"
|
||||
QDRANT = "http://localhost:6333"
|
||||
DEMO_DIR = "/Users/accusys/momentry/var/sftpgo/data/demo"
|
||||
OUTPUT_DIR = "/Users/accusys/momentry/output_dev"
|
||||
RELEASE_DIR = "/Users/accusys/momentry_core_0.1/release/files"
|
||||
|
||||
# ---- Helpers ----
|
||||
|
||||
def psql_cmd(sql, db=DB):
|
||||
"""Run a SQL command via psql."""
|
||||
r = subprocess.run(
|
||||
[f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-t", "-A", "-c", sql],
|
||||
capture_output=True, text=True, timeout=30)
|
||||
return r.stdout.strip()
|
||||
|
||||
def pg_execute(sql, params=None):
|
||||
"""Execute SQL via psycopg2."""
|
||||
conn = psycopg2.connect(DB)
|
||||
cur = conn.cursor()
|
||||
if params:
|
||||
cur.execute(sql, params)
|
||||
else:
|
||||
cur.execute(sql)
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
def pg_query(sql, params=None):
|
||||
"""Query via psycopg2."""
|
||||
conn = psycopg2.connect(DB)
|
||||
cur = conn.cursor()
|
||||
if params:
|
||||
cur.execute(sql, params)
|
||||
else:
|
||||
cur.execute(sql)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
def qdrant_delete_points(uuid, collection):
|
||||
"""Delete points from Qdrant collection by payload filter."""
|
||||
try:
|
||||
req = Request(f"{QDRANT}/collections/{collection}/points/delete",
|
||||
data=json.dumps({
|
||||
"filter": {"must": [{"key": "file_uuid", "match": {"value": uuid}}]}
|
||||
}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="POST")
|
||||
urlopen(req)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
# ---- Deploy ----
|
||||
|
||||
def cmd_deploy(tarball_path):
|
||||
"""Deploy a release package."""
|
||||
if not os.path.exists(tarball_path):
|
||||
print(f"ERROR: {tarball_path} not found")
|
||||
return 1
|
||||
|
||||
t0 = time.time()
|
||||
print(f"=== Deploy: {os.path.basename(tarball_path)} ===")
|
||||
|
||||
# 1. Extract
|
||||
tmpdir = tempfile.mkdtemp(prefix="release_deploy_")
|
||||
print(f"Extracting to {tmpdir}...")
|
||||
with tarfile.open(tarball_path) as tar:
|
||||
tar.extractall(tmpdir)
|
||||
|
||||
# Find UUID from directory name or file_info.json
|
||||
uuid = None
|
||||
for item in os.listdir(tmpdir):
|
||||
info_path = os.path.join(tmpdir, item, "file_info.json")
|
||||
if os.path.exists(info_path):
|
||||
with open(info_path) as f:
|
||||
info = json.load(f)
|
||||
uuid = info.get("file_uuid", "")
|
||||
break
|
||||
|
||||
if not uuid:
|
||||
print("ERROR: Could not find file_info.json with UUID")
|
||||
return 1
|
||||
|
||||
pkg_dir = os.path.join(tmpdir, uuid)
|
||||
print(f"UUID: {uuid}")
|
||||
|
||||
# 2. Import data.sql
|
||||
sql_path = os.path.join(pkg_dir, "data.sql")
|
||||
if os.path.exists(sql_path):
|
||||
print(f"Importing data.sql ({os.path.getsize(sql_path)/1024/1024:.0f} MB)...")
|
||||
r = subprocess.run([f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-f", sql_path],
|
||||
capture_output=True, text=True, timeout=300)
|
||||
if r.returncode != 0:
|
||||
print(f"WARNING: SQL import may have issues")
|
||||
print(r.stderr[-500:] if r.stderr else "")
|
||||
else:
|
||||
print("WARNING: data.sql not found in package")
|
||||
|
||||
# 3. Copy video to demo dir
|
||||
for fname in os.listdir(pkg_dir):
|
||||
fpath = os.path.join(pkg_dir, fname)
|
||||
if fname.endswith(('.mp4', '.mov', '.avi', '.mkv')):
|
||||
dest = os.path.join(DEMO_DIR, fname)
|
||||
if not os.path.exists(dest):
|
||||
shutil.copy2(fpath, dest)
|
||||
print(f"Video: {fname} → {DEMO_DIR}/")
|
||||
else:
|
||||
print(f"Video: {fname} already exists in demo dir")
|
||||
|
||||
# 4. Copy JSON outputs
|
||||
for fname in os.listdir(pkg_dir):
|
||||
if fname.endswith('.json'):
|
||||
src = os.path.join(pkg_dir, fname)
|
||||
dest = os.path.join(OUTPUT_DIR, fname)
|
||||
shutil.copy2(src, dest)
|
||||
|
||||
print(f"Output files copied to {OUTPUT_DIR}/")
|
||||
|
||||
# 5. Verify deployment
|
||||
rows = pg_query("SELECT COUNT(*) FROM dev.chunk WHERE file_uuid = %s", (uuid,))
|
||||
chunks = rows[0][0] if rows else 0
|
||||
rows = pg_query("SELECT COUNT(*) FROM dev.face_detections WHERE file_uuid = %s", (uuid,))
|
||||
faces = rows[0][0] if rows else 0
|
||||
rows = pg_query("SELECT file_name, duration FROM dev.videos WHERE file_uuid = %s", (uuid,))
|
||||
video_info = rows[0] if rows else ("?", "?")
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f"\n=== Deploy Complete ({elapsed:.0f}s) ===")
|
||||
print(f" Video: {video_info[0]} ({float(video_info[1]):.0f}s)")
|
||||
print(f" Chunks: {chunks}")
|
||||
print(f" Face detections: {faces}")
|
||||
|
||||
shutil.rmtree(tmpdir, ignore_errors=True)
|
||||
return 0
|
||||
|
||||
# ---- Undeploy ----
|
||||
|
||||
def cmd_undeploy(uuid):
|
||||
"""Undeploy: remove all trace of a UUID from the system."""
|
||||
print(f"=== Undeploy: {uuid} ===")
|
||||
|
||||
# Confirm
|
||||
rows = pg_query("SELECT file_name FROM dev.videos WHERE file_uuid = %s", (uuid,))
|
||||
if not rows:
|
||||
print(f"ERROR: UUID {uuid} not found in DB")
|
||||
return 1
|
||||
filename = rows[0][0]
|
||||
print(f"Video: {filename}")
|
||||
print("This will DELETE all data for this video. Are you sure? (y/N): ", end="")
|
||||
confirm = sys.stdin.readline().strip().lower()
|
||||
if confirm != 'y':
|
||||
print("Cancelled")
|
||||
return 0
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
# Get video path before deleting
|
||||
rows = pg_query("SELECT file_path FROM dev.videos WHERE file_uuid = %s", (uuid,))
|
||||
video_path = rows[0][0] if rows else ""
|
||||
|
||||
# 1. Delete DB data
|
||||
tables = [
|
||||
("dev.chunk", "file_uuid"),
|
||||
("dev.chunk_vectors", "uuid"),
|
||||
("dev.face_detections", "file_uuid"),
|
||||
("dev.processor_results", "file_uuid"),
|
||||
("dev.monitor_jobs", "uuid"),
|
||||
("dev.pre_chunks", "file_uuid"),
|
||||
]
|
||||
for tbl, col in tables:
|
||||
pg_execute(f"DELETE FROM {tbl} WHERE {col} = %s", (uuid,))
|
||||
print(f" {tbl}: cleared")
|
||||
pg_execute("DELETE FROM dev.videos WHERE file_uuid = %s", (uuid,))
|
||||
print(f" dev.videos: removed")
|
||||
|
||||
# Clean orphaned identity bindings
|
||||
pg_execute("DELETE FROM dev.identity_bindings WHERE identity_value NOT IN (SELECT face_id FROM dev.face_detections)")
|
||||
|
||||
# 2. Delete output files
|
||||
for f in os.listdir(OUTPUT_DIR):
|
||||
if f.startswith(uuid):
|
||||
os.remove(os.path.join(OUTPUT_DIR, f))
|
||||
print(f" Output files: removed")
|
||||
|
||||
# 3. Delete video from demo dir
|
||||
if video_path and os.path.exists(video_path):
|
||||
os.remove(video_path)
|
||||
print(f" Video file: removed ({os.path.basename(video_path)})")
|
||||
|
||||
# 4. Clean Qdrant (skip - Qdrant points don't have easy UUID filter)
|
||||
# Instead rely on upsert behavior
|
||||
|
||||
# 5. Delete release package
|
||||
pkg_path = os.path.join(RELEASE_DIR, uuid)
|
||||
if os.path.exists(pkg_path):
|
||||
shutil.rmtree(pkg_path)
|
||||
print(f" Release dir: removed")
|
||||
for f in os.listdir(RELEASE_DIR):
|
||||
if f.startswith(uuid):
|
||||
os.remove(os.path.join(RELEASE_DIR, f))
|
||||
print(f" Release file: {f} removed")
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f"\n=== Undeploy Complete ({elapsed:.0f}s) ===")
|
||||
return 0
|
||||
|
||||
# ---- List ----
|
||||
|
||||
def cmd_list():
|
||||
"""List deployed videos."""
|
||||
rows = pg_query("""
|
||||
SELECT file_uuid, file_name,
|
||||
TO_CHAR((duration/60)::int, 'FM999"min"') as dur,
|
||||
status,
|
||||
(SELECT COUNT(*) FROM dev.chunk WHERE file_uuid = v.file_uuid) as chunks,
|
||||
(SELECT COUNT(*) FROM dev.face_detections WHERE file_uuid = v.file_uuid) as faces
|
||||
FROM dev.videos v ORDER BY id DESC
|
||||
""")
|
||||
print(f"{'UUID':36s} {'Name':40s} {'Duration':8s} {'Status':10s} {'Chunks':>6s} {'Faces':>6s}")
|
||||
print("-" * 120)
|
||||
for r in rows:
|
||||
uuid, name, dur, status, chunks, faces = r
|
||||
short_name = (name or "")[:38] + ".." if len(name or "") > 40 else (name or "")
|
||||
print(f"{uuid:36s} {short_name:40s} {dur or '?':8s} {status or '?':10s} {chunks or 0:>6d} {faces or 0:>6d}")
|
||||
|
||||
# ---- Package ----
|
||||
|
||||
def cmd_package(uuid):
|
||||
"""Create a release package for a deployed video."""
|
||||
print(f"=== Package: {uuid} ===")
|
||||
|
||||
# Check video exists
|
||||
rows = pg_query("SELECT file_uuid, file_name, file_path FROM dev.videos WHERE file_uuid = %s", (uuid,))
|
||||
if not rows:
|
||||
print(f"ERROR: UUID {uuid} not found")
|
||||
return 1
|
||||
|
||||
outdir = os.path.join(RELEASE_DIR, uuid)
|
||||
shutil.rmtree(outdir, ignore_errors=True)
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
|
||||
# Export data.sql
|
||||
r = subprocess.run([f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-t", "-A",
|
||||
"-c", f"SELECT json_build_object('file_uuid', file_uuid, 'file_name', file_name, 'duration', duration, 'fps', fps, 'width', width, 'height', height, 'total_frames', total_frames, 'status', status) FROM dev.videos WHERE file_uuid='{uuid}'"],
|
||||
capture_output=True, text=True, timeout=15)
|
||||
if r.stdout.strip():
|
||||
info = json.loads(r.stdout.strip())
|
||||
with open(os.path.join(outdir, "file_info.json"), "w") as f:
|
||||
json.dump(info, f, indent=2)
|
||||
|
||||
# Export SQL
|
||||
sql_path = os.path.join(outdir, "data.sql")
|
||||
with open(sql_path, "w") as f:
|
||||
f.write(f"-- Release package: {uuid}\nBEGIN;\n\n")
|
||||
for tbl, col in [("dev.videos", "file_uuid"), ("dev.chunk", "file_uuid"),
|
||||
("dev.chunk_vectors", "uuid"), ("dev.face_detections", "file_uuid")]:
|
||||
r = subprocess.run([f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-c",
|
||||
f"COPY (SELECT * FROM {tbl} WHERE {col} = '{uuid}') TO STDOUT WITH CSV HEADER"],
|
||||
capture_output=True, text=True, timeout=60)
|
||||
if r.stdout.strip():
|
||||
# Get column names
|
||||
schema, table = tbl.split(".")
|
||||
r2 = subprocess.run([f"{PG_BIN}/psql", "-U", "accusys", "-d", "momentry", "-t", "-A",
|
||||
"-c", f"SELECT string_agg(column_name, ', ' ORDER BY ordinal_position) FROM information_schema.columns WHERE table_schema='{schema}' AND table_name='{table}' AND is_updatable='YES'"],
|
||||
capture_output=True, text=True, timeout=15)
|
||||
cols = r2.stdout.strip()
|
||||
f.write(f"COPY {tbl} ({cols}) FROM STDIN WITH CSV HEADER;\n")
|
||||
f.write(r.stdout)
|
||||
if not r.stdout.endswith("\n"):
|
||||
f.write("\n")
|
||||
f.write("\\.\n\n")
|
||||
f.write("COMMIT;\n")
|
||||
|
||||
size = os.path.getsize(sql_path)
|
||||
print(f" data.sql ({size/1024/1024:.0f} MB)")
|
||||
|
||||
# Copy video
|
||||
video_path = rows[0][2]
|
||||
if video_path and os.path.exists(video_path):
|
||||
dest = os.path.join(outdir, os.path.basename(video_path))
|
||||
shutil.copy2(video_path, dest)
|
||||
print(f" {os.path.basename(video_path)} ({os.path.getsize(dest)/1024/1024:.0f} MB)")
|
||||
|
||||
# Copy output JSONs
|
||||
for fname in os.listdir(OUTPUT_DIR):
|
||||
if fname.startswith(uuid) and fname.endswith('.json'):
|
||||
shutil.copy2(os.path.join(OUTPUT_DIR, fname), os.path.join(outdir, fname))
|
||||
|
||||
# tar.gz
|
||||
tarball = os.path.join(RELEASE_DIR, f"{uuid}_v{int(time.time())}.tar.gz")
|
||||
subprocess.run(["tar", "-czf", tarball, "-C", RELEASE_DIR, uuid], check=True, timeout=300)
|
||||
tsize = os.path.getsize(tarball)
|
||||
print(f" Package: {tarball} ({tsize/1024/1024:.0f} MB)")
|
||||
return 0
|
||||
|
||||
# ---- Main ----
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Release Manager — deploy/undeploy/list video packages")
|
||||
sub = parser.add_subparsers(dest="cmd")
|
||||
|
||||
p_deploy = sub.add_parser("deploy", help="Deploy a release package")
|
||||
p_deploy.add_argument("tarball", help="Path to .tar.gz package")
|
||||
|
||||
p_undeploy = sub.add_parser("undeploy", help="Undeploy (remove all data for a UUID)")
|
||||
p_undeploy.add_argument("uuid", help="File UUID")
|
||||
|
||||
p_list = sub.add_parser("list", help="List deployed videos")
|
||||
|
||||
p_package = sub.add_parser("package", help="Create release package for deployed video")
|
||||
p_package.add_argument("uuid", help="File UUID")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.cmd == "deploy":
|
||||
sys.exit(cmd_deploy(args.tarball))
|
||||
elif args.cmd == "undeploy":
|
||||
sys.exit(cmd_undeploy(args.uuid))
|
||||
elif args.cmd == "list":
|
||||
cmd_list()
|
||||
elif args.cmd == "package":
|
||||
sys.exit(cmd_package(args.uuid))
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
222
scripts/render_face_heatmap.py
Normal file
222
scripts/render_face_heatmap.py
Normal file
@@ -0,0 +1,222 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""Face Trace Heatmap + Timeline Visualization for Momentry.
|
||||
Usage:
|
||||
python3 render_face_heatmap.py <uuid> [output.html] [--identity ID]
|
||||
"""
|
||||
import sys, psycopg2, argparse
|
||||
from collections import defaultdict
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("uuid")
|
||||
parser.add_argument("output", nargs="?", default=None)
|
||||
parser.add_argument("--identity", "-i", type=int, default=None, help="Filter by identity_id")
|
||||
args = parser.parse_args()
|
||||
|
||||
UUID = args.uuid
|
||||
OUT = args.output or f"/tmp/face_report_{UUID[:8]}.html"
|
||||
IDENTITY = args.identity
|
||||
|
||||
conn = psycopg2.connect("dbname=momentry user=accusys")
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("SELECT duration, file_name, COALESCE(fps, 25.0) FROM dev.videos WHERE file_uuid=%s", (UUID,))
|
||||
row = cur.fetchone()
|
||||
if not row:
|
||||
print("UUID not found")
|
||||
sys.exit(1)
|
||||
duration, video_name, fps = float(row[0] or 6785), row[1] or UUID, float(row[2] or 25.0)
|
||||
|
||||
# Get sample interval from face.json metadata (or default 3 = 8Hz)
|
||||
sample_interval = 3
|
||||
hz = fps / sample_interval
|
||||
|
||||
# Build identity filter
|
||||
identity_filter = ""
|
||||
identity_params = [UUID]
|
||||
identity_label = ""
|
||||
identity_info = None # full identity record when filtered
|
||||
top_identities = [] # top identities summary (all view)
|
||||
|
||||
if IDENTITY is not None:
|
||||
identity_filter = " AND identity_id = %s"
|
||||
identity_params.append(IDENTITY)
|
||||
cur.execute("SELECT id, name, identity_type, source, status FROM dev.identities WHERE id=%s", (IDENTITY,))
|
||||
id_row = cur.fetchone()
|
||||
if id_row:
|
||||
identity_info = {"id": id_row[0], "name": id_row[1], "type": id_row[2], "source": id_row[3], "status": id_row[4]}
|
||||
identity_label = f" (identity: {id_row[1]})"
|
||||
else:
|
||||
identity_label = f" (identity #{IDENTITY})"
|
||||
identity_params = [UUID, IDENTITY]
|
||||
|
||||
# Query trace spans
|
||||
cur.execute(f"""
|
||||
SELECT trace_id, MIN(frame_number), MAX(frame_number),
|
||||
COALESCE(MIN(timestamp_secs), MIN(frame_number) / {fps}) as first_t,
|
||||
COALESCE(MAX(timestamp_secs), MAX(frame_number) / {fps}) as last_t,
|
||||
COUNT(*)
|
||||
FROM dev.face_detections
|
||||
WHERE file_uuid=%s AND trace_id IS NOT NULL{identity_filter}
|
||||
GROUP BY trace_id ORDER BY first_t
|
||||
""", identity_params)
|
||||
trace_spans = cur.fetchall()
|
||||
|
||||
# Query density per time bucket (5s)
|
||||
cur.execute(f"""
|
||||
SELECT FLOOR(COALESCE(timestamp_secs, frame_number / {fps}) / 5)::int as bkt, COUNT(*) as cnt
|
||||
FROM dev.face_detections
|
||||
WHERE file_uuid=%s AND trace_id IS NOT NULL{identity_filter}
|
||||
GROUP BY bkt ORDER BY bkt
|
||||
""", identity_params)
|
||||
density = {b: c for b, c in cur.fetchall()}
|
||||
|
||||
# Count total detections
|
||||
cur.execute(f"SELECT COUNT(*) FROM dev.face_detections WHERE file_uuid=%s{identity_filter}", identity_params)
|
||||
total_detections = cur.fetchone()[0]
|
||||
|
||||
# Get top identities (for all view) and trace↔identity mapping
|
||||
if IDENTITY is None:
|
||||
cur.execute("""
|
||||
SELECT fd.identity_id, i.name, COUNT(*) as faces, COUNT(DISTINCT fd.trace_id) as traces
|
||||
FROM dev.face_detections fd
|
||||
LEFT JOIN dev.identities i ON i.id = fd.identity_id
|
||||
WHERE fd.file_uuid=%s AND fd.identity_id IS NOT NULL
|
||||
GROUP BY fd.identity_id, i.name ORDER BY faces DESC LIMIT 10
|
||||
""", (UUID,))
|
||||
top_identities = cur.fetchall()
|
||||
else:
|
||||
# Get trace→identity mapping for tooltip enrichment
|
||||
cur.execute("""
|
||||
SELECT DISTINCT fd.trace_id, i.name
|
||||
FROM dev.face_detections fd
|
||||
LEFT JOIN dev.identities i ON i.id = fd.identity_id
|
||||
WHERE fd.file_uuid=%s AND fd.identity_id IS NOT NULL
|
||||
""", (UUID,))
|
||||
trace_to_identity = {r[0]: r[1] for r in cur.fetchall()}
|
||||
|
||||
cur.close(); conn.close()
|
||||
|
||||
BUCKET = 5
|
||||
num_buckets = int(duration / BUCKET) + 1
|
||||
max_density = max(density.values()) if density else 1
|
||||
|
||||
def build_html():
|
||||
h = []
|
||||
h.append('<!DOCTYPE html><html><head><meta charset="utf-8"><title>Face Trace Report</title>')
|
||||
h.append('<style>')
|
||||
h.append('body{font-family:-apple-system,BlinkMacSystemFont,sans-serif;margin:20px;background:#0d1117;color:#c9d1d9}')
|
||||
h.append('h1,h2{color:#e94560}')
|
||||
h.append('.stats{display:flex;gap:12px;margin:8px 0;flex-wrap:wrap}')
|
||||
h.append('.stat{background:#161b22;padding:6px 14px;border-radius:6px}')
|
||||
h.append('.stat .num{font-size:20px;font-weight:bold;color:#e94560}')
|
||||
h.append('.stat .label{font-size:10px;color:#8b949e}')
|
||||
h.append('.viz{position:relative;background:#0d1117;border:1px solid #30363d;margin:8px 0;overflow:hidden}')
|
||||
h.append('.bar{display:block;position:absolute;height:3px;background:#e94560;opacity:0.7;border-radius:1px}')
|
||||
h.append('.bar:hover{height:8px;opacity:1}')
|
||||
h.append('</style></head><body>')
|
||||
sub = identity_label if identity_label else ""
|
||||
h.append('<h1>Face Trace Report — ' + video_name[:60] + sub + '</h1>')
|
||||
|
||||
# Identity card (when filtering by identity)
|
||||
if identity_info:
|
||||
h.append('<div style="background:#161b22;border:1px solid #30363d;border-radius:8px;padding:16px;margin:12px 0;">')
|
||||
h.append('<h3 style="margin:0;color:#e94560">Identity Details</h3>')
|
||||
h.append(f'<table style="width:100%;margin-top:8px;color:#c9d1d9;border-collapse:collapse">')
|
||||
h.append(f'<tr><td style="padding:4px 12px 4px 0;color:#8b949e;width:80px">ID</td><td>{identity_info["id"]}</td></tr>')
|
||||
h.append(f'<tr><td style="padding:4px 12px 4px 0;color:#8b949e">Name</td><td style="font-weight:bold">{identity_info["name"]}</td></tr>')
|
||||
h.append(f'<tr><td style="padding:4px 12px 4px 0;color:#8b949e">Type</td><td>{identity_info["type"]}</td></tr>')
|
||||
h.append(f'<tr><td style="padding:4px 12px 4px 0;color:#8b949e">Source</td><td>{identity_info["source"]}</td></tr>')
|
||||
h.append(f'<tr><td style="padding:4px 12px 4px 0;color:#8b949e">Status</td><td>{identity_info["status"]}</td></tr>')
|
||||
h.append('</table></div>')
|
||||
|
||||
# Top identities table (all view)
|
||||
if top_identities:
|
||||
h.append('<h2>Top Identities</h2>')
|
||||
h.append('<div style="overflow-x:auto">')
|
||||
h.append('<table style="width:100%;color:#c9d1d9;border-collapse:collapse;font-size:13px">')
|
||||
h.append('<tr style="background:#161b22;text-align:left"><th style="padding:6px 10px">Identity</th><th style="padding:6px 10px">Name</th><th style="padding:6px 10px;text-align:right">Faces</th><th style="padding:6px 10px;text-align:right">Traces</th></tr>')
|
||||
for iid, name, fc, tc in top_identities:
|
||||
short_name = (name or f"#{iid}")[:60]
|
||||
h.append(f'<tr style="border-bottom:1px solid #21262d"><td style="padding:4px 10px;color:#8b949e">{iid}</td><td style="padding:4px 10px">{short_name}</td><td style="padding:4px 10px;text-align:right">{fc:,}</td><td style="padding:4px 10px;text-align:right">{tc}</td></tr>')
|
||||
h.append('</table></div>')
|
||||
|
||||
# Stats row
|
||||
h.append('<div class="stats">')
|
||||
h.append(f'<div class="stat"><div class="num">{len(trace_spans):,}</div><div class="label">traces</div></div>')
|
||||
h.append(f'<div class="stat"><div class="num">{total_detections:,}</div><div class="label">detections</div></div>')
|
||||
h.append(f'<div class="stat"><div class="num">{duration:.0f}s</div><div class="label">duration</div></div>')
|
||||
h.append(f'<div class="stat"><div class="num">{max_density}</div><div class="label">max per {BUCKET}s</div></div>')
|
||||
h.append(f'<div class="stat"><div class="num">{fps:.0f}fps</div><div class="label">video fps</div></div>')
|
||||
h.append(f'<div class="stat"><div class="num">{hz:.0f}Hz</div><div class="label">sample rate (every {sample_interval}frames)</div></div>')
|
||||
h.append(f'<div class="stat"><div class="num">{num_buckets}</div><div class="label">{BUCKET}s buckets</div></div>')
|
||||
h.append('</div>')
|
||||
|
||||
# 1. Density histogram
|
||||
h.append('<h2>Face Density Over Time</h2>')
|
||||
h.append('<div style="color:#666;font-size:12px;margin-bottom:4px">Number of face detections per 5-second interval</div>')
|
||||
w_px = num_buckets * 2 + 20
|
||||
h.append(f'<div class="viz" style="width:{w_px}px;height:80px">')
|
||||
for b in range(num_buckets):
|
||||
v = density.get(b, 0)
|
||||
h_px = max(2, int(60 * v / max(1, max_density * 0.6))) if v > 0 else 0
|
||||
if v == 0:
|
||||
color = "#0d1117"
|
||||
else:
|
||||
i = min(v / (max(1, max_density * 0.5)), 1.0)
|
||||
r = int(233 * i + 13 * (1 - i))
|
||||
g = int(69 * i + 13 * (1 - i))
|
||||
bv = int(96 * i + 23 * (1 - i))
|
||||
color = f"rgb({r},{g},{bv})"
|
||||
title = f"{b * BUCKET:.0f}s: {v} faces"
|
||||
h.append(f'<span style="position:absolute;left:{b*2+10}px;bottom:0;width:2px;height:{h_px}px;background:{color}" title="{title}"></span>')
|
||||
h.append('</div>')
|
||||
h.append(f'<div style="font-size:10px;color:#444;width:{w_px}px;display:flex;justify-content:space-between"><span>0s</span><span>{duration:.0f}s</span></div>')
|
||||
|
||||
# 2. Trace timeline (Gantt)
|
||||
h.append('<h2>Trace Timeline</h2>')
|
||||
h.append('<div style="color:#666;font-size:12px;margin-bottom:4px">First → last appearance for each trace. Hover for details.</div>')
|
||||
show_traces = min(len(trace_spans), 2000)
|
||||
bar_h = 2
|
||||
chart_height = show_traces * (bar_h + 1) + 10
|
||||
h.append(f'<div class="viz" style="width:{w_px}px;height:{chart_height}px">')
|
||||
for i, (tid, fn0, fn1, t0, t1, cnt) in enumerate(trace_spans[:show_traces]):
|
||||
left = int(t0 / duration * (w_px - 20)) + 10
|
||||
width = max(3, int((t1 - t0) / duration * (w_px - 20)))
|
||||
top = i * (bar_h + 1) + 5
|
||||
opacity = 1.0 if cnt > 5 else 0.3
|
||||
identity_note = ""
|
||||
if IDENTITY is not None and tid in trace_to_identity:
|
||||
identity_note = f", identity: {trace_to_identity[tid]}"
|
||||
title = f"T{tid}: {t0:.0f}s–{t1:.0f}s, {cnt} faces, f{fn0}–f{fn1}{identity_note}"
|
||||
h.append(f'<span class="bar" style="left:{left}px;top:{top}px;width:{width}px;height:{bar_h}px;opacity:{opacity}" title="{title}"></span>')
|
||||
h.append('</div>')
|
||||
h.append(f'<div style="font-size:10px;color:#444;width:{w_px}px;display:flex;justify-content:space-between"><span>0s (showing {show_traces}/{len(trace_spans)} traces)</span><span>{duration:.0f}s</span></div>')
|
||||
|
||||
# 3. Per-trace heatmap
|
||||
h.append('<h2>Per-Trace Heatmap (top 500, every 10th trace)</h2>')
|
||||
h.append(f'<div style="overflow-x:scroll;max-width:100%">')
|
||||
step = max(1, num_buckets // 120)
|
||||
for i, (tid, fn0, fn1, t0, t1, cnt) in enumerate(trace_spans[:500]):
|
||||
if i % 10 != 0:
|
||||
continue
|
||||
start_bkt = int(t0 / BUCKET)
|
||||
end_bkt = int(t1 / BUCKET) + 1
|
||||
row = f'<div style="white-space:nowrap;line-height:6px"><span style="display:inline-block;width:45px;font-size:6px;color:#555">T{tid}</span>'
|
||||
for b in range(0, num_buckets, step):
|
||||
active = start_bkt <= b <= end_bkt
|
||||
color = "#e94560" if active else "#161b22"
|
||||
row += f'<span style="display:inline-block;width:2px;height:4px;background:{color};margin:0"></span>'
|
||||
row += '</div>'
|
||||
h.append(row)
|
||||
h.append('</div>')
|
||||
|
||||
h.append('</body></html>')
|
||||
return '\n'.join(h)
|
||||
|
||||
html = build_html()
|
||||
with open(OUT, 'w') as f:
|
||||
f.write(html)
|
||||
|
||||
print(f"Saved: {OUT}")
|
||||
print(f"Traces: {len(trace_spans)}, Detections: {total_detections}, Density max: {max_density}, Duration: {duration:.0f}s, Sample: {hz:.0f}Hz")
|
||||
print(f"Size: {len(html) / 1024:.0f}KB")
|
||||
164
scripts/speaker_assign.py
Normal file
164
scripts/speaker_assign.py
Normal file
@@ -0,0 +1,164 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Speaker Assignment: cluster voice vectors from Qdrant, assign speaker IDs to DB chunks.
|
||||
"""
|
||||
import json, sys, time
|
||||
import psycopg2
|
||||
import numpy as np
|
||||
from urllib.request import Request, urlopen
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5"
|
||||
QDRANT = "http://localhost:6333"
|
||||
DB = "dbname=momentry user=accusys"
|
||||
COLLECTION = "momentry_dev_voice"
|
||||
|
||||
print(f"=== Speaker Assignment for {UUID} ===")
|
||||
|
||||
# Step 1: Read voice vectors from Qdrant
|
||||
print("Reading voice vectors from Qdrant...")
|
||||
vectors = []
|
||||
chunk_ids = []
|
||||
# We need to scroll through all points
|
||||
offset = None
|
||||
while True:
|
||||
data = {"limit": 100, "with_payload": True, "with_vector": True}
|
||||
if offset is not None:
|
||||
data["offset"] = offset
|
||||
req = Request(f"{QDRANT}/collections/{COLLECTION}/points/scroll",
|
||||
data=json.dumps(data).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="POST")
|
||||
resp = json.loads(urlopen(req).read())
|
||||
result = resp["result"]
|
||||
points = result.get("points", [])
|
||||
if not points:
|
||||
break
|
||||
for pt in points:
|
||||
payload = pt.get("payload", {})
|
||||
cid = payload.get("chunk_id", "")
|
||||
# Only get vectors for THIS UUID's chunks
|
||||
# Filter by checking DB later, or rely on Qdrant payload
|
||||
vectors.append(pt["vector"])
|
||||
chunk_ids.append(cid)
|
||||
offset = result.get("next_page_offset")
|
||||
if offset is None:
|
||||
break
|
||||
print(f" Read {len(vectors)} vectors...")
|
||||
|
||||
print(f"Total vectors: {len(vectors)}")
|
||||
|
||||
# Step 2: Filter to only our UUID's chunks (from DB)
|
||||
conn = psycopg2.connect(DB)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT chunk_id FROM dev.chunk WHERE file_uuid = %s AND chunk_type = 'sentence' ORDER BY id", (UUID,))
|
||||
db_chunk_ids = set(row[0] for row in cur.fetchall())
|
||||
print(f"DB chunk_ids: {len(db_chunk_ids)}")
|
||||
|
||||
# Filter vectors to match DB chunks
|
||||
filtered_vectors = []
|
||||
filtered_chunk_ids = []
|
||||
for v, cid in zip(vectors, chunk_ids):
|
||||
if cid in db_chunk_ids:
|
||||
filtered_vectors.append(v)
|
||||
filtered_chunk_ids.append(cid)
|
||||
|
||||
vectors = filtered_vectors
|
||||
chunk_ids = filtered_chunk_ids
|
||||
print(f"Matched vectors: {len(vectors)}")
|
||||
|
||||
# Sort by chunk_id (which is numeric string)
|
||||
indices = sorted(range(len(chunk_ids)), key=lambda i: int(chunk_ids[i]) if chunk_ids[i].isdigit() else 0)
|
||||
vectors = [vectors[i] for i in indices]
|
||||
chunk_ids = [chunk_ids[i] for i in indices]
|
||||
|
||||
# Step 3: Read speaker_change from asr.json
|
||||
asr_path = f"/Users/accusys/momentry/output_dev/{UUID}.asr.json"
|
||||
with open(asr_path) as f:
|
||||
asr_data = json.load(f)
|
||||
segments = asr_data.get("segments", [])
|
||||
speaker_changes = {}
|
||||
for seg in segments:
|
||||
speaker_changes[seg["chunk_id"]] = seg.get("speaker_change", False)
|
||||
|
||||
# Step 4: Cluster embeddings
|
||||
print("Clustering...")
|
||||
X = np.array(vectors)
|
||||
|
||||
# Compute cosine distance matrix
|
||||
# Cosine distance = 1 - cosine_similarity
|
||||
cos_sim = cosine_similarity(X)
|
||||
cos_dist = 1 - cos_sim
|
||||
|
||||
# Use AgglomerativeClustering with cosine distance
|
||||
# Determine optimal n_clusters by looking at speaker_change boundaries
|
||||
# First pass: use speaker_change as hard boundaries to get initial clusters
|
||||
# Then refine
|
||||
|
||||
# Simpler: use a distance threshold
|
||||
n = len(vectors)
|
||||
labels = np.full(n, -1, dtype=int)
|
||||
current_speaker = 0
|
||||
|
||||
# Start with first chunk as speaker 0
|
||||
labels[0] = current_speaker
|
||||
centroids = [np.array(vectors[0])] # per-cluster centroid
|
||||
|
||||
for i in range(1, n):
|
||||
has_change = speaker_changes.get(chunk_ids[i], False)
|
||||
vec = np.array(vectors[i])
|
||||
|
||||
if has_change:
|
||||
# Speaker change: check if this is a NEW speaker or returning to a previous one
|
||||
# Compare with centroid of current speaker vs others
|
||||
similarities = [float(np.dot(vec, c) / (np.linalg.norm(vec) * np.linalg.norm(c) + 1e-10)) for c in centroids]
|
||||
best_sim = max(similarities) if similarities else 0
|
||||
best_cluster = similarities.index(best_sim) if similarities else 0
|
||||
|
||||
if best_sim > 0.65 and best_cluster != current_speaker:
|
||||
# Returning to a previous speaker
|
||||
labels[i] = best_cluster
|
||||
elif best_sim < 0.55:
|
||||
# New speaker
|
||||
current_speaker = len(centroids)
|
||||
labels[i] = current_speaker
|
||||
centroids.append(vec)
|
||||
else:
|
||||
# Stay with current speaker (false change detection)
|
||||
labels[i] = current_speaker
|
||||
centroids[current_speaker] = (centroids[current_speaker] + vec) / 2
|
||||
else:
|
||||
# No speaker change: same speaker as previous
|
||||
labels[i] = current_speaker
|
||||
centroids[current_speaker] = (centroids[current_speaker] + vec) / 2
|
||||
|
||||
n_speakers = len(set(labels))
|
||||
print(f"Identified {n_speakers} unique speakers")
|
||||
|
||||
# Step 5: Update DB chunks with speaker assignment
|
||||
print("Updating DB chunks...")
|
||||
# Map: chunk_id -> speaker_id
|
||||
speaker_map = {}
|
||||
for cid, label in zip(chunk_ids, labels):
|
||||
speaker_map[cid] = f"SPEAKER_{label}"
|
||||
|
||||
updated = 0
|
||||
for cid, spk_id in speaker_map.items():
|
||||
cur.execute("""
|
||||
UPDATE dev.chunk SET metadata = COALESCE(metadata, '{}'::jsonb) || %s::jsonb
|
||||
WHERE file_uuid = %s AND chunk_id = %s AND chunk_type = 'sentence'
|
||||
""", (json.dumps({"speaker_id": spk_id}), UUID, cid))
|
||||
updated += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"Updated {updated} chunks with speaker IDs")
|
||||
|
||||
# Step 6: Save speaker map
|
||||
speaker_map_path = f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map.json"
|
||||
with open(speaker_map_path, "w") as f:
|
||||
json.dump({"speakers": n_speakers, "assignments": speaker_map}, f, indent=2)
|
||||
print(f"Speaker map saved: {speaker_map_path}")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
print("=== Done ===")
|
||||
284
scripts/transcribe.py
Normal file
284
scripts/transcribe.py
Normal file
@@ -0,0 +1,284 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
One-pass ASR + Speaker Change Detection + Split → asr.json
|
||||
"""
|
||||
import json, os, sys, time, argparse, subprocess, tempfile, shutil
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "asrx_self"))
|
||||
from speaker_encoder import load_speaker_encoder, extract_speaker_embedding, normalize_embeddings
|
||||
import torchaudio
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
SUB_WIN = 0.5
|
||||
SUB_STRIDE = 0.25
|
||||
MIN_DUR = 0.3
|
||||
SIM_THRESHOLD = 0.45
|
||||
CHANGE_CONFIRM = 2
|
||||
|
||||
def extract_audio(video_path, tmp_dir, sr=16000):
|
||||
wav_path = os.path.join(tmp_dir, "audio.wav")
|
||||
subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", video_path,
|
||||
"-ar", str(sr), "-ac", "1", "-sample_fmt", "s16", wav_path],
|
||||
check=True, capture_output=True, timeout=300)
|
||||
wav_data, sr_actual = torchaudio.load(wav_path)
|
||||
if wav_data.shape[0] > 1:
|
||||
wav_data = wav_data.mean(dim=0, keepdim=True)
|
||||
return wav_data, sr_actual
|
||||
|
||||
def transcribe_pass1(model, wav_path, vad_params=None):
|
||||
print(" [faster-whisper] Transcribing...")
|
||||
if vad_params is None:
|
||||
vad_params = {"min_silence_duration_ms": 500, "speech_pad_ms": 200}
|
||||
segments, info = model.transcribe(wav_path, beam_size=5,
|
||||
vad_filter=True, word_timestamps=True, vad_parameters=vad_params)
|
||||
pass1 = []
|
||||
for i, seg in enumerate(segments):
|
||||
words = []
|
||||
if seg.words:
|
||||
for w in seg.words:
|
||||
words.append({"word": w.word.strip(), "start": round(w.start,3), "end": round(w.end,3)})
|
||||
pass1.append({
|
||||
"index": i,
|
||||
"start": round(seg.start, 3),
|
||||
"end": round(seg.end, 3),
|
||||
"text": seg.text.strip(),
|
||||
"words": words,
|
||||
})
|
||||
print(f" Pass1 segments: {len(pass1)}")
|
||||
return pass1
|
||||
|
||||
def detect_speaker_changes(wav_data, sr, pass1_segs, encoder, progress_step=100):
|
||||
print(" [Speaker Detection] Scanning...")
|
||||
ws = int(SUB_WIN * sr)
|
||||
sw = int(SUB_STRIDE * sr)
|
||||
change_points = [] # List[List[float]] → change times per pass1 segment
|
||||
t0 = time.time()
|
||||
|
||||
for si, seg in enumerate(pass1_segs):
|
||||
st = int(seg["start"] * sr)
|
||||
et = int(seg["end"] * sr)
|
||||
dur = seg["end"] - seg["start"]
|
||||
|
||||
if dur < 1.0:
|
||||
change_points.append([])
|
||||
continue
|
||||
|
||||
sub_embs = []
|
||||
sub_times = []
|
||||
for wpos in range(st, et - ws + 1, sw):
|
||||
chunk = wav_data[:, wpos:wpos+ws]
|
||||
emb = extract_speaker_embedding(encoder, chunk.numpy(), sr)
|
||||
emb = emb / (np.linalg.norm(emb) + 1e-10)
|
||||
sub_embs.append(emb)
|
||||
sub_times.append(wpos / sr)
|
||||
|
||||
if len(sub_embs) < 3:
|
||||
change_points.append([])
|
||||
continue
|
||||
|
||||
sub_embs = normalize_embeddings(np.array(sub_embs))
|
||||
cps = []
|
||||
# Require CHANGE_CONFIRM consecutive low-similarity windows before registering a change
|
||||
low_run = 0
|
||||
for i in range(1, len(sub_embs)):
|
||||
sim = float(np.dot(sub_embs[i-1], sub_embs[i]))
|
||||
if sim < SIM_THRESHOLD:
|
||||
low_run += 1
|
||||
if low_run >= CHANGE_CONFIRM:
|
||||
# Change point at the START of the low-sim run
|
||||
cps.append(round(sub_times[i - low_run + 1], 2))
|
||||
low_run = 0
|
||||
else:
|
||||
low_run = 0
|
||||
change_points.append(cps)
|
||||
|
||||
if (si + 1) % progress_step == 0:
|
||||
pct = (si + 1) * 100 // len(pass1_segs)
|
||||
print(f" {si+1}/{len(pass1_segs)} ({pct}%) [{time.time()-t0:.0f}s]")
|
||||
|
||||
total_changes = sum(len(cps) for cps in change_points)
|
||||
print(f" Speaker changes detected: {total_changes} in {len(pass1_segs)} segments ({time.time()-t0:.0f}s)")
|
||||
return change_points
|
||||
|
||||
def build_segments(pass1_segs, change_points, wav_data, sr, asr_model, tmp_dir, fps=24.0):
|
||||
print(" [Split] Building final segments...")
|
||||
final = []
|
||||
chunk_idx = 0
|
||||
|
||||
for si, seg in enumerate(pass1_segs):
|
||||
cps = change_points[si]
|
||||
if not cps:
|
||||
final.append({
|
||||
"chunk_id": str(chunk_idx),
|
||||
"pass1_index": si,
|
||||
"start_time": seg["start"],
|
||||
"end_time": seg["end"],
|
||||
"start_frame": int(seg["start"] * fps),
|
||||
"end_frame": int(seg["end"] * fps),
|
||||
"text": seg["text"],
|
||||
})
|
||||
chunk_idx += 1
|
||||
continue
|
||||
|
||||
seg["split"] = True
|
||||
boundaries = [seg["start"]] + cps + [seg["end"]]
|
||||
for pi in range(len(boundaries) - 1):
|
||||
ps, pe = boundaries[pi], boundaries[pi+1]
|
||||
if pe - ps < MIN_DUR:
|
||||
continue
|
||||
|
||||
# Try word_timestamp mapping first (wider tolerance)
|
||||
sub_words = [w["word"] for w in seg["words"] if w["start"] >= ps - 0.3 and w["end"] <= pe + 0.3]
|
||||
text = " ".join(sub_words).strip() if sub_words else ""
|
||||
|
||||
# Fallback: call faster-whisper on the sub-audio chunk
|
||||
if not text:
|
||||
import soundfile as sf
|
||||
chunk_path = os.path.join(tmp_dir, f"sub_{chunk_idx}.wav")
|
||||
a_chunk = wav_data[:, int(ps*sr):int(pe*sr)].numpy()[0]
|
||||
if len(a_chunk) > sr * 0.3: # skip if < 0.3s
|
||||
sf.write(chunk_path, a_chunk, sr)
|
||||
try:
|
||||
sub_segs, _ = asr_model.transcribe(chunk_path, beam_size=5,
|
||||
vad_filter=True, vad_parameters={"min_silence_duration_ms": 100})
|
||||
text = " ".join(s.text.strip() for s in sub_segs)
|
||||
except:
|
||||
pass
|
||||
os.remove(chunk_path)
|
||||
if not text:
|
||||
text = " ".join([w["word"] for w in seg["words"]
|
||||
if w["start"] >= ps - 0.5 and w["end"] <= pe + 0.5]).strip()
|
||||
if not text:
|
||||
text = seg["text"][:60]
|
||||
|
||||
final.append({
|
||||
"chunk_id": str(chunk_idx),
|
||||
"pass1_index": si,
|
||||
"start_time": round(ps, 3),
|
||||
"end_time": round(pe, 3),
|
||||
"start_frame": int(ps * fps),
|
||||
"end_frame": int(pe * fps),
|
||||
"text": text,
|
||||
"speaker_change": True,
|
||||
})
|
||||
chunk_idx += 1
|
||||
|
||||
print(f" Final segments: {len(final)}")
|
||||
return final
|
||||
|
||||
def voice_vectors_to_qdrant(wav_data, sr, final_segs, encoder, qdrant_url="http://localhost:6333"):
|
||||
print(" [Voice Vectors] Extracting 192D embeddings...")
|
||||
embeddings = []
|
||||
t0 = time.time()
|
||||
for si, seg in enumerate(final_segs):
|
||||
st = int(seg["start_time"] * sr)
|
||||
et = int(seg["end_time"] * sr)
|
||||
a_chunk = wav_data[:, st:et]
|
||||
emb = extract_speaker_embedding(encoder, a_chunk.numpy(), sr)
|
||||
emb = emb / (np.linalg.norm(emb) + 1e-10)
|
||||
embeddings.append({"chunk_id": seg["chunk_id"], "embedding": emb.tolist()})
|
||||
if (si + 1) % 500 == 0:
|
||||
print(f" {si+1}/{len(final_segs)} [{time.time()-t0:.0f}s]")
|
||||
|
||||
print(f" Writing to Qdrant...")
|
||||
from urllib.request import Request, urlopen
|
||||
batch = []
|
||||
for i, e in enumerate(embeddings):
|
||||
batch.append({"id": i + 1, "vector": e["embedding"],
|
||||
"payload": {"chunk_id": e["chunk_id"], "chunk_type": "sentence"}})
|
||||
if len(batch) >= 100:
|
||||
req = Request(f"{qdrant_url}/collections/momentry_dev_voice/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try: urlopen(req)
|
||||
except: pass
|
||||
batch = []
|
||||
# Flush remaining
|
||||
if batch:
|
||||
req = Request(f"{qdrant_url}/collections/momentry_dev_voice/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try: urlopen(req)
|
||||
except: pass
|
||||
|
||||
print(f" Voice vectors: {len(embeddings)} pts → Qdrant [{time.time()-t0:.0f}s]")
|
||||
return embeddings
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--video", default="/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn | Comedy Mystery Romance Thriller | Full Movie.mp4")
|
||||
parser.add_argument("--output", help="Output path for asr.json", default="/Users/accusys/momentry/output_dev/aeed71342a899fe4b4c57b7d41bcb692.asr.json")
|
||||
parser.add_argument("--sample", type=int, help="Process only first N pass1 segments (for testing)")
|
||||
parser.add_argument("--no-qdrant", action="store_true", help="Skip Qdrant upload")
|
||||
args = parser.parse_args()
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
# Load models
|
||||
print("=== Loading Models ===")
|
||||
asr_model = WhisperModel("small", device="cpu", compute_type="int8")
|
||||
print(" faster-whisper small loaded")
|
||||
encoder = load_speaker_encoder()
|
||||
print(" ECAPA-TDNN loaded")
|
||||
print()
|
||||
|
||||
# Extract audio
|
||||
print("=== Audio Extraction ===")
|
||||
tmp_dir = tempfile.mkdtemp(prefix="transcribe_")
|
||||
wav_data, sr = extract_audio(args.video, tmp_dir)
|
||||
print(f" Audio: {wav_data.shape[1]/sr:.0f}s, {sr}Hz")
|
||||
wav_path = os.path.join(tmp_dir, "audio.wav")
|
||||
print()
|
||||
|
||||
# Step 1: faster-whisper pass1
|
||||
print("=== Step 1: Pass1 Transcription ===")
|
||||
pass1_segs = transcribe_pass1(asr_model, wav_path)
|
||||
if args.sample:
|
||||
pass1_segs = pass1_segs[:args.sample]
|
||||
print(f" SAMPLE MODE: limiting to {args.sample} segments")
|
||||
print()
|
||||
|
||||
# Step 2: Speaker change detection
|
||||
print("=== Step 2: Speaker Change Detection ===")
|
||||
change_points = detect_speaker_changes(wav_data, sr, pass1_segs, encoder)
|
||||
print()
|
||||
|
||||
# Step 3: Build final segments
|
||||
print("=== Step 3: Build Final Segments ===")
|
||||
final_segs = build_segments(pass1_segs, change_points, wav_data, sr, asr_model, tmp_dir)
|
||||
print()
|
||||
|
||||
# Step 4: Voice vectors → Qdrant
|
||||
if not args.no_qdrant:
|
||||
print("=== Step 4: Voice Vectors → Qdrant ===")
|
||||
voice_vectors_to_qdrant(wav_data, sr, final_segs, encoder)
|
||||
print()
|
||||
|
||||
# Step 5: Write asr.json
|
||||
print("=== Step 5: Write asr.json ===")
|
||||
uuid = os.path.basename(args.output).replace(".asr.json", "")
|
||||
output = {
|
||||
"file_uuid": uuid,
|
||||
"pass1": pass1_segs,
|
||||
"segments": final_segs,
|
||||
}
|
||||
with open(args.output, "w") as f:
|
||||
json.dump(output, f, indent=2, ensure_ascii=False)
|
||||
sz = os.path.getsize(args.output)
|
||||
print(f" {args.output} ({sz/1024:.0f} KB)")
|
||||
|
||||
# Cleanup
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f"\n=== Done ({elapsed:.0f}s) ===")
|
||||
print(f" Pass1 segments: {len(pass1_segs)}")
|
||||
print(f" Final segments: {len(final_segs)}")
|
||||
fp = args.output
|
||||
print(f" Output: {fp}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
BIN
scripts/vec0.dylib
Normal file
BIN
scripts/vec0.dylib
Normal file
Binary file not shown.
69
scripts/vectorize_chunks.py
Normal file
69
scripts/vectorize_chunks.py
Normal file
@@ -0,0 +1,69 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""Vectorize sentence chunks via Ollama mxbai-embed-large and store in DB + Qdrant."""
|
||||
import json, sys, time
|
||||
import psycopg2
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
DB = "dbname=momentry user=accusys"
|
||||
UUID = sys.argv[1] if len(sys.argv) > 1 else "23b1c872379d4ec06479e5ed39eef4c5"
|
||||
OLLAMA = "http://localhost:11434/api/embeddings"
|
||||
QDRANT = "http://localhost:6333"
|
||||
|
||||
conn = psycopg2.connect(DB)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("""
|
||||
SELECT chunk_id, text_content FROM dev.chunk
|
||||
WHERE file_uuid = %s AND chunk_type = 'sentence'
|
||||
AND (text_content IS NOT NULL AND text_content != '')
|
||||
ORDER BY id
|
||||
""", (UUID,))
|
||||
rows = cur.fetchall()
|
||||
print(f"Vectorizing {len(rows)} chunks for {UUID}...")
|
||||
|
||||
stored = 0
|
||||
batch = []
|
||||
for chunk_id, text in rows:
|
||||
req = Request(OLLAMA, data=json.dumps({
|
||||
"model": "nomic-embed-text-v2-moe:latest",
|
||||
"prompt": text
|
||||
}).encode(), headers={"Content-Type": "application/json"})
|
||||
resp = json.loads(urlopen(req).read())
|
||||
embedding = resp["embedding"]
|
||||
|
||||
# Store in PostgreSQL chunk_vectors
|
||||
cur.execute("""
|
||||
INSERT INTO dev.chunk_vectors (chunk_id, uuid, chunk_type, embedding)
|
||||
VALUES (%s, %s, 'sentence', %s::jsonb)
|
||||
ON CONFLICT (chunk_id, uuid) DO UPDATE SET embedding = EXCLUDED.embedding
|
||||
""", (chunk_id, UUID, json.dumps(embedding)))
|
||||
|
||||
# Batch for Qdrant
|
||||
batch.append({
|
||||
"id": int(chunk_id) + 1 if chunk_id.isdigit() else len(batch) + 10000,
|
||||
"vector": embedding,
|
||||
"payload": {"chunk_id": chunk_id, "chunk_type": "sentence"}
|
||||
})
|
||||
|
||||
if len(batch) >= 100:
|
||||
req = Request(f"{QDRANT}/collections/momentry_dev_rule1_v2/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req)
|
||||
batch = []
|
||||
|
||||
stored += 1
|
||||
if stored % 50 == 0:
|
||||
print(f" {stored}/{len(rows)}")
|
||||
conn.commit()
|
||||
|
||||
if batch:
|
||||
req = Request(f"{QDRANT}/collections/momentry_dev_rule1_v2/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req)
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
print(f"Done: {stored} vectors stored")
|
||||
Reference in New Issue
Block a user