feat: trace-level matching, health watcher/worker status, timezone config

2026-05-21 01:08:30 +08:00
parent 8ede4be159
commit bebaa743ed
60 changed files with 6110 additions and 1586 deletions
--- a/scripts/pycache/redis_publisher.cpython-311.pyc
+++ b/scripts/pycache/redis_publisher.cpython-311.pyc
--- a/scripts/checksums.sha256
+++ b/scripts/checksums.sha256
@@ -103,7 +103,7 @@ f4d1b4334a49357b74b80e390ad5a3d16263e51cbe5cab661af92bd2e9721f02  ./face_process
 802015c73dfce0866f2a0bc94c645aa35ba30a6de78244af23090bb1f1828c6e  ./face_processor_mps.py
 96ffdbde3f4d87e9942f9e1f4c93cbd999dc404b43e00d4cdcbb22de3c0f16b7  ./face_processor_optimized.py
 17e7d0bd142bddfead94b1dd959c1f41c0dad7063ffc677dff1a99d62aab6cf8  ./face_processor_v1.py
-15877adf5c160d861da688a25b93fd2edc189f326f9646ffb4de063e554f773a  ./face_processor.py
+d6ddad29a5e53b43b887554072d7965f0535e47fb62dad1a8b87e44fa1be6015  ./face_processor.py
 8edab61189ad1a8fa60c203077e814e82d46c5bae67054fa2ab1958e199c05f9  ./face_recognition_processor.py
 9ea19f357b3fcec6c8b3875c538e53cb46e407ab188cd544963e0123e535fa03  ./face_registration.py
 72648816de611fd9b84d2b98c177b8b4f24374024b69184e8151c06cf44d633b  ./face_statistics_report.py
@@ -174,15 +174,15 @@ fd39b779a0337f521940f3f7b159931f1f207f200eefd610183781fdcf3dfafd  ./object_searc
 42d2952fc78b57302b0d12bc3d45790a2c2c46d4ffa3c713a82686134bd63f13  ./ocr_benchmark_runner.py
 7b3ccb5c4ddd4c62c5ad04d0e3aafaecc2c1441012b6a98613cdcf055e2e50e8  ./ocr_processor_contract_v1.py
 271023eec42d6be4a1ce6ae2ce3f29e825210a57e6bb37554a6f7fdf54616f9a  ./ocr_processor_mps.py
-e666bc8488bb93cc45bcd6a70a4ef38a74af6631d7b87a789381bfbdab4569f5  ./ocr_processor.py
+2e73c41285e52ef013594fcd4d20df9f5781bfc26bcf62e54dd2c04ec44200c3  ./ocr_processor.py
 62196108cb3337b5f9a873d70d2981ac8f49152369afbcc8a12b3a13de579e80  ./opencv_stamp_search.py
 b2e8d552c272fd173c77693e9453a85fe16dfc12f7c2cd304d299c6188c14077  ./paligemma_vs_gdino.py
-2c6767e763cf69917af832b8383528f754c65db5a3f02cb4d63e3f896d5920b6  ./parent_chunk_5w1h.py
+1534d5b7617dbae77f7a37a2c33a89b90f965247a6828f00b73ea6b720f6f4fc  ./parent_chunk_5w1h.py
 5208c738d4b615282813d351daf09872ce516121bb604caa64968ef5e52c53d3  ./pipeline_checklist.py
 8f80c3a2be5c330e2d1853d9250a171c75db84598dbf3304280c42237ed4fb1f  ./pipeline_status.py
 94db44c0f49115a677d117d4901a1b7991c1517905300eaa495dd62b8ac1c79c  ./pose_processor_contract_v1.py
 167dee5e42c6bd46674bcffcfd92f368fc0b48a1f42c459c806853b281bc6482  ./pose_processor_mps.py
-a1cdb1efd992d229829ae156d8aa439347c51d664e2a606c14d2274a11c93a66  ./pose_processor.py
+a6ef3a785ef5c6dc47fa38dbed80d76bc7d4bf48cbaf0f7edb3d26df98d7262c  ./pose_processor.py
 45e6798dc5900f2f7c8776a2d260c122aae5068a075256b8a5c02e8d0be6c131  ./probe_file.py
 139a68b5915680ec697d4bb5420adbd20b89637de2c16a15d68aca4fc22da02b  ./qa/executor.py
 4a59b36c29e1ee6e2b169db3b0201d2f7088c6ccbfdf642a3b522aeb182bbeea  ./qa/judges/facenet.py
@@ -197,7 +197,7 @@ c4e4424aad1847d822e9cf7dc98a1b2e903735a61e8ec056c6a9be75f79486bd  ./qa/pipeline.
 01c7b3c30c1531224f9605f0ee633285fe8489ab2d0a3c9c6a41f2b2b60d6626  ./quick_stamp_search.py
 e3143673a2bff6139e05c82446fd8770c4b7e59a854a42c3b29662f5ac75efe2  ./rebuild_parents.py
 4aa98981632d4f8a11039c510e86aa296ae1cd4b399fc871ed664ac11e445bd9  ./rebuild_story_content.py
-45c437b412d34c7c6d5758e94b7205a2956b32b6fe170c3f56db7231ec6f5a15  ./redis_publisher.py
+205cfc47b603b5ab94d97dae8c25486b342b7c2858afe6d6dae27615ca0b2aeb  ./redis_publisher.py
 750f778946b56bc57c47d9d2295332bb0f8cec2c1aa03c6b882d39ef4432673d  ./refine_search.py
 0f8a6a6866a5797e964d3b17e2b7ef146fe7a798f09fcea982fcda6f629b4d06  ./regenerate_parent_5w1h.py
 3ee192b623f290136b36bd63abd018aad6e6639a9543970c3415734628b33bd6  ./register_sample_faces.py
@@ -303,7 +303,7 @@ d0ec8f4a67c1a1eb1356ad6e9b2f466575691bd336621cdbbfd31dd10159f2dc  ./utils/test_m
 ff98864f1b11795cc3bb64f30ccb6f8609771ddc7a5df2c003ba7c2233d16fc2  ./vectorize_chunks.py
 5880c128400e6e36c8eb7dffd009dbbc99dd13f8575b0037bdc854e25ddc41fb  ./video_comparison_statistics.py
 0a1501ffdc027236cdf88706b3d61229e2998ab268fd57fb60e399ccb734b6a1  ./vision_agent.py
-6831281de868d24ecd84151965909b57f895d534114d24300a81c396492c19f8  ./visual_chunk_processor.py
+eac8f90fbbb655614abcefc4b887e346bf94db5f015d33d37bc9514fb030489d  ./visual_chunk_processor.py
 c165dfc5fc981dc731b25ef414184ee58e56b73b148d41a32fdce985c701efd5  ./visualize_stamp.py
 6c65a82fdd1d585e20bee4fcb2d1bdec2e6220bda71d6ef9cd00d6a3cf74c4d7  ./voice_embedding_extractor.py
 2b3a7b357db4ddd07ca30bf200c6600724e33441d8def0a4d9a39673e2cfb1c0  ./weather_sound_detector.py
@@ -343,3 +343,4 @@ b2ee4f8a445a7e83f7b99ae5d4139fd525d9e3e58a360bfef054d441aa21d901  ./swift_proces
 fbca5ba0783153c4e21c174b0cbf75b582514f6ef0f92750a82d3178bc170f48  ./test_search_modes.sh
 f8c1647cdb4db8adef1829e41fbecd97f6b3b2e62927f195cd8e68127876069d  ./troubleshoot.sh
 992296b5218f3ef97ce53325be12f71848f3c3aeb3ee81d764bfe4bd61e1de05  ./verify_package.sh
+b6f95fa070cc0258bc5d005f10d13025ba8b08d3ee1598bcdad405ff1d3332ed  ./tmdb_agent.py
--- a/scripts/extract_face_embedding.py
+++ b/scripts/extract_face_embedding.py
@@ -0,0 +1,84 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Extract face embedding from an image using InsightFace + CoreML FaceNet.
+
+Usage:
+    python3 scripts/extract_face_embedding.py <image_path>
+
+Output: JSON with "embedding" key (512 floats) or "error" key.
+Exit code: 0 on success, 1 on failure.
+"""
+import json
+import os
+import sys
+
+# Prefer venv if it exists (has insightface + coremltools installed)
+VENV_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "venv")
+VENV_SITE = os.path.join(VENV_PATH, "lib", "python3.11", "site-packages")
+if os.path.isdir(VENV_SITE):
+    sys.path.insert(0, VENV_SITE)
+
+import cv2
+import numpy as np
+
+MODELS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "models")
+FACENET_PATH = os.path.join(MODELS_DIR, "facenet512.mlpackage")
+
+
+def extract_embedding(image_path: str):
+    import io
+    import warnings
+    warnings.filterwarnings("ignore")
+
+    # Suppress InsightFace verbose stdout during model loading
+    old_stdout = sys.stdout
+    sys.stdout = io.StringIO()
+    try:
+        import insightface
+        from insightface.app import FaceAnalysis
+        import coremltools as ct
+
+        app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"])
+        app.prepare(ctx_id=0, det_thresh=0.5)
+        coreml_model = ct.models.MLModel(FACENET_PATH)
+    finally:
+        sys.stdout = old_stdout
+
+    img_bytes = open(image_path, "rb").read()
+    nparr = np.frombuffer(img_bytes, np.uint8)
+    img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+    if img is None:
+        print(json.dumps({"error": "Failed to decode image"}))
+        sys.exit(1)
+
+    # Detect faces
+    faces = app.get(img)
+    if not faces:
+        print(json.dumps({"error": "No face detected"}))
+        sys.exit(1)
+
+    largest = max(faces, key=lambda f: (f.bbox[2] - f.bbox[0]) * (f.bbox[3] - f.bbox[1]))
+    x1, y1, x2, y2 = [int(v) for v in largest.bbox]
+    x1, y1 = max(0, x1), max(0, y1)
+    x2, y2 = min(img.shape[1], x2), min(img.shape[0], y2)
+    if x2 <= x1 or y2 <= y1:
+        print(json.dumps({"error": "Invalid face bbox"}))
+        sys.exit(1)
+
+    face_img = img[y1:y2, x1:x2]
+    face_img = cv2.resize(face_img, (160, 160))
+    normalized = (face_img.astype(np.float32) / 127.5) - 1.0
+    normalized = np.transpose(normalized, (2, 0, 1))
+    input_array = np.expand_dims(normalized, axis=0)
+
+    result = coreml_model.predict({"input": input_array})
+    emb_key = [k for k in result.keys() if k.startswith("var_")][0]
+    embedding = result[emb_key].flatten().tolist()
+    print(json.dumps({"embedding": embedding}))
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print(json.dumps({"error": "Usage: extract_face_embedding.py <image_path>"}))
+        sys.exit(1)
+    extract_embedding(sys.argv[1])
--- a/scripts/face_landmark_qc.py
+++ b/scripts/face_landmark_qc.py
@@ -2,23 +2,30 @@
 """
 Face landmark QC: verify eyes/nose are within face bounding box.
 Flags faces in DB where landmarks don't match the bbox.
-Usage: python3 face_landmark_qc.py <file_uuid> [--threshold 0.5] [--fix]
+Usage: python3 face_landmark_qc.py <file_uuid> [--threshold 0.5] [--apply]
 """
-import sys, json, psycopg2, argparse
+import sys, json, psycopg2, argparse, os

 parser = argparse.ArgumentParser()
 parser.add_argument("uuid")
 parser.add_argument("--threshold", "-t", type=float, default=0.5,
    help="Fraction of landmark points that must be inside bbox (default: 0.5)")
-parser.add_argument("--fix", action="store_true", help="Update face_detections QC flag in DB")
+parser.add_argument("--apply", action="store_true",
+    help="Write qc_ok to face_detections.metadata in DB")
+parser.add_argument("--schema", default="dev",
+    help="DB schema (default: dev)")
 args = parser.parse_args()

 UUID = args.uuid
 THRESHOLD = args.threshold
-FACE_PATH = f"/Users/accusys/momentry/output_dev/{UUID}.face.json"
+SCHEMA = args.schema
+OUTPUT_DIR = os.environ.get("MOMENTRY_OUTPUT_DIR", f"/Users/accusys/momentry/output_dev")
+FACE_PATH = f"{OUTPUT_DIR}/{UUID}.face.json"

 print(f"=== Face Landmark QC ===")
 print(f"UUID: {UUID}")
+print(f"Schema: {SCHEMA}")
+print(f"Face file: {FACE_PATH}")
 print(f"Threshold: {THRESHOLD * 100:.0f}% points must be inside bbox")

 # Load face.json
@@ -29,8 +36,7 @@ total_faces = 0
 faces_with_lm = 0
 good_faces = 0
 bad_faces = 0
-bad_frame_ids = set()
-bad_face_details = []
+qc_results = []  # list of (frame, face_idx, qc_ok, x, y, w, h)

 # Build frame lookup for fast access
 frame_map = {}
@@ -42,13 +48,22 @@ for frame_num, frm in frame_map.items():
        total_faces += 1
        lm = face.get('landmarks')
        if not lm:
+            bbox = face.get('bbox', {})
+            qc_results.append((frame_num, fi, False, bbox.get('x'), bbox.get('y'),
+                               bbox.get('width'), bbox.get('height')))
+            bad_faces += 1
            continue
        faces_with_lm += 1

-        x, y, w, h = face['x'], face['y'], face['width'], face['height']
+        bbox = face.get('bbox', {})
+        x, y, w, h = bbox.get('x'), bbox.get('y'), bbox.get('width'), bbox.get('height')
+        if None in (x, y, w, h):
+            qc_results.append((frame_num, fi, False, x, y, w, h))
+            bad_faces += 1
+            continue
        inside_pts = 0
        total_pts = 0
-        eye_nose_inside = 0  # at least one point from each eye+nose inside
+        eye_nose_inside = 0

        for lm_type in ['left_eye', 'right_eye', 'nose']:
            points = lm.get(lm_type, [])
@@ -63,53 +78,39 @@ for frame_num, frm in frame_map.items():
                eye_nose_inside += 1

        ratio = inside_pts / max(1, total_pts)
+        qc_ok = (ratio >= THRESHOLD and eye_nose_inside >= 2)

-        if ratio >= THRESHOLD and eye_nose_inside >= 2:
+        qc_results.append((frame_num, fi, qc_ok, x, y, w, h))
+        if qc_ok:
            good_faces += 1
        else:
            bad_faces += 1
-            bad_frame_ids.add(frame_num)
-            bad_face_details.append({
-                'frame': frame_num,
-                'face_idx': fi,
-                'bbox': [x, y, w, h],
-                'inside_pts': inside_pts,
-                'total_pts': total_pts,
-                'ratio': ratio,
-                'eye_nose_ok': eye_nose_inside,
-            })

 print(f"\nTotal faces: {total_faces:,}")
 print(f"Faces with landmarks: {faces_with_lm:,}")
 print(f"✅ Good (≥{THRESHOLD*100:.0f}% inside + ≥2 features): {good_faces:,}")
-print(f"❌ Bad: {bad_faces:,}")
+print(f"❌ Bad (no eyes or insufficient landmarks): {bad_faces:,}")
 print(f"Quality pass rate: {100 * good_faces / max(1, faces_with_lm):.1f}%")

-print(f"\nBad faces in {len(bad_frame_ids)} unique frames")
-
-# Show sample bad faces
-print(f"\nSample bad faces:")
-for bf in sorted(bad_face_details, key=lambda b: b['ratio'])[:5]:
-    print(f"  frame={bf['frame']}, bbox={bf['bbox']}, {bf['inside_pts']}/{bf['total_pts']} inside ({bf['ratio']*100:.0f}%), eye/nose={bf['eye_nose_ok']}/3")
-
-# Show sample good faces
-print(f"\nSample good faces:")
-good_details = []
-for frame_num, frm in frame_map.items():
-    for face in frm.get('faces', []):
-        lm = face.get('landmarks')
-        if not lm:
-            continue
-        x, y, w, h = face['x'], face['y'], face['width'], face['height']
-        inside = sum(1 for pts in lm.values() for pt in pts
-            if (x <= pt[0] <= x + w) and (y <= pt[1] <= y + h))
-        total = sum(len(pts) for pts in lm.values())
-        if inside / max(1, total) >= THRESHOLD:
-            good_details.append((frame_num, x, y, w, h, inside, total))
-            if len(good_details) >= 5:
-                break
-    if len(good_details) >= 5:
-        break
-
-for g in good_details:
-    print(f"  frame={g[0]}, bbox=[{g[1]},{g[2]},{g[3]},{g[4]}], {g[5]}/{g[6]} inside ({100*g[5]/max(1,g[6]):.0f}%)")
+# Apply mode: write qc_ok to face_detections.metadata
+if args.apply:
+    print(f"\n=== Applying QC results to {SCHEMA}.face_detections ===")
+    db_url = os.environ.get("DATABASE_URL", "postgres://accusys@localhost:5432/momentry")
+    conn = psycopg2.connect(db_url)
+    cur = conn.cursor()
+    updated = 0
+    for frame_num, fi, qc_ok, x, y, w, h in qc_results:
+        qc_str = "true" if qc_ok else "false"
+        cur.execute(
+            f"UPDATE {SCHEMA}.face_detections "
+            f"SET metadata = jsonb_set(COALESCE(metadata, '{{}}'::jsonb), '{{qc_ok}}', '\"{qc_str}\"'::jsonb) "
+            f"WHERE file_uuid = %s AND frame_number = %s AND x = %s AND y = %s AND width = %s AND height = %s",
+            (UUID, frame_num, x, y, w, h)
+        )
+        if cur.rowcount > 0:
+            updated += 1
+    conn.commit()
+    cur.close()
+    conn.close()
+    print(f"Updated {updated} rows in {SCHEMA}.face_detections")
+    print(f"Skipped {len(qc_results) - updated} rows (no matching face_detections row)")
--- a/scripts/face_processor.py
+++ b/scripts/face_processor.py
@@ -13,6 +13,7 @@ Detection cost: near-zero CPU (Vision ANE)
 Embedding cost: near-zero CPU (CoreML ANE)
 """

+import re
 import sys
 import os
 import json
@@ -29,6 +30,7 @@ from pathlib import Path
 import coremltools as ct

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from redis_publisher import RedisPublisher

 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 SWIFT_BIN = os.path.join(SCRIPT_DIR, "swift_processors", ".build", "debug", "swift_face")
@@ -49,11 +51,12 @@ def classify_pose(roll: float, yaw: float) -> str:

 class FaceProcessorVision:
    def __init__(self, video_path: str, output_path: str, uuid: str = "",
-                 sample_interval: int = 3):
+                 sample_interval: int = 3, publisher: RedisPublisher = None):
        self.video_path = video_path
        self.output_path = output_path
        self.uuid = uuid
        self.sample_interval = sample_interval
+        self.publisher = publisher

        # Load CoreML FaceNet
        self.coreml_model = None
@@ -127,7 +130,33 @@ class FaceProcessorVision:

        print(f"[FACE_V2] Running: {' '.join(cmd)}")
        t0 = time.time()
-        subprocess.run(cmd, check=True)
+        log_path = swift_out + ".log"
+        log_f = open(log_path, "w")
+        proc = subprocess.Popen(cmd, stdout=log_f, stderr=subprocess.STDOUT, text=True)
+        last_pct = -1
+        while proc.poll() is None:
+            time.sleep(10)
+            # Read latest log lines
+            try:
+                with open(log_path) as lf:
+                    for line in lf:
+                        line = line.strip()
+                        m = re.search(r'(\d+)% complete', line)
+                        if m:
+                            pct = int(m.group(1))
+                            if pct > last_pct:
+                                last_pct = pct
+                                if self.publisher:
+                                    self.publisher.progress("face", pct, 100, f"swift detect {pct}%")
+            except Exception:
+                pass
+        log_f.close()
+        if proc.returncode != 0:
+            stderr_out = proc.stderr.read()
+            if stderr_out:
+                print(stderr_out.strip(), file=sys.stderr)
+            raise RuntimeError(f"swift_face exited with code {proc.returncode}")
+
        elapsed = time.time() - t0
        print(f"[FACE_V2] Detection done in {elapsed:.1f}s")

@@ -156,6 +185,8 @@ class FaceProcessorVision:

        t0 = time.time()
        embed_count = 0
+        total_face_count = 0
+        last_pct = -1

        for frame_info in frames:
            frame_num = frame_info["frame"]
@@ -220,6 +251,12 @@ class FaceProcessorVision:
            if len(face_data["frames"]) % 100 == 0:
                elapsed = time.time() - t0
                print(f"[FACE_V2] {len(face_data['frames'])} frames, {embed_count} embeddings, {elapsed:.0f}s")
+                if self.publisher:
+                    pct = int(len(face_data["frames"]) * 100 / max(len(frames), 1))
+                    if pct > last_pct:
+                        last_pct = pct
+                        self.publisher.progress("face", len(face_data["frames"]), len(frames),
+                            f"{embed_count} faces", embed_count, "faces")

        self.video.release()

@@ -259,19 +296,36 @@ def main():
    parser.add_argument("--force", action="store_true")
    args = parser.parse_args()

+    publisher = RedisPublisher(args.uuid) if args.uuid else None
+    if publisher:
+        publisher.info("face", "FACE_START")
+
    if args.force and os.path.exists(args.output_path):
        os.remove(args.output_path)

    processor = FaceProcessorVision(
        args.video_path, args.output_path,
-        args.uuid, args.sample_interval
+        args.uuid, args.sample_interval, publisher
    )

    # Step 1: Vision detection (bbox + pose via ANE)
-    detection = processor.process_with_swift()
+    try:
+        detection = processor.process_with_swift()
+    except Exception as e:
+        if publisher:
+            publisher.error("face", f"Detection failed: {e}")
+        raise

    # Step 2: CoreML embedding + save
-    processor.embed_and_save(detection)
+    try:
+        processor.embed_and_save(detection)
+    except Exception as e:
+        if publisher:
+            publisher.error("face", f"Embedding failed: {e}")
+        raise
+
+    if publisher:
+        publisher.complete("face", f"{len(detection.get('frames',[]))} frames")

    # Clean up temp detection file
    swift_out = args.output_path.replace(".json", "_detect.json")
--- a/scripts/identity_bind.py
+++ b/scripts/identity_bind.py
@@ -81,10 +81,10 @@ for cluster_id in sorted(set(labels)):
        VALUES (%s, 'face', 'auto', 'active', NOW(), %s)
        ON CONFLICT (name) DO UPDATE SET status = 'active', file_uuid = COALESCE(dev.identities.file_uuid, %s)
        RETURNING id
-    """, (f"PERSON_{UUID[:8]}_{cluster_id}", UUID, UUID))
+    """, (f"stranger_{UUID}_{cluster_id}", UUID, UUID))
    identity_id = cur.fetchone()[0]
    cluster_to_identity[cluster_id] = identity_id
-    print(f"  Cluster {cluster_id}: new identity {identity_id} (PERSON_{cluster_id})")
+    print(f"  Cluster {cluster_id}: new identity {identity_id} (stranger_{UUID}_{cluster_id})")

 # Step 4: Create identity bindings
 print("Creating identity bindings...")
--- a/scripts/migrate_identity_files.py
+++ b/scripts/migrate_identity_files.py
@@ -0,0 +1,131 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Migrate Identity Files — one-time: DB identities → filesystem identity.json
+
+Reads all identities from PostgreSQL, queries file bindings,
+and writes identity.json + _index.json to {OUTPUT_DIR}/identities/{uuid}/
+
+Usage:
+    python3 scripts/migrate_identity_files.py
+    python3 scripts/migrate_identity_files.py --db "dbname=momentry user=accusys"
+    python3 scripts/migrate_identity_files.py --output /path/to/output
+"""
+import argparse
+import json
+import os
+from datetime import datetime, timezone
+from pathlib import Path
+
+import psycopg2
+import psycopg2.extras
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Migrate identities to filesystem")
+    parser.add_argument("--db", default=os.getenv("DATABASE_URL", "dbname=momentry user=accusys host=localhost"))
+    parser.add_argument("--output", default=os.getenv("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output"))
+    args = parser.parse_args()
+
+    conn = psycopg2.connect(args.db)
+    identities_root = Path(args.output) / "identities"
+    identities_root.mkdir(parents=True, exist_ok=True)
+
+    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+    cur.execute("""
+        SELECT id, uuid::text, name, identity_type, source, status,
+               tmdb_id, tmdb_profile, metadata::text, created_at, updated_at
+        FROM identities
+        WHERE uuid IS NOT NULL
+        ORDER BY id
+    """)
+    rows = cur.fetchall()
+
+    if not rows:
+        print("No identities found in DB.")
+        return
+
+    index = {}
+    migrated = 0
+    skipped = 0
+
+    for row in rows:
+        uuid_raw = row["uuid"]
+        uuid_clean = uuid_raw.replace("-", "")
+        name = row["name"] or ""
+
+        dir_path = identities_root / uuid_clean
+        dir_path.mkdir(parents=True, exist_ok=True)
+
+        # Get bindings for this identity from face_detections
+        bindings_cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+        bindings_cur.execute("""
+            SELECT fd.file_uuid,
+                   COALESCE(array_agg(DISTINCT fd.trace_id) FILTER (WHERE fd.trace_id IS NOT NULL), '{}') AS trace_ids,
+                   COUNT(*)::bigint AS face_count
+            FROM face_detections fd
+            WHERE fd.identity_id = %s
+            GROUP BY fd.file_uuid
+            ORDER BY fd.file_uuid
+        """, (row["id"],))
+        binding_rows = bindings_cur.fetchall()
+        bindings_cur.close()
+
+        file_bindings = []
+        for b in binding_rows:
+            trace_ids = b["trace_ids"]
+            if isinstance(trace_ids, list):
+                trace_ids = [int(t) for t in trace_ids if t is not None]
+            file_bindings.append({
+                "file_uuid": b["file_uuid"],
+                "trace_ids": trace_ids,
+                "face_count": int(b["face_count"]),
+            })
+
+        metadata = row.get("metadata")
+        if isinstance(metadata, str):
+            metadata = json.loads(metadata) if metadata else {}
+        elif metadata is None:
+            metadata = {}
+
+        fmt_time = lambda v: v.isoformat() if v else datetime.now(timezone.utc).isoformat()
+
+        identity_file = {
+            "version": 1,
+            "identity_uuid": uuid_clean,
+            "name": name,
+            "identity_type": row.get("identity_type"),
+            "source": row.get("source"),
+            "status": row.get("status"),
+            "tmdb_id": row.get("tmdb_id"),
+            "tmdb_profile": row.get("tmdb_profile"),
+            "metadata": metadata,
+            "file_bindings": file_bindings,
+            "created_at": fmt_time(row.get("created_at")),
+            "updated_at": fmt_time(row.get("updated_at")),
+        }
+
+        with open(dir_path / "identity.json", "w", encoding="utf-8") as f:
+            json.dump(identity_file, f, indent=2, ensure_ascii=False)
+
+        index[uuid_clean] = name
+        migrated += 1
+        print(f"  [{migrated:5d}] {name} ({uuid_clean})")
+
+    cur.close()
+    conn.close()
+
+    # Write _index.json
+    index_file = {
+        "version": 1,
+        "updated_at": datetime.now(timezone.utc).isoformat(),
+        "entries": index,
+    }
+    with open(identities_root / "_index.json", "w", encoding="utf-8") as f:
+        json.dump(index_file, f, indent=2, ensure_ascii=False)
+
+    print(f"\nDone: {migrated} identities migrated")
+    print(f"Index: {identities_root / '_index.json'} ({len(index)} entries)")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/ocr_processor.py
+++ b/scripts/ocr_processor.py
@@ -4,6 +4,7 @@ OCR Processor Wrapper
 Calls Swift Vision Framework OCR (swift_ocr) with fallback to PaddleOCR.
 """

+import re
 import sys
 import json
 import os
@@ -11,6 +12,10 @@ import subprocess
 import argparse


+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from redis_publisher import RedisPublisher
+
+
 SWIFT_OCR_PATH = os.path.join(
    os.path.dirname(os.path.abspath(__file__)),
    "swift_processors/.build/debug/swift_ocr"
@@ -19,6 +24,7 @@ SWIFT_OCR_ALT = os.path.join(
    os.path.dirname(os.path.abspath(__file__)),
    "swift_processors/.build/arm64-apple-macosx/debug/swift_ocr"
 )
+SWIFT_PROGRESS_RE = re.compile(r"\[SwiftOCR\] Progress:\s*(\d+)%")


 def process_ocr(
@@ -27,6 +33,7 @@ def process_ocr(
    uuid: str = "",
    sample_interval: int = 30,
    recognition_level: str = "accurate",
+    publisher: RedisPublisher = None,
 ) -> dict:
    swift_bin = SWIFT_OCR_PATH
    if not os.path.exists(swift_bin):
@@ -42,15 +49,34 @@ def process_ocr(
           "--uuid", uuid]

    print(f"[OCR] Running Swift OCR", file=sys.stderr)
-    result = subprocess.run(cmd, capture_output=True, text=True, timeout=7200)
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

-    if result.stdout:
-        print(result.stdout.strip(), file=sys.stderr)
-    if result.stderr:
-        print(result.stderr.strip(), file=sys.stderr)
+    last_pct = -1
+    stdout_lines = []
+    for line in proc.stdout:
+        line = line.strip()
+        stdout_lines.append(line)
+        m = SWIFT_PROGRESS_RE.search(line)
+        if m:
+            pct = int(m.group(1))
+            if pct > last_pct:
+                last_pct = pct
+                print(f"[OCR] Progress: {pct}%", file=sys.stderr)
+                if publisher:
+                    publisher.progress("ocr", pct, 100, f"{pct}%")
+        elif line:
+            print(line, file=sys.stderr)

-    if result.returncode != 0 or not os.path.exists(output_path):
-        print(f"[OCR] Swift OCR failed, falling back to PaddleOCR", file=sys.stderr)
+    stderr_output = proc.stderr.read()
+    if stderr_output:
+        print(stderr_output.strip(), file=sys.stderr)
+
+    proc.wait()
+
+    if proc.returncode != 0 or not os.path.exists(output_path):
+        print(f"[OCR] Swift OCR failed (exit={proc.returncode}), falling back to PaddleOCR", file=sys.stderr)
+        if publisher:
+            publisher.error("ocr", f"Swift OCR failed, using fallback")
        return _fallback(video_path, output_path, uuid, sample_interval)

    with open(output_path) as f:
@@ -81,9 +107,16 @@ if __name__ == "__main__":
    parser.add_argument("--recognition-level", choices=["fast", "accurate"], default="accurate")
    args = parser.parse_args()

+    publisher = RedisPublisher(args.uuid) if args.uuid else None
+    if publisher:
+        publisher.info("ocr", "OCR_START")
+
    result = process_ocr(args.video_path, args.output_path, args.uuid,
-                         args.sample_interval, args.recognition_level)
+                         args.sample_interval, args.recognition_level,
+                         publisher)

    with open(args.output_path, "w") as f:
        json.dump(result, f, indent=2)
    print(f"OCR: {len(result.get('frames', []))} frames with text")
+    if publisher:
+        publisher.complete("ocr", f"{len(result.get('frames',[]))} frames")
--- a/scripts/parent_chunk_5w1h.py
+++ b/scripts/parent_chunk_5w1h.py
@@ -28,7 +28,7 @@ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 DB_URL = os.getenv("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
 SCHEMA = os.getenv("DATABASE_SCHEMA", "dev")
 OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
-OLLAMA_URL = "http://localhost:11434/api"
+EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://localhost:11436/v1/embeddings")

 def load_speaker_map(file_uuid: str) -> dict:
    """Load speaker→identity mapping from DB (generalized, not hardcoded)"""
@@ -64,7 +64,7 @@ CURRENT_VERSIONS = {
    "embedding_agent": "nomic-embed-768d/v1",
 }

-LLM_URL = os.getenv("MOMENTRY_LLM_SUMMARY_URL", "http://127.0.0.1:8081/v1/chat/completions")
+LLM_URL = os.getenv("MOMENTRY_LLM_URL", os.getenv("MOMENTRY_LLM_SUMMARY_URL", "http://127.0.0.1:8082/v1/chat/completions"))
 LLM_MODEL = os.getenv("MOMENTRY_LLM_SUMMARY_MODEL", "gemma4")


@@ -97,7 +97,7 @@ def build_child_chunks(data: dict, file_uuid: str) -> List[dict]:
        s, e = cs["start_time"], cs["end_time"]

        children = []
-        for seg in asr_segs:
+        for seg_idx, seg in enumerate(asr_segs):
            st, en = seg.get("start", 0), seg.get("end", 0)
            text = seg.get("text", "").strip()
            if st < s or en > e or not text: continue
@@ -117,11 +117,11 @@ def build_child_chunks(data: dict, file_uuid: str) -> List[dict]:
                "start": st, "end": en, "text": text,
                "speaker_id": spk_id, "speaker_name": character,
                "speaker_confidence": spk_conf,
-                "chunk_id": f"{file_uuid}_{st:.0f}_{en:.0f}",
+                "chunk_id": f"{file_uuid}_{seg_idx}",
            })

        # Boundary overlap: even empty scenes get partial children
-        for seg in asr_segs:
+        for seg_idx, seg in enumerate(asr_segs):
            st, en = seg.get("start", 0), seg.get("end", 0)
            text = seg.get("text", "").strip()
            if not text: continue
@@ -141,7 +141,7 @@ def build_child_chunks(data: dict, file_uuid: str) -> List[dict]:
                "start": st, "end": en, "text": text,
                "speaker_id": spk_id, "speaker_name": character,
                "speaker_confidence": spk_conf,
-                "chunk_id": f"{file_uuid}_{st:.0f}_{en:.0f}",
+                "chunk_id": f"{file_uuid}_{seg_idx}",
                "overlap_type": "partial",
            })

@@ -215,14 +215,17 @@ def generate_llm_child_summary(child: dict, parent_summary: str) -> Optional[str
 # ===== Embedding (Ollama nomic-embed) =====

 def embed_text(text: str, max_retries: int = 3) -> Optional[List[float]]:
-    """Get embedding via Ollama nomic-embed-text"""
+    """Get embedding via EmbeddingGemma server"""
    for attempt in range(max_retries):
        try:
-            resp = requests.post(f"{OLLAMA_URL}/embeddings", json={
-                "model": "nomic-embed-text-v2-moe", "prompt": text,
+            resp = requests.post(EMBEDDING_URL, json={
+                "input": [text],
            }, timeout=30)
            if resp.status_code == 200:
-                return resp.json()["embedding"]
+                data = resp.json()
+                items = data.get("data", [])
+                if items:
+                    return items[0]["embedding"]
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"  ⚠️ Embedding failed: {e}")
@@ -244,7 +247,7 @@ def store_chunks(file_uuid: str, scenes: List[dict], mode: str, do_embed: bool,

    # Get base chunk_index
    cur.execute(
-        f"SELECT COALESCE(MAX(chunk_index), 0) FROM {SCHEMA}.chunks WHERE file_uuid = %s",
+        f"SELECT COALESCE(MAX(chunk_index), 0) FROM {SCHEMA}.chunk WHERE file_uuid = %s",
        (file_uuid,),
    )
    next_index = (cur.fetchone()[0] or 0) + 1
@@ -255,20 +258,38 @@ def store_chunks(file_uuid: str, scenes: List[dict], mode: str, do_embed: bool,

        parent_id = f"{mode}_parent_{file_uuid}_{scene['start_time']:.0f}_{scene['end_time']:.0f}"

-        cur.execute(
-            f"""
-            INSERT INTO {SCHEMA}.chunks (chunk_id, old_chunk_id, file_uuid, chunk_type, chunk_index,
-                                         start_time, end_time, content, text_content, parent_chunk_id)
-            VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb, %s, %s)
-            ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE
-                SET content = EXCLUDED.content, text_content = EXCLUDED.text_content
-            """,
-            (parent_id, parent_id, file_uuid, parent_type, next_index,
-             scene["start_time"], scene["end_time"],
-             json.dumps({"summary": parent_text, "mode": mode, "type": "parent",
-                         "source_versions": CURRENT_VERSIONS}),
-             parent_text, None),
-        )
+        parent_embedding = embed_text(parent_text) if do_embed else None
+        if do_embed and parent_embedding:
+            cur.execute(
+                f"""
+                INSERT INTO {SCHEMA}.chunk (chunk_id, old_chunk_id, file_uuid, chunk_type, chunk_index,
+                                             start_time, end_time, content, text_content, parent_chunk_id, embedding)
+                VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb, %s, %s, %s::vector)
+                ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE
+                    SET content = EXCLUDED.content, text_content = EXCLUDED.text_content,
+                        embedding = EXCLUDED.embedding
+                """,
+                (parent_id, parent_id, file_uuid, parent_type, next_index,
+                 scene["start_time"], scene["end_time"],
+                 json.dumps({"summary": parent_text, "mode": mode, "type": "parent",
+                             "source_versions": CURRENT_VERSIONS}),
+                 parent_text, None, parent_embedding),
+            )
+        else:
+            cur.execute(
+                f"""
+                INSERT INTO {SCHEMA}.chunk (chunk_id, old_chunk_id, file_uuid, chunk_type, chunk_index,
+                                             start_time, end_time, content, text_content, parent_chunk_id)
+                VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb, %s, %s)
+                ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE
+                    SET content = EXCLUDED.content, text_content = EXCLUDED.text_content
+                """,
+                (parent_id, parent_id, file_uuid, parent_type, next_index,
+                 scene["start_time"], scene["end_time"],
+                 json.dumps({"summary": parent_text, "mode": mode, "type": "parent",
+                             "source_versions": CURRENT_VERSIONS}),
+                 parent_text, None),
+            )
        next_index += 1
        parent_count += 1

@@ -276,22 +297,42 @@ def store_chunks(file_uuid: str, scenes: List[dict], mode: str, do_embed: bool,
            child_id = child["chunk_id"]
            child_text = generate_story_child_summary(child, parent_text) if mode == "story" else generate_llm_child_summary(child, parent_text)

-            cur.execute(
-                f"""
-                INSERT INTO {SCHEMA}.chunks (chunk_id, old_chunk_id, file_uuid, chunk_type, chunk_index,
-                                             start_time, end_time, content, text_content, parent_chunk_id)
-                VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb, %s, %s)
-                ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE
-                    SET content = EXCLUDED.content, text_content = EXCLUDED.text_content,
-                        parent_chunk_id = EXCLUDED.parent_chunk_id
-                """,
-                (child_id, child_id, file_uuid, child_type, next_index,
-                 child["start"], child["end"],
-                 json.dumps({"speaker": child["speaker_name"], "text": child["text"], "mode": mode,
-                             "speaker_confidence": child.get("speaker_confidence", 0),
-                             "source_versions": CURRENT_VERSIONS}),
-                 child_text, parent_id),
-            )
+            child_embedding = embed_text(child_text) if do_embed else None
+            if do_embed and child_embedding:
+                cur.execute(
+                    f"""
+                    INSERT INTO {SCHEMA}.chunk (chunk_id, old_chunk_id, file_uuid, chunk_type, chunk_index,
+                                                 start_time, end_time, content, text_content, parent_chunk_id, embedding)
+                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb, %s, %s, %s::vector)
+                    ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE
+                        SET content = EXCLUDED.content, text_content = EXCLUDED.text_content,
+                            parent_chunk_id = EXCLUDED.parent_chunk_id,
+                            embedding = EXCLUDED.embedding
+                    """,
+                    (child_id, child_id, file_uuid, child_type, next_index,
+                     child["start"], child["end"],
+                     json.dumps({"speaker": child["speaker_name"], "text": child["text"], "mode": mode,
+                                 "speaker_confidence": child.get("speaker_confidence", 0),
+                                 "source_versions": CURRENT_VERSIONS}),
+                     child_text, parent_id, child_embedding),
+                )
+            else:
+                cur.execute(
+                    f"""
+                    INSERT INTO {SCHEMA}.chunk (chunk_id, old_chunk_id, file_uuid, chunk_type, chunk_index,
+                                                 start_time, end_time, content, text_content, parent_chunk_id)
+                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s::jsonb, %s, %s)
+                    ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE
+                        SET content = EXCLUDED.content, text_content = EXCLUDED.text_content,
+                            parent_chunk_id = EXCLUDED.parent_chunk_id
+                    """,
+                    (child_id, child_id, file_uuid, child_type, next_index,
+                     child["start"], child["end"],
+                     json.dumps({"speaker": child["speaker_name"], "text": child["text"], "mode": mode,
+                                 "speaker_confidence": child.get("speaker_confidence", 0),
+                                 "source_versions": CURRENT_VERSIONS}),
+                     child_text, parent_id),
+                )
            next_index += 1
            child_count += 1

@@ -304,7 +345,7 @@ def main():
    parser = argparse.ArgumentParser(description="Story Processor V2.0")
    parser.add_argument("--file-uuid", required=True)
    parser.add_argument("--mode", choices=["story", "llm"], default="story")
-    parser.add_argument("--max-scenes", type=int, default=300)
+    parser.add_argument("--max-scenes", type=int, default=99999)
    parser.add_argument("--embed", action="store_true", help="Generate embeddings (Ollama)")
    parser.add_argument("--no-db", action="store_true", help="Skip DB storage")
    args = parser.parse_args()
--- a/scripts/pose_processor.py
+++ b/scripts/pose_processor.py
@@ -5,12 +5,16 @@ Calls Swift Vision Framework pose (swift_pose) with fallback to YOLOv8 Pose.
 Uses VNDetectHumanBodyPoseRequest with ANE acceleration.
 """

+import re
 import sys
 import json
 import os
 import subprocess
 import argparse

+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from redis_publisher import RedisPublisher
+
 SWIFT_POSE_PATH = os.path.join(
    os.path.dirname(os.path.abspath(__file__)),
    "swift_processors/.build/debug/swift_pose"
@@ -21,11 +25,14 @@ SWIFT_POSE_ALT = os.path.join(
 )


+SWIFT_POSE_PROGRESS_RE = re.compile(r"\[SwiftPose\] Progress:\s*(\d+)%")
+
 def process_pose(
    video_path: str,
    output_path: str,
    uuid: str = "",
    sample_interval: int = 30,
+    publisher: RedisPublisher = None,
 ) -> dict:
    swift_bin = SWIFT_POSE_PATH
    if not os.path.exists(swift_bin):
@@ -33,6 +40,8 @@ def process_pose(

    if not os.path.exists(swift_bin):
        print("[Pose] Swift binary not found, using YOLOv8 fallback", file=sys.stderr)
+        if publisher:
+            publisher.error("pose", "Swift binary not found, using fallback")
        return _fallback(video_path, output_path, uuid, sample_interval)

    cmd = [swift_bin, video_path, output_path,
@@ -40,17 +49,32 @@ def process_pose(
           "--uuid", uuid]

    print(f"[Pose] Running Swift Pose (Vision Framework)", file=sys.stderr)
-    result = subprocess.run(cmd, capture_output=True, text=True, timeout=7200)
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)

-    if result.stdout:
-        for line in result.stdout.strip().split("\n"):
-            print(f"  {line}", file=sys.stderr)
-    if result.stderr:
-        for line in result.stderr.strip().split("\n"):
+    last_pct = -1
+    for line in proc.stdout:
+        line = line.strip()
+        m = SWIFT_POSE_PROGRESS_RE.search(line)
+        if m:
+            pct = int(m.group(1))
+            if pct > last_pct:
+                last_pct = pct
+                print(f"[Pose] Progress: {pct}%", file=sys.stderr)
+                if publisher:
+                    publisher.progress("pose", pct, 100, f"{pct}%")
+        elif line:
            print(f"  {line}", file=sys.stderr)

-    if result.returncode != 0 or not os.path.exists(output_path):
-        print(f"[Pose] Swift Pose failed, falling back to YOLOv8", file=sys.stderr)
+    stderr_output = proc.stderr.read()
+    if stderr_output:
+        print(stderr_output.strip(), file=sys.stderr)
+
+    proc.wait()
+
+    if proc.returncode != 0 or not os.path.exists(output_path):
+        print(f"[Pose] Swift Pose failed (exit={proc.returncode}), falling back to YOLOv8", file=sys.stderr)
+        if publisher:
+            publisher.error("pose", f"Swift Pose failed, using fallback")
        return _fallback(video_path, output_path, uuid, sample_interval)

    with open(output_path) as f:
@@ -113,7 +137,14 @@ if __name__ == "__main__":
    parser.add_argument("--sample-interval", type=int, default=30)
    args = parser.parse_args()

-    result = process_pose(args.video_path, args.output_path, args.uuid, args.sample_interval)
+    publisher = RedisPublisher(args.uuid) if args.uuid else None
+    if publisher:
+        publisher.info("pose", "POSE_START")
+
+    result = process_pose(args.video_path, args.output_path, args.uuid,
+                          args.sample_interval, publisher)
    with open(args.output_path, "w") as f:
        json.dump(result, f, indent=2)
    print(f"Pose: {len(result.get('frames', []))} frames with poses")
+    if publisher:
+        publisher.complete("pose", f"{len(result.get('frames',[]))} frames")
--- a/scripts/redis_publisher.py
+++ b/scripts/redis_publisher.py
@@ -34,6 +34,8 @@ class ProgressData:
    message: Optional[str] = None
    current: Optional[int] = None
    total: Optional[int] = None
+    output_count: Optional[int] = None
+    output_type: Optional[str] = None
    extra: Optional[Dict[str, Any]] = None


@@ -49,7 +51,8 @@ class StructuredMessage:
 class RedisPublisher:
    def __init__(self, uuid: str):
        self.uuid = uuid
-        self.channel = f"momentry:progress:{uuid}"
+        prefix = os.environ.get("MOMENTRY_REDIS_PREFIX", "momentry:")
+        self.channel = f"{prefix}progress:{uuid}"
        self._enabled = False
        self._client = None
        self._connect()
@@ -107,6 +110,8 @@ class RedisPublisher:
        message: Optional[str] = None,
        current: Optional[int] = None,
        total: Optional[int] = None,
+        output_count: Optional[int] = None,
+        output_type: Optional[str] = None,
        extra: Optional[Dict[str, Any]] = None,
    ) -> bool:
        if not self._enabled:
@@ -121,6 +126,8 @@ class RedisPublisher:
                message=message,
                current=current,
                total=total,
+                output_count=output_count,
+                output_type=output_type,
                extra=extra,
            ),
        )
@@ -136,6 +143,8 @@ class RedisPublisher:
        current: int,
        total: int,
        message: str = "",
+        output_count: Optional[int] = None,
+        output_type: Optional[str] = None,
    ) -> bool:
        return self.publish(
            MessageType.PROGRESS,
@@ -143,6 +152,8 @@ class RedisPublisher:
            message=message,
            current=current,
            total=total,
+            output_count=output_count,
+            output_type=output_type,
        )

    def complete(self, processor: str, message: str = "") -> bool:
--- a/scripts/sync_users_from_sftpgo.py
+++ b/scripts/sync_users_from_sftpgo.py
@@ -0,0 +1,117 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Sync users from SFTPGo to Momentry users table.
+
+Usage:
+    python3 scripts/sync_users_from_sftpgo.py
+    python3 scripts/sync_users_from_sftpgo.py --sftpgo-url http://localhost:8080
+    python3 scripts/sync_users_from_sftpgo.py --db "dbname=momentry user=accusys"
+
+Environment:
+    SFTPGO_BASE_URL   Default: http://localhost:8080
+    DATABASE_URL      Default: dbname=momentry user=accusys host=localhost
+
+This script does NOT copy passwords. It creates user records with placeholder
+password hashes. The real password will be captured on the user's first
+login through Momentry (which verifies against SFTPGo and caches the hash).
+"""
+import argparse
+import json
+import os
+import sys
+from typing import Any
+
+import psycopg2
+import psycopg2.extras
+import requests
+
+
+def get_sftpgo_users(sftpgo_url: str, admin_user: str, admin_pass: str) -> list[dict[str, Any]]:
+    """Get all users from SFTPGo."""
+    # Get admin token (SFTPGo uses GET, not POST)
+    token_url = f"{sftpgo_url}/api/v2/token"
+    resp = requests.get(token_url, auth=(admin_user, admin_pass), timeout=10)
+    resp.raise_for_status()
+    token = resp.json().get("access_token")
+    if not token:
+        print("ERROR: Failed to get SFTPGo admin token", file=sys.stderr)
+        sys.exit(1)
+
+    # List users
+    users_url = f"{sftpgo_url}/api/v2/users"
+    headers = {"Authorization": f"Bearer {token}"}
+    resp = requests.get(users_url, headers=headers, timeout=10)
+    resp.raise_for_status()
+    return resp.json()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Sync SFTPGo users to Momentry")
+    parser.add_argument("--sftpgo-url", default=os.getenv("SFTPGO_BASE_URL", "http://localhost:8080"))
+    parser.add_argument("--db", default=os.getenv("DATABASE_URL", "dbname=momentry user=accusys host=localhost"))
+    parser.add_argument("--admin-user", default="admin")
+    parser.add_argument("--admin-pass", default=os.getenv("SFTPGO_ADMIN_PASSWORD", "Test3200Test3200"))
+    parser.add_argument("--dry-run", action="store_true", help="Print what would be done without executing")
+    args = parser.parse_args()
+
+    # Fetch users from SFTPGo
+    print(f"[SFTPGo] Connecting to {args.sftpgo_url}...")
+    try:
+        sftpgo_users = get_sftpgo_users(args.sftpgo_url, args.admin_user, args.admin_pass)
+    except Exception as e:
+        print(f"ERROR: Failed to fetch SFTPGo users: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    print(f"[SFTPGo] Found {len(sftpgo_users)} users")
+
+    # Connect to Momentry DB and set schema
+    conn = psycopg2.connect(args.db)
+    cur = conn.cursor()
+    cur.execute("SET search_path TO dev")
+
+    synced = 0
+    skipped = 0
+
+    for user in sftpgo_users:
+        username = user.get("username")
+        status = user.get("status", 0)
+
+        if not username or status != 1:
+            skipped += 1
+            continue
+
+        role = "admin" if username == "admin" else "user"
+        # Placeholder hash — will be updated on first login via SFTPGo fallback
+        placeholder_hash = "$placeholder$synced_from_sftpgo"
+
+        if args.dry_run:
+            print(f"  Would insert: {username} (role={role})")
+            synced += 1
+            continue
+
+        try:
+            cur.execute(
+                "INSERT INTO users (username, password_hash, role) VALUES (%s, %s, %s) "
+                "ON CONFLICT (username) DO NOTHING",
+                (username, placeholder_hash, role),
+            )
+            if cur.rowcount > 0:
+                print(f"  ✅ {username} (role={role})")
+                synced += 1
+            else:
+                print(f"  ⏭️  {username} already exists, skipped")
+                skipped += 1
+        except Exception as e:
+            print(f"  ❌ {username}: {e}", file=sys.stderr)
+            skipped += 1
+
+    conn.commit()
+    cur.close()
+    conn.close()
+
+    print(f"\nDone: {synced} synced, {skipped} skipped/errors")
+    print("Note: Password hashes are placeholders. First login via Momentry will cache the real hash.")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/tmdb_agent.py
+++ b/scripts/tmdb_agent.py
@@ -0,0 +1,285 @@
+#!/opt/homebrew/bin/python3.11
+"""
+TMDb Agent — pre-fetch TMDb data and write directly to identity files.
+
+Usage:
+    python3 scripts/tmdb_agent.py --file-uuid <uuid>
+    python3 scripts/tmdb_agent.py --file-uuid <uuid> --db "dbname=momentry user=accusys"
+
+Environment:
+    TMDB_API_KEY          Required. TMDb API key.
+    MOMENTRY_OUTPUT_DIR   Default: /Users/accusys/momentry/output
+    DATABASE_URL          Default: dbname=momentry user=accusys host=localhost
+
+Flow:
+    1. Query videos table for file_name
+    2. Extract movie name from filename
+    3. TMDB /search/movie → find best match
+    4. TMDB /movie/{id}/credits → fetch cast
+    5. TMDB /person/{id} → fetch person details
+    6. Write {OUTPUT}/identities/{uuid}/identity.json + profile.jpg for each cast member
+    7. Write {OUTPUT}/{uuid}.tmdb.json cache (movie info + identity uuid list)
+"""
+import argparse
+import hashlib
+import json
+import os
+import re
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+import requests
+import psycopg2
+import psycopg2.extras
+
+
+TMDB_BASE = "https://api.themoviedb.org/3"
+TMDB_API_KEY = os.getenv("TMDB_API_KEY")
+
+
+def extract_movie_name(filename: str) -> str | None:
+    """Extract movie name from filename (e.g. 'Charade_1963.mp4' → 'Charade 1963')"""
+    name = Path(filename).stem
+    cleaned = re.sub(r'[._]', ' ', name).strip()
+    # Strip text after separators like |, (, [, {
+    for sep in ('|', '(', '[', '{', '\u2502'):
+        idx = cleaned.find(sep)
+        if idx > 0:
+            cleaned = cleaned[:idx].strip()
+    # Strip common suffixes (quality, format, source, etc.)
+    suffixes = (
+        r'\d{3,4}p', r'\d{3,4}x\d{3,4}', r'\d+fps', r'bluray', r'web[ -]?dl',
+        r'webrip', r'hdrip', r'dvdrip', r'dvd', r'brrip', r'hdtv', r'xvid',
+        r'x264', r'h264', r'x265', r'h265', r'hevc', r'aac', r'mp3', r'ac3',
+        r'dts', r'5\.1', r'7\.1', r'dual[ -]?audio', r'multi[ -]?sub',
+        r'proper', r'repack', r'extended', r'unrated', r'directors[ -]?cut',
+        r'theatrical', r'internal', r'limited', r'complete', r'full[ -]?movie',
+        r'english', r'french', r'spanish', r'german', r'chinese',
+        r'youtube', r'yify', r'ettv', r'rarbg', r'tgx', r'axxo', r'ctrlhd',
+    )
+    pattern = r'\b(?:' + '|'.join(suffixes) + r')\b'
+    cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip()
+    # Collapse multiple spaces
+    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
+    return cleaned if len(cleaned) >= 3 else None
+
+
+def search_movie(query: str) -> dict | None:
+    """Search TMDB for a movie by name. Returns first result."""
+    url = f"{TMDB_BASE}/search/movie"
+    params = {"query": query, "api_key": TMDB_API_KEY, "language": "en-US", "page": 1}
+    try:
+        resp = requests.get(url, params=params, timeout=15)
+        resp.raise_for_status()
+        results = resp.json().get("results", [])
+        return results[0] if results else None
+    except Exception as e:
+        print(f"TMDB search failed: {e}", file=sys.stderr)
+        return None
+
+
+def get_credits(movie_id: int) -> list[dict]:
+    """Get cast credits for a movie from TMDB."""
+    url = f"{TMDB_BASE}/movie/{movie_id}/credits"
+    params = {"api_key": TMDB_API_KEY, "language": "en-US"}
+    try:
+        resp = requests.get(url, params=params, timeout=15)
+        resp.raise_for_status()
+        return resp.json().get("cast", [])
+    except Exception as e:
+        print(f"TMDB credits failed: {e}", file=sys.stderr)
+        return []
+
+
+def get_person_details(person_id: int) -> dict:
+    """Fetch person details from TMDB /person/{id}."""
+    url = f"{TMDB_BASE}/person/{person_id}"
+    params = {"api_key": TMDB_API_KEY, "language": "en-US"}
+    try:
+        resp = requests.get(url, params=params, timeout=15)
+        resp.raise_for_status()
+        data = resp.json()
+        return {
+            "biography": data.get("biography"),
+            "birthday": data.get("birthday"),
+            "place_of_birth": data.get("place_of_birth"),
+            "also_known_as": data.get("also_known_as", []),
+            "imdb_id": data.get("imdb_id"),
+            "known_for_department": data.get("known_for_department"),
+            "popularity": data.get("popularity"),
+            "deathday": data.get("deathday"),
+            "gender": data.get("gender"),
+            "homepage": data.get("homepage"),
+        }
+    except Exception as e:
+        print(f"TMDB person details failed for {person_id}: {e}", file=sys.stderr)
+        return {}
+
+
+def main():
+    parser = argparse.ArgumentParser(description="TMDb Agent — pre-fetch cache")
+    parser.add_argument("--file-uuid", required=True, help="File UUID to enrich")
+    parser.add_argument("--db", default=os.getenv("DATABASE_URL", "dbname=momentry user=accusys host=localhost"))
+    parser.add_argument("--output", default=os.getenv("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output"))
+    args = parser.parse_args()
+
+    if not TMDB_API_KEY:
+        print("ERROR: TMDB_API_KEY not set.", file=sys.stderr)
+        sys.exit(1)
+
+    # 1. Query DB for file_name
+    schema = os.getenv("DATABASE_SCHEMA", "").strip()
+    table = f"{schema}.videos" if schema else "videos"
+    conn = psycopg2.connect(args.db)
+    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+    cur.execute(f"SELECT file_name FROM {table} WHERE file_uuid = %s", (args.file_uuid,))
+    row = cur.fetchone()
+    cur.close()
+    conn.close()
+
+    if not row:
+        print(f"ERROR: File not found: {args.file_uuid}", file=sys.stderr)
+        sys.exit(1)
+
+    file_name = row["file_name"]
+    print(f"[TKG-AGENT] File: {file_name} ({args.file_uuid})")
+
+    # 2. Extract movie name
+    movie_name = extract_movie_name(file_name)
+    if not movie_name:
+        print(f"ERROR: Cannot extract movie name from: {file_name}", file=sys.stderr)
+        sys.exit(1)
+    print(f"[TKG-AGENT] Extracted movie name: '{movie_name}'")
+
+    # 3. Search TMDB
+    movie = search_movie(movie_name)
+    if not movie:
+        print(f"ERROR: No TMDB movie found for: {movie_name}", file=sys.stderr)
+        sys.exit(1)
+    print(f"[TKG-AGENT] Matched: {movie['title']} (TMDB id={movie['id']})")
+
+    # 4. Fetch credits
+    cast = get_credits(movie["id"])
+    if not cast:
+        print(f"WARN: No cast data found for movie {movie['id']}", file=sys.stderr)
+
+    # 5. Enrich each cast member with person details and write identity files
+    output = Path(args.output)
+    identities_root = output / "identities"
+    identities_root.mkdir(parents=True, exist_ok=True)
+
+    now = datetime.now(timezone.utc).isoformat()
+    created_identities = []
+
+    for i, m in enumerate(cast):
+        person_id = m["id"]
+        person = get_person_details(person_id)
+
+        # Generate deterministic UUID: SHA256("tmdb-{movie_id}-{person_id}-{name}")
+        uuid_raw = hashlib.sha256(f"tmdb-{movie['id']}-{person_id}-{m['name']}".encode()).hexdigest()[:32]
+        profile_url = (
+            f"https://image.tmdb.org/t/p/w185{m['profile_path']}"
+            if m.get("profile_path") else None
+        )
+
+        # Build identity.json
+        metadata = {
+            "tmdb_character": m.get("character", ""),
+            "tmdb_cast_order": i,
+            "tmdb_movie_id": movie["id"],
+            "tmdb_movie_title": movie["title"],
+            "tmdb_biography": person.get("biography"),
+            "tmdb_birthday": person.get("birthday"),
+            "tmdb_place_of_birth": person.get("place_of_birth"),
+            "tmdb_aliases": person.get("also_known_as", []),
+            "tmdb_imdb_id": person.get("imdb_id"),
+            "tmdb_department": person.get("known_for_department"),
+            "tmdb_popularity": person.get("popularity"),
+            "tmdb_deathday": person.get("deathday"),
+            "tmdb_gender": person.get("gender"),
+            "tmdb_homepage": person.get("homepage"),
+        }
+
+        identity = {
+            "version": 1,
+            "identity_uuid": uuid_raw,
+            "name": m["name"],
+            "identity_type": "people",
+            "source": "tmdb",
+            "status": "confirmed",
+            "tmdb_id": person_id,
+            "tmdb_profile": profile_url,
+            "metadata": {k: v for k, v in metadata.items() if v is not None or k == "tmdb_aliases"},
+            "file_bindings": [],
+            "created_at": now,
+            "updated_at": now,
+        }
+
+        # Write identity.json
+        identity_dir = identities_root / uuid_raw
+        identity_dir.mkdir(parents=True, exist_ok=True)
+        identity_path = identity_dir / "identity.json"
+        with open(identity_path, "w", encoding="utf-8") as f:
+            json.dump(identity, f, indent=2, ensure_ascii=False)
+
+        # Download profile.jpg
+        if profile_url:
+            img_path = identity_dir / "profile.jpg"
+            if not img_path.exists():
+                try:
+                    resp = requests.get(profile_url, timeout=15)
+                    if resp.status_code == 200:
+                        img_path.write_bytes(resp.content)
+                except Exception as e:
+                    print(f"  [WARN] Failed to download profile for {m['name']}: {e}", file=sys.stderr)
+
+        created_identities.append({
+            "identity_uuid": uuid_raw,
+            "name": m["name"],
+            "tmdb_id": person_id,
+            "character": m.get("character", ""),
+            "order": i,
+        })
+
+        if (i + 1) % 5 == 0:
+            print(f"[TKG-AGENT] Wrote {i+1}/{len(cast)} identity files")
+
+    # Update _index.json
+    index_path = identities_root / "_index.json"
+    index = {}
+    if index_path.exists():
+        with open(index_path) as f:
+            index = json.load(f)
+    for ci in created_identities:
+        index[ci["identity_uuid"]] = ci["name"]
+    with open(index_path, "w", encoding="utf-8") as f:
+        json.dump(index, f, indent=2, ensure_ascii=False)
+
+    # Write movie cache ({uuid}.tmdb.json) — simplified, no per-person data
+    cache = {
+        "file_uuid": args.file_uuid,
+        "fetched_at": now,
+        "source": "agent",
+        "movie": {
+            "tmdb_id": movie["id"],
+            "title": movie["title"],
+            "release_date": movie.get("release_date"),
+            "overview": movie.get("overview"),
+            "poster_path": movie.get("poster_path"),
+        },
+        "cast_count": len(cast),
+        "identities_created": len(created_identities),
+        "identities": created_identities,
+    }
+
+    cache_path = output / f"{args.file_uuid}.tmdb.json"
+    with open(cache_path, "w", encoding="utf-8") as f:
+        json.dump(cache, f, indent=2, ensure_ascii=False)
+
+    print(f"[TKG-AGENT] Cache written: {cache_path}")
+    print(f"[TKG-AGENT] Identity files: {len(created_identities)} cast members → {identities_root}/")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/visual_chunk_processor.py
+++ b/scripts/visual_chunk_processor.py
@@ -384,6 +384,7 @@ def main():
    parser.add_argument("video_path", help="視頻文件路徑")
    parser.add_argument("output_path", help="輸出文件路徑")
    parser.add_argument("--yolo-result", help="YOLO 結果文件路徑（可選）")
+    parser.add_argument("--uuid", help="檔案 UUID（由 executor 傳入）")
    parser.add_argument(
        "--strategy", choices=["fixed", "similarity"], default="fixed", help="分片策略"
    )