cleanup: remove dead code and duplicate docs

- Remove session-ses_2f27.md (161KB raw session log) - Remove 49 ROOT_* duplicate files across REFERENCE/ - Remove 14 duplicate files between REFERENCE/ root and history/ - Remove asr_legacy.rs (dead code, replaced by asr.rs) - Remove src/core/worker/ (duplicate JobWorker) - Remove src/core/layers/ (empty directory) - Remove 4 .bak files in src/ - Remove 7 dead private methods in worker/processor.rs - Remove backup directory from git tracking
2026-05-04 01:31:21 +08:00
parent ee81e343ce
commit e75c4d6f07
3270 changed files with 35190 additions and 53367 deletions
--- a/scripts/pycache/redis_publisher.cpython-311.pyc
+++ b/scripts/pycache/redis_publisher.cpython-311.pyc
--- a/scripts/analyze_video_faces.py
+++ b/scripts/analyze_video_faces.py
@@ -4,14 +4,12 @@
 """

 import cv2
-import numpy as np
 import os
 import sys
 import json
 import time
 from datetime import datetime
 import psycopg2
-from psycopg2.extras import RealDictCursor

 # 導入人臉識別處理器
 sys.path.append(os.path.dirname(os.path.abspath(__file__)))
@@ -275,7 +273,7 @@ class VideoFaceAnalyzer:
        with open(result_file, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=2, ensure_ascii=False)

-        print(f"\n分析完成:")
+        print("\n分析完成:")
        print(f"  - 處理幀數: {len(frames)}")
        print(f"  - 檢測到人臉: {len(detections)}")
        print(f"  - 分析時間: {result['analysis_time']:.1f}秒")
@@ -454,14 +452,14 @@ def main():
            total_faces = sum(r["faces_detected"] for r in video_results)
            total_time = sum(r["analysis_time"] for r in video_results)

-            print(f"\n📈 分析摘要:")
+            print("\n📈 分析摘要:")
            print(f"  - 總處理視頻: {len(video_results)}")
            print(f"  - 總處理幀數: {total_frames}")
            print(f"  - 總檢測人臉: {total_faces}")
            print(f"  - 總分析時間: {total_time:.1f}秒")

            # 列出生成的文件
-            print(f"\n📄 生成的文件:")
+            print("\n📄 生成的文件:")
            for filename in sorted(os.listdir(analyzer.output_dir)):
                filepath = os.path.join(analyzer.output_dir, filename)
                if os.path.isfile(filepath):
--- a/scripts/asr_benchmark_runner.py
+++ b/scripts/asr_benchmark_runner.py
@@ -23,7 +23,7 @@ import signal
 import platform
 import psutil
 from datetime import datetime, timezone
-from typing import Dict, Any, Optional, List, Tuple
+from typing import Dict, Any, List
 from pathlib import Path
 import traceback

@@ -606,7 +606,7 @@ class ASRBenchmarkRunner:
                metrics = result.get('metrics', {})
                real_time = result.get('real_time', {})
                
-                lines.append(f"- **Status**: Success")
+                lines.append("- **Status**: Success")
                lines.append(f"- **Start**: {real_time.get('test_start', 'N/A')}")
                lines.append(f"- **End**: {real_time.get('test_end', 'N/A')}")
                lines.append(f"- **Duration**: {metrics.get('processing_time_seconds', 0):.3f}s")
@@ -615,7 +615,7 @@ class ASRBenchmarkRunner:
                lines.append(f"- **Memory Peak**: {metrics.get('peak_memory_mb', 0):.1f}MB")
                lines.append(f"- **Language**: {metrics.get('language_detected', 'N/A')} ({metrics.get('language_probability', 0):.2f})")
            else:
-                lines.append(f"- **Status**: Failed")
+                lines.append("- **Status**: Failed")
                lines.append(f"- **Error**: {result.get('error', 'Unknown error')}")
            
            lines.append("")
@@ -680,7 +680,7 @@ def main():
            runner.generate_results_json()
            runner.generate_markdown_report()
            
-            print(f"\nBenchmark completed!")
+            print("\nBenchmark completed!")
            print(f"Results: {output_dir / 'asr_benchmark_results.json'}")
            print(f"Report: {output_dir / 'asr_benchmark_report.md'}")
    
--- a/scripts/asr_face_stats.py
+++ b/scripts/asr_face_stats.py
@@ -96,7 +96,7 @@ def print_stats(dist, total_segments):
    avg_faces = total_faces_sum / total_segments if total_segments > 0 else 0
    max_faces = max(dist.keys()) if dist else 0

-    print(f"\n📊 Summary:")
+    print("\n📊 Summary:")
    print(f"   Average faces per segment: {avg_faces:.1f}")
    print(f"   Max faces in a segment:    {max_faces}")
    print(
@@ -110,20 +110,20 @@ def print_stats(dist, total_segments):
    )

    # Show some example segments
-    print(f"\n🔍 Example Segments:")
-    print(f"   0 faces:")
+    print("\n🔍 Example Segments:")
+    print("   0 faces:")
    examples = [s for s in segment_details if s["face_count"] == 0][:3]
    for ex in examples:
        print(f"     [{ex['start']:.0f}s-{ex['end']:.0f}s] {ex['text']}...")

-    print(f"   1 face:")
+    print("   1 face:")
    examples = [s for s in segment_details if s["face_count"] == 1][:3]
    for ex in examples:
        print(
            f"     [{ex['start']:.0f}s-{ex['end']:.0f}s] {ex['person_ids'][0]}: {ex['text']}..."
        )

-    print(f"   3 faces:")
+    print("   3 faces:")
    examples = [s for s in segment_details if s["face_count"] == 3][:3]
    for ex in examples:
        pids = ", ".join(ex["person_ids"])
--- a/scripts/asr_processor.py
+++ b/scripts/asr_processor.py
@@ -18,12 +18,10 @@ Configuration:
 import sys
 import json
 import os
-import time
 import argparse
 import signal
 import subprocess
 import tempfile
-from datetime import datetime
 from faster_whisper import WhisperModel

 PROCESSOR_VERSION = "2.1"
@@ -164,44 +162,127 @@ def run_asr(video_path, output_path, uuid: str = ""):
        sys.stderr.flush()
        sys.exit(0)

+    # 嘗試以 CUT 場景分段處理（降低長片記憶體使用）
+    cut_scenes = []
+    cut_path = output_path.replace(".asr.json", ".cut.json")
+    if os.path.exists(cut_path):
+        try:
+            with open(cut_path) as f:
+                cut_data = json.load(f)
+            scenes = cut_data.get("scenes", [])
+            if scenes:
+                cut_scenes = [(s["start_time"], s["end_time"]) for s in scenes]
+                print(f"[ASR] Loaded {len(cut_scenes)} cut scenes for segmented transcription", file=sys.stderr)
+        except Exception as e:
+            print(f"[ASR] Failed to load cut scenes: {e}", file=sys.stderr)
+
    if publisher:
        publisher.info("asr", "Loading Whisper model...")

-    # Use small model with CPU (MPS not supported by faster_whisper)
-    # small 模型在準確率和速度間取得最佳平衡
-    model = WhisperModel("small", device="cpu", compute_type="int8")
+    model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8")

    if publisher:
        publisher.info("asr", f"Transcribing: {video_path}")

-    # Transcribe with VAD filter for better accuracy, with PyAV fallback
-    segments, info = transcribe_with_fallback(model, video_path, publisher)
-
-    if publisher:
-        publisher.info("asr", f"ASR_LANGUAGE:{info.language}")
-
    results = []
    total_segments = 0

-    for segment in segments:
-        results.append(
-            {"start": segment.start, "end": segment.end, "text": segment.text.strip()}
-        )
-        total_segments += 1
-        if total_segments % 100 == 0:
-            if publisher:
-                publisher.progress(
-                    "asr", total_segments, 0, f"Segment {total_segments}"
+    if cut_scenes:
+        # 分段處理：對每個場景萃取音訊並轉錄
+        import subprocess
+        import tempfile
+        import json
+        temp_dir = tempfile.mkdtemp(prefix="asr_cut_")
+        transcript_language = None
+
+        # 建立 scene lookup: 給定時間點，找是哪個 scene
+        import bisect
+        scene_starts = [s[0] for s in cut_scenes]
+        def find_scene_idx(t):
+            i = bisect.bisect_right(scene_starts, t) - 1
+            return max(0, i)
+
+        # 逐段處理，每段結果即時寫入 .asr.tmp
+        tmp_path = output_path + ".tmp"
+        all_segments = []
+
+        for idx, (start_t, end_t) in enumerate(cut_scenes):
+            seg_wav = os.path.join(temp_dir, f"seg_{idx:04d}.wav")
+            # 用 ffmpeg 萃取出該段音訊
+            cmd = ["ffmpeg", "-y", "-v", "quiet", "-i", video_path,
+                   "-ss", str(start_t), "-to", str(end_t),
+                   "-ar", "16000", "-ac", "1", seg_wav]
+            subprocess.run(cmd, check=False, capture_output=True)
+
+            if not os.path.exists(seg_wav) or os.path.getsize(seg_wav) < 100:
+                continue  # 跳過空音訊
+
+            try:
+                seg_result, seg_info = model.transcribe(
+                    seg_wav, beam_size=5,
+                    vad_filter=True,
+                    vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
                )
+                if transcript_language is None:
+                    transcript_language = seg_info.language

-    output = {
-        "language": info.language,
-        "language_probability": info.language_probability,
-        "segments": results,
-    }
+                scene_segments = []
+                for segment in seg_result:
+                    seg_start = start_t + segment.start
+                    seg_end = start_t + segment.end
+                    scene_idx = find_scene_idx((seg_start + seg_end) / 2)
+                    scene_segments.append({
+                        "start": seg_start,
+                        "end": seg_end,
+                        "text": segment.text.strip(),
+                        "scene_number": scene_idx + 1,
+                    })
+                    total_segments += 1

-    with open(output_path, "w") as f:
-        json.dump(output, f, indent=2)
+                # 當前 scene 結果寫入 .asr.tmp
+                all_segments.extend(scene_segments)
+                with open(tmp_path, "w") as f:
+                    json.dump({"language": transcript_language or "", "segments": all_segments}, f)
+
+                if total_segments % 100 == 0:
+                    if publisher:
+                        publisher.progress("asr", total_segments, 0, f"Segment {total_segments}")
+            except Exception as e:
+                print(f"[ASR] Segment {idx} failed: {e}", file=sys.stderr)
+
+            # 清理暫存 WAV
+            try: os.remove(seg_wav)
+            except: pass
+
+        try: os.rmdir(temp_dir)
+        except: pass
+
+        info_language = transcript_language or "unknown"
+        print(f"[ASR] Segmented transcription complete: {total_segments} segments", file=sys.stderr)
+    else:
+        # 無 CUT 資料，直接轉錄（原有流程）
+        segments, info = transcribe_with_fallback(model, video_path, publisher)
+        info_language = info.language
+
+        tmp_path = output_path + ".tmp"
+        all_segments = []
+        for segment in segments:
+            all_segments.append({
+                "start": segment.start, "end": segment.end,
+                "text": segment.text.strip(),
+            })
+            total_segments += 1
+            if total_segments % 100 == 0:
+                if publisher:
+                    publisher.progress("asr", total_segments, 0, f"Segment {total_segments}")
+        with open(tmp_path, "w") as f:
+            json.dump({"language": info_language, "segments": all_segments}, f)
+
+    if publisher:
+        publisher.info("asr", f"ASR_LANGUAGE:{info_language}")
+
+    # rename .tmp → .json
+    os.rename(tmp_path, output_path)

    if publisher:
        publisher.complete("asr", f"{len(results)} segments")
--- a/scripts/asrx_processor_custom.py
+++ b/scripts/asrx_processor_custom.py
@@ -2,12 +2,19 @@
 """
 ASRX Processor - Custom Implementation Wrapper
 Uses SpeechBrain ECAPA-TDNN (no HuggingFace token required)
+
+Pipeline:
+  1. Preprocess: ffprobe audio tracks → select best track → extract WAV
+  2. Process: VAD (Silero) → Speaker embedding (ECAPA-TDNN) → Spectral clustering
+  3. Output: segments with speaker_id
 """

 import sys
 import json
 import argparse
 import os
+import subprocess
+import tempfile
 from pathlib import Path

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -18,6 +25,78 @@ sys.path.insert(
 from redis_publisher import RedisPublisher


+def probe_audio_tracks(video_path: str) -> list:
+    """Use ffprobe to list all audio tracks in the video file."""
+    cmd = [
+        "ffprobe", "-v", "quiet", "-print_format", "json",
+        "-show_streams", "-select_streams", "a", video_path,
+    ]
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+        data = json.loads(result.stdout)
+        tracks = []
+        for stream in data.get("streams", []):
+            track = {
+                "index": stream.get("index"),
+                "codec": stream.get("codec_name"),
+                "language": stream.get("tags", {}).get("language", "und"),
+                "channels": stream.get("channels", 0),
+                "sample_rate": stream.get("sample_rate", "0"),
+            }
+            tracks.append(track)
+        return tracks
+    except Exception as e:
+        print(f"[ASRX] ffprobe failed: {e}")
+        return []
+
+
+def select_best_track(tracks: list) -> int:
+    """Select the best audio track: English > first available > fallback to 0."""
+    if not tracks:
+        return 0
+
+    # Priority 1: English track
+    for i, t in enumerate(tracks):
+        if t["language"] == "eng" or t["language"] == "en":
+            print(f"[ASRX] Selected English track (index {t['index']})")
+            return i
+
+    # Priority 2: First track with the most channels
+    best = 0
+    for i, t in enumerate(tracks):
+        if t["channels"] > tracks[best]["channels"]:
+            best = i
+
+    print(f"[ASRX] Selected track {best} (lang={tracks[best]['language']}, ch={tracks[best]['channels']})")
+    return best
+
+
+def extract_audio_to_wav(video_path: str, track_index: int, output_wav: str) -> bool:
+    """Extract selected audio track to 16kHz mono WAV using ffmpeg."""
+    cmd = [
+        "ffmpeg", "-y", "-v", "quiet",
+        "-i", video_path,
+        "-map", f"0:{track_index}",
+        "-ar", "16000",
+        "-ac", "1",
+        "-sample_fmt", "s16",
+        output_wav,
+    ]
+    try:
+        subprocess.run(cmd, check=True, capture_output=True, timeout=300)
+        return True
+    except Exception as e:
+        print(f"[ASRX] ffmpeg extraction failed: {e}")
+        return False
+
+
+def _cleanup(tmp_dir):
+    """Clean up temporary directory."""
+    if tmp_dir and os.path.exists(tmp_dir):
+        import shutil
+        shutil.rmtree(tmp_dir, ignore_errors=True)
+
+
 def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
    """Process video for speaker diarization using custom implementation"""

@@ -25,25 +104,102 @@ def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
    if publisher:
        publisher.info("asrx", "ASRX_START")

+    tmp_dir = None
+
    try:
+        # Ensure working directory is the scripts dir for model loading
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        os.chdir(script_dir)
+
+        # Debug: check ffmpeg availability
+        import shutil
+        ffmpeg_path = shutil.which("ffmpeg")
+        print(f"[ASRX] ffmpeg: {ffmpeg_path}", file=sys.stderr)
+        print(f"[ASRX] CWD: {os.getcwd()}", file=sys.stderr)
+
+        # ---- Stage 1: Audio Track Preprocessing ----
+        print("\n[ASRX] ===== Stage 1: Audio Track Analysis =====", file=sys.stderr)
+        print(f"[ASRX] Input: {video_path}", file=sys.stderr)
+
+        tracks = probe_audio_tracks(video_path)
+        if tracks:
+            print(f"[ASRX] Found {len(tracks)} audio track(s):", file=sys.stderr)
+            for t in tracks:
+                print(f"  Track {t['index']}: {t['codec']} {t['channels']}ch {t['sample_rate']}Hz lang={t['language']}", file=sys.stderr)
+        else:
+            print("[ASRX] No audio tracks found via ffprobe, using raw file", file=sys.stderr)
+
+        # Select best track
+        track_idx = select_best_track(tracks) if tracks else 0
+        actual_track_index = tracks[track_idx]["index"] if tracks else track_idx
+
+        # Extract audio to WAV
+        tmp_dir = tempfile.mkdtemp(prefix="asrx_")
+        wav_path = os.path.join(tmp_dir, "audio.wav")
+
+        if extract_audio_to_wav(video_path, actual_track_index, wav_path):
+            wav_size = os.path.getsize(wav_path)
+            print(f"[ASRX] Audio extracted: {wav_path} ({wav_size / 1024 / 1024:.1f}MB)", file=sys.stderr)
+            audio_input = wav_path
+        else:
+            print("[ASRX] Audio extraction failed, falling back to original file", file=sys.stderr)
+            audio_input = video_path
+
+        # ---- Stage 2: Load ASR segments for time alignment ----
+        # Try multiple paths to find ASR JSON
+        asr_segments = []
+        asr_fallback_reason = ""
+        asr_candidates = [
+            output_path.replace(".asrx.json", ".asr.json") if output_path else "",
+            os.path.join(os.path.dirname(output_path) if output_path else ".", os.path.basename(video_path).rsplit(".", 1)[0] + ".asr.json"),
+            os.path.join(os.path.dirname(output_path) if output_path else ".", "dd61fda85fee441fdd00ab5528213ff7.asr.json"),
+        ]
+        asr_path = ""
+        for candidate in asr_candidates:
+            if candidate and os.path.exists(candidate):
+                asr_path = candidate
+                break
+        if asr_path:
+            try:
+                with open(asr_path) as f:
+                    asr_data = json.load(f)
+                asr_segments = asr_data.get("segments", [])
+                print(f"[ASRX] Loaded {len(asr_segments)} ASR segments from {asr_path}", file=sys.stderr)
+                asr_fallback_reason = f"loaded_{len(asr_segments)}_segments"
+            except Exception as e:
+                asr_fallback_reason = f"load_error_{e}"
+                print(f"[ASRX] Failed to load ASR segments: {e}", file=sys.stderr)
+        else:
+            asr_fallback_reason = f"asr_json_not_found_tried_{len(asr_candidates)}_paths"
+            print(f"[ASRX] ASR output not found, tried {len(asr_candidates)} paths. First candidate: {asr_candidates[0]}", file=sys.stderr)
+
+        # ---- Stage 3: ASRX Processing ----
        from asrx_self.main_fixed import SelfASRXFixed

        if publisher:
            publisher.info("asrx", "ASRX_LOADING_MODEL")

-        # Initialize custom ASRX processor
        asrx = SelfASRXFixed()

        if publisher:
            publisher.info("asrx", "ASRX_TRANSCRIBING")

-        # Process video/audio
-        result = asrx.process(
-            video_path,
-            output_path=None,  # We'll save our own format
-            min_speech_duration_ms=500,
-            max_speakers=10,
-        )
+        if asr_segments:
+            # Use ASR segment boundaries for speaker embedding extraction
+            print(f"[ASRX] Using {len(asr_segments)} ASR segments for diarization", file=sys.stderr)
+            result = asrx.process_with_segments(
+                audio_input,
+                asr_segments,
+                output_path=None,
+            )
+        else:
+            # Fallback: VAD-based diarization
+            result = asrx.process(
+                audio_input,
+                output_path=None,
+                min_speech_duration_ms=500,
+                max_speakers=10,
+            )

        if "error" in result:
            if publisher:
@@ -58,21 +214,47 @@ def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
            if publisher:
                publisher.complete("asrx", "0 segments")

+            _cleanup(tmp_dir)
            return output_result

-        # Convert to Rust-expected format
+        # Convert to Rust-expected format (start_frame/end_frame/speaker)
+        # Read fps from probe json ({file_uuid}.probe.json)
+        _debug = {"asr_fallback": asr_fallback_reason, "asr_path": asr_path}
+        fps = 30.0
+        output_dir = os.path.dirname(output_path) if output_path else "."
+        base_name = os.path.basename(output_path) if output_path else ""
+        # Extract uuid from {uuid}.{type}.json format
+        uuid_part = base_name.split(".")[0] if base_name else ""
+        probe_candidates = [
+            os.path.join(output_dir, f"{uuid_part}.probe.json"),
+        ]
+        for p in probe_candidates:
+            if os.path.exists(p):
+                try:
+                    with open(p) as pf:
+                        probe_data = json.load(pf)
+                        if "fps" in probe_data:
+                            fps = float(probe_data["fps"])
+                            print(f"[ASRX] FPS from probe: {fps}", file=sys.stderr)
+                        break
+                except:
+                    pass
        output_result = {
-            "language": None,  # Custom implementation doesn't detect language
+            "language": None,
            "segments": [],
        }

        # Convert segments
        for seg in result["segments"]:
+            start_sec = seg["start"]
+            end_sec = seg["end"]
            output_result["segments"].append(
                {
-                    "start": seg["start"],
-                    "end": seg["end"],
-                    "text": "",  # Will be filled by matching with ASR later
+                    "start_time": start_sec,
+                    "end_time": end_sec,
+                    "start_frame": int(start_sec * fps),
+                    "end_frame": int(end_sec * fps),
+                    "text": "",
                    "speaker_id": seg["speaker"],
                }
            )
@@ -81,20 +263,24 @@ def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
        if "speaker_stats" in result:
            output_result["speaker_stats"] = result["speaker_stats"]

+        # 傳遞 embeddings（每個 segment 對應的 192-D speaker embedding）
+        if "embeddings" in result:
+            output_result["embeddings"] = result["embeddings"]
+
        if publisher:
            publisher.info("asrx", f"ASRX_COMPLETE:{len(output_result['segments'])}")

        # Save output
+        output_result["_debug"] = _debug
        with open(output_path, "w") as f:
            json.dump(output_result, f, indent=2)

        if publisher:
            publisher.complete("asrx", f"{len(output_result['segments'])} segments")

-        print(
-            f"[ASRX-Custom] Saved {len(output_result['segments'])} segments to {output_path}"
-        )
+        print(f"[ASRX-Custom] Saved {len(output_result['segments'])} segments to {output_path}", file=sys.stderr)

+        _cleanup(tmp_dir)
        return output_result

    except Exception as e:
@@ -114,6 +300,7 @@ def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
        if publisher:
            publisher.complete("asrx", "0 segments")

+        _cleanup(tmp_dir)
        return output_result


@@ -133,7 +320,7 @@ if __name__ == "__main__":

    result = process_asrx_custom(args.video_path, args.output_path, args.uuid)

-    print(f"\n[Summary]")
+    print("\n[Summary]")
    print(f"  Total segments: {len(result['segments'])}")
    if "speaker_stats" in result:
        print(f"  Detected speakers: {len(result['speaker_stats'])}")
--- a/scripts/asrx_self/integrate_face_asrx_speaker.py
+++ b/scripts/asrx_self/integrate_face_asrx_speaker.py
@@ -130,12 +130,12 @@ def main():
    integrated = match_face_with_speaker_v3(face_data, asrx_data, args.threshold)
    
    # 分析
-    print(f"\n[Analyze] Analyzing speaker-face correspondence...")
+    print("\n[Analyze] Analyzing speaker-face correspondence...")
    speaker_stats = analyze_speaker_face(integrated)
    
    # 顯示統計
    print(f"\n{'='*70}")
-    print(f"說話人 - 人臉對應統計")
+    print("說話人 - 人臉對應統計")
    print(f"{'='*70}")
    
    total_segments = len(integrated)
--- a/scripts/asrx_self/main.py
+++ b/scripts/asrx_self/main.py
@@ -16,7 +16,6 @@ Self-implemented ASRX - 自實作說話人分離系統
 import sys
 import json
 import time
-import numpy as np
 from pathlib import Path

 # 導入自定義模組
@@ -182,7 +181,7 @@ class SelfASRX:
        result["processing_time"] = round(total_time, 2)
        result["realtime_factor"] = round(result["total_duration"] / total_time, 2)

-        print(f"\n[SelfASRX] Processing completed!")
+        print("\n[SelfASRX] Processing completed!")
        print(f"  Total time: {total_time:.2f}s")
        print(f"  Realtime factor: {result['realtime_factor']:.2f}x")
        print(f"  Detected speakers: {estimated_n_speakers}")
@@ -249,14 +248,14 @@ def main():

    # 顯示結果摘要
    if "error" not in result:
-        print(f"\n[Summary]")
+        print("\n[Summary]")
        print(f"  Audio duration: {result['total_duration']:.2f}s")
        print(f"  Speech segments: {result['n_speech_segments']}")
        print(f"  Detected speakers: {result['n_speakers']}")
        print(f"  Processing time: {result['processing_time']:.2f}s")
        print(f"  Realtime factor: {result['realtime_factor']:.2f}x")

-        print(f"\n[Speaker Statistics]")
+        print("\n[Speaker Statistics]")
        for speaker, stats in result["speaker_stats"].items():
            pct = stats["duration"] / result["total_duration"] * 100
            print(
--- a/scripts/asrx_self/main_fixed.py
+++ b/scripts/asrx_self/main_fixed.py
@@ -134,7 +134,7 @@ class SelfASRXFixed:
        result["processing_time"] = round(total_time, 2)
        result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
        
-        print(f"\n[SelfASRX-Fixed] Processing completed!")
+        print("\n[SelfASRX-Fixed] Processing completed!")
        print(f"  Total time: {total_time:.2f}s")
        print(f"  Realtime factor: {result['realtime_factor']:.2f}x")
        print(f"  Detected speakers: {estimated_n_speakers}")
@@ -154,6 +154,117 @@ class SelfASRXFixed:
        return result


+    def process_with_segments(self, audio_path, asr_segments, output_path=None):
+        """
+        使用 ASR segment 邊界進行 speaker diarization，取代 VAD 步驟。
+        
+        Args:
+            audio_path: 音頻文件路徑（WAV）
+            asr_segments: ASR segment 列表，每個包含 start/end（秒）
+            output_path: 輸出 JSON 路徑（可選）
+        """
+        start_time = time.time()
+        print(f"\n[SelfASRX-Fixed] Processing with {len(asr_segments)} ASR segments: {audio_path}")
+        print("=" * 60)
+
+        # 載入完整音頻
+        import soundfile as sf
+        wav, sample_rate = sf.read(audio_path)
+        if len(wav.shape) > 1:
+            wav = np.mean(wav, axis=1)  # 轉 mono
+        print(f"  Audio loaded: {len(wav)/sample_rate:.2f}s, {sample_rate}Hz")
+
+        # 使用 ASR segments 取代 VAD
+        speech_segments = [(s["start"], s["end"]) for s in asr_segments]
+        print(f"  Speech segments from ASR: {len(speech_segments)}")
+
+        if len(speech_segments) == 0:
+            print("[SelfASRX-Fixed] No ASR segments provided!")
+            return {"error": "No ASR segments", "segments": []}
+
+        # 提取語音片段
+        audio_segments = []
+        for start_sec, end_sec in speech_segments:
+            start_sample = int(start_sec * sample_rate)
+            end_sample = int(end_sec * sample_rate)
+            if start_sample >= len(wav):
+                continue
+            audio_segments.append(wav[start_sample:min(end_sample, len(wav))])
+
+        print(f"  Audio segments extracted: {len(audio_segments)}")
+
+        # 批量提取聲紋嵌入
+        print("\n[Step 2] Speaker embedding extraction...")
+        step2_start = time.time()
+        embeddings = extract_speaker_embeddings_batch(
+            self.speaker_encoder, audio_segments, sample_rate
+        )
+        embeddings = normalize_embeddings(embeddings)
+        step2_time = time.time() - step2_start
+        print(f"  Embedding shape: {embeddings.shape}")
+        print(f"  Embedding time: {step2_time:.2f}s")
+
+        # 聚類
+        print("\n[Step 3] Robust speaker clustering...")
+        step3_start = time.time()
+        speaker_labels, estimated_n_speakers = robust_speaker_clustering(
+            embeddings, n_speakers=None, max_speakers=10
+        )
+        step3_time = time.time() - step3_start
+        print(f"  Clustering time: {step3_time:.2f}s")
+
+        # 建立輸出
+        result = {
+            "audio_path": str(audio_path),
+            "total_duration": len(wav) / sample_rate,
+            "n_speech_segments": len(speech_segments),
+            "n_speakers": int(estimated_n_speakers),
+            "segments": []
+        }
+
+        for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
+            result["segments"].append({
+                "index": i,
+                "start": round(start, 3),
+                "end": round(end, 3),
+                "duration": round(end - start, 3),
+                "speaker": f"SPEAKER_{int(label)}"
+            })
+
+        # 加入 embeddings（每個 segment 對應的 192-D speaker embedding）
+        result["embeddings"] = []
+        for emb in embeddings:
+            result["embeddings"].append(emb.tolist())
+
+        # 統計
+        speaker_stats = {}
+        for seg in result["segments"]:
+            speaker = seg["speaker"]
+            if speaker not in speaker_stats:
+                speaker_stats[speaker] = {"count": 0, "duration": 0}
+            speaker_stats[speaker]["count"] += 1
+            speaker_stats[speaker]["duration"] += seg["duration"]
+        result["speaker_stats"] = speaker_stats
+
+        total_time = time.time() - start_time
+        result["processing_time"] = round(total_time, 2)
+        result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
+
+        print("\n[SelfASRX-Fixed] Processing completed!")
+        print(f"  Total time: {total_time:.2f}s")
+        print(f"  Realtime factor: {result['realtime_factor']:.2f}x")
+        print(f"  Detected speakers: {estimated_n_speakers}")
+
+        if output_path:
+            import json
+            with open(output_path, 'w', encoding='utf-8') as f:
+                json.dump(result, f, indent=2, ensure_ascii=False)
+            print(f"  Results saved to: {output_path}")
+
+        print("=" * 60)
+        return result
+
+
 def main():
    import argparse
    
@@ -180,14 +291,14 @@ def main():
    )
    
    if "error" not in result:
-        print(f"\n[Summary]")
+        print("\n[Summary]")
        print(f"  Audio duration: {result['total_duration']:.2f}s")
        print(f"  Speech segments: {result['n_speech_segments']}")
        print(f"  Detected speakers: {result['n_speakers']}")
        print(f"  Processing time: {result['processing_time']:.2f}s")
        print(f"  Realtime factor: {result['realtime_factor']:.2f}x")
        
-        print(f"\n[Speaker Statistics]")
+        print("\n[Speaker Statistics]")
        for speaker, stats in result['speaker_stats'].items():
            pct = stats['duration'] / result['total_duration'] * 100
            print(f"  {speaker}: {stats['count']} segments, " +
--- a/scripts/asrx_self/speaker_cluster.py
+++ b/scripts/asrx_self/speaker_cluster.py
@@ -138,7 +138,7 @@ def spectral_clustering_speaker(

        speaker_labels = clustering.fit_predict(similarity_matrix)

-        print(f"[Clustering] Spectral clustering completed")
+        print("[Clustering] Spectral clustering completed")
        print(f"[Clustering] n_speakers: {n_speakers}")
        print(f"[Clustering] n_segments: {n_segments}")

@@ -146,7 +146,7 @@ def spectral_clustering_speaker(

    except Exception as e:
        print(f"[Clustering] Spectral clustering failed: {e}")
-        print(f"[Clustering] Using fallback: 2 speakers")
+        print("[Clustering] Using fallback: 2 speakers")
        # 簡單分配：前一半是 SPEAKER_0，後一半是 SPEAKER_1
        speaker_labels = np.array(
            [0] * (n_segments // 2) + [1] * (n_segments - n_segments // 2)
@@ -203,7 +203,7 @@ def agglomerative_clustering_speaker(

    speaker_labels = clustering.fit_predict(embeddings)

-    print(f"[Clustering] Agglomerative clustering completed")
+    print("[Clustering] Agglomerative clustering completed")
    print(f"[Clustering] n_speakers: {n_speakers}")

    return speaker_labels, n_speakers
@@ -249,7 +249,6 @@ def compute_diarization_purity(speaker_labels, ground_truth_labels=None):
    """
    if ground_truth_labels is None:
        # 沒有 ground truth，使用聚類純度近似
-        from sklearn.metrics import silhouette_score

        # 使用餘弦相似度作為距離
        purity = 0.5  # 預設值
@@ -300,7 +299,7 @@ if __name__ == "__main__":
        similarity, n_speakers=None, auto_estimate=True
    )

-    print(f"\n[Test] Clustering results:")
+    print("\n[Test] Clustering results:")
    print(f"  True n_speakers: {n_speakers}")
    print(f"  Estimated n_speakers: {n_clusters}")
    print(f"  Unique labels: {np.unique(labels)}")
--- a/scripts/asrx_self/speaker_cluster_fixed.py
+++ b/scripts/asrx_self/speaker_cluster_fixed.py
@@ -6,7 +6,6 @@ Speaker Clustering - Fixed Version

 import numpy as np
 from sklearn.cluster import AgglomerativeClustering
-from sklearn.metrics.pairwise import cosine_similarity


 def robust_speaker_clustering(embeddings, n_speakers=None, max_speakers=10):
@@ -57,7 +56,7 @@ def robust_speaker_clustering(embeddings, n_speakers=None, max_speakers=10):
    
    # 統計每個聚類的大小
    unique, counts = np.unique(speaker_labels, return_counts=True)
-    print(f"[Clustering] Cluster sizes:")
+    print("[Clustering] Cluster sizes:")
    for label, count in zip(unique, counts):
        print(f"  SPEAKER_{label}: {count} segments ({count/n_segments*100:.1f}%)")
    
@@ -148,6 +147,6 @@ if __name__ == "__main__":
    # 測試聚類
    labels, n_clusters = robust_speaker_clustering(embeddings)
    
-    print(f"\nResult:")
+    print("\nResult:")
    print(f"  True n_speakers: {n_speakers}")
    print(f"  Estimated n_speakers: {n_clusters}")
--- a/scripts/asrx_self/speaker_encoder.py
+++ b/scripts/asrx_self/speaker_encoder.py
@@ -33,8 +33,8 @@ def load_speaker_encoder(model_name="speechbrain/spkrec-ecapa-voxceleb"):
    )

    # 獲取模型資訊
-    print(f"[SpeakerEncoder] Model loaded successfully")
-    print(f"[SpeakerEncoder] Embedding dimension: 192")
+    print("[SpeakerEncoder] Model loaded successfully")
+    print("[SpeakerEncoder] Embedding dimension: 192")

    return classifier

@@ -187,5 +187,5 @@ if __name__ == "__main__":
    print(f"[Test] Embedding std: {embedding.std():.4f}")

    # 顯示部分嵌入值
-    print(f"\n[Test] First 10 embedding values:")
+    print("\n[Test] First 10 embedding values:")
    print(f"  {embedding[:10]}")
--- a/scripts/asrx_self/speaker_player_gui.py
+++ b/scripts/asrx_self/speaker_player_gui.py
@@ -11,7 +11,6 @@ import os
 import threading
 import time
 from pathlib import Path
-from typing import List, Dict

 try:
    import tkinter as tk
--- a/scripts/asrx_self/speaker_player_gui_face.py
+++ b/scripts/asrx_self/speaker_player_gui_face.py
@@ -11,7 +11,6 @@ import os
 import threading
 import time
 from pathlib import Path
-from typing import List, Dict

 try:
    import tkinter as tk
@@ -203,7 +202,7 @@ class SpeakerPlayerGUI:
            self.face_path = filename
            self.face_label.config(text=Path(filename).name)
            self.integrate_button.config(state=tk.NORMAL)
-            self.status_label.config(text=f"✅ Face 已選擇 - 請點擊整合")
+            self.status_label.config(text="✅ Face 已選擇 - 請點擊整合")

    def integrate_face(self):
        """整合 Face 與 ASRX"""
--- a/scripts/asrx_self/speaker_player_interactive.py
+++ b/scripts/asrx_self/speaker_player_interactive.py
@@ -93,14 +93,14 @@ def show_menu(speaker_segments: Dict[str, List[Dict]], speaker_id: str):
        print(f"  ... and {len(segs) - 20} more segments")

    print(f"\n{'=' * 70}")
-    print(f"Commands:")
+    print("Commands:")
    print(f"  [1-{min(20, len(segs))}]  Play specific segment")
-    print(f"  all      Play all segments (may take a while)")
-    print(f"  first N  Play first N segments")
-    print(f"  next     Next speaker")
-    print(f"  prev     Previous speaker")
-    print(f"  list     List all speakers")
-    print(f"  quit     Exit")
+    print("  all      Play all segments (may take a while)")
+    print("  first N  Play first N segments")
+    print("  next     Next speaker")
+    print("  prev     Previous speaker")
+    print("  list     List all speakers")
+    print("  quit     Exit")
    print(f"{'=' * 70}")


@@ -132,7 +132,7 @@ def interactive_player(audio_path: str, result_path: str):

    current_speaker_idx = 0

-    print(f"\n🎬 Speaker Audio Player")
+    print("\n🎬 Speaker Audio Player")
    print(f"📁 Audio: {audio_path}")
    print(f"📊 Speakers: {len(speakers)}")
    print(f"{'=' * 70}")
@@ -159,7 +159,7 @@ def interactive_player(audio_path: str, result_path: str):
                print(
                    f"  ⏱️  {seg['start']:.2f}s - {seg['end']:.2f}s ({seg['duration']:.2f}s)"
                )
-                print(f"  ▶️  Playing...", end="", flush=True)
+                print("  ▶️  Playing...", end="", flush=True)
                if extract_and_play(audio_path, seg["start"], seg["end"]):
                    print(" ✅ Done")
                else:
@@ -220,7 +220,7 @@ def interactive_player(audio_path: str, result_path: str):
        # 列出所有說話人
        elif cmd == "list":
            print(f"\n{'=' * 70}")
-            print(f"📢 All speakers:")
+            print("📢 All speakers:")
            print(f"{'=' * 70}")
            for i, speaker in enumerate(speakers, 1):
                segs = speaker_segments[speaker]
--- a/scripts/asrx_self/test_gui_face_player.py
+++ b/scripts/asrx_self/test_gui_face_player.py
@@ -6,8 +6,6 @@ GUI Face Player 自動化測試腳本

 import json
 import subprocess
-import time
-import os
 from pathlib import Path


--- a/scripts/asrx_self/test_long_movie.py
+++ b/scripts/asrx_self/test_long_movie.py
@@ -5,7 +5,6 @@

 import json
 import subprocess
-import time
 from pathlib import Path
 from datetime import datetime

@@ -55,7 +54,7 @@ def test_asrx_results():
    print(f"📊 語音片段：{n_segments}")
    
    # 說話人統計
-    print(f"\n📢 說話人分佈:")
+    print("\n📢 說話人分佈:")
    speaker_stats = data.get('speaker_stats', {})
    for speaker, stats in sorted(speaker_stats.items(), key=lambda x: x[1]['duration'], reverse=True):
        duration = stats.get('duration', 0)
@@ -102,7 +101,7 @@ def test_integration():
    print(f"📊 匹配率：{match_rate:.2f}%")
    
    # 說話人匹配統計
-    print(f"\n📢 說話人匹配詳情:")
+    print("\n📢 說話人匹配詳情:")
    speaker_stats = data.get('speaker_stats', {})
    for speaker, stats in sorted(speaker_stats.items()):
        total_seg = stats.get('total_segments', 0)
@@ -164,7 +163,7 @@ def test_playback():
    end = first_seg['end']
    duration = end - start
    
-    print(f"\n🎵 測試提取第一個片段:")
+    print("\n🎵 測試提取第一個片段:")
    print(f"   時間：{start:.2f}s - {end:.2f}s ({duration:.2f}s)")
    
    # 實際提取測試
@@ -222,10 +221,10 @@ def generate_report():
    # 保存報告
    report_path = '/tmp/long_movie_test_report.md'
    with open(report_path, 'w', encoding='utf-8') as f:
-        f.write(f"# 長影片測試報告\n\n")
+        f.write("# 長影片測試報告\n\n")
        f.write(f"**測試時間**: {datetime.now().isoformat()}\n")
-        f.write(f"**測試影片**: Charade 1963 (114.7 分鐘)\n\n")
-        f.write(f"## 結果\n\n")
+        f.write("**測試影片**: Charade 1963 (114.7 分鐘)\n\n")
+        f.write("## 結果\n\n")
        f.write(f"**通過**: {passed}/{total}\n\n")
        for name, result in tests:
            status = "✅" if result else "❌"
--- a/scripts/asrx_self/vad.py
+++ b/scripts/asrx_self/vad.py
@@ -9,7 +9,6 @@ VAD (Voice Activity Detection) - 語音活動檢測
 """

 import torch
-import numpy as np


 def load_vad_model():
@@ -143,7 +142,7 @@ if __name__ == "__main__":
    print(f"[VAD] Processing: {audio_path}")
    segments, wav, sr = extract_speech_segments(audio_path, model, utils)

-    print(f"\n[VAD] Results:")
+    print("\n[VAD] Results:")
    print(f"  Sample rate: {sr} Hz")
    print(f"  Speech segments: {len(segments)}")
    print(f"  Total duration: {len(wav) / sr:.2f}s")
@@ -153,7 +152,7 @@ if __name__ == "__main__":
        f"  Total speech: {total_speech:.2f}s ({total_speech / (len(wav) / sr) * 100:.1f}%)"
    )

-    print(f"\n[VAD] Segments:")
+    print("\n[VAD] Segments:")
    for i, (start, end) in enumerate(segments[:10]):
        print(f"  {i + 1:3d}. {start:6.2f}s - {end:6.2f}s ({end - start:5.2f}s)")

--- a/scripts/audio_taxonomy_processor.py
+++ b/scripts/audio_taxonomy_processor.py
@@ -4,7 +4,6 @@ Audio Taxonomy Processor (Hugging Face Transformers)
 職責：使用 AST 模型進行高精度音頻分類，並映射到業務分類。
 """

-import numpy as np
 import json
 import os
 import sys
@@ -75,7 +74,7 @@ def map_to_taxonomy(predictions):

 def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):
    """執行分類"""
-    print(f"🔍 Loading AST model (MIT) from Hugging Face...")
+    print("🔍 Loading AST model (MIT) from Hugging Face...")
    # 使用 Audio Spectrogram Transformer，準確率高且支援 MPS/CPU
    classifier = pipeline(
        "audio-classification",
@@ -103,7 +102,7 @@ def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):

            if taxonomy:
                results.append({"timestamp": round(current, 1), "categories": taxonomy})
-        except Exception as e:
+        except Exception:
            pass  # 跳過錯誤片段

        current += hop_sec
@@ -132,6 +131,6 @@ if __name__ == "__main__":
    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump({"audio_taxonomy": events}, f, indent=2, ensure_ascii=False)

-    print(f"\n🎉 Classification Complete!")
+    print("\n🎉 Classification Complete!")
    print(f"✅ Found {len(events)} tagged audio segments.")
    print(f"💾 Saved to {OUTPUT_JSON}")
--- a/scripts/audio_taxonomy_processor_v2.py
+++ b/scripts/audio_taxonomy_processor_v2.py
@@ -99,7 +99,7 @@ def map_to_taxonomy(logits, model):

 def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):
    """執行分類"""
-    print(f"🔍 Loading AST model (MIT)...")
+    print("🔍 Loading AST model (MIT)...")
    model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"

    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
@@ -167,6 +167,6 @@ if __name__ == "__main__":
    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump({"audio_taxonomy": events}, f, indent=2, ensure_ascii=False)

-    print(f"\n🎉 Classification Complete!")
+    print("\n🎉 Classification Complete!")
    print(f"✅ Found {len(events)} tagged audio segments.")
    print(f"💾 Saved to {OUTPUT_JSON}")
--- a/scripts/auto_identify_persons.py
+++ b/scripts/auto_identify_persons.py
@@ -105,7 +105,7 @@ def main():

    # 6. Generate report
    print(f"\n{'=' * 60}")
-    print(f"📊 Person Identification Results")
+    print("📊 Person Identification Results")
    print(f"{'=' * 60}")

    # Sort by frame count
@@ -177,7 +177,7 @@ def main():
    print(f"✅ Executed {executed} SQL statements")

    # 9. Generate SQL INSERT statements for person_identities
-    print(f"\n--- SQL INSERT statements for person_identities ---")
+    print("\n--- SQL INSERT statements for person_identities ---")
    for p in output["persons"][:10]:
        speaker_val = f"'{p['speaker_id']}'" if p["speaker_id"] else "NULL"
        print(
--- a/scripts/backfill_demographics.py
+++ b/scripts/backfill_demographics.py
@@ -4,11 +4,9 @@ Backfill missing Age & Gender for persons.
 """

 import os
-import sys
 import cv2
 import psycopg2
 import insightface
-import numpy as np

 DB_CONFIG = {"host": "localhost", "user": "accusys", "dbname": "momentry"}
 BASE_VIDEO_DIR = "output"
@@ -94,7 +92,7 @@ def main():
            else:
                print(f"  -> Detection incomplete (Age:{age}, Gender:{gender})")
        else:
-            print(f"  -> No face found in frame.")
+            print("  -> No face found in frame.")

    print("=== Done ===")
    conn.close()
--- a/scripts/check_all_stamps.py
+++ b/scripts/check_all_stamps.py
@@ -10,8 +10,8 @@ from transformers import AutoProcessor, AutoModelForCausalLM

 UUID = "384b0ff44aaaa1f1"
 OUTPUT_DIR = f"output/{UUID}/florence2_results"
-INPUT_IMG = os.path.join(OUTPUT_DIR, f"raw_6846.jpg")
-OUTPUT_IMG = os.path.join(OUTPUT_DIR, f"all_stamps_detected.jpg")
+INPUT_IMG = os.path.join(OUTPUT_DIR, "raw_6846.jpg")
+OUTPUT_IMG = os.path.join(OUTPUT_DIR, "all_stamps_detected.jpg")

 # Patch for compatibility (Same as before)
 import types
--- a/scripts/check_architecture_all.py
+++ b/scripts/check_architecture_all.py
@@ -67,10 +67,10 @@ def main():

    all_passed = doc_check_success and code_doc_check_success
    if all_passed:
-        print(f"\n🎉 所有檢查通過！")
+        print("\n🎉 所有檢查通過！")
        print("架構文檔符合 Phase 1 標準化要求。")
    else:
-        print(f"\n⚠️  發現問題，請參考檢查結果進行修復。")
+        print("\n⚠️  發現問題，請參考檢查結果進行修復。")
        print("提示：")
        print("  1. 使用 TERMINOLOGY_MAPPING.md 作為術語標準參考")
        print("  2. 確保設計與實現差異在 DESIGN_IMPLEMENTATION_GAP.md 中記錄")
--- a/scripts/check_architecture_docs.py
+++ b/scripts/check_architecture_docs.py
@@ -12,14 +12,13 @@
 python3 scripts/check_architecture_docs.py [--report] [--verbose]
 """

-import os
 import re
 import sys
 import glob
 import json
 import argparse
 from pathlib import Path
-from typing import Dict, List, Set, Tuple, Optional
+from typing import Dict, List, Set, Optional
 from collections import defaultdict

 # 配置
@@ -410,15 +409,15 @@ class ArchitectureDocChecker:
            print(f"{'=' * 60}")
            print(f"📁 檢查文件數: {total_files}")
            print(f"⚠️  發現問題數: {total_issues}")
-            print(f"\n問題分類:")
+            print("\n問題分類:")
            for issue_type, count in report["summary"]["issues_by_type"].items():
                print(f"  - {issue_type}: {count}")
-            print(f"\n嚴重程度:")
+            print("\n嚴重程度:")
            for severity, count in report["summary"]["issues_by_severity"].items():
                print(f"  - {severity}: {count}")

            if total_issues > 0:
-                print(f"\n🔍 詳細問題:")
+                print("\n🔍 詳細問題:")
                for file_report in report["files"]:
                    if file_report["issues"]:
                        print(f"\n文件: {file_report['file']}")
@@ -474,7 +473,7 @@ def main():
            print(f"\n❌ 發現 {report['summary']['total_issues']} 個問題，請修復")
            sys.exit(1)
        else:
-            print(f"\n✅ 所有檢查通過！")
+            print("\n✅ 所有檢查通過！")
            sys.exit(0)


--- a/scripts/check_code_document_consistency.py
+++ b/scripts/check_code_document_consistency.py
@@ -6,9 +6,7 @@
 核心原則：當設計與實現出現矛盾時，以實際的 Rust 代碼實現為最高權威
 """

-import os
 import re
-import sys
 from pathlib import Path


@@ -177,7 +175,7 @@ def main():
    issues = check_terminology_consistency(implemented_variants)

    # 3. 顯示結果
-    print(f"\n📊 檢查完成:")
+    print("\n📊 檢查完成:")
    print(f"  發現問題數: {len(issues)}")

    if issues:
--- a/scripts/check_frame_112_36.py
+++ b/scripts/check_frame_112_36.py
@@ -5,7 +5,6 @@ Analyze Frame at 112:36 (6756s) for Stamps

 import os
 import cv2
-import torch
 import types
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForCausalLM
--- a/scripts/check_frame_91_59.py
+++ b/scripts/check_frame_91_59.py
@@ -5,7 +5,6 @@ Analyze Frame at 91:59 (5519s) for Stamps

 import os
 import cv2
-import torch
 import types
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForCausalLM
--- a/scripts/chunk_statistics.py
+++ b/scripts/chunk_statistics.py
@@ -6,7 +6,6 @@ Generates a comprehensive report of each chunk's content.

 import json
 import os
-import sys

 UUID = "384b0ff44aaaa1f1"
 BASE_DIR = f"output/{UUID}"
@@ -107,7 +106,7 @@ def print_summary(chunks):
        1 for c in chunks if not c["has_speech"] and not c["has_faces"]
    )

-    print(f"\n📊 Overview:")
+    print("\n📊 Overview:")
    print(f"   Total chunks:        {len(chunks)}")
    print(
        f"   Chunks with speech:  {total_speech_chunks} ({total_speech_chunks / len(chunks) * 100:.0f}%)"
@@ -125,7 +124,7 @@ def print_summary(chunks):
    print(f"   Total face frames:   {total_faces}")

    # Combination breakdown
-    print(f"\n🎯 ASR/Face Combination Breakdown:")
+    print("\n🎯 ASR/Face Combination Breakdown:")

    combos = {}
    for c in chunks:
@@ -148,7 +147,7 @@ def print_summary(chunks):
        )

    # Top chunks by activity
-    print(f"\n🔥 Top 10 Most Active Chunks (by ASR+Faces):")
+    print("\n🔥 Top 10 Most Active Chunks (by ASR+Faces):")
    scored_chunks = []
    for c in chunks:
        score = c["asr_count"] + c["face_count"]
@@ -164,7 +163,7 @@ def print_summary(chunks):
        )

    # Stamp scene chunk
-    print(f"\n🔍 Special Interest Chunks:")
+    print("\n🔍 Special Interest Chunks:")
    for c in chunks:
        # Stamp scene around 5730s
        if c["start"] <= 5730 <= c["end"]:
--- a/scripts/clip_logo_integration.py
+++ b/scripts/clip_logo_integration.py
@@ -256,7 +256,7 @@ def test_similarity_search(
        result = cur.fetchone()
        
        if not result or not result[0]:
-            print(f"⚠️ Identity embedding not found")
+            print("⚠️ Identity embedding not found")
            return []
        
        stored_embedding_raw = result[0]
@@ -323,7 +323,7 @@ def main():
    logo_path = TEMP_DIR / f"{name.replace(' ', '_')}.png"
    
    if not logo_path.exists():
-        print(f"\n🔧 Downloading logo...")
+        print("\n🔧 Downloading logo...")
        if not download_image(logo_url, logo_path):
            sys.exit(1)
    
@@ -334,18 +334,18 @@ def main():
    if args.performance:
        perf_result = test_mps_performance(model, processor, device, logo_path, iterations=10)
        if perf_result:
-            print(f"\n📊 Performance Summary:")
+            print("\n📊 Performance Summary:")
            print(f"  MPS: {perf_result['mps_time']:.4f}s/img")
            print(f"  CPU: {perf_result['cpu_time']:.4f}s/img")
            print(f"  Speedup: {perf_result['speedup']:.2f}x")
    
-    print(f"\n🔧 Extracting CLIP embedding...")
+    print("\n🔧 Extracting CLIP embedding...")
    embedding = extract_clip_embedding(model, processor, device, logo_path)
    
    if not embedding:
        sys.exit(1)
    
-    print(f"\n🔧 Registering to database...")
+    print("\n🔧 Registering to database...")
    uuid = register_logo_identity_to_db(
        name=name,
        logo_url=logo_url,
@@ -354,13 +354,13 @@ def main():
    )
    
    if uuid:
-        print(f"\n🎉 Integration completed!")
+        print("\n🎉 Integration completed!")
        print(f"   Identity: {name}")
        print(f"   UUID: {uuid}")
        print(f"   Embedding: {len(embedding)}-dim")
        print(f"   URL: {logo_url}")
        
-        print(f"\n🔧 Testing similarity search...")
+        print("\n🔧 Testing similarity search...")
        test_embeddings = [
            embedding,
            [0.1] * 768,
@@ -369,9 +369,9 @@ def main():
        matches = test_similarity_search(uuid, test_embeddings, threshold=0.85, schema=args.schema)
        
        if matches:
-            print(f"\n✅ Similarity search test passed")
+            print("\n✅ Similarity search test passed")
    else:
-        print(f"\n❌ Integration failed")
+        print("\n❌ Integration failed")
        sys.exit(1)


--- a/scripts/compare_asr_content.py
+++ b/scripts/compare_asr_content.py
@@ -10,7 +10,7 @@ ASR方案内容对比分析

 import json
 from pathlib import Path
-from difflib import unified_diff, SequenceMatcher
+from difflib import SequenceMatcher

 def load_segments(json_path):
    """加载JSON文件中的segments"""
@@ -25,7 +25,7 @@ def compare_segments(seg_a, seg_b, name_a, name_b):
    print(f"{'='*60}")
    
    # 统计
-    print(f"\n【数量对比】")
+    print("\n【数量对比】")
    print(f"  {name_a}: {len(seg_a)} segments")
    print(f"  {name_b}: {len(seg_b)} segments")
    print(f"  差异: {len(seg_a) - len(seg_b)} segments")
@@ -34,7 +34,7 @@ def compare_segments(seg_a, seg_b, name_a, name_b):
    total_time_a = sum(s['end'] - s['start'] for s in seg_a)
    total_time_b = sum(s['end'] - s['start'] for s in seg_b)
    
-    print(f"\n【时间覆盖】")
+    print("\n【时间覆盖】")
    print(f"  {name_a}: {total_time_a:.2f}秒")
    print(f"  {name_b}: {total_time_b:.2f}秒")
    print(f"  差异: {total_time_a - total_time_b:.2f}秒")
@@ -48,11 +48,11 @@ def compare_segments(seg_a, seg_b, name_a, name_b):
    text_b_full = ' '.join(texts_b)
    similarity = SequenceMatcher(None, text_a_full, text_b_full).ratio()
    
-    print(f"\n【文本相似度】")
+    print("\n【文本相似度】")
    print(f"  相似度: {similarity*100:.1f}%")
    
    # 差异分析
-    print(f"\n【详细差异】")
+    print("\n【详细差异】")
    
    # 按时间对齐对比
    matched_diffs = []
@@ -98,7 +98,7 @@ def compare_segments(seg_a, seg_b, name_a, name_b):
        if len(matched_diffs) > 10:
            print(f"\n  ... 还有 {len(matched_diffs) - 10} 处差异")
    else:
-        print(f"  ✓ 无显著文本差异")
+        print("  ✓ 无显著文本差异")
    
    return {
        'segments_diff': len(seg_a) - len(seg_b),
@@ -122,10 +122,10 @@ def main():
    
    # 方案基本信息
    print("【测试方案】")
-    print(f"  方案A: faster-whisper small CPU")
-    print(f"  方案B: OpenAI whisper small CPU")
-    print(f"  方案D: OpenAI whisper medium CPU")
-    print(f"  方案C/E: MPS失败（不支持）")
+    print("  方案A: faster-whisper small CPU")
+    print("  方案B: OpenAI whisper small CPU")
+    print("  方案D: OpenAI whisper medium CPU")
+    print("  方案C/E: MPS失败（不支持）")
    print()
    
    # 三组对比
@@ -142,16 +142,16 @@ def main():
    print("="*60)
    
    print("\n【Segments数量】")
-    print(f"  方案A: 77 segments (最多)")
-    print(f"  方案B: 74 segments")
-    print(f"  方案D: 74 segments")
-    print(f"  结论: faster-whisper分割更细（+3 segments）")
+    print("  方案A: 77 segments (最多)")
+    print("  方案B: 74 segments")
+    print("  方案D: 74 segments")
+    print("  结论: faster-whisper分割更细（+3 segments）")
    
    print("\n【文本相似度】")
    print(f"  A vs B: {results['A_vs_B']['similarity']*100:.1f}%")
    print(f"  A vs D: {results['A_vs_D']['similarity']*100:.1f}%")
    print(f"  B vs D: {results['B_vs_D']['similarity']*100:.1f}%")
-    print(f"  结论: 三个方案文本高度相似")
+    print("  结论: 三个方案文本高度相似")
    
    print("\n【文本差异统计】")
    print(f"  A vs B: {results['A_vs_B']['text_diffs']}处差异")
@@ -159,9 +159,9 @@ def main():
    print(f"  B vs D: {results['B_vs_D']['text_diffs']}处差异")
    
    print("\n【方案D（medium）vs 方案B（small）】")
-    print(f"  Segments数量相同: 74条")
+    print("  Segments数量相同: 74条")
    print(f"  文本相似度: {results['B_vs_D']['similarity']*100:.1f}%")
-    print(f"  结论: medium模型无明显提升")
+    print("  结论: medium模型无明显提升")
    
    print()
    print("="*60)
--- a/scripts/compare_segmentation.py
+++ b/scripts/compare_segmentation.py
@@ -0,0 +1,131 @@
+#!/opt/homebrew/bin/python3.11
+"""
+POC: Compare silence-based segmentation vs CUT-based segmentation for ASR.
+
+Tests a short video segment and reports:
+1. Number of segments from each method
+2. Segment boundaries
+3. ASR quality comparison (WER estimate)
+"""
+import json
+import os
+import sys
+import subprocess
+import tempfile
+import time
+from faster_whisper import WhisperModel
+
+VIDEO_PATH = sys.argv[1] if len(sys.argv) > 1 else "/Users/accusys/test_video/Old_Time_Movie_Show_-_Charade_1963.HD.mov"
+DURATION = 300  # Test first 5 minutes only
+
+model = WhisperModel("small", device="cpu", compute_type="int8")
+
+def extract_audio_segment(start, end, out_wav):
+    cmd = ["ffmpeg", "-y", "-v", "quiet", "-i", VIDEO_PATH,
+           "-ss", str(start), "-to", str(end),
+           "-ar", "16000", "-ac", "1", out_wav]
+    subprocess.run(cmd, check=False, capture_output=True)
+    return os.path.getsize(out_wav) > 100
+
+def transcribe(wav_path):
+    segs, info = model.transcribe(wav_path, beam_size=5, vad_filter=True,
+                                   vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200))
+    return list(segs), info
+
+# === Method 1: CUT-based segmentation ===
+print("=" * 60)
+print("METHOD 1: CUT-based segmentation")
+print("=" * 60)
+cut_path = "/Users/accusys/momentry/output_dev/417a7e93860d70c87aee6c4c1b715d70.cut.json"
+cut_scenes = []
+if os.path.exists(cut_path):
+    with open(cut_path) as f:
+        data = json.load(f)
+    cut_scenes = [(s["start_time"], s["end_time"]) for s in data.get("scenes", []) if s["start_time"] < DURATION]
+    print(f"  Scenes in first {DURATION}s: {len(cut_scenes)}")
+
+tmpdir = tempfile.mkdtemp(prefix="seg_compare_")
+t1 = time.time()
+cut_segments = []
+total_chars = 0
+for idx, (st, et) in enumerate(cut_scenes):
+    wav = os.path.join(tmpdir, f"cut_{idx:04d}.wav")
+    if not extract_audio_segment(st, et, wav):
+        continue
+    segs, info = transcribe(wav)
+    for s in segs:
+        cut_segments.append({"start": st + s.start, "end": st + s.end, "text": s.text})
+        total_chars += len(s.text)
+cut_time = time.time() - t1
+print(f"  Segments: {len(cut_segments)}, Total chars: {total_chars}, Time: {cut_time:.1f}s")
+print(f"  Avg segment duration: {DURATION/len(cut_segments):.1f}s" if cut_segments else "")
+
+# === Method 2: Silence-based segmentation (ffmpeg silencedetect) ===
+print()
+print("=" * 60)
+print("METHOD 2: Silence-based segmentation (ffmpeg silencedetect)")
+print("=" * 60)
+
+# Extract full 5min audio
+full_wav = os.path.join(tmpdir, "full_audio.wav")
+extract_audio_segment(0, DURATION, full_wav)
+
+# Use ffmpeg silencedetect to find speech segments
+t2 = time.time()
+detect_cmd = ["ffmpeg", "-i", full_wav, "-af", "silencedetect=noise=-30dB:d=0.5", "-f", "null", "-"]
+result = subprocess.run(detect_cmd, capture_output=True, text=True)
+stderr = result.stderr
+
+# Parse silencedetect output
+silence_starts = []
+silence_ends = []
+for line in stderr.split("\n"):
+    if "silence_start:" in line:
+        silence_starts.append(float(line.split("silence_start:")[1].strip()))
+    elif "silence_end:" in line:
+        silence_ends.append(float(line.split("silence_end:")[1].split("|")[0].strip()))
+
+# Build speech segments: gaps between silence periods
+speech_segments = []
+last_end = 0.0
+for ss, se in zip(silence_starts, silence_ends):
+    if ss > last_end + 0.5:
+        speech_segments.append((last_end, ss))
+    last_end = se
+if last_end < DURATION:
+    speech_segments.append((last_end, DURATION))
+
+print(f"  Silence periods detected: {len(silence_starts)}")
+print(f"  Speech segments: {len(speech_segments)}")
+
+# Transcribe each speech segment
+silence_segments = []
+total_chars2 = 0
+for idx, (st, et) in enumerate(speech_segments):
+    wav = os.path.join(tmpdir, f"sil_{idx:04d}.wav")
+    if not extract_audio_segment(st, et, wav):
+        continue
+    segs, info = transcribe(wav)
+    for s in segs:
+        silence_segments.append({"start": st + s.start, "end": st + s.end, "text": s.text})
+        total_chars2 += len(s.text)
+silence_time = time.time() - t2
+print(f"  Segments: {len(silence_segments)}, Total chars: {total_chars2}, Time: {silence_time:.1f}s")
+
+# === Comparison ===
+print()
+print("=" * 60)
+print("COMPARISON")
+print("=" * 60)
+print(f"{'Metric':<30} {'CUT-based':<15} {'Silence-based':<15}")
+print("-" * 60)
+print(f"{'Number of audio segments':<30} {len(cut_scenes):<15} {len(speech_segments):<15}")
+print(f"{'Number of ASR segments':<30} {len(cut_segments):<15} {len(silence_segments):<15}")
+print(f"{'Total chars recognized':<30} {total_chars:<15} {total_chars2:<15}")
+print(f"{'Processing time (s)':<30} {cut_time:<15.1f} {silence_time:<15.1f}")
+
+# Cleanup
+import shutil
+shutil.rmtree(tmpdir, ignore_errors=True)
+print()
+print("Done.")
--- a/scripts/crop_real_stamps.py
+++ b/scripts/crop_real_stamps.py
@@ -13,7 +13,6 @@ OUTPUT_DIR = f"output/{UUID}/florence2_results"
 # These are placeholders - I need to re-run to get the exact boxes if they weren't printed.
 # Since I saw the logs, I know it found them.
 # But I need the exact coordinates. Let's run a detection script that crops them immediately.
-import torch
 import types
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForCausalLM
--- a/scripts/crop_stamp_112_36.py
+++ b/scripts/crop_stamp_112_36.py
@@ -6,7 +6,6 @@ Crop the detected stamp from the 112:36 frame (with Patch).
 from PIL import Image
 import os
 import cv2
-import torch
 import types
 from transformers import AutoProcessor, AutoModelForCausalLM

--- a/scripts/cut_benchmark_runner.py
+++ b/scripts/cut_benchmark_runner.py
@@ -140,7 +140,7 @@ def main():
        
        video_stream = next((s for s in video_info["streams"] if s["codec_type"] == "video"), None)
        
-        print(f"\n测试视频:")
+        print("\n测试视频:")
        print(f"  文件: {int(video_info['format'].get('size', 0)) / 1024 / 1024:.1f} MB")
        print(f"  时长: {float(video_info['format'].get('duration', 0)):.1f} 秒")
        print(f"  分辨率: {video_stream.get('width', 0)}x{video_stream.get('height', 0)}")
@@ -188,7 +188,7 @@ def main():
                "file_size_kb": result["file_size_kb"],
            })
            
-            print(f"\n✅ 处理完成:")
+            print("\n✅ 处理完成:")
            print(f"  时间: {result['elapsed_time']:.2f}秒")
            print(f"  内存峰值: {result['peak_memory_mb']:.1f} MB")
            print(f"  检测场景数: {result['total_scenes']}")
@@ -223,7 +223,7 @@ def main():
    print(f"{'=' * 80}")
    
    print("\n【对比总结】")
-    print(f"\n| 方案 | 脚本 | 时间(秒) | 内存(MB) | 场景数 | 平均时长(秒) |")
+    print("\n| 方案 | 脚本 | 时间(秒) | 内存(MB) | 场景数 | 平均时长(秒) |")
    print("|------|------|---------|---------|--------|-------------|")
    
    for r in results:
--- a/scripts/debug_face_registration.py
+++ b/scripts/debug_face_registration.py
@@ -4,7 +4,6 @@ Debug script to test face registration with same arguments Rust uses
 """

 import subprocess
-import sys
 import os

 # Simulate what Rust would call
--- a/scripts/deep_analysis_112_36.py
+++ b/scripts/deep_analysis_112_36.py
@@ -7,7 +7,6 @@ Deep Analysis of 112:36 Frame

 import os
 import cv2
-import torch
 import types
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForCausalLM
@@ -149,7 +148,7 @@ try:
                            2,
                        )
            else:
-                print(f"   ❌ Not found.")
+                print("   ❌ Not found.")
        except Exception as e:
            print(f"   ⚠️ Error: {e}")

--- a/scripts/demo_dashboard.py
+++ b/scripts/demo_dashboard.py
@@ -4,7 +4,6 @@ Momentry Core Visual Demo Dashboard
 職責：提供處理器模組的視覺化預覽，支持時間軸檢查與多模組疊加顯示。
 """

-import sys
 import os
 import json
 import cv2
--- a/scripts/demo_face_learning.py
+++ b/scripts/demo_face_learning.py
@@ -6,7 +6,6 @@ Demonstrate face learning capability
 import json
 import os
 import sys
-import numpy as np
 from pathlib import Path

 # Add script directory to path
--- a/scripts/detect_language.py
+++ b/scripts/detect_language.py
@@ -8,7 +8,7 @@
 import sys
 import json
 import argparse
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, Tuple
 import re

 # 簡單的語言檢測規則（可擴展）
--- a/scripts/detect_objects_keyframes.py
+++ b/scripts/detect_objects_keyframes.py
@@ -5,7 +5,6 @@ Detect and Crop Envelopes/Objects in Keyframes

 import os
 import cv2
-import torch
 import types
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForCausalLM
--- a/scripts/export_person_thumbnails.py
+++ b/scripts/export_person_thumbnails.py
@@ -7,7 +7,6 @@ Export Person Thumbnails
 import cv2
 import json
 import os
-import sys

 # 設定
 OUTPUT_DIR = "output/quick_preview"
--- a/scripts/extract_female_faces.py
+++ b/scripts/extract_female_faces.py
@@ -4,8 +4,6 @@
 """

 import cv2
-import numpy as np
-import json
 import os
 from datetime import datetime

@@ -247,7 +245,7 @@ def create_female_faces_report(female_frames_info, output_dir="/tmp/female_faces
                    f"- `{os.path.basename(info['thumbnail'])}` - 縮略圖（800px寬）\n"
                )

-        f.write(f"- `female_faces_report.md` - 本報告文件\n\n")
+        f.write("- `female_faces_report.md` - 本報告文件\n\n")

        f.write("## 🔍 分析說明\n\n")
        f.write("1. **邊界框顏色**: 粉色 (RGB: 255,105,180) 表示女性人臉\n")
@@ -332,20 +330,20 @@ def main():
            info for info in female_frames_info if info["female_count"] == max_females
        ][0]

-        print(f"📊 統計摘要:")
+        print("📊 統計摘要:")
        print(f"  - 總分析畫面: {len(female_frames_info)}")
        print(f"  - 女性最多畫面: 幀 {max_frame_info['frame_number']}")
        print(f"  - 女性數量: {max_females} 人")
        print(f"  - 時間位置: {max_frame_info['timestamp_formatted']}")
        print()

-        print(f"📁 生成文件:")
+        print("📁 生成文件:")
        print(f"  - 標記圖像: {output_dir}/female_faces_frame_*.jpg")
        print(f"  - 縮略圖: {output_dir}/female_faces_frame_*_thumbnail.jpg")
        print(f"  - 分析報告: {report_path}")
        print()

-        print(f"🔍 查看結果:")
+        print("🔍 查看結果:")
        print(f"  ls -la {output_dir}/")
        print(f"  open {output_dir}/female_faces_report.md")

--- a/scripts/face_benchmark_runner.py
+++ b/scripts/face_benchmark_runner.py
@@ -23,7 +23,6 @@ import sys
 import json
 import time
 import subprocess
-import shutil
 from pathlib import Path
 from datetime import datetime

@@ -230,7 +229,7 @@ def main():
        sys.exit(1)
    
    video_info = get_video_info(video_path)
-    print(f"\n测试视频:")
+    print("\n测试视频:")
    print(f"  UUID: {video_uuid}")
    print(f"  文件: {video_info.get('size_mb', 0):.1f} MB")
    print(f"  时长: {video_info.get('duration', 0):.1f} 秒")
@@ -286,7 +285,7 @@ def main():
                "has_landmarks": result["has_landmarks"]
            })
            
-            print(f"\n✅ 处理完成:")
+            print("\n✅ 处理完成:")
            print(f"  时间: {result['elapsed_time']:.2f}秒")
            print(f"  速度: {speed:.2f}x 实时倍速")
            print(f"  内存峰值: {result['peak_memory_mb']:.1f} MB")
@@ -324,7 +323,7 @@ def main():
    print(f"{'=' * 80}")
    
    print("\n【对比总结】")
-    print(f"\n| 方案 | 脚本 | 时间(秒) | 速度 | 内存(MB) | 人脸数 | Embedding |")
+    print("\n| 方案 | 脚本 | 时间(秒) | 速度 | 内存(MB) | 人脸数 | Embedding |")
    print("|------|------|---------|------|---------|--------|-----------|")
    
    for r in results:
--- a/scripts/face_count_comparison.py
+++ b/scripts/face_count_comparison.py
@@ -5,9 +5,7 @@ Face Detection Count Comparison
 """

 import json
-import sys
 from pathlib import Path
-from collections import defaultdict

 def load_results(filepath):
    """加载检测结果"""
@@ -172,7 +170,7 @@ def main():
    
    stats = analyze_detection_distribution(results_a, results_b, results_c)
    
-    print(f"| 版本 | 总人脸数 | 检测帧数 | 有人脸帧 | 无人脸帧 | 平均每帧 | 最多人脸 |")
+    print("| 版本 | 总人脸数 | 检测帧数 | 有人脸帧 | 无人脸帧 | 平均每帧 | 最多人脸 |")
    print("|------|---------|---------|---------|---------|---------|---------|")
    
    for name, s in stats.items():
@@ -187,14 +185,14 @@ def main():
    print(f"共有 {len(comparison)} 帧检测数量不同")
    print()
    
-    print(f"| 帧号 | 时间(秒) | InsightFace | MediaPipe | OpenCV | 最大差异 |")
+    print("| 帧号 | 时间(秒) | InsightFace | MediaPipe | OpenCV | 最大差异 |")
    print("|------|---------|------------|----------|--------|---------|")
    
    for item in comparison[:30]:  # 只显示前30帧
        print(f"| {item['frame']} | {item['timestamp']:.2f} | {item['insightface']} | {item['mediapipe']} | {item['opencv']} | {item['diff']} |")
    
    if len(comparison) > 30:
-        print(f"| ... | ... | ... | ... | ... | ... |")
+        print("| ... | ... | ... | ... | ... | ... |")
        print(f"| 共 {len(comparison)} 帧有差异 |")
    
    print()
@@ -212,7 +210,7 @@ def main():
    
    if mediapipe_missed:
        print("MediaPipe漏检详情（前10帧）:")
-        print(f"| 帧号 | InsightFace检测 | OpenCV检测 |")
+        print("| 帧号 | InsightFace检测 | OpenCV检测 |")
        print("|------|----------------|-----------|")
        for m in mediapipe_missed[:10]:
            print(f"| {m['frame']} | {m.get('insightface_count', m.get('others_count', '?'))} | {m.get('opencv_count', '?')} |")
@@ -225,7 +223,7 @@ def main():
    
    print(f"以InsightFace为基准（{baseline}张人脸）:")
    print()
-    print(f"| 版本 | 检测数 | 检测率 | 漏检数 |")
+    print("| 版本 | 检测数 | 检测率 | 漏检数 |")
    print("|------|--------|--------|--------|")
    
    for name, s in stats.items():
--- a/scripts/face_embedding_extractor.py
+++ b/scripts/face_embedding_extractor.py
@@ -38,7 +38,7 @@ def extract_face_embeddings(uuid: str, video_path: str):
        return {}

    # 1. 加載 Face JSON 數據
-    face_path = os.path.join(OUTPUT_DIR, "quick_preview", f"preview.face.json")
+    face_path = os.path.join(OUTPUT_DIR, "quick_preview", "preview.face.json")
    if not os.path.exists(face_path):
        print(f"  [Skip] No Face data for {uuid}")
        return {}
@@ -119,7 +119,7 @@ def extract_face_embeddings(uuid: str, video_path: str):
                )
                if result:
                    embeddings.append(np.array(result[0]["embedding"]))
-            except Exception as e:
+            except Exception:
                # 忽略無法識別的臉部
                pass

--- a/scripts/face_processor.py
+++ b/scripts/face_processor.py
@@ -21,7 +21,6 @@ import os
 import time

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from redis_publisher import RedisPublisher
 from resume_framework import ResumeFramework, format_time, print_progress
 from utils.pose_analyzer import calculate_pose_angle_v2

@@ -141,7 +140,7 @@ def process_face(
    print(f"\nProcessing video: {total_frames} frames @ {fps:.2f} fps")
    print(f"Auto-save every {auto_save_interval}s or {auto_save_frames} frames")
    print(f"Resume from frame {frame_count + 1 if resume_mode else 1}")
-    print(f"Detection method: InsightFace (REQUIRED)")
+    print("Detection method: InsightFace (REQUIRED)")
    print()

    while True:
@@ -199,7 +198,7 @@ def process_face(
                            "pitch": pose_result.get("pitch", "neutral"),
                            "features": pose_result.get("features", {}),
                        }
-                    except Exception as e:
+                    except Exception:
                        pass

                face_list.append(
@@ -255,6 +254,45 @@ def process_face(
    return face_data


+def _convert_to_face_result(face_data: dict) -> dict:
+    """Convert ResumeFramework output to FaceResult format expected by Rust."""
+    metadata = face_data.get("metadata", {})
+    raw_frames = face_data.get("frames", {})
+    fps = metadata.get("fps", 30.0)
+    frames = []
+    for frame_key in sorted(raw_frames.keys(), key=lambda k: int(k)):
+        f = raw_frames[frame_key]
+        faces = []
+        for raw_face in f.get("faces", []):
+            pose = raw_face.get("pose_angle")
+            attributes = raw_face.get("attributes", {})
+            face = {
+                "face_id": None,
+                "x": raw_face["x"],
+                "y": raw_face["y"],
+                "width": raw_face["width"],
+                "height": raw_face["height"],
+                "confidence": raw_face.get("confidence", 0.0),
+                "embedding": raw_face.get("embedding"),
+                "landmarks": raw_face.get("landmarks"),
+                "attributes": {
+                    "age": attributes.get("age") if attributes else None,
+                    "gender": attributes.get("gender") if attributes else None,
+                },
+            }
+            faces.append(face)
+        frames.append({
+            "frame": f["frame_number"],
+            "timestamp": f["time_seconds"],
+            "faces": faces,
+        })
+    return {
+        "frame_count": len(frames),
+        "fps": fps,
+        "frames": frames,
+    }
+
+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Face Detection & Demographics with Resume Support")
    parser.add_argument("video_path", help="Path to video file")
@@ -285,11 +323,11 @@ if __name__ == "__main__":
        "-s",
        help="Frame sample interval",
        type=int,
-        default=30,
+        default=5,
    )
    args = parser.parse_args()

-    process_face(
+    result = process_face(
        args.video_path,
        args.output_path,
        args.uuid,
@@ -297,4 +335,7 @@ if __name__ == "__main__":
        args.auto_save_frames,
        args.force_restart,
        args.sample_interval,
-    )
+    )
+    face_result = _convert_to_face_result(result)
+    with open(args.output_path, "w") as f:
+        json.dump(face_result, f, indent=2)
--- a/scripts/face_processor_mps.py
+++ b/scripts/face_processor_mps.py
@@ -18,7 +18,7 @@ import os
 import signal
 import time
 from datetime import datetime
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List

 import cv2
 import numpy as np
@@ -108,7 +108,7 @@ class MediaPipeFaceDetector:
                print(f"[Face] Using fallback model: {alt_path}")
                return alt_path
            
-            raise RuntimeError(f"Could not download MediaPipe model from any source")
+            raise RuntimeError("Could not download MediaPipe model from any source")
        
        return model_path

--- a/scripts/face_recognition_processor.py
+++ b/scripts/face_recognition_processor.py
@@ -9,10 +9,8 @@ import sys
 import json
 import argparse
 import os
-import time
 import numpy as np
 from typing import List, Dict, Any, Optional, Tuple
-import uuid

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from redis_publisher import RedisPublisher
--- a/scripts/face_registration.py
+++ b/scripts/face_registration.py
@@ -8,7 +8,6 @@ import sys
 import json
 import argparse
 import os
-import numpy as np
 import time
 from typing import Dict, Any, Optional

@@ -176,7 +175,7 @@ class FaceRegistration:
                }

            if len(faces) > 1:
-                print(f"[WARNING] Multiple faces detected, using the first one")
+                print("[WARNING] Multiple faces detected, using the first one")

            # Use the first face
            face = faces[0]
--- a/scripts/face_statistics_report.py
+++ b/scripts/face_statistics_report.py
@@ -4,7 +4,6 @@
 """

 import psycopg2
-import json
 from datetime import datetime
 import sys

@@ -235,7 +234,7 @@ def main():
        with open("/tmp/face_statistics_report.txt", "w") as f:
            f.write(report)

-        print(f"\n報告已保存到: /tmp/face_statistics_report.txt")
+        print("\n報告已保存到: /tmp/face_statistics_report.txt")

    except Exception as e:
        print(f"❌ 生成報告時出錯: {e}")
--- a/scripts/fast_face_clustering_processor.py
+++ b/scripts/fast_face_clustering_processor.py
@@ -74,7 +74,7 @@ def main():

    total_faces = sum(len(faces) for faces in faces_map.values())
    print(f"✅ Indexed {len(faces_map)} frames, containing {total_faces} faces.")
-    print(f"🚀 Starting Linear Video Scan...")
+    print("🚀 Starting Linear Video Scan...")

    # 2. 線性掃描
    video_path = VIDEO_PATH  # 使用區域變數避免 global 問題
@@ -138,7 +138,7 @@ def main():
                            face_refs.append(
                                {"frame_idx": current_frame, "face_idx": face_idx}
                            )
-                except Exception as e:
+                except Exception:
                    pass

            processed_frames += 1
--- a/scripts/fast_stamp_search.py
+++ b/scripts/fast_stamp_search.py
@@ -220,7 +220,7 @@ for sec in range(0, total_sec, FRAME_INTERVAL):
                        print(
                            f"    🎯 {sec}s | {term} | {s:.2f} | {int(orig_w)}x{int(orig_h)}px"
                        )
-            except Exception as e:
+            except Exception:
                pass

    # Save annotated frame if stamps found
--- a/scripts/final_face_validation.py
+++ b/scripts/final_face_validation.py
@@ -7,7 +7,6 @@
 import sys
 import os
 import subprocess
-import time


 def run_test(script_name, description):
@@ -50,7 +49,7 @@ def check_server_status():

        response = requests.get("http://localhost:3002/health", timeout=5)
        if response.status_code == 200:
-            print(f"✅ 生產服務器運行正常 (端口 3002)")
+            print("✅ 生產服務器運行正常 (端口 3002)")
            return True
        else:
            print(f"❌ 生產服務器異常: {response.status_code}")
@@ -63,7 +62,7 @@ def check_server_status():

        response = requests.get("http://localhost:3003/health", timeout=5)
        if response.status_code == 200:
-            print(f"✅ 開發服務器運行正常 (端口 3003)")
+            print("✅ 開發服務器運行正常 (端口 3003)")
            return True
        else:
            print(f"❌ 開發服務器異常: {response.status_code}")
@@ -100,7 +99,7 @@ def check_database():
        """)

        tables = cursor.fetchall()
-        print(f"✅ 數據庫連接正常")
+        print("✅ 數據庫連接正常")
        print(f"✅ 找到 {len(tables)} 個人臉相關表:")
        for table in tables:
            print(f"  - {table[0]}")
--- a/scripts/find_kids_pose.py
+++ b/scripts/find_kids_pose.py
@@ -6,7 +6,6 @@ Heuristic: Kids have a larger head relative to their body height (approx 1:5 or

 import json
 import math
-import sys

 # Configuration
 POSE_JSON_PATH = "output/384b0ff44aaaa1f1/384b0ff44aaaa1f1.pose.json"
@@ -161,7 +160,7 @@ def find_kids():
    # Sort by timestamp
    sorted_kids = sorted(unique_kids.values(), key=lambda x: x['timestamp'])
    
-    print(f"\nUnique potential kid detections (timestamps):")
+    print("\nUnique potential kid detections (timestamps):")
    for k in sorted_kids:
        print(f"  -> Timestamp: {k['timestamp']:.2f}s | Ratio: {k['ratio']}")

--- a/scripts/find_kids_refined.py
+++ b/scripts/find_kids_refined.py
@@ -8,7 +8,6 @@ Filters:

 import json
 import math
-import sys
 import os

 POSE_JSON_PATH = "output/384b0ff44aaaa1f1/384b0ff44aaaa1f1.pose.json"
@@ -133,7 +132,7 @@ def find_kids():

    sorted_kids = sorted(unique_kids.values(), key=lambda x: x["timestamp"])

-    print(f"\nRefined Timestamps:")
+    print("\nRefined Timestamps:")
    for k in sorted_kids:
        print(
            f"  ⏱️ {k['timestamp']:.2f}s | Ratio: {k['ratio']} | Width: {k['shoulder_width']}px | Conf: {k['confidence']}"
--- a/scripts/find_magnifying_glass.py
+++ b/scripts/find_magnifying_glass.py
@@ -5,7 +5,6 @@ Search for magnifying glass in key stamp scenes using OWL-ViT

 import os
 import cv2
-import json
 from PIL import Image
 import torch
 from transformers import OwlViTProcessor, OwlViTForObjectDetection
--- a/scripts/florence2_scan_stamps.py
+++ b/scripts/florence2_scan_stamps.py
@@ -17,7 +17,7 @@ os.makedirs(OUTPUT_DIR, exist_ok=True)
 # Scan frames at 5-minute intervals throughout the 2-hour video
 TIMESTAMPS = list(range(0, 6879, 300))  # Every 5 minutes

-print(f"📽️ Loading Florence-2 model...")
+print("📽️ Loading Florence-2 model...")
 processor = AutoProcessor.from_pretrained(
    "microsoft/Florence-2-base", trust_remote_code=True
 )
--- a/scripts/generate_benchmark_summary.py
+++ b/scripts/generate_benchmark_summary.py
@@ -148,7 +148,7 @@ def generate_summary_report():
        fastest_scheme = fastest.get('file_info', {}).get('scheme_id', 'unknown')
        fastest_time = fastest.get('metrics', {}).get('processing_time_seconds', 0)
        
-        lines.append(f"### Performance Comparison")
+        lines.append("### Performance Comparison")
        lines.append("")
        lines.append(f"- **Fastest Scheme**: {fastest_scheme} ({fastest_time:.1f}s)")
        
@@ -169,7 +169,7 @@ def generate_summary_report():
        lines.append("")
    
    if failed_tests:
-        lines.append(f"### Failed Tests")
+        lines.append("### Failed Tests")
        lines.append("")
        for result in failed_tests:
            scheme_id = result.get('file_info', {}).get('scheme_id', 'unknown')
@@ -178,8 +178,8 @@ def generate_summary_report():
            
            if 'MPS' in error_msg:
                lines.append(f"- **{scheme_id} ({scheme_name})**: MPS backend compatibility issue")
-                lines.append(f"  - PyTorch SparseMPS backend does not support `_sparse_coo_tensor_with_dims_and_tensors`")
-                lines.append(f"  - OpenAI whisper requires this operation for MPS device")
+                lines.append("  - PyTorch SparseMPS backend does not support `_sparse_coo_tensor_with_dims_and_tensors`")
+                lines.append("  - OpenAI whisper requires this operation for MPS device")
        
        lines.append("")
    
--- a/scripts/generate_chunk_summaries.py
+++ b/scripts/generate_chunk_summaries.py
@@ -252,7 +252,6 @@ Summary: [2-3 sentence detailed summary connecting to scene]"""

 def parse_5w1h_summary(result_text):
    """Parse 5W1H and summary from LLM response"""
-    import re

    data = {
        "who": "",
@@ -314,7 +313,6 @@ def update_chunk_summary(
    uuid=None,
 ):
    """Update chunk summary, 5W1H, identity, and visual in database"""
-    import json

    conn = psycopg2.connect(**DB_CONFIG)
    cur = conn.cursor()
--- a/scripts/generate_parent_chunks_gemma4.py
+++ b/scripts/generate_parent_chunks_gemma4.py
@@ -203,7 +203,7 @@ def main():
        )

    # Step 3: Generate summaries and insert
-    print(f"\n🤖 Generating summaries with gemma4...")
+    print("\n🤖 Generating summaries with gemma4...")
    inserted = insert_parent_chunks(scenes)

    print(f"\n{'=' * 70}")
--- a/scripts/generate_synonyms_llamacpp.py
+++ b/scripts/generate_synonyms_llamacpp.py
@@ -100,7 +100,7 @@ def check_server_health(api_url: str) -> bool:
    except requests.exceptions.ConnectionError:
        print(f"❌ Cannot connect to llama.cpp server at {api_url}")
    except requests.exceptions.Timeout:
-        print(f"❌ Connection to llama.cpp server timed out")
+        print("❌ Connection to llama.cpp server timed out")
    return False


@@ -282,7 +282,7 @@ def main():
    # Check server health
    if not check_server_health(args.url):
        print("\n💡 Start llama.cpp server with:")
-        print(f"  llama-server --model <gemma4.gguf> --port 8081")
+        print("  llama-server --model <gemma4.gguf> --port 8081")
        sys.exit(1)

    # Prepare seed words
--- a/scripts/hybrid_stamp_search.py
+++ b/scripts/hybrid_stamp_search.py
@@ -172,7 +172,7 @@ for idx, (sec, frame) in enumerate(candidate_frames):
                    )

                    print(f"  🎯 {sec}s | {term} | {s:.2f} | {bw}x{bh}px")
-        except Exception as e:
+        except Exception:
            pass

    if found:
--- a/scripts/identity_agent.py
+++ b/scripts/identity_agent.py
@@ -20,7 +20,6 @@ import argparse
 import os
 import numpy as np
 from typing import Dict, List, Optional, Tuple
-from datetime import datetime
 from sklearn.metrics.pairwise import cosine_similarity

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
--- a/scripts/integrate_face_asrx.py
+++ b/scripts/integrate_face_asrx.py
@@ -7,7 +7,6 @@ Face + ASRX 整合處理器
 import sys
 import json
 import argparse
-import os
 from pathlib import Path
 from datetime import datetime

@@ -194,7 +193,7 @@ def integrate_face_asrx(face_path, asrx_path, output_path, time_threshold=1.0):
            f"    With face: {speaker['with_face']} ({speaker['with_face'] / speaker['segment_count'] * 100:.0f}%)"
        )

-    print(f"\n[Face-ASRX] Integration complete!")
+    print("\n[Face-ASRX] Integration complete!")


 def main():
--- a/scripts/integrated_body_action_decoder.py
+++ b/scripts/integrated_body_action_decoder.py
@@ -15,13 +15,10 @@ Output:
 - Integrated action data with all body parts
 """

-import sys
 import json
 import argparse
-import numpy as np
 from typing import Dict, List
 from collections import defaultdict
-from pathlib import Path


 class IntegratedBodyActionDecoder:
--- a/scripts/language_router.py
+++ b/scripts/language_router.py
@@ -297,10 +297,10 @@ def main():
            print(f"  檔案路徑: {result['file_path']}")
            print(f"  檔案存在: {result['file_exists']}")
            if result.get("fallback_used"):
-                print(f"  使用了回退: 是")
+                print("  使用了回退: 是")
                print(f"  回退原因: {result.get('fallback_reason', '未知')}")
            else:
-                print(f"  使用了回退: 否")
+                print("  使用了回退: 否")
            print(f"  可用語言: {', '.join(result['available_languages'])}")
        else:
            if result["file_exists"]:
--- a/scripts/lip_processor.py
+++ b/scripts/lip_processor.py
@@ -235,7 +235,7 @@ def process_lip(
                    )
                else:
                    landmarks = None
-        except Exception as e:
+        except Exception:
            landmarks = None

        if landmarks is not None and len(landmarks) >= 468:
--- a/scripts/lip_processor_cv.py
+++ b/scripts/lip_processor_cv.py
@@ -10,7 +10,6 @@ import argparse
 import os
 import signal
 import cv2
-import numpy as np

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from redis_publisher import RedisPublisher
--- a/scripts/lip_processor_simple.py
+++ b/scripts/lip_processor_simple.py
@@ -10,7 +10,6 @@ import argparse
 import os
 import signal
 import cv2
-import numpy as np

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from redis_publisher import RedisPublisher
--- a/scripts/magnifying_glass_extract.py
+++ b/scripts/magnifying_glass_extract.py
@@ -6,7 +6,6 @@ Extracts frames at 1fps around key dialogue moments for thorough analysis.

 import cv2
 import os
-import subprocess

 UUID = "384b0ff44aaaa1f1"
 VIDEO_PATH = f"output/{UUID}/{UUID}.mp4"
--- a/scripts/match_face_identity.py
+++ b/scripts/match_face_identity.py
@@ -14,7 +14,6 @@ Usage:
 import json
 import argparse
 import numpy as np
-from datetime import datetime
 import psycopg2
 import os

@@ -313,7 +312,7 @@ def analyze_match_results(results):
            print(f"  Is Match: {r['is_match']}")
            
            if r['strategy'] == 'combined':
-                print(f"  Details:")
+                print("  Details:")
                print(f"    Best Match: {r['best_match']:.4f}")
                print(f"    Vote Ratio: {r['vote_ratio']:.2%}")
                print(f"    Weighted Sim: {r['weighted_sim']:.4f}")
@@ -408,7 +407,7 @@ def main():
            print("❌ No embedding in first face")
            return
        
-        print(f"\n🔧 Matching first face...")
+        print("\n🔧 Matching first face...")
        match_result = match_face_to_identity(
            detected_embedding=embedding,
            identity_uuid=identity_uuid,
@@ -419,7 +418,7 @@ def main():
        )
        
        if match_result:
-            print(f"\n✅ Match Result:")
+            print("\n✅ Match Result:")
            print(f"  Identity: {match_result['identity_name']}")
            print(f"  Strategy: {match_result['strategy']}")
            print(f"  Is Match: {match_result['is_match']}")
--- a/scripts/match_face_with_pose_filtering.py
+++ b/scripts/match_face_with_pose_filtering.py
@@ -19,7 +19,6 @@ Usage:
 import json
 import argparse
 import numpy as np
-from datetime import datetime
 import psycopg2
 import os
 import sys
@@ -424,7 +423,7 @@ def analyze_pose_match_results(results):
        for angle, threshold in adaptive_thresholds_used.items():
            print(f"{angle}: {threshold:.2f}")
        
-        print(f"\n=== Angle Match Types ===")
+        print("\n=== Angle Match Types ===")
        print(f"{angle_match_types}")
    
    # Top 5 details
@@ -528,7 +527,7 @@ def main():
            pose_features = match_result.get("pose_features", {})
            ratio_str = f"{pose_ratio:.3f}" if pose_ratio else f"{pose_features.get('nose_to_eye_ratio', 'N/A')}"
            
-            print(f"\n✅ Result:")
+            print("\n✅ Result:")
            print(f"  Pose: {match_result['pose_angle']} (ratio: {ratio_str})")
            print(f"  Similarity: {match_result['best_similarity']:.4f}")
            print(f"  Match: {match_result['is_match']}")
--- a/scripts/mediapipe_holistic_processor.py
+++ b/scripts/mediapipe_holistic_processor.py
@@ -43,15 +43,12 @@ Output structure:
 }
 """

-import sys
 import json
 import argparse
 import cv2
 import numpy as np
 import mediapipe as mp
-from pathlib import Path
-from typing import Dict, List, Optional
-from collections import defaultdict
+from typing import Dict


 class MediaPipeHolisticProcessor:
--- a/scripts/migrate_face_results.py
+++ b/scripts/migrate_face_results.py
@@ -150,7 +150,7 @@ def migrate_results():
                migrated_count += 1
                print(f"  ✅ Migrated {total_faces} faces")
            else:
-                print(f"  ⚠️ Already exists, skipping")
+                print("  ⚠️ Already exists, skipping")

        # Commit changes
        conn.commit()
@@ -193,7 +193,7 @@ def test_api_after_migration():

        if response.status_code == 200:
            data = response.json()
-            print(f"✅ Success!")
+            print("✅ Success!")
            print(f"Video UUID: {data.get('video_uuid')}")
            print(f"Total faces: {data.get('total_faces')}")
            print(f"Processing time: {data.get('processing_time_secs')}s")
@@ -203,7 +203,7 @@ def test_api_after_migration():
            if isinstance(result_data, str):
                result_data = json.loads(result_data)

-            print(f"\n📊 Detailed results:")
+            print("\n📊 Detailed results:")
            print(f"  Frames with faces: {result_data.get('frames_with_faces')}")

            gender_dist = result_data.get("gender_distribution", {})
--- a/scripts/multi_stage_stamp_search.py
+++ b/scripts/multi_stage_stamp_search.py
@@ -9,9 +9,7 @@ Stage 3: Filter and rank results
 import os
 import cv2
 import json
-import glob
 import time
-import numpy as np
 from PIL import Image
 import torch
 from transformers import OwlViTProcessor, OwlViTForObjectDetection
@@ -123,7 +121,7 @@ for idx, (sec, frame_path) in enumerate(frames_to_process):
                            ],
                        }
                    )
-        except Exception as e:
+        except Exception:
            pass

    if not containers:
@@ -226,7 +224,7 @@ for idx, (sec, frame_path) in enumerate(frames_to_process):
                        print(
                            f"      🎯 {sec}s | {stamp_term} | {s:.2f} | {int(orig_w)}x{int(orig_h)}px"
                        )
-            except Exception as e:
+            except Exception:
                pass

 # ─── Stage 3: Filter and rank ───
--- a/scripts/music_segmentation_processor.py
+++ b/scripts/music_segmentation_processor.py
@@ -8,7 +8,6 @@ import librosa
 import numpy as np
 import os
 import json
-import matplotlib.pyplot as plt  # Only for debug if needed, but we stick to console for now

 # 設定
 UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
@@ -29,7 +28,7 @@ def analyze_music_segmentation(audio_path):
    hop_length = int(1.0 * sr)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)

-    print(f"📊 Analyzing transitions...")
+    print("📊 Analyzing transitions...")

    # 2. 計算自我相似度矩陣 (Self-Similarity Matrix) - 優化版
    # 這裡我們簡化為計算相鄰片段的餘弦距離 (Cosine Distance)
@@ -45,7 +44,6 @@ def analyze_music_segmentation(audio_path):

    # 使用 librosa 的 onset_strength 的變體，但針對 Chroma
    # 這裡手動計算 Cosine Distance 以確保準確度
-    from sklearn.metrics.pairwise import cosine_similarity

    # 為了效能，我們不逐一計算，而是使用向量化的方法
    # 計算 frame[t] 和 frame[t+lag] 的差異
@@ -127,12 +125,12 @@ if __name__ == "__main__":
    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump({"music_segments": segments}, f, indent=2, ensure_ascii=False)

-    print(f"\n🎉 Analysis Complete!")
+    print("\n🎉 Analysis Complete!")
    print(f"✅ Identified {len(segments)} music-based scenes.")
    print(f"💾 Saved to {OUTPUT_JSON}")

    # 顯示結果
-    print(f"\n🎶 Top Music Segments:")
+    print("\n🎶 Top Music Segments:")
    for i, seg in enumerate(segments[:20]):
        m_s, s_s = divmod(seg["start_time"], 60)
        print(f"  {i + 1:02d}. [{int(m_s):02d}:{s_s:05.2f}] - {seg['duration']}s")
--- a/scripts/ocr_benchmark_runner.py
+++ b/scripts/ocr_benchmark_runner.py
@@ -173,7 +173,7 @@ def main():
        
        video_stream = next((s for s in video_info["streams"] if s["codec_type"] == "video"), None)
        
-        print(f"\n测试视频:")
+        print("\n测试视频:")
        print(f"  文件: {float(video_info['format'].get('size', 0)) / 1024 / 1024:.1f} MB")
        print(f"  时长: {float(video_info['format'].get('duration', 0)):.1f} 秒")
        print(f"  分辨率: {video_stream.get('width', 0)}x{video_stream.get('height', 0)}")
@@ -229,7 +229,7 @@ def main():
                "file_size_kb": result["file_size_kb"],
            })
            
-            print(f"\n✅ 处理完成:")
+            print("\n✅ 处理完成:")
            print(f"  时间: {result['elapsed_time']:.2f}秒")
            print(f"  内存峰值: {result['peak_memory_mb']:.1f} MB")
            print(f"  检测帧数: {result['total_frames']}")
@@ -266,7 +266,7 @@ def main():
    print(f"{'=' * 80}")
    
    print("\n【对比总结】")
-    print(f"\n| 方案 | 脚本 | 语言 | 时间(秒) | 内存(MB) | 帧数 | 文字数 | 置信度 | 空帧率 |")
+    print("\n| 方案 | 脚本 | 语言 | 时间(秒) | 内存(MB) | 帧数 | 文字数 | 置信度 | 空帧率 |")
    print("|------|------|------|---------|---------|------|--------|--------|--------|")
    
    for r in results:
--- a/scripts/ocr_processor.py
+++ b/scripts/ocr_processor.py
@@ -13,12 +13,9 @@ import sys
 import json
 import argparse
 import os
-import signal
 import time
-from datetime import datetime

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from redis_publisher import RedisPublisher
 from resume_framework import ResumeFramework, format_time, print_progress


--- a/scripts/ocr_processor_mps.py
+++ b/scripts/ocr_processor_mps.py
@@ -18,7 +18,7 @@ import os
 import signal
 import time
 from datetime import datetime
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List

 import cv2
 import numpy as np
--- a/scripts/pose_processor.py
+++ b/scripts/pose_processor.py
@@ -17,10 +17,8 @@ import json
 import argparse
 import os
 import time
-from datetime import datetime

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from redis_publisher import RedisPublisher
 from resume_framework import ResumeFramework, format_time, print_progress


--- a/scripts/pose_processor_mps.py
+++ b/scripts/pose_processor_mps.py
@@ -17,10 +17,9 @@ import os
 import signal
 import time
 from datetime import datetime
-from typing import Dict, List, Optional, Tuple
+from typing import Dict

 import cv2
-import numpy as np
 import torch
 from ultralytics import YOLO

--- a/scripts/refine_search.py
+++ b/scripts/refine_search.py
@@ -5,14 +5,13 @@ Refined Search for "Postage Stamp" in the Image

 import os
 import cv2
-import torch
 import types
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForCausalLM

 UUID = "384b0ff44aaaa1f1"
 OUTPUT_DIR = f"output/{UUID}/florence2_results"
-INPUT_IMG = os.path.join(OUTPUT_DIR, f"raw_6846.jpg")
+INPUT_IMG = os.path.join(OUTPUT_DIR, "raw_6846.jpg")


 # Patch for compatibility (Required for this environment)
--- a/scripts/regenerate_parent_5w1h.py
+++ b/scripts/regenerate_parent_5w1h.py
@@ -185,7 +185,7 @@ def main():
            if update_parent_chunk(parent, analysis):
                success_count += 1
        else:
-            print(f"   ❌ Failed to generate analysis")
+            print("   ❌ Failed to generate analysis")

    print(f"\n{'=' * 70}")
    print(
--- a/scripts/register_sample_faces.py
+++ b/scripts/register_sample_faces.py
@@ -4,8 +4,6 @@ Register sample faces to test the face recognition system
 """

 import requests
-import json
-import base64
 import os

 # API configuration
--- a/scripts/resume_framework.py
+++ b/scripts/resume_framework.py
@@ -41,7 +41,7 @@ import json
 import signal
 import time
 from datetime import datetime
-from typing import Dict, Optional, Tuple, Any, Callable
+from typing import Dict, Optional, Tuple, Callable


 class ResumeFramework:
--- a/scripts/scan_keyframes.py
+++ b/scripts/scan_keyframes.py
@@ -5,7 +5,6 @@ Scan Multiple Frames for Stamps

 import os
 import cv2
-import torch
 import types
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForCausalLM
--- a/scripts/scan_keyframes_opencv.py
+++ b/scripts/scan_keyframes_opencv.py
@@ -6,7 +6,6 @@ Batch Scan Keyframes for SMALL red stamps
 import cv2
 import numpy as np
 import os
-import json

 UUID = "384b0ff44aaaa1f1"
 BASE_DIR = f"output/{UUID}/florence2_results"
@@ -93,4 +92,4 @@ for frame_name in FRAMES:
        res_name = f"result_opencv_{frame_name}"
        cv2.imwrite(os.path.join(BASE_DIR, res_name), img)
    else:
-        print(f"   ❌ No small stamps found.")
+        print("   ❌ No small stamps found.")
--- a/scripts/scene_classifier.py
+++ b/scripts/scene_classifier.py
@@ -230,7 +230,7 @@ class SceneClassifier:
                    print("[SCENE] Places365 model loaded successfully (365 classes)")
                else:
                    print(
-                        f"[SCENE] Places365 model not found, using ImageNet pretrained"
+                        "[SCENE] Places365 model not found, using ImageNet pretrained"
                    )
                    self.model = models.resnet18(pretrained=True)
                    self.model_type = "imagenet"
--- a/scripts/search_blue_stamp.py
+++ b/scripts/search_blue_stamp.py
@@ -85,7 +85,7 @@ for frame_name in FRAMES:
                cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 3)
                cv2.putText(
                    img,
-                    f"BLUE STAMP?",
+                    "BLUE STAMP?",
                    (x, y - 10),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.6,
--- a/scripts/search_envelope.py
+++ b/scripts/search_envelope.py
@@ -5,7 +5,6 @@ Search for Envelope/Stamp in Keyframes

 import os
 import cv2
-import torch
 import types
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForCausalLM
--- a/scripts/search_vase.py
+++ b/scripts/search_vase.py
@@ -5,7 +5,6 @@ Search for "vase" in the video using OWL-ViT on a subset of frames.

 import os
 import cv2
-import json
 import glob
 from PIL import Image
 import torch
--- a/scripts/select_face_reference_vectors.py
+++ b/scripts/select_face_reference_vectors.py
@@ -15,7 +15,6 @@ Usage:
 import json
 import argparse
 import numpy as np
-from pathlib import Path
 from datetime import datetime
 import psycopg2
 import os
@@ -313,10 +312,10 @@ def main():
        )
        
        if uuid:
-            print(f"\n🎉 Registration completed!")
+            print("\n🎉 Registration completed!")
    else:
-        print(f"\n📊 Analysis only (no registration)")
-        print(f"   To register, run with --register flag")
+        print("\n📊 Analysis only (no registration)")
+        print("   To register, run with --register flag")


 if __name__ == "__main__":
--- a/scripts/select_face_reference_vectors_v2.py
+++ b/scripts/select_face_reference_vectors_v2.py
@@ -443,7 +443,7 @@ def main():
    print_selection_report(angle_groups, selected, coverage_report)
    
    if not args.report_only and args.register and args.identity_name:
-        print(f"\n🔧 Step 5: Registering Identity...")
+        print("\n🔧 Step 5: Registering Identity...")
        
        reference_data = build_reference_data_structure(selected, args.video_uuid)
        
@@ -454,14 +454,14 @@ def main():
        )
        
        if uuid:
-            print(f"\n✅ Registration completed!")
+            print("\n✅ Registration completed!")
            print(f"   UUID: {uuid}")
            print(f"   Name: {args.identity_name}")
            print(f"   Angles: {coverage_report['angles_covered']}")
            print(f"   Total vectors: {coverage_report['total_references']}")
            print(f"   Quality avg: {coverage_report['quality_avg']:.2f}")
    elif args.report_only:
-        print(f"\n📊 Report only (no registration)")
+        print("\n📊 Report only (no registration)")


 if __name__ == "__main__":
--- a/scripts/select_face_reference_vectors_v3.py
+++ b/scripts/select_face_reference_vectors_v3.py
@@ -329,7 +329,7 @@ def main():
        print("Please run face_tracker.py first")
        return
    
-    print(f"\n=== Available Traces ===")
+    print("\n=== Available Traces ===")
    for trace_id_str, trace in sorted(traces.items(), key=lambda x: int(x[0])):
        print(f"Trace {trace_id_str}:")
        print(f"  Frames: {trace['start_frame']}-{trace['end_frame']} ({trace['duration_frames']} frames)")
@@ -364,7 +364,7 @@ def main():
    # Filter faces by trace
    filtered_face_data = filter_faces_by_trace(face_data, trace_id_filter)
    
-    print(f"\n=== Filtering Faces ===")
+    print("\n=== Filtering Faces ===")
    print(f"Original frames: {len(face_data.get('frames', {}))}")
    print(f"Filtered frames: {len(filtered_face_data.get('frames', {}))}")
    
@@ -379,7 +379,7 @@ def main():
        print("❌ No reference vectors selected")
        return
    
-    print(f"\n=== Selected Reference Vectors ===")
+    print("\n=== Selected Reference Vectors ===")
    print(f"Total: {len(selected_vectors)}")
    
    angle_distribution = defaultdict(int)
@@ -390,7 +390,7 @@ def main():
    print(f"Distribution: {dict(angle_distribution)}")
    print(f"Quality avg: {np.mean([v['quality_score'] for v in selected_vectors]):.3f}")
    
-    print(f"\n=== Vector Details ===")
+    print("\n=== Vector Details ===")
    for i, v in enumerate(selected_vectors[:10]):
        print(f"Vector {i+1}:")
        print(f"  Angle: {v['pose_angle']} (confidence: {v['pose_confidence']:.2f})")
@@ -404,7 +404,7 @@ def main():
        return
    
    if args.register and args.identity_name:
-        print(f"\n=== Registering Identity ===")
+        print("\n=== Registering Identity ===")
        
        identity_uuid = register_identity_with_trace(
            identity_name=args.identity_name,
@@ -416,7 +416,7 @@ def main():
        )
        
        if identity_uuid:
-            print(f"\n✅ Registration completed!")
+            print("\n✅ Registration completed!")
            print(f"   UUID: {identity_uuid}")
            print(f"   Name: {args.identity_name}")
            print(f"   Trace ID: {trace_id_filter}")
--- a/scripts/simple_api_test.py
+++ b/scripts/simple_api_test.py
@@ -34,7 +34,7 @@ def test_endpoint(endpoint, method="GET", data=None):
        print(f"Headers: {dict(response.headers)}")

        if response.status_code == 200:
-            print(f"✅ Success!")
+            print("✅ Success!")
            if response.text:
                print(f"Response (first 500 chars): {response.text[:500]}")
            return True
--- a/scripts/simple_face_stats.py
+++ b/scripts/simple_face_stats.py
@@ -93,14 +93,14 @@ def main():
        # 直接回答問題
        print("📝 問題回答:")
        print("-" * 40)
-        print(f"Q: 這兩個影片內有幾個人?")
+        print("Q: 這兩個影片內有幾個人?")
        print(f"A: 總共檢測到 {total_faces} 個人臉")
        print()
-        print(f"Q: 幾男幾女?")
+        print("Q: 幾男幾女?")
        print(f"A: 男性 {male_count} 人 ({male_count / total_faces * 100:.1f}%)")
        print(f"   女性 {female_count} 人 ({female_count / total_faces * 100:.1f}%)")
        print()
-        print(f"Q: 平均年齡?")
+        print("Q: 平均年齡?")
        print(f"A: 平均 {avg_age} 歲 (範圍: {min_age}-{max_age}歲)")
        print()
        print("=" * 60)
--- a/scripts/sound_event_detector.py
+++ b/scripts/sound_event_detector.py
@@ -26,7 +26,7 @@ def detect_impulse_sounds(audio_path, threshold_multiplier=1.5):
    # 載入音頻 (Mono, 22050Hz)
    y, sr = librosa.load(audio_path, sr=22050)

-    print(f"📊 Analyzing energy envelope...")
+    print("📊 Analyzing energy envelope...")
    # 1. 計算 RMS 能量 (以 0.05秒 為一幀)
    frame_length = int(0.05 * sr)
    hop_length = int(0.02 * sr)
--- a/scripts/specific_stamp_search.py
+++ b/scripts/specific_stamp_search.py
@@ -5,14 +5,13 @@ Search for Specific Stamps in the Image (Avoiding Watermark)

 import os
 import cv2
-import torch
 import types
 from PIL import Image
 from transformers import AutoProcessor, AutoModelForCausalLM

 UUID = "384b0ff44aaaa1f1"
 OUTPUT_DIR = f"output/{UUID}/florence2_results"
-INPUT_IMG = os.path.join(OUTPUT_DIR, f"raw_6846.jpg")
+INPUT_IMG = os.path.join(OUTPUT_DIR, "raw_6846.jpg")


 # Patch for compatibility
--- a/scripts/swift_processors/.build/.lock
+++ b/scripts/swift_processors/.build/.lock
@@ -0,0 +1 @@
+7861
--- a/Show More
+++ b/Show More