fix: restore identity_id after face_dedup, rebuild package v20260512

- Re-ran identity_bind.py to restore identity_id on face_detections - Dedup cleanup had removed rows with identity_id, kept NULL rows - 70691 face_detections now have identity_id, 428 identities - Full package rebuild: 169MB sqlite, 1358MB tar.gz - identities.json: 428 identities + 5483 bindings + 5483 trace maps - TMDB matching complete: Audrey Hepburn 843 traces, Cary Grant 482
2026-05-13 04:30:18 +08:00
parent fff2af8ad1
commit 48c3b13c37
837 changed files with 33273 additions and 5473 deletions
--- a/scripts/test_segment_count.py
+++ b/scripts/test_segment_count.py
@@ -0,0 +1,71 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Minimal test: run faster-whisper on full video, output each segment's text.
+No VAD tuning, no speaker detection, no splitting. Just raw ASR output.
+"""
+import json, os, sys, time, subprocess, tempfile, shutil
+import torchaudio
+from faster_whisper import WhisperModel
+
+def extract_audio(video_path, tmp_dir, sr=16000):
+    wav_path = os.path.join(tmp_dir, "audio.wav")
+    subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", video_path,
+        "-ar", str(sr), "-ac", "1", "-sample_fmt", "s16", wav_path],
+        check=True, capture_output=True, timeout=300)
+    wav_data, sr_actual = torchaudio.load(wav_path)
+    if wav_data.shape[0] > 1:
+        wav_data = wav_data.mean(dim=0, keepdim=True)
+    return wav_path, wav_data, sr_actual
+
+def main():
+    video = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn ｜ Comedy Mystery Romance Thriller ｜ Full Movie.mp4"
+    output = "/Users/accusys/momentry/output_dev/segment_texts.txt"
+
+    t0 = time.time()
+
+    # Load model
+    print("Loading faster-whisper small int8 CPU...")
+    model = WhisperModel("small", device="cpu", compute_type="int8")
+    print(f"Model loaded ({time.time()-t0:.0f}s)")
+
+    # Extract audio
+    print("Extracting audio...")
+    tmp_dir = tempfile.mkdtemp(prefix="asr_test_")
+    wav_path, wav_data, sr = extract_audio(video, tmp_dir)
+    total_audio_s = wav_data.shape[1] / sr
+    print(f"Audio: {total_audio_s:.0f}s, {sr}Hz ({time.time()-t0:.0f}s)")
+
+    # Transcribe - NO VAD filter, let the model segment naturally
+    print("Transcribing (vad_filter=False)...")
+    segments, info = model.transcribe(wav_path, beam_size=5,
+        vad_filter=False, word_timestamps=True)
+    print(f"  Detected language: {info.language} (prob: {info.language_probability:.2f})")
+    print(f"  Duration after VAD: {info.duration_after_vad:.1f}s")
+
+    # Write each segment to file
+    count = 0
+    total_words = 0
+    total_dur = 0
+    with open(output, "w") as f:
+        for seg in segments:
+            text = seg.text.strip()
+            dur = seg.end - seg.start
+            words = len(seg.words) if seg.words else 0
+            f.write(f"{seg.start:.2f}\t{seg.end:.2f}\t{dur:.2f}\t{words}\t{text}\n")
+            count += 1
+            total_words += words
+            total_dur += dur
+
+    elapsed = time.time() - t0
+    print(f"\n=== Results ===")
+    print(f"Segments: {count}")
+    print(f"Words: {total_words}")
+    print(f"Speech duration: {total_dur:.0f}s")
+    print(f"Avg segment: {total_dur/count:.1f}s, {total_words/count:.1f} words")
+    print(f"Elapsed: {elapsed:.0f}s ({elapsed/60:.1f}min)")
+    print(f"Output: {output}")
+
+    shutil.rmtree(tmp_dir, ignore_errors=True)
+
+if __name__ == "__main__":
+    main()