Files
momentry_core/scripts/test_segment_count.py
Accusys 48c3b13c37 fix: restore identity_id after face_dedup, rebuild package v20260512
- Re-ran identity_bind.py to restore identity_id on face_detections
- Dedup cleanup had removed rows with identity_id, kept NULL rows
- 70691 face_detections now have identity_id, 428 identities
- Full package rebuild: 169MB sqlite, 1358MB tar.gz
- identities.json: 428 identities + 5483 bindings + 5483 trace maps
- TMDB matching complete: Audrey Hepburn 843 traces, Cary Grant 482
2026-05-13 04:30:18 +08:00

72 lines
2.7 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/opt/homebrew/bin/python3.11
"""
Minimal test: run faster-whisper on full video, output each segment's text.
No VAD tuning, no speaker detection, no splitting. Just raw ASR output.
"""
import json, os, sys, time, subprocess, tempfile, shutil
import torchaudio
from faster_whisper import WhisperModel
def extract_audio(video_path, tmp_dir, sr=16000):
wav_path = os.path.join(tmp_dir, "audio.wav")
subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", video_path,
"-ar", str(sr), "-ac", "1", "-sample_fmt", "s16", wav_path],
check=True, capture_output=True, timeout=300)
wav_data, sr_actual = torchaudio.load(wav_path)
if wav_data.shape[0] > 1:
wav_data = wav_data.mean(dim=0, keepdim=True)
return wav_path, wav_data, sr_actual
def main():
video = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn Comedy Mystery Romance Thriller Full Movie.mp4"
output = "/Users/accusys/momentry/output_dev/segment_texts.txt"
t0 = time.time()
# Load model
print("Loading faster-whisper small int8 CPU...")
model = WhisperModel("small", device="cpu", compute_type="int8")
print(f"Model loaded ({time.time()-t0:.0f}s)")
# Extract audio
print("Extracting audio...")
tmp_dir = tempfile.mkdtemp(prefix="asr_test_")
wav_path, wav_data, sr = extract_audio(video, tmp_dir)
total_audio_s = wav_data.shape[1] / sr
print(f"Audio: {total_audio_s:.0f}s, {sr}Hz ({time.time()-t0:.0f}s)")
# Transcribe - NO VAD filter, let the model segment naturally
print("Transcribing (vad_filter=False)...")
segments, info = model.transcribe(wav_path, beam_size=5,
vad_filter=False, word_timestamps=True)
print(f" Detected language: {info.language} (prob: {info.language_probability:.2f})")
print(f" Duration after VAD: {info.duration_after_vad:.1f}s")
# Write each segment to file
count = 0
total_words = 0
total_dur = 0
with open(output, "w") as f:
for seg in segments:
text = seg.text.strip()
dur = seg.end - seg.start
words = len(seg.words) if seg.words else 0
f.write(f"{seg.start:.2f}\t{seg.end:.2f}\t{dur:.2f}\t{words}\t{text}\n")
count += 1
total_words += words
total_dur += dur
elapsed = time.time() - t0
print(f"\n=== Results ===")
print(f"Segments: {count}")
print(f"Words: {total_words}")
print(f"Speech duration: {total_dur:.0f}s")
print(f"Avg segment: {total_dur/count:.1f}s, {total_words/count:.1f} words")
print(f"Elapsed: {elapsed:.0f}s ({elapsed/60:.1f}min)")
print(f"Output: {output}")
shutil.rmtree(tmp_dir, ignore_errors=True)
if __name__ == "__main__":
main()