- Re-ran identity_bind.py to restore identity_id on face_detections - Dedup cleanup had removed rows with identity_id, kept NULL rows - 70691 face_detections now have identity_id, 428 identities - Full package rebuild: 169MB sqlite, 1358MB tar.gz - identities.json: 428 identities + 5483 bindings + 5483 trace maps - TMDB matching complete: Audrey Hepburn 843 traces, Cary Grant 482
72 lines
2.7 KiB
Python
72 lines
2.7 KiB
Python
#!/opt/homebrew/bin/python3.11
|
||
"""
|
||
Minimal test: run faster-whisper on full video, output each segment's text.
|
||
No VAD tuning, no speaker detection, no splitting. Just raw ASR output.
|
||
"""
|
||
import json, os, sys, time, subprocess, tempfile, shutil
|
||
import torchaudio
|
||
from faster_whisper import WhisperModel
|
||
|
||
def extract_audio(video_path, tmp_dir, sr=16000):
|
||
wav_path = os.path.join(tmp_dir, "audio.wav")
|
||
subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", video_path,
|
||
"-ar", str(sr), "-ac", "1", "-sample_fmt", "s16", wav_path],
|
||
check=True, capture_output=True, timeout=300)
|
||
wav_data, sr_actual = torchaudio.load(wav_path)
|
||
if wav_data.shape[0] > 1:
|
||
wav_data = wav_data.mean(dim=0, keepdim=True)
|
||
return wav_path, wav_data, sr_actual
|
||
|
||
def main():
|
||
video = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn | Comedy Mystery Romance Thriller | Full Movie.mp4"
|
||
output = "/Users/accusys/momentry/output_dev/segment_texts.txt"
|
||
|
||
t0 = time.time()
|
||
|
||
# Load model
|
||
print("Loading faster-whisper small int8 CPU...")
|
||
model = WhisperModel("small", device="cpu", compute_type="int8")
|
||
print(f"Model loaded ({time.time()-t0:.0f}s)")
|
||
|
||
# Extract audio
|
||
print("Extracting audio...")
|
||
tmp_dir = tempfile.mkdtemp(prefix="asr_test_")
|
||
wav_path, wav_data, sr = extract_audio(video, tmp_dir)
|
||
total_audio_s = wav_data.shape[1] / sr
|
||
print(f"Audio: {total_audio_s:.0f}s, {sr}Hz ({time.time()-t0:.0f}s)")
|
||
|
||
# Transcribe - NO VAD filter, let the model segment naturally
|
||
print("Transcribing (vad_filter=False)...")
|
||
segments, info = model.transcribe(wav_path, beam_size=5,
|
||
vad_filter=False, word_timestamps=True)
|
||
print(f" Detected language: {info.language} (prob: {info.language_probability:.2f})")
|
||
print(f" Duration after VAD: {info.duration_after_vad:.1f}s")
|
||
|
||
# Write each segment to file
|
||
count = 0
|
||
total_words = 0
|
||
total_dur = 0
|
||
with open(output, "w") as f:
|
||
for seg in segments:
|
||
text = seg.text.strip()
|
||
dur = seg.end - seg.start
|
||
words = len(seg.words) if seg.words else 0
|
||
f.write(f"{seg.start:.2f}\t{seg.end:.2f}\t{dur:.2f}\t{words}\t{text}\n")
|
||
count += 1
|
||
total_words += words
|
||
total_dur += dur
|
||
|
||
elapsed = time.time() - t0
|
||
print(f"\n=== Results ===")
|
||
print(f"Segments: {count}")
|
||
print(f"Words: {total_words}")
|
||
print(f"Speech duration: {total_dur:.0f}s")
|
||
print(f"Avg segment: {total_dur/count:.1f}s, {total_words/count:.1f} words")
|
||
print(f"Elapsed: {elapsed:.0f}s ({elapsed/60:.1f}min)")
|
||
print(f"Output: {output}")
|
||
|
||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||
|
||
if __name__ == "__main__":
|
||
main()
|