momentry_core/scripts/compare_segmentation.py

#!/opt/homebrew/bin/python3.11
"""
POC: Compare silence-based segmentation vs CUT-based segmentation for ASR.

Tests a short video segment and reports:
1. Number of segments from each method
2. Segment boundaries
3. ASR quality comparison (WER estimate)
"""
import json
import os
import sys
import subprocess
import tempfile
import time
from faster_whisper import WhisperModel

VIDEO_PATH = sys.argv[1] if len(sys.argv) > 1 else "/Users/accusys/test_video/Old_Time_Movie_Show_-_Charade_1963.HD.mov"
DURATION = 300  # Test first 5 minutes only

model = WhisperModel("small", device="cpu", compute_type="int8")

def extract_audio_segment(start, end, out_wav):
    cmd = ["ffmpeg", "-y", "-v", "quiet", "-i", VIDEO_PATH,
           "-ss", str(start), "-to", str(end),
           "-ar", "16000", "-ac", "1", out_wav]
    subprocess.run(cmd, check=False, capture_output=True)
    return os.path.getsize(out_wav) > 100

def transcribe(wav_path):
    segs, info = model.transcribe(wav_path, beam_size=5, vad_filter=True,
                                   vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200))
    return list(segs), info

# === Method 1: CUT-based segmentation ===
print("=" * 60)
print("METHOD 1: CUT-based segmentation")
print("=" * 60)
cut_path = "/Users/accusys/momentry/output_dev/417a7e93860d70c87aee6c4c1b715d70.cut.json"
cut_scenes = []
if os.path.exists(cut_path):
    with open(cut_path) as f:
        data = json.load(f)
    cut_scenes = [(s["start_time"], s["end_time"]) for s in data.get("scenes", []) if s["start_time"] < DURATION]
    print(f"  Scenes in first {DURATION}s: {len(cut_scenes)}")

tmpdir = tempfile.mkdtemp(prefix="seg_compare_")
t1 = time.time()
cut_segments = []
total_chars = 0
for idx, (st, et) in enumerate(cut_scenes):
    wav = os.path.join(tmpdir, f"cut_{idx:04d}.wav")
    if not extract_audio_segment(st, et, wav):
        continue
    segs, info = transcribe(wav)
    for s in segs:
        cut_segments.append({"start": st + s.start, "end": st + s.end, "text": s.text})
        total_chars += len(s.text)
cut_time = time.time() - t1
print(f"  Segments: {len(cut_segments)}, Total chars: {total_chars}, Time: {cut_time:.1f}s")
print(f"  Avg segment duration: {DURATION/len(cut_segments):.1f}s" if cut_segments else "")

# === Method 2: Silence-based segmentation (ffmpeg silencedetect) ===
print()
print("=" * 60)
print("METHOD 2: Silence-based segmentation (ffmpeg silencedetect)")
print("=" * 60)

# Extract full 5min audio
full_wav = os.path.join(tmpdir, "full_audio.wav")
extract_audio_segment(0, DURATION, full_wav)

# Use ffmpeg silencedetect to find speech segments
t2 = time.time()
detect_cmd = ["ffmpeg", "-i", full_wav, "-af", "silencedetect=noise=-30dB:d=0.5", "-f", "null", "-"]
result = subprocess.run(detect_cmd, capture_output=True, text=True)
stderr = result.stderr

# Parse silencedetect output
silence_starts = []
silence_ends = []
for line in stderr.split("\n"):
    if "silence_start:" in line:
        silence_starts.append(float(line.split("silence_start:")[1].strip()))
    elif "silence_end:" in line:
        silence_ends.append(float(line.split("silence_end:")[1].split("|")[0].strip()))

# Build speech segments: gaps between silence periods
speech_segments = []
last_end = 0.0
for ss, se in zip(silence_starts, silence_ends):
    if ss > last_end + 0.5:
        speech_segments.append((last_end, ss))
    last_end = se
if last_end < DURATION:
    speech_segments.append((last_end, DURATION))

print(f"  Silence periods detected: {len(silence_starts)}")
print(f"  Speech segments: {len(speech_segments)}")

# Transcribe each speech segment
silence_segments = []
total_chars2 = 0
for idx, (st, et) in enumerate(speech_segments):
    wav = os.path.join(tmpdir, f"sil_{idx:04d}.wav")
    if not extract_audio_segment(st, et, wav):
        continue
    segs, info = transcribe(wav)
    for s in segs:
        silence_segments.append({"start": st + s.start, "end": st + s.end, "text": s.text})
        total_chars2 += len(s.text)
silence_time = time.time() - t2
print(f"  Segments: {len(silence_segments)}, Total chars: {total_chars2}, Time: {silence_time:.1f}s")

# === Comparison ===
print()
print("=" * 60)
print("COMPARISON")
print("=" * 60)
print(f"{'Metric':<30} {'CUT-based':<15} {'Silence-based':<15}")
print("-" * 60)
print(f"{'Number of audio segments':<30} {len(cut_scenes):<15} {len(speech_segments):<15}")
print(f"{'Number of ASR segments':<30} {len(cut_segments):<15} {len(silence_segments):<15}")
print(f"{'Total chars recognized':<30} {total_chars:<15} {total_chars2:<15}")
print(f"{'Processing time (s)':<30} {cut_time:<15.1f} {silence_time:<15.1f}")

# Cleanup
import shutil
shutil.rmtree(tmpdir, ignore_errors=True)
print()
print("Done.")