- Remove session-ses_2f27.md (161KB raw session log) - Remove 49 ROOT_* duplicate files across REFERENCE/ - Remove 14 duplicate files between REFERENCE/ root and history/ - Remove asr_legacy.rs (dead code, replaced by asr.rs) - Remove src/core/worker/ (duplicate JobWorker) - Remove src/core/layers/ (empty directory) - Remove 4 .bak files in src/ - Remove 7 dead private methods in worker/processor.rs - Remove backup directory from git tracking
132 lines
4.6 KiB
Python
132 lines
4.6 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
POC: Compare silence-based segmentation vs CUT-based segmentation for ASR.
|
|
|
|
Tests a short video segment and reports:
|
|
1. Number of segments from each method
|
|
2. Segment boundaries
|
|
3. ASR quality comparison (WER estimate)
|
|
"""
|
|
import json
|
|
import os
|
|
import sys
|
|
import subprocess
|
|
import tempfile
|
|
import time
|
|
from faster_whisper import WhisperModel
|
|
|
|
VIDEO_PATH = sys.argv[1] if len(sys.argv) > 1 else "/Users/accusys/test_video/Old_Time_Movie_Show_-_Charade_1963.HD.mov"
|
|
DURATION = 300 # Test first 5 minutes only
|
|
|
|
model = WhisperModel("small", device="cpu", compute_type="int8")
|
|
|
|
def extract_audio_segment(start, end, out_wav):
|
|
cmd = ["ffmpeg", "-y", "-v", "quiet", "-i", VIDEO_PATH,
|
|
"-ss", str(start), "-to", str(end),
|
|
"-ar", "16000", "-ac", "1", out_wav]
|
|
subprocess.run(cmd, check=False, capture_output=True)
|
|
return os.path.getsize(out_wav) > 100
|
|
|
|
def transcribe(wav_path):
|
|
segs, info = model.transcribe(wav_path, beam_size=5, vad_filter=True,
|
|
vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200))
|
|
return list(segs), info
|
|
|
|
# === Method 1: CUT-based segmentation ===
|
|
print("=" * 60)
|
|
print("METHOD 1: CUT-based segmentation")
|
|
print("=" * 60)
|
|
cut_path = "/Users/accusys/momentry/output_dev/417a7e93860d70c87aee6c4c1b715d70.cut.json"
|
|
cut_scenes = []
|
|
if os.path.exists(cut_path):
|
|
with open(cut_path) as f:
|
|
data = json.load(f)
|
|
cut_scenes = [(s["start_time"], s["end_time"]) for s in data.get("scenes", []) if s["start_time"] < DURATION]
|
|
print(f" Scenes in first {DURATION}s: {len(cut_scenes)}")
|
|
|
|
tmpdir = tempfile.mkdtemp(prefix="seg_compare_")
|
|
t1 = time.time()
|
|
cut_segments = []
|
|
total_chars = 0
|
|
for idx, (st, et) in enumerate(cut_scenes):
|
|
wav = os.path.join(tmpdir, f"cut_{idx:04d}.wav")
|
|
if not extract_audio_segment(st, et, wav):
|
|
continue
|
|
segs, info = transcribe(wav)
|
|
for s in segs:
|
|
cut_segments.append({"start": st + s.start, "end": st + s.end, "text": s.text})
|
|
total_chars += len(s.text)
|
|
cut_time = time.time() - t1
|
|
print(f" Segments: {len(cut_segments)}, Total chars: {total_chars}, Time: {cut_time:.1f}s")
|
|
print(f" Avg segment duration: {DURATION/len(cut_segments):.1f}s" if cut_segments else "")
|
|
|
|
# === Method 2: Silence-based segmentation (ffmpeg silencedetect) ===
|
|
print()
|
|
print("=" * 60)
|
|
print("METHOD 2: Silence-based segmentation (ffmpeg silencedetect)")
|
|
print("=" * 60)
|
|
|
|
# Extract full 5min audio
|
|
full_wav = os.path.join(tmpdir, "full_audio.wav")
|
|
extract_audio_segment(0, DURATION, full_wav)
|
|
|
|
# Use ffmpeg silencedetect to find speech segments
|
|
t2 = time.time()
|
|
detect_cmd = ["ffmpeg", "-i", full_wav, "-af", "silencedetect=noise=-30dB:d=0.5", "-f", "null", "-"]
|
|
result = subprocess.run(detect_cmd, capture_output=True, text=True)
|
|
stderr = result.stderr
|
|
|
|
# Parse silencedetect output
|
|
silence_starts = []
|
|
silence_ends = []
|
|
for line in stderr.split("\n"):
|
|
if "silence_start:" in line:
|
|
silence_starts.append(float(line.split("silence_start:")[1].strip()))
|
|
elif "silence_end:" in line:
|
|
silence_ends.append(float(line.split("silence_end:")[1].split("|")[0].strip()))
|
|
|
|
# Build speech segments: gaps between silence periods
|
|
speech_segments = []
|
|
last_end = 0.0
|
|
for ss, se in zip(silence_starts, silence_ends):
|
|
if ss > last_end + 0.5:
|
|
speech_segments.append((last_end, ss))
|
|
last_end = se
|
|
if last_end < DURATION:
|
|
speech_segments.append((last_end, DURATION))
|
|
|
|
print(f" Silence periods detected: {len(silence_starts)}")
|
|
print(f" Speech segments: {len(speech_segments)}")
|
|
|
|
# Transcribe each speech segment
|
|
silence_segments = []
|
|
total_chars2 = 0
|
|
for idx, (st, et) in enumerate(speech_segments):
|
|
wav = os.path.join(tmpdir, f"sil_{idx:04d}.wav")
|
|
if not extract_audio_segment(st, et, wav):
|
|
continue
|
|
segs, info = transcribe(wav)
|
|
for s in segs:
|
|
silence_segments.append({"start": st + s.start, "end": st + s.end, "text": s.text})
|
|
total_chars2 += len(s.text)
|
|
silence_time = time.time() - t2
|
|
print(f" Segments: {len(silence_segments)}, Total chars: {total_chars2}, Time: {silence_time:.1f}s")
|
|
|
|
# === Comparison ===
|
|
print()
|
|
print("=" * 60)
|
|
print("COMPARISON")
|
|
print("=" * 60)
|
|
print(f"{'Metric':<30} {'CUT-based':<15} {'Silence-based':<15}")
|
|
print("-" * 60)
|
|
print(f"{'Number of audio segments':<30} {len(cut_scenes):<15} {len(speech_segments):<15}")
|
|
print(f"{'Number of ASR segments':<30} {len(cut_segments):<15} {len(silence_segments):<15}")
|
|
print(f"{'Total chars recognized':<30} {total_chars:<15} {total_chars2:<15}")
|
|
print(f"{'Processing time (s)':<30} {cut_time:<15.1f} {silence_time:<15.1f}")
|
|
|
|
# Cleanup
|
|
import shutil
|
|
shutil.rmtree(tmpdir, ignore_errors=True)
|
|
print()
|
|
print("Done.")
|