momentry_core/scripts/sound_event_detector.py

#!/opt/homebrew/bin/python3.11
"""
Sound Event Detector (Impulse/Gunshot)
職責：使用聲學特徵檢測高能量脈衝聲音 (如槍聲、爆炸)。
"""

import librosa
import numpy as np
import json
import os
import sys

# 設定
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
AUDIO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.wav")
OUTPUT_JSON = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.sound_events.json")


def detect_impulse_sounds(audio_path, threshold_multiplier=1.5):
    """
    檢測脈衝聲音 (Impulse Sounds)
    原理：尋找 RMS 能量的局部峰值，且該峰值顯著高於背景噪音。
    """
    print(f"🔊 Loading audio: {audio_path}")
    # 載入音頻 (Mono, 22050Hz)
    y, sr = librosa.load(audio_path, sr=22050)

    print("📊 Analyzing energy envelope...")
    # 1. 計算 RMS 能量 (以 0.05秒 為一幀)
    frame_length = int(0.05 * sr)
    hop_length = int(0.02 * sr)
    rms = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]

    # 2. 計算動態閾值 (背景噪音 + 標準差的倍數)
    # 使用移動平均來適應不同場景的背景音
    background = np.median(rms)
    threshold = background * threshold_multiplier + 0.05  # 絕對底限

    print(f"   Background Level: {background:.4f}")
    print(f"   Detection Threshold: {threshold:.4f}")

    # 3. 尋找超過閾值的峰值
    # 使用 scipy 的 find_peaks 或簡單的 numpy 邏輯
    from scipy.signal import find_peaks

    peaks, properties = find_peaks(
        rms, height=threshold, distance=int(0.2 / 0.02)
    )  # 至少間隔 0.2秒

    # 4. 過濾與分類
    events = []
    for peak_idx in peaks:
        # 時間戳 (秒)
        time_sec = peak_idx * hop_length / sr

        # 特徵分析：檢查頻譜質心 (Spectral Centroid) - 槍聲通常頻譜質心高
        # 取峰值前後一小段
        start_frame = max(0, peak_idx - 2)
        end_frame = min(len(rms), peak_idx + 2)
        frame_idx = int(time_sec * sr)
        segment = y[max(0, frame_idx - 1000) : frame_idx + 1000]

        if len(segment) > 0:
            # 計算頻譜質心 (聲音的 "亮度")
            centroid = librosa.feature.spectral_centroid(y=segment, sr=sr)[0]
            avg_centroid = np.mean(centroid)

            # 計算頻帶能量 (Gunshot 通常高頻能量豐富)
            # 這裡簡化：如果 RMS 極高，直接標記為 "Gunshot/Explosion"
            rms_val = rms[peak_idx]

            event_type = "Loud Noise"
            if rms_val > threshold * 2.0:
                event_type = "Explosion/Gunshot"  # 極高能量
            elif rms_val > threshold * 1.2:
                event_type = "Loud Impact"

            events.append(
                {
                    "timestamp": round(time_sec, 2),
                    "type": event_type,
                    "energy": round(float(rms_val), 4),
                    "centroid": round(float(avg_centroid), 2),
                }
            )

    return events


if __name__ == "__main__":
    if not os.path.exists(AUDIO_PATH):
        # 嘗試從 mp4 提取
        AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4")
        if not os.path.exists(AUDIO_PATH_MP4):
            AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mov")

        if os.path.exists(AUDIO_PATH_MP4):
            print("🎥 Extracting audio from video...")
            os.system(f"ffmpeg -y -i {AUDIO_PATH_MP4} -vn -ar 16000 -ac 1 {AUDIO_PATH}")
        else:
            print("❌ No audio/video found.")
            sys.exit(1)

    print(f"🕵️‍♂️ Starting Sound Event Detection for {UUID}...")

    # 執行檢測
    events = detect_impulse_sounds(AUDIO_PATH)

    # 保存結果
    with open(OUTPUT_JSON, "w") as f:
        json.dump({"sound_events": events}, f, indent=2)

    print(f"\n🎉 Found {len(events)} potential sound events.")
    print(f"💾 Results saved to {OUTPUT_JSON}")

    # 顯示前 10 個高能量事件
    print("\n🔥 Top 10 Loudest Events (Potential Gunshots):")
    # 按能量排序
    sorted_events = sorted(events, key=lambda x: x["energy"], reverse=True)[:10]
    for i, ev in enumerate(sorted_events):
        m, s = divmod(ev["timestamp"], 60)
        print(
            f"  {i + 1}. [{int(m):02d}:{s:05.2f}] {ev['type']} (Energy: {ev['energy']:.4f})"
        )