Files
momentry_core/scripts/sound_event_detector.py
Warren e75c4d6f07 cleanup: remove dead code and duplicate docs
- Remove session-ses_2f27.md (161KB raw session log)
- Remove 49 ROOT_* duplicate files across REFERENCE/
- Remove 14 duplicate files between REFERENCE/ root and history/
- Remove asr_legacy.rs (dead code, replaced by asr.rs)
- Remove src/core/worker/ (duplicate JobWorker)
- Remove src/core/layers/ (empty directory)
- Remove 4 .bak files in src/
- Remove 7 dead private methods in worker/processor.rs
- Remove backup directory from git tracking
2026-05-04 01:31:21 +08:00

126 lines
4.3 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Sound Event Detector (Impulse/Gunshot)
職責:使用聲學特徵檢測高能量脈衝聲音 (如槍聲、爆炸)。
"""
import librosa
import numpy as np
import json
import os
import sys
# 設定
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
AUDIO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.wav")
OUTPUT_JSON = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.sound_events.json")
def detect_impulse_sounds(audio_path, threshold_multiplier=1.5):
"""
檢測脈衝聲音 (Impulse Sounds)
原理:尋找 RMS 能量的局部峰值,且該峰值顯著高於背景噪音。
"""
print(f"🔊 Loading audio: {audio_path}")
# 載入音頻 (Mono, 22050Hz)
y, sr = librosa.load(audio_path, sr=22050)
print("📊 Analyzing energy envelope...")
# 1. 計算 RMS 能量 (以 0.05秒 為一幀)
frame_length = int(0.05 * sr)
hop_length = int(0.02 * sr)
rms = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]
# 2. 計算動態閾值 (背景噪音 + 標準差的倍數)
# 使用移動平均來適應不同場景的背景音
background = np.median(rms)
threshold = background * threshold_multiplier + 0.05 # 絕對底限
print(f" Background Level: {background:.4f}")
print(f" Detection Threshold: {threshold:.4f}")
# 3. 尋找超過閾值的峰值
# 使用 scipy 的 find_peaks 或簡單的 numpy 邏輯
from scipy.signal import find_peaks
peaks, properties = find_peaks(
rms, height=threshold, distance=int(0.2 / 0.02)
) # 至少間隔 0.2秒
# 4. 過濾與分類
events = []
for peak_idx in peaks:
# 時間戳 (秒)
time_sec = peak_idx * hop_length / sr
# 特徵分析:檢查頻譜質心 (Spectral Centroid) - 槍聲通常頻譜質心高
# 取峰值前後一小段
start_frame = max(0, peak_idx - 2)
end_frame = min(len(rms), peak_idx + 2)
frame_idx = int(time_sec * sr)
segment = y[max(0, frame_idx - 1000) : frame_idx + 1000]
if len(segment) > 0:
# 計算頻譜質心 (聲音的 "亮度")
centroid = librosa.feature.spectral_centroid(y=segment, sr=sr)[0]
avg_centroid = np.mean(centroid)
# 計算頻帶能量 (Gunshot 通常高頻能量豐富)
# 這裡簡化:如果 RMS 極高,直接標記為 "Gunshot/Explosion"
rms_val = rms[peak_idx]
event_type = "Loud Noise"
if rms_val > threshold * 2.0:
event_type = "Explosion/Gunshot" # 極高能量
elif rms_val > threshold * 1.2:
event_type = "Loud Impact"
events.append(
{
"timestamp": round(time_sec, 2),
"type": event_type,
"energy": round(float(rms_val), 4),
"centroid": round(float(avg_centroid), 2),
}
)
return events
if __name__ == "__main__":
if not os.path.exists(AUDIO_PATH):
# 嘗試從 mp4 提取
AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4")
if not os.path.exists(AUDIO_PATH_MP4):
AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mov")
if os.path.exists(AUDIO_PATH_MP4):
print("🎥 Extracting audio from video...")
os.system(f"ffmpeg -y -i {AUDIO_PATH_MP4} -vn -ar 16000 -ac 1 {AUDIO_PATH}")
else:
print("❌ No audio/video found.")
sys.exit(1)
print(f"🕵️‍♂️ Starting Sound Event Detection for {UUID}...")
# 執行檢測
events = detect_impulse_sounds(AUDIO_PATH)
# 保存結果
with open(OUTPUT_JSON, "w") as f:
json.dump({"sound_events": events}, f, indent=2)
print(f"\n🎉 Found {len(events)} potential sound events.")
print(f"💾 Results saved to {OUTPUT_JSON}")
# 顯示前 10 個高能量事件
print("\n🔥 Top 10 Loudest Events (Potential Gunshots):")
# 按能量排序
sorted_events = sorted(events, key=lambda x: x["energy"], reverse=True)[:10]
for i, ev in enumerate(sorted_events):
m, s = divmod(ev["timestamp"], 60)
print(
f" {i + 1}. [{int(m):02d}:{s:05.2f}] {ev['type']} (Energy: {ev['energy']:.4f})"
)