#!/opt/homebrew/bin/python3.11 """ Music Segmentation Processor 職責:利用色度特徵 (Chroma Features) 分析配樂變化,識別場景轉換點。 """ import librosa import numpy as np import os import json # 設定 UUID = os.getenv("UUID", "384b0ff44aaaa1f1") OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output") AUDIO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.wav") OUTPUT_JSON = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.music_segments.json") def analyze_music_segmentation(audio_path): print(f"🎵 Loading audio for analysis: {audio_path}") # 載入音頻,降低取樣率以加速處理 (8kHz 足夠分析音高) y, sr = librosa.load(audio_path, sr=8000, mono=True) total_dur = len(y) / sr print(f"✅ Audio Loaded ({total_dur:.1f}s). Computing Chroma Features...") # 1. 計算色度特徵 (Chroma STFT) # hop_length 設為 1 秒,以便快速計算長片 hop_length = int(1.0 * sr) chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length) print("📊 Analyzing transitions...") # 2. 計算自我相似度矩陣 (Self-Similarity Matrix) - 優化版 # 這裡我們簡化為計算相鄰片段的餘弦距離 (Cosine Distance) # 如果距離很大,代表音樂變了 num_frames = chroma.shape[1] novelty_scores = np.zeros(num_frames) # 使用滑動窗口計算差異 window_size = 5 # 檢查前後 5 秒的變化 print(f"🔍 Scanning {num_frames} frames...") # 使用 librosa 的 onset_strength 的變體,但針對 Chroma # 這裡手動計算 Cosine Distance 以確保準確度 # 為了效能,我們不逐一計算,而是使用向量化的方法 # 計算 frame[t] 和 frame[t+lag] 的差異 # 我們建立一個 "Recurrence Matrix" 的對角線偏移版本來找變化 # 簡單做法:計算 chroma[:, t] 和 chroma[:, t+1] 的距離 # 更快的方法:計算一階差分 diff = np.diff(chroma, axis=1) # 計算差分的 L2 Norm (歐幾里得距離) distances = np.linalg.norm(diff, axis=0) # 平滑化 (Moving Average) 以減少噪聲 # 取 5 秒的移動平均 kernel_size = 5 kernel = np.ones(kernel_size) / kernel_size smooth_distances = np.convolve(distances, kernel, mode="same") # 3. 尋找峰值 (Change Points) # 設定閾值:只有當距離變化顯著大於平均值時才視為切分 threshold = np.mean(smooth_distances) + 1.5 * np.std(smooth_distances) # 尋找局部最大值 from scipy.signal import find_peaks peaks, properties = find_peaks( smooth_distances, height=threshold, distance=30 ) # distance=30s to avoid too many cuts print(f"🎯 Found {len(peaks)} Music Transition Points.") # 4. 構建 Segments segments = [] start_time = 0.0 # 將幀索引轉換為時間 peak_times = peaks * (hop_length / sr) for p_time in peak_times: # 如果間隔太短 (小於 10 秒),忽略,視為同一樂段的起伏 if p_time - start_time < 10.0: continue # 計算該段的平均能量/特徵以生成標籤 # 這裡簡化處理 segments.append( { "start_time": round(start_time, 1), "end_time": round(p_time, 1), "duration": round(p_time - start_time, 1), "type": "Music Segment", } ) start_time = p_time # 最後一段 if start_time < total_dur: segments.append( { "start_time": round(start_time, 1), "end_time": round(total_dur, 1), "duration": round(total_dur - start_time, 1), "type": "Music Segment", } ) return segments if __name__ == "__main__": if not os.path.exists(AUDIO_PATH): print(f"❌ Audio not found at {AUDIO_PATH}") exit() print(f"🎼 Starting Music Segmentation Analysis for {UUID}...") segments = analyze_music_segmentation(AUDIO_PATH) # 儲存 with open(OUTPUT_JSON, "w", encoding="utf-8") as f: json.dump({"music_segments": segments}, f, indent=2, ensure_ascii=False) print("\n🎉 Analysis Complete!") print(f"✅ Identified {len(segments)} music-based scenes.") print(f"💾 Saved to {OUTPUT_JSON}") # 顯示結果 print("\n🎶 Top Music Segments:") for i, seg in enumerate(segments[:20]): m_s, s_s = divmod(seg["start_time"], 60) print(f" {i + 1:02d}. [{int(m_s):02d}:{s_s:05.2f}] - {seg['duration']}s")