Files
momentry_core/scripts/music_segmentation_processor.py
Warren e75c4d6f07 cleanup: remove dead code and duplicate docs
- Remove session-ses_2f27.md (161KB raw session log)
- Remove 49 ROOT_* duplicate files across REFERENCE/
- Remove 14 duplicate files between REFERENCE/ root and history/
- Remove asr_legacy.rs (dead code, replaced by asr.rs)
- Remove src/core/worker/ (duplicate JobWorker)
- Remove src/core/layers/ (empty directory)
- Remove 4 .bak files in src/
- Remove 7 dead private methods in worker/processor.rs
- Remove backup directory from git tracking
2026-05-04 01:31:21 +08:00

137 lines
4.5 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Music Segmentation Processor
職責:利用色度特徵 (Chroma Features) 分析配樂變化,識別場景轉換點。
"""
import librosa
import numpy as np
import os
import json
# 設定
UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
AUDIO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.wav")
OUTPUT_JSON = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.music_segments.json")
def analyze_music_segmentation(audio_path):
print(f"🎵 Loading audio for analysis: {audio_path}")
# 載入音頻,降低取樣率以加速處理 (8kHz 足夠分析音高)
y, sr = librosa.load(audio_path, sr=8000, mono=True)
total_dur = len(y) / sr
print(f"✅ Audio Loaded ({total_dur:.1f}s). Computing Chroma Features...")
# 1. 計算色度特徵 (Chroma STFT)
# hop_length 設為 1 秒,以便快速計算長片
hop_length = int(1.0 * sr)
chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)
print("📊 Analyzing transitions...")
# 2. 計算自我相似度矩陣 (Self-Similarity Matrix) - 優化版
# 這裡我們簡化為計算相鄰片段的餘弦距離 (Cosine Distance)
# 如果距離很大,代表音樂變了
num_frames = chroma.shape[1]
novelty_scores = np.zeros(num_frames)
# 使用滑動窗口計算差異
window_size = 5 # 檢查前後 5 秒的變化
print(f"🔍 Scanning {num_frames} frames...")
# 使用 librosa 的 onset_strength 的變體,但針對 Chroma
# 這裡手動計算 Cosine Distance 以確保準確度
# 為了效能,我們不逐一計算,而是使用向量化的方法
# 計算 frame[t] 和 frame[t+lag] 的差異
# 我們建立一個 "Recurrence Matrix" 的對角線偏移版本來找變化
# 簡單做法:計算 chroma[:, t] 和 chroma[:, t+1] 的距離
# 更快的方法:計算一階差分
diff = np.diff(chroma, axis=1)
# 計算差分的 L2 Norm (歐幾里得距離)
distances = np.linalg.norm(diff, axis=0)
# 平滑化 (Moving Average) 以減少噪聲
# 取 5 秒的移動平均
kernel_size = 5
kernel = np.ones(kernel_size) / kernel_size
smooth_distances = np.convolve(distances, kernel, mode="same")
# 3. 尋找峰值 (Change Points)
# 設定閾值:只有當距離變化顯著大於平均值時才視為切分
threshold = np.mean(smooth_distances) + 1.5 * np.std(smooth_distances)
# 尋找局部最大值
from scipy.signal import find_peaks
peaks, properties = find_peaks(
smooth_distances, height=threshold, distance=30
) # distance=30s to avoid too many cuts
print(f"🎯 Found {len(peaks)} Music Transition Points.")
# 4. 構建 Segments
segments = []
start_time = 0.0
# 將幀索引轉換為時間
peak_times = peaks * (hop_length / sr)
for p_time in peak_times:
# 如果間隔太短 (小於 10 秒),忽略,視為同一樂段的起伏
if p_time - start_time < 10.0:
continue
# 計算該段的平均能量/特徵以生成標籤
# 這裡簡化處理
segments.append(
{
"start_time": round(start_time, 1),
"end_time": round(p_time, 1),
"duration": round(p_time - start_time, 1),
"type": "Music Segment",
}
)
start_time = p_time
# 最後一段
if start_time < total_dur:
segments.append(
{
"start_time": round(start_time, 1),
"end_time": round(total_dur, 1),
"duration": round(total_dur - start_time, 1),
"type": "Music Segment",
}
)
return segments
if __name__ == "__main__":
if not os.path.exists(AUDIO_PATH):
print(f"❌ Audio not found at {AUDIO_PATH}")
exit()
print(f"🎼 Starting Music Segmentation Analysis for {UUID}...")
segments = analyze_music_segmentation(AUDIO_PATH)
# 儲存
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
json.dump({"music_segments": segments}, f, indent=2, ensure_ascii=False)
print("\n🎉 Analysis Complete!")
print(f"✅ Identified {len(segments)} music-based scenes.")
print(f"💾 Saved to {OUTPUT_JSON}")
# 顯示結果
print("\n🎶 Top Music Segments:")
for i, seg in enumerate(segments[:20]):
m_s, s_s = divmod(seg["start_time"], 60)
print(f" {i + 1:02d}. [{int(m_s):02d}:{s_s:05.2f}] - {seg['duration']}s")