momentry_core/scripts/music_segmentation_processor.py

#!/opt/homebrew/bin/python3.11
"""
Music Segmentation Processor
職責：利用色度特徵 (Chroma Features) 分析配樂變化，識別場景轉換點。
"""

import librosa
import numpy as np
import os
import json

# 設定
UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
AUDIO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.wav")
OUTPUT_JSON = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.music_segments.json")


def analyze_music_segmentation(audio_path):
    print(f"🎵 Loading audio for analysis: {audio_path}")
    # 載入音頻，降低取樣率以加速處理 (8kHz 足夠分析音高)
    y, sr = librosa.load(audio_path, sr=8000, mono=True)
    total_dur = len(y) / sr
    print(f"✅ Audio Loaded ({total_dur:.1f}s). Computing Chroma Features...")

    # 1. 計算色度特徵 (Chroma STFT)
    # hop_length 設為 1 秒，以便快速計算長片
    hop_length = int(1.0 * sr)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)

    print("📊 Analyzing transitions...")

    # 2. 計算自我相似度矩陣 (Self-Similarity Matrix) - 優化版
    # 這裡我們簡化為計算相鄰片段的餘弦距離 (Cosine Distance)
    # 如果距離很大，代表音樂變了

    num_frames = chroma.shape[1]
    novelty_scores = np.zeros(num_frames)

    # 使用滑動窗口計算差異
    window_size = 5  # 檢查前後 5 秒的變化

    print(f"🔍 Scanning {num_frames} frames...")

    # 使用 librosa 的 onset_strength 的變體，但針對 Chroma
    # 這裡手動計算 Cosine Distance 以確保準確度

    # 為了效能，我們不逐一計算，而是使用向量化的方法
    # 計算 frame[t] 和 frame[t+lag] 的差異

    # 我們建立一個 "Recurrence Matrix" 的對角線偏移版本來找變化
    # 簡單做法：計算 chroma[:, t] 和 chroma[:, t+1] 的距離

    # 更快的方法：計算一階差分
    diff = np.diff(chroma, axis=1)
    # 計算差分的 L2 Norm (歐幾里得距離)
    distances = np.linalg.norm(diff, axis=0)

    # 平滑化 (Moving Average) 以減少噪聲
    # 取 5 秒的移動平均
    kernel_size = 5
    kernel = np.ones(kernel_size) / kernel_size
    smooth_distances = np.convolve(distances, kernel, mode="same")

    # 3. 尋找峰值 (Change Points)
    # 設定閾值：只有當距離變化顯著大於平均值時才視為切分
    threshold = np.mean(smooth_distances) + 1.5 * np.std(smooth_distances)

    # 尋找局部最大值
    from scipy.signal import find_peaks

    peaks, properties = find_peaks(
        smooth_distances, height=threshold, distance=30
    )  # distance=30s to avoid too many cuts

    print(f"🎯 Found {len(peaks)} Music Transition Points.")

    # 4. 構建 Segments
    segments = []
    start_time = 0.0

    # 將幀索引轉換為時間
    peak_times = peaks * (hop_length / sr)

    for p_time in peak_times:
        # 如果間隔太短 (小於 10 秒)，忽略，視為同一樂段的起伏
        if p_time - start_time < 10.0:
            continue

        # 計算該段的平均能量/特徵以生成標籤
        # 這裡簡化處理
        segments.append(
            {
                "start_time": round(start_time, 1),
                "end_time": round(p_time, 1),
                "duration": round(p_time - start_time, 1),
                "type": "Music Segment",
            }
        )
        start_time = p_time

    # 最後一段
    if start_time < total_dur:
        segments.append(
            {
                "start_time": round(start_time, 1),
                "end_time": round(total_dur, 1),
                "duration": round(total_dur - start_time, 1),
                "type": "Music Segment",
            }
        )

    return segments


if __name__ == "__main__":
    if not os.path.exists(AUDIO_PATH):
        print(f"❌ Audio not found at {AUDIO_PATH}")
        exit()

    print(f"🎼 Starting Music Segmentation Analysis for {UUID}...")
    segments = analyze_music_segmentation(AUDIO_PATH)

    # 儲存
    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump({"music_segments": segments}, f, indent=2, ensure_ascii=False)

    print("\n🎉 Analysis Complete!")
    print(f"✅ Identified {len(segments)} music-based scenes.")
    print(f"💾 Saved to {OUTPUT_JSON}")

    # 顯示結果
    print("\n🎶 Top Music Segments:")
    for i, seg in enumerate(segments[:20]):
        m_s, s_s = divmod(seg["start_time"], 60)
        print(f"  {i + 1:02d}. [{int(m_s):02d}:{s_s:05.2f}] - {seg['duration']}s")