- Remove session-ses_2f27.md (161KB raw session log) - Remove 49 ROOT_* duplicate files across REFERENCE/ - Remove 14 duplicate files between REFERENCE/ root and history/ - Remove asr_legacy.rs (dead code, replaced by asr.rs) - Remove src/core/worker/ (duplicate JobWorker) - Remove src/core/layers/ (empty directory) - Remove 4 .bak files in src/ - Remove 7 dead private methods in worker/processor.rs - Remove backup directory from git tracking
137 lines
4.5 KiB
Python
137 lines
4.5 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
Music Segmentation Processor
|
|
職責:利用色度特徵 (Chroma Features) 分析配樂變化,識別場景轉換點。
|
|
"""
|
|
|
|
import librosa
|
|
import numpy as np
|
|
import os
|
|
import json
|
|
|
|
# 設定
|
|
UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
|
|
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
|
|
AUDIO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.wav")
|
|
OUTPUT_JSON = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.music_segments.json")
|
|
|
|
|
|
def analyze_music_segmentation(audio_path):
|
|
print(f"🎵 Loading audio for analysis: {audio_path}")
|
|
# 載入音頻,降低取樣率以加速處理 (8kHz 足夠分析音高)
|
|
y, sr = librosa.load(audio_path, sr=8000, mono=True)
|
|
total_dur = len(y) / sr
|
|
print(f"✅ Audio Loaded ({total_dur:.1f}s). Computing Chroma Features...")
|
|
|
|
# 1. 計算色度特徵 (Chroma STFT)
|
|
# hop_length 設為 1 秒,以便快速計算長片
|
|
hop_length = int(1.0 * sr)
|
|
chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)
|
|
|
|
print("📊 Analyzing transitions...")
|
|
|
|
# 2. 計算自我相似度矩陣 (Self-Similarity Matrix) - 優化版
|
|
# 這裡我們簡化為計算相鄰片段的餘弦距離 (Cosine Distance)
|
|
# 如果距離很大,代表音樂變了
|
|
|
|
num_frames = chroma.shape[1]
|
|
novelty_scores = np.zeros(num_frames)
|
|
|
|
# 使用滑動窗口計算差異
|
|
window_size = 5 # 檢查前後 5 秒的變化
|
|
|
|
print(f"🔍 Scanning {num_frames} frames...")
|
|
|
|
# 使用 librosa 的 onset_strength 的變體,但針對 Chroma
|
|
# 這裡手動計算 Cosine Distance 以確保準確度
|
|
|
|
# 為了效能,我們不逐一計算,而是使用向量化的方法
|
|
# 計算 frame[t] 和 frame[t+lag] 的差異
|
|
|
|
# 我們建立一個 "Recurrence Matrix" 的對角線偏移版本來找變化
|
|
# 簡單做法:計算 chroma[:, t] 和 chroma[:, t+1] 的距離
|
|
|
|
# 更快的方法:計算一階差分
|
|
diff = np.diff(chroma, axis=1)
|
|
# 計算差分的 L2 Norm (歐幾里得距離)
|
|
distances = np.linalg.norm(diff, axis=0)
|
|
|
|
# 平滑化 (Moving Average) 以減少噪聲
|
|
# 取 5 秒的移動平均
|
|
kernel_size = 5
|
|
kernel = np.ones(kernel_size) / kernel_size
|
|
smooth_distances = np.convolve(distances, kernel, mode="same")
|
|
|
|
# 3. 尋找峰值 (Change Points)
|
|
# 設定閾值:只有當距離變化顯著大於平均值時才視為切分
|
|
threshold = np.mean(smooth_distances) + 1.5 * np.std(smooth_distances)
|
|
|
|
# 尋找局部最大值
|
|
from scipy.signal import find_peaks
|
|
|
|
peaks, properties = find_peaks(
|
|
smooth_distances, height=threshold, distance=30
|
|
) # distance=30s to avoid too many cuts
|
|
|
|
print(f"🎯 Found {len(peaks)} Music Transition Points.")
|
|
|
|
# 4. 構建 Segments
|
|
segments = []
|
|
start_time = 0.0
|
|
|
|
# 將幀索引轉換為時間
|
|
peak_times = peaks * (hop_length / sr)
|
|
|
|
for p_time in peak_times:
|
|
# 如果間隔太短 (小於 10 秒),忽略,視為同一樂段的起伏
|
|
if p_time - start_time < 10.0:
|
|
continue
|
|
|
|
# 計算該段的平均能量/特徵以生成標籤
|
|
# 這裡簡化處理
|
|
segments.append(
|
|
{
|
|
"start_time": round(start_time, 1),
|
|
"end_time": round(p_time, 1),
|
|
"duration": round(p_time - start_time, 1),
|
|
"type": "Music Segment",
|
|
}
|
|
)
|
|
start_time = p_time
|
|
|
|
# 最後一段
|
|
if start_time < total_dur:
|
|
segments.append(
|
|
{
|
|
"start_time": round(start_time, 1),
|
|
"end_time": round(total_dur, 1),
|
|
"duration": round(total_dur - start_time, 1),
|
|
"type": "Music Segment",
|
|
}
|
|
)
|
|
|
|
return segments
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if not os.path.exists(AUDIO_PATH):
|
|
print(f"❌ Audio not found at {AUDIO_PATH}")
|
|
exit()
|
|
|
|
print(f"🎼 Starting Music Segmentation Analysis for {UUID}...")
|
|
segments = analyze_music_segmentation(AUDIO_PATH)
|
|
|
|
# 儲存
|
|
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
|
|
json.dump({"music_segments": segments}, f, indent=2, ensure_ascii=False)
|
|
|
|
print("\n🎉 Analysis Complete!")
|
|
print(f"✅ Identified {len(segments)} music-based scenes.")
|
|
print(f"💾 Saved to {OUTPUT_JSON}")
|
|
|
|
# 顯示結果
|
|
print("\n🎶 Top Music Segments:")
|
|
for i, seg in enumerate(segments[:20]):
|
|
m_s, s_s = divmod(seg["start_time"], 60)
|
|
print(f" {i + 1:02d}. [{int(m_s):02d}:{s_s:05.2f}] - {seg['duration']}s")
|