Files
momentry_core/scripts/audio_taxonomy_processor.py
Warren e75c4d6f07 cleanup: remove dead code and duplicate docs
- Remove session-ses_2f27.md (161KB raw session log)
- Remove 49 ROOT_* duplicate files across REFERENCE/
- Remove 14 duplicate files between REFERENCE/ root and history/
- Remove asr_legacy.rs (dead code, replaced by asr.rs)
- Remove src/core/worker/ (duplicate JobWorker)
- Remove src/core/layers/ (empty directory)
- Remove 4 .bak files in src/
- Remove 7 dead private methods in worker/processor.rs
- Remove backup directory from git tracking
2026-05-04 01:31:21 +08:00

137 lines
4.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/opt/homebrew/bin/python3.11
"""
Audio Taxonomy Processor (Hugging Face Transformers)
職責:使用 AST 模型進行高精度音頻分類,並映射到業務分類。
"""
import json
import os
import sys
import librosa
# 依賴檢查
try:
from transformers import pipeline
HAS_HF = True
except ImportError:
print("❌ transformers not found. Run: pip install transformers")
sys.exit(1)
# 設定
UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
AUDIO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.wav")
OUTPUT_JSON = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.audio_taxonomy.json")
# 1. 建立標籤映射字典 (AudioSet -> 業務分類)
TAXONOMY_MAP = {
"Speech": "Human/Speech",
"Male speech, man speaking": "Human/Speech",
"Female speech, woman speaking": "Human/Speech",
"Conversation": "Human/Speech",
"Laughter": "Human/Vocals",
"Singing": "Human/Vocals",
"Choir": "Human/Vocals",
"Cough": "Human/Vocals",
"Applause": "Human/Vocals",
"Rain": "Nature/Weather",
"Raindrop": "Nature/Weather",
"Thunder": "Nature/Weather",
"Wind": "Nature/Weather",
"Ocean": "Nature/Water",
"Stream": "Nature/Water",
"Bird": "Nature/Flora_Fauna",
"Dog": "Nature/Flora_Fauna",
"Cat": "Nature/Flora_Fauna",
"Gunshot, gunfire": "Artificial/Impact_Weapon",
"Explosion": "Artificial/Impact_Weapon",
"Glass shatter": "Artificial/Impact_Weapon",
"Car": "Artificial/Transport",
"Engine": "Artificial/Transport",
"Siren": "Artificial/Transport",
"Piano": "Artificial/Music",
"Guitar": "Artificial/Music",
"Drum": "Artificial/Music",
"Music": "Artificial/Music",
"Keyboard": "Artificial/Household",
"Telephone": "Artificial/Household",
"Door": "Artificial/Household",
}
def map_to_taxonomy(predictions):
"""將 HF 輸出映射到業務分類"""
events = {}
for pred in predictions:
label = pred["label"]
score = pred["score"]
mapped_cat = TAXONOMY_MAP.get(label)
if mapped_cat and score > 0.3: # 過濾低信心度
events[mapped_cat] = round(float(score), 4)
return events
def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):
"""執行分類"""
print("🔍 Loading AST model (MIT) from Hugging Face...")
# 使用 Audio Spectrogram Transformer準確率高且支援 MPS/CPU
classifier = pipeline(
"audio-classification",
model="MIT/ast-finetuned-audioset-10-10-0.4593",
device=-1,
)
print(f"📊 Analyzing audio in {chunk_sec}s chunks (hop: {hop_sec}s)...")
y, sr = librosa.load(audio_path, sr=16000, mono=True)
total_dur = len(y) / sr
results = []
current = 0.0
print(f"⏱️ Total duration: {total_dur:.1f}s")
while current + chunk_sec <= total_dur:
start_sample = int(current * sr)
end_sample = int((current + chunk_sec) * sr)
clip = y[start_sample:end_sample]
try:
# 推斷 Top 5
preds = classifier(clip, sampling_rate=16000, top_k=5)
taxonomy = map_to_taxonomy(preds)
if taxonomy:
results.append({"timestamp": round(current, 1), "categories": taxonomy})
except Exception:
pass # 跳過錯誤片段
current += hop_sec
if int(current) % 30 == 0:
print(f" 🕒 Processed: {int(current)}s / {int(total_dur)}s")
return results
if __name__ == "__main__":
if not os.path.exists(AUDIO_PATH):
AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4")
if not os.path.exists(AUDIO_PATH_MP4):
AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mov")
if os.path.exists(AUDIO_PATH_MP4):
print("🎥 Extracting audio from video...")
os.system(f"ffmpeg -y -i {AUDIO_PATH_MP4} -vn -ar 16000 -ac 1 {AUDIO_PATH}")
else:
print("❌ No audio/video found.")
sys.exit(1)
print(f"🕵️‍♂️ Starting Audio Taxonomy Classification for {UUID}...")
events = run_audio_taxonomy(AUDIO_PATH)
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
json.dump({"audio_taxonomy": events}, f, indent=2, ensure_ascii=False)
print("\n🎉 Classification Complete!")
print(f"✅ Found {len(events)} tagged audio segments.")
print(f"💾 Saved to {OUTPUT_JSON}")