- Remove session-ses_2f27.md (161KB raw session log) - Remove 49 ROOT_* duplicate files across REFERENCE/ - Remove 14 duplicate files between REFERENCE/ root and history/ - Remove asr_legacy.rs (dead code, replaced by asr.rs) - Remove src/core/worker/ (duplicate JobWorker) - Remove src/core/layers/ (empty directory) - Remove 4 .bak files in src/ - Remove 7 dead private methods in worker/processor.rs - Remove backup directory from git tracking
137 lines
4.4 KiB
Python
137 lines
4.4 KiB
Python
#!/opt/homebrew/bin/python3.11
|
||
"""
|
||
Audio Taxonomy Processor (Hugging Face Transformers)
|
||
職責:使用 AST 模型進行高精度音頻分類,並映射到業務分類。
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import sys
|
||
import librosa
|
||
|
||
# 依賴檢查
|
||
try:
|
||
from transformers import pipeline
|
||
|
||
HAS_HF = True
|
||
except ImportError:
|
||
print("❌ transformers not found. Run: pip install transformers")
|
||
sys.exit(1)
|
||
|
||
# 設定
|
||
UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
|
||
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
|
||
AUDIO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.wav")
|
||
OUTPUT_JSON = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.audio_taxonomy.json")
|
||
|
||
# 1. 建立標籤映射字典 (AudioSet -> 業務分類)
|
||
TAXONOMY_MAP = {
|
||
"Speech": "Human/Speech",
|
||
"Male speech, man speaking": "Human/Speech",
|
||
"Female speech, woman speaking": "Human/Speech",
|
||
"Conversation": "Human/Speech",
|
||
"Laughter": "Human/Vocals",
|
||
"Singing": "Human/Vocals",
|
||
"Choir": "Human/Vocals",
|
||
"Cough": "Human/Vocals",
|
||
"Applause": "Human/Vocals",
|
||
"Rain": "Nature/Weather",
|
||
"Raindrop": "Nature/Weather",
|
||
"Thunder": "Nature/Weather",
|
||
"Wind": "Nature/Weather",
|
||
"Ocean": "Nature/Water",
|
||
"Stream": "Nature/Water",
|
||
"Bird": "Nature/Flora_Fauna",
|
||
"Dog": "Nature/Flora_Fauna",
|
||
"Cat": "Nature/Flora_Fauna",
|
||
"Gunshot, gunfire": "Artificial/Impact_Weapon",
|
||
"Explosion": "Artificial/Impact_Weapon",
|
||
"Glass shatter": "Artificial/Impact_Weapon",
|
||
"Car": "Artificial/Transport",
|
||
"Engine": "Artificial/Transport",
|
||
"Siren": "Artificial/Transport",
|
||
"Piano": "Artificial/Music",
|
||
"Guitar": "Artificial/Music",
|
||
"Drum": "Artificial/Music",
|
||
"Music": "Artificial/Music",
|
||
"Keyboard": "Artificial/Household",
|
||
"Telephone": "Artificial/Household",
|
||
"Door": "Artificial/Household",
|
||
}
|
||
|
||
|
||
def map_to_taxonomy(predictions):
|
||
"""將 HF 輸出映射到業務分類"""
|
||
events = {}
|
||
for pred in predictions:
|
||
label = pred["label"]
|
||
score = pred["score"]
|
||
mapped_cat = TAXONOMY_MAP.get(label)
|
||
if mapped_cat and score > 0.3: # 過濾低信心度
|
||
events[mapped_cat] = round(float(score), 4)
|
||
return events
|
||
|
||
|
||
def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):
|
||
"""執行分類"""
|
||
print("🔍 Loading AST model (MIT) from Hugging Face...")
|
||
# 使用 Audio Spectrogram Transformer,準確率高且支援 MPS/CPU
|
||
classifier = pipeline(
|
||
"audio-classification",
|
||
model="MIT/ast-finetuned-audioset-10-10-0.4593",
|
||
device=-1,
|
||
)
|
||
|
||
print(f"📊 Analyzing audio in {chunk_sec}s chunks (hop: {hop_sec}s)...")
|
||
y, sr = librosa.load(audio_path, sr=16000, mono=True)
|
||
total_dur = len(y) / sr
|
||
|
||
results = []
|
||
current = 0.0
|
||
|
||
print(f"⏱️ Total duration: {total_dur:.1f}s")
|
||
while current + chunk_sec <= total_dur:
|
||
start_sample = int(current * sr)
|
||
end_sample = int((current + chunk_sec) * sr)
|
||
clip = y[start_sample:end_sample]
|
||
|
||
try:
|
||
# 推斷 Top 5
|
||
preds = classifier(clip, sampling_rate=16000, top_k=5)
|
||
taxonomy = map_to_taxonomy(preds)
|
||
|
||
if taxonomy:
|
||
results.append({"timestamp": round(current, 1), "categories": taxonomy})
|
||
except Exception:
|
||
pass # 跳過錯誤片段
|
||
|
||
current += hop_sec
|
||
if int(current) % 30 == 0:
|
||
print(f" 🕒 Processed: {int(current)}s / {int(total_dur)}s")
|
||
|
||
return results
|
||
|
||
|
||
if __name__ == "__main__":
|
||
if not os.path.exists(AUDIO_PATH):
|
||
AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4")
|
||
if not os.path.exists(AUDIO_PATH_MP4):
|
||
AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mov")
|
||
|
||
if os.path.exists(AUDIO_PATH_MP4):
|
||
print("🎥 Extracting audio from video...")
|
||
os.system(f"ffmpeg -y -i {AUDIO_PATH_MP4} -vn -ar 16000 -ac 1 {AUDIO_PATH}")
|
||
else:
|
||
print("❌ No audio/video found.")
|
||
sys.exit(1)
|
||
|
||
print(f"🕵️♂️ Starting Audio Taxonomy Classification for {UUID}...")
|
||
events = run_audio_taxonomy(AUDIO_PATH)
|
||
|
||
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
|
||
json.dump({"audio_taxonomy": events}, f, indent=2, ensure_ascii=False)
|
||
|
||
print("\n🎉 Classification Complete!")
|
||
print(f"✅ Found {len(events)} tagged audio segments.")
|
||
print(f"💾 Saved to {OUTPUT_JSON}")
|