feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
This commit is contained in:
137
scripts/audio_taxonomy_processor.py
Normal file
137
scripts/audio_taxonomy_processor.py
Normal file
@@ -0,0 +1,137 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Audio Taxonomy Processor (Hugging Face Transformers)
|
||||
職責:使用 AST 模型進行高精度音頻分類,並映射到業務分類。
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import librosa
|
||||
|
||||
# 依賴檢查
|
||||
try:
|
||||
from transformers import pipeline
|
||||
|
||||
HAS_HF = True
|
||||
except ImportError:
|
||||
print("❌ transformers not found. Run: pip install transformers")
|
||||
sys.exit(1)
|
||||
|
||||
# 設定
|
||||
UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
|
||||
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
|
||||
AUDIO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.wav")
|
||||
OUTPUT_JSON = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.audio_taxonomy.json")
|
||||
|
||||
# 1. 建立標籤映射字典 (AudioSet -> 業務分類)
|
||||
TAXONOMY_MAP = {
|
||||
"Speech": "Human/Speech",
|
||||
"Male speech, man speaking": "Human/Speech",
|
||||
"Female speech, woman speaking": "Human/Speech",
|
||||
"Conversation": "Human/Speech",
|
||||
"Laughter": "Human/Vocals",
|
||||
"Singing": "Human/Vocals",
|
||||
"Choir": "Human/Vocals",
|
||||
"Cough": "Human/Vocals",
|
||||
"Applause": "Human/Vocals",
|
||||
"Rain": "Nature/Weather",
|
||||
"Raindrop": "Nature/Weather",
|
||||
"Thunder": "Nature/Weather",
|
||||
"Wind": "Nature/Weather",
|
||||
"Ocean": "Nature/Water",
|
||||
"Stream": "Nature/Water",
|
||||
"Bird": "Nature/Flora_Fauna",
|
||||
"Dog": "Nature/Flora_Fauna",
|
||||
"Cat": "Nature/Flora_Fauna",
|
||||
"Gunshot, gunfire": "Artificial/Impact_Weapon",
|
||||
"Explosion": "Artificial/Impact_Weapon",
|
||||
"Glass shatter": "Artificial/Impact_Weapon",
|
||||
"Car": "Artificial/Transport",
|
||||
"Engine": "Artificial/Transport",
|
||||
"Siren": "Artificial/Transport",
|
||||
"Piano": "Artificial/Music",
|
||||
"Guitar": "Artificial/Music",
|
||||
"Drum": "Artificial/Music",
|
||||
"Music": "Artificial/Music",
|
||||
"Keyboard": "Artificial/Household",
|
||||
"Telephone": "Artificial/Household",
|
||||
"Door": "Artificial/Household",
|
||||
}
|
||||
|
||||
|
||||
def map_to_taxonomy(predictions):
|
||||
"""將 HF 輸出映射到業務分類"""
|
||||
events = {}
|
||||
for pred in predictions:
|
||||
label = pred["label"]
|
||||
score = pred["score"]
|
||||
mapped_cat = TAXONOMY_MAP.get(label)
|
||||
if mapped_cat and score > 0.3: # 過濾低信心度
|
||||
events[mapped_cat] = round(float(score), 4)
|
||||
return events
|
||||
|
||||
|
||||
def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):
|
||||
"""執行分類"""
|
||||
print(f"🔍 Loading AST model (MIT) from Hugging Face...")
|
||||
# 使用 Audio Spectrogram Transformer,準確率高且支援 MPS/CPU
|
||||
classifier = pipeline(
|
||||
"audio-classification",
|
||||
model="MIT/ast-finetuned-audioset-10-10-0.4593",
|
||||
device=-1,
|
||||
)
|
||||
|
||||
print(f"📊 Analyzing audio in {chunk_sec}s chunks (hop: {hop_sec}s)...")
|
||||
y, sr = librosa.load(audio_path, sr=16000, mono=True)
|
||||
total_dur = len(y) / sr
|
||||
|
||||
results = []
|
||||
current = 0.0
|
||||
|
||||
print(f"⏱️ Total duration: {total_dur:.1f}s")
|
||||
while current + chunk_sec <= total_dur:
|
||||
start_sample = int(current * sr)
|
||||
end_sample = int((current + chunk_sec) * sr)
|
||||
clip = y[start_sample:end_sample]
|
||||
|
||||
try:
|
||||
# 推斷 Top 5
|
||||
preds = classifier(clip, sampling_rate=16000, top_k=5)
|
||||
taxonomy = map_to_taxonomy(preds)
|
||||
|
||||
if taxonomy:
|
||||
results.append({"timestamp": round(current, 1), "categories": taxonomy})
|
||||
except Exception as e:
|
||||
pass # 跳過錯誤片段
|
||||
|
||||
current += hop_sec
|
||||
if int(current) % 30 == 0:
|
||||
print(f" 🕒 Processed: {int(current)}s / {int(total_dur)}s")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not os.path.exists(AUDIO_PATH):
|
||||
AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4")
|
||||
if not os.path.exists(AUDIO_PATH_MP4):
|
||||
AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mov")
|
||||
|
||||
if os.path.exists(AUDIO_PATH_MP4):
|
||||
print("🎥 Extracting audio from video...")
|
||||
os.system(f"ffmpeg -y -i {AUDIO_PATH_MP4} -vn -ar 16000 -ac 1 {AUDIO_PATH}")
|
||||
else:
|
||||
print("❌ No audio/video found.")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"🕵️♂️ Starting Audio Taxonomy Classification for {UUID}...")
|
||||
events = run_audio_taxonomy(AUDIO_PATH)
|
||||
|
||||
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump({"audio_taxonomy": events}, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n🎉 Classification Complete!")
|
||||
print(f"✅ Found {len(events)} tagged audio segments.")
|
||||
print(f"💾 Saved to {OUTPUT_JSON}")
|
||||
Reference in New Issue
Block a user