feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors
- Add release pre-flight check script
- Add synonym generation, chunk processing scripts
- Add face recognition, stamp search utilities
This commit is contained in:
Warren
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions

View File

@@ -0,0 +1,137 @@
#!/opt/homebrew/bin/python3.11
"""
Audio Taxonomy Processor (Hugging Face Transformers)
職責:使用 AST 模型進行高精度音頻分類,並映射到業務分類。
"""
import numpy as np
import json
import os
import sys
import librosa
# 依賴檢查
try:
from transformers import pipeline
HAS_HF = True
except ImportError:
print("❌ transformers not found. Run: pip install transformers")
sys.exit(1)
# 設定
UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
AUDIO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.wav")
OUTPUT_JSON = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.audio_taxonomy.json")
# 1. 建立標籤映射字典 (AudioSet -> 業務分類)
TAXONOMY_MAP = {
"Speech": "Human/Speech",
"Male speech, man speaking": "Human/Speech",
"Female speech, woman speaking": "Human/Speech",
"Conversation": "Human/Speech",
"Laughter": "Human/Vocals",
"Singing": "Human/Vocals",
"Choir": "Human/Vocals",
"Cough": "Human/Vocals",
"Applause": "Human/Vocals",
"Rain": "Nature/Weather",
"Raindrop": "Nature/Weather",
"Thunder": "Nature/Weather",
"Wind": "Nature/Weather",
"Ocean": "Nature/Water",
"Stream": "Nature/Water",
"Bird": "Nature/Flora_Fauna",
"Dog": "Nature/Flora_Fauna",
"Cat": "Nature/Flora_Fauna",
"Gunshot, gunfire": "Artificial/Impact_Weapon",
"Explosion": "Artificial/Impact_Weapon",
"Glass shatter": "Artificial/Impact_Weapon",
"Car": "Artificial/Transport",
"Engine": "Artificial/Transport",
"Siren": "Artificial/Transport",
"Piano": "Artificial/Music",
"Guitar": "Artificial/Music",
"Drum": "Artificial/Music",
"Music": "Artificial/Music",
"Keyboard": "Artificial/Household",
"Telephone": "Artificial/Household",
"Door": "Artificial/Household",
}
def map_to_taxonomy(predictions):
"""將 HF 輸出映射到業務分類"""
events = {}
for pred in predictions:
label = pred["label"]
score = pred["score"]
mapped_cat = TAXONOMY_MAP.get(label)
if mapped_cat and score > 0.3: # 過濾低信心度
events[mapped_cat] = round(float(score), 4)
return events
def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):
"""執行分類"""
print(f"🔍 Loading AST model (MIT) from Hugging Face...")
# 使用 Audio Spectrogram Transformer準確率高且支援 MPS/CPU
classifier = pipeline(
"audio-classification",
model="MIT/ast-finetuned-audioset-10-10-0.4593",
device=-1,
)
print(f"📊 Analyzing audio in {chunk_sec}s chunks (hop: {hop_sec}s)...")
y, sr = librosa.load(audio_path, sr=16000, mono=True)
total_dur = len(y) / sr
results = []
current = 0.0
print(f"⏱️ Total duration: {total_dur:.1f}s")
while current + chunk_sec <= total_dur:
start_sample = int(current * sr)
end_sample = int((current + chunk_sec) * sr)
clip = y[start_sample:end_sample]
try:
# 推斷 Top 5
preds = classifier(clip, sampling_rate=16000, top_k=5)
taxonomy = map_to_taxonomy(preds)
if taxonomy:
results.append({"timestamp": round(current, 1), "categories": taxonomy})
except Exception as e:
pass # 跳過錯誤片段
current += hop_sec
if int(current) % 30 == 0:
print(f" 🕒 Processed: {int(current)}s / {int(total_dur)}s")
return results
if __name__ == "__main__":
if not os.path.exists(AUDIO_PATH):
AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4")
if not os.path.exists(AUDIO_PATH_MP4):
AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mov")
if os.path.exists(AUDIO_PATH_MP4):
print("🎥 Extracting audio from video...")
os.system(f"ffmpeg -y -i {AUDIO_PATH_MP4} -vn -ar 16000 -ac 1 {AUDIO_PATH}")
else:
print("❌ No audio/video found.")
sys.exit(1)
print(f"🕵️‍♂️ Starting Audio Taxonomy Classification for {UUID}...")
events = run_audio_taxonomy(AUDIO_PATH)
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
json.dump({"audio_taxonomy": events}, f, indent=2, ensure_ascii=False)
print(f"\n🎉 Classification Complete!")
print(f"✅ Found {len(events)} tagged audio segments.")
print(f"💾 Saved to {OUTPUT_JSON}")