feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/audio_taxonomy_processor.py
+++ b/scripts/audio_taxonomy_processor.py
@@ -0,0 +1,137 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Audio Taxonomy Processor (Hugging Face Transformers)
+職責：使用 AST 模型進行高精度音頻分類，並映射到業務分類。
+"""
+
+import numpy as np
+import json
+import os
+import sys
+import librosa
+
+# 依賴檢查
+try:
+    from transformers import pipeline
+
+    HAS_HF = True
+except ImportError:
+    print("❌ transformers not found. Run: pip install transformers")
+    sys.exit(1)
+
+# 設定
+UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
+OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
+AUDIO_PATH = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.wav")
+OUTPUT_JSON = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.audio_taxonomy.json")
+
+# 1. 建立標籤映射字典 (AudioSet -> 業務分類)
+TAXONOMY_MAP = {
+    "Speech": "Human/Speech",
+    "Male speech, man speaking": "Human/Speech",
+    "Female speech, woman speaking": "Human/Speech",
+    "Conversation": "Human/Speech",
+    "Laughter": "Human/Vocals",
+    "Singing": "Human/Vocals",
+    "Choir": "Human/Vocals",
+    "Cough": "Human/Vocals",
+    "Applause": "Human/Vocals",
+    "Rain": "Nature/Weather",
+    "Raindrop": "Nature/Weather",
+    "Thunder": "Nature/Weather",
+    "Wind": "Nature/Weather",
+    "Ocean": "Nature/Water",
+    "Stream": "Nature/Water",
+    "Bird": "Nature/Flora_Fauna",
+    "Dog": "Nature/Flora_Fauna",
+    "Cat": "Nature/Flora_Fauna",
+    "Gunshot, gunfire": "Artificial/Impact_Weapon",
+    "Explosion": "Artificial/Impact_Weapon",
+    "Glass shatter": "Artificial/Impact_Weapon",
+    "Car": "Artificial/Transport",
+    "Engine": "Artificial/Transport",
+    "Siren": "Artificial/Transport",
+    "Piano": "Artificial/Music",
+    "Guitar": "Artificial/Music",
+    "Drum": "Artificial/Music",
+    "Music": "Artificial/Music",
+    "Keyboard": "Artificial/Household",
+    "Telephone": "Artificial/Household",
+    "Door": "Artificial/Household",
+}
+
+
+def map_to_taxonomy(predictions):
+    """將 HF 輸出映射到業務分類"""
+    events = {}
+    for pred in predictions:
+        label = pred["label"]
+        score = pred["score"]
+        mapped_cat = TAXONOMY_MAP.get(label)
+        if mapped_cat and score > 0.3:  # 過濾低信心度
+            events[mapped_cat] = round(float(score), 4)
+    return events
+
+
+def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):
+    """執行分類"""
+    print(f"🔍 Loading AST model (MIT) from Hugging Face...")
+    # 使用 Audio Spectrogram Transformer，準確率高且支援 MPS/CPU
+    classifier = pipeline(
+        "audio-classification",
+        model="MIT/ast-finetuned-audioset-10-10-0.4593",
+        device=-1,
+    )
+
+    print(f"📊 Analyzing audio in {chunk_sec}s chunks (hop: {hop_sec}s)...")
+    y, sr = librosa.load(audio_path, sr=16000, mono=True)
+    total_dur = len(y) / sr
+
+    results = []
+    current = 0.0
+
+    print(f"⏱️  Total duration: {total_dur:.1f}s")
+    while current + chunk_sec <= total_dur:
+        start_sample = int(current * sr)
+        end_sample = int((current + chunk_sec) * sr)
+        clip = y[start_sample:end_sample]
+
+        try:
+            # 推斷 Top 5
+            preds = classifier(clip, sampling_rate=16000, top_k=5)
+            taxonomy = map_to_taxonomy(preds)
+
+            if taxonomy:
+                results.append({"timestamp": round(current, 1), "categories": taxonomy})
+        except Exception as e:
+            pass  # 跳過錯誤片段
+
+        current += hop_sec
+        if int(current) % 30 == 0:
+            print(f"   🕒 Processed: {int(current)}s / {int(total_dur)}s")
+
+    return results
+
+
+if __name__ == "__main__":
+    if not os.path.exists(AUDIO_PATH):
+        AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mp4")
+        if not os.path.exists(AUDIO_PATH_MP4):
+            AUDIO_PATH_MP4 = os.path.join(OUTPUT_DIR, UUID, f"{UUID}.mov")
+
+        if os.path.exists(AUDIO_PATH_MP4):
+            print("🎥 Extracting audio from video...")
+            os.system(f"ffmpeg -y -i {AUDIO_PATH_MP4} -vn -ar 16000 -ac 1 {AUDIO_PATH}")
+        else:
+            print("❌ No audio/video found.")
+            sys.exit(1)
+
+    print(f"🕵️‍♂️ Starting Audio Taxonomy Classification for {UUID}...")
+    events = run_audio_taxonomy(AUDIO_PATH)
+
+    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
+        json.dump({"audio_taxonomy": events}, f, indent=2, ensure_ascii=False)
+
+    print(f"\n🎉 Classification Complete!")
+    print(f"✅ Found {len(events)} tagged audio segments.")
+    print(f"💾 Saved to {OUTPUT_JSON}")