feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/text_semantic_analysis.py
+++ b/scripts/text_semantic_analysis.py
@@ -0,0 +1,138 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Text Semantic Analysis (PoC)
+職責：分析 ASR 數據的語義分佈，生成統計報告並演示搜尋效果。
+"""
+
+import sys
+import json
+import os
+import argparse
+import numpy as np
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+try:
+    from sentence_transformers import SentenceTransformer
+    from sklearn.cluster import KMeans
+
+    HAS_DEPS = True
+except ImportError:
+    HAS_DEPS = False
+    print(
+        "❌ Missing dependencies. Run: pip install sentence-transformers scikit-learn"
+    )
+    sys.exit(1)
+
+OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
+
+
+def load_asr_data(uuid):
+    path = os.path.join(OUTPUT_DIR, f"{uuid}.asr.json")
+    if not os.path.exists(path):
+        print(f"❌ ASR file not found: {path}")
+        return None
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def run_analysis(uuid, num_topics=5):
+    """
+    運行語義分析
+    """
+    print(f"🚀 Starting Semantic Analysis for {uuid}...")
+
+    # 1. 加載數據
+    data = load_asr_data(uuid)
+    if not data:
+        return
+
+    segments = data.get("segments", [])
+    texts = [
+        seg["text"] for seg in segments if len(seg["text"].strip()) > 5
+    ]  # 過濾太短的
+    times = [seg["start"] for seg in segments if len(seg["text"].strip()) > 5]
+
+    if not texts:
+        print("❌ No valid text found.")
+        return
+
+    print(f"✅ Loaded {len(texts)} valid text segments.")
+
+    # 2. 向量化 (使用輕量級模型 all-MiniLM-L6-v2)
+    print("🧠 Generating embeddings (this may take a moment)...")
+    model = SentenceTransformer("all-MiniLM-L6-v2")
+    embeddings = model.encode(texts, show_progress_bar=True)
+
+    # 3. 統計分析：主題聚類 (K-Means)
+    print(f"🔍 Identifying ~{num_topics} main topics...")
+    kmeans = KMeans(n_clusters=num_topics, random_state=42, n_init=10)
+    labels = kmeans.fit_predict(embeddings)
+
+    # 計算每個 Topic 的中心句 (離中心點最近的句子)
+    topic_centers = []
+    for i in range(num_topics):
+        cluster_indices = np.where(labels == i)[0]
+        if len(cluster_indices) == 0:
+            continue
+
+        cluster_embeddings = embeddings[cluster_indices]
+        cluster_texts = [texts[idx] for idx in cluster_indices]
+        cluster_times = [times[idx] for idx in cluster_indices]
+
+        # 計算 Cluster Center
+        center = np.mean(cluster_embeddings, axis=0)
+
+        # 找最接近中心的文本
+        sims = np.dot(cluster_embeddings, center) / (
+            np.linalg.norm(cluster_embeddings, axis=1) * np.linalg.norm(center)
+        )
+        best_idx_in_cluster = np.argmax(sims)
+
+        topic_centers.append(
+            {
+                "topic_id": i,
+                "representative_text": cluster_texts[best_idx_in_cluster],
+                "representative_time": cluster_times[best_idx_in_cluster],
+                "count": len(cluster_texts),
+            }
+        )
+
+    # 4. 輸出報告
+    print("\n" + "=" * 60)
+    print(f"📊 ANALYSIS REPORT FOR {uuid}")
+    print("=" * 60)
+    for topic in sorted(topic_centers, key=lambda x: x["count"], reverse=True):
+        print(f"🔹 Topic {topic['topic_id']} ({topic['count']} segments):")
+        print(f"   💬 '{topic['representative_text']}'")
+        print(f"   ⏰ Time: {topic['representative_time']:.2f}s")
+        print("-" * 40)
+
+    # 5. 演示搜尋 (Search Demo)
+    print("\n🔎 SEARCH DEMO")
+    print("-" * 60)
+    query = input(
+        "Enter a search query (e.g., 'money', 'fight', 'love', or press Enter to skip): "
+    )
+    if query:
+        query_vec = model.encode([query])[0]
+        sims = np.dot(embeddings, query_vec)
+
+        # 取 Top 3
+        top_indices = np.argsort(sims)[-3:][::-1]
+
+        for idx in top_indices:
+            print(
+                f"✅ Match ({sims[idx] * 100:.1f}%): [{times[idx]:.1f}s] {texts[idx]}"
+            )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Semantic Analysis PoC")
+    parser.add_argument("--uuid", default="384b0ff44aaaa1f1", help="Video UUID")
+    parser.add_argument(
+        "--topics", type=int, default=5, help="Number of topics to find"
+    )
+    args = parser.parse_args()
+
+    run_analysis(args.uuid, args.topics)