feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
This commit is contained in:
138
scripts/text_semantic_analysis.py
Normal file
138
scripts/text_semantic_analysis.py
Normal file
@@ -0,0 +1,138 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Text Semantic Analysis (PoC)
|
||||
職責:分析 ASR 數據的語義分佈,生成統計報告並演示搜尋效果。
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
import argparse
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
try:
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
HAS_DEPS = True
|
||||
except ImportError:
|
||||
HAS_DEPS = False
|
||||
print(
|
||||
"❌ Missing dependencies. Run: pip install sentence-transformers scikit-learn"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
OUTPUT_DIR = os.getenv("MOMENTRY_OUTPUT_DIR", "./output")
|
||||
|
||||
|
||||
def load_asr_data(uuid):
|
||||
path = os.path.join(OUTPUT_DIR, f"{uuid}.asr.json")
|
||||
if not os.path.exists(path):
|
||||
print(f"❌ ASR file not found: {path}")
|
||||
return None
|
||||
with open(path, "r") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def run_analysis(uuid, num_topics=5):
|
||||
"""
|
||||
運行語義分析
|
||||
"""
|
||||
print(f"🚀 Starting Semantic Analysis for {uuid}...")
|
||||
|
||||
# 1. 加載數據
|
||||
data = load_asr_data(uuid)
|
||||
if not data:
|
||||
return
|
||||
|
||||
segments = data.get("segments", [])
|
||||
texts = [
|
||||
seg["text"] for seg in segments if len(seg["text"].strip()) > 5
|
||||
] # 過濾太短的
|
||||
times = [seg["start"] for seg in segments if len(seg["text"].strip()) > 5]
|
||||
|
||||
if not texts:
|
||||
print("❌ No valid text found.")
|
||||
return
|
||||
|
||||
print(f"✅ Loaded {len(texts)} valid text segments.")
|
||||
|
||||
# 2. 向量化 (使用輕量級模型 all-MiniLM-L6-v2)
|
||||
print("🧠 Generating embeddings (this may take a moment)...")
|
||||
model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
embeddings = model.encode(texts, show_progress_bar=True)
|
||||
|
||||
# 3. 統計分析:主題聚類 (K-Means)
|
||||
print(f"🔍 Identifying ~{num_topics} main topics...")
|
||||
kmeans = KMeans(n_clusters=num_topics, random_state=42, n_init=10)
|
||||
labels = kmeans.fit_predict(embeddings)
|
||||
|
||||
# 計算每個 Topic 的中心句 (離中心點最近的句子)
|
||||
topic_centers = []
|
||||
for i in range(num_topics):
|
||||
cluster_indices = np.where(labels == i)[0]
|
||||
if len(cluster_indices) == 0:
|
||||
continue
|
||||
|
||||
cluster_embeddings = embeddings[cluster_indices]
|
||||
cluster_texts = [texts[idx] for idx in cluster_indices]
|
||||
cluster_times = [times[idx] for idx in cluster_indices]
|
||||
|
||||
# 計算 Cluster Center
|
||||
center = np.mean(cluster_embeddings, axis=0)
|
||||
|
||||
# 找最接近中心的文本
|
||||
sims = np.dot(cluster_embeddings, center) / (
|
||||
np.linalg.norm(cluster_embeddings, axis=1) * np.linalg.norm(center)
|
||||
)
|
||||
best_idx_in_cluster = np.argmax(sims)
|
||||
|
||||
topic_centers.append(
|
||||
{
|
||||
"topic_id": i,
|
||||
"representative_text": cluster_texts[best_idx_in_cluster],
|
||||
"representative_time": cluster_times[best_idx_in_cluster],
|
||||
"count": len(cluster_texts),
|
||||
}
|
||||
)
|
||||
|
||||
# 4. 輸出報告
|
||||
print("\n" + "=" * 60)
|
||||
print(f"📊 ANALYSIS REPORT FOR {uuid}")
|
||||
print("=" * 60)
|
||||
for topic in sorted(topic_centers, key=lambda x: x["count"], reverse=True):
|
||||
print(f"🔹 Topic {topic['topic_id']} ({topic['count']} segments):")
|
||||
print(f" 💬 '{topic['representative_text']}'")
|
||||
print(f" ⏰ Time: {topic['representative_time']:.2f}s")
|
||||
print("-" * 40)
|
||||
|
||||
# 5. 演示搜尋 (Search Demo)
|
||||
print("\n🔎 SEARCH DEMO")
|
||||
print("-" * 60)
|
||||
query = input(
|
||||
"Enter a search query (e.g., 'money', 'fight', 'love', or press Enter to skip): "
|
||||
)
|
||||
if query:
|
||||
query_vec = model.encode([query])[0]
|
||||
sims = np.dot(embeddings, query_vec)
|
||||
|
||||
# 取 Top 3
|
||||
top_indices = np.argsort(sims)[-3:][::-1]
|
||||
|
||||
for idx in top_indices:
|
||||
print(
|
||||
f"✅ Match ({sims[idx] * 100:.1f}%): [{times[idx]:.1f}s] {texts[idx]}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Semantic Analysis PoC")
|
||||
parser.add_argument("--uuid", default="384b0ff44aaaa1f1", help="Video UUID")
|
||||
parser.add_argument(
|
||||
"--topics", type=int, default=5, help="Number of topics to find"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
run_analysis(args.uuid, args.topics)
|
||||
Reference in New Issue
Block a user