feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/generate_synonyms_ollama.py
+++ b/scripts/generate_synonyms_ollama.py
@@ -0,0 +1,262 @@
+#!/usr/bin/env python3
+"""
+LLM-Based Chinese-English Synonym Generator for Momentry
+
+Generates a synonym database by querying LLM (via Ollama or OpenAI-compatible API).
+Output format: JSON with word -> [synonyms] mapping
+
+Usage:
+  python scripts/generate_synonyms_ollama.py           # Using Ollama (default: llama3)
+  python scripts/generate_synonyms_ollama.py --model gemma:2b  # Specify model
+  python scripts/generate_synonyms_ollama.py --help    # Show help
+
+Requires:
+  - Ollama running (http://localhost:11434)
+  - pip install ollama
+"""
+
+import json
+import os
+import sys
+import time
+import argparse
+from typing import Dict, List, Optional
+
+try:
+    import ollama
+except ImportError:
+    print("Error: ollama package required. Install with: pip install ollama")
+    sys.exit(1)
+
+# ======================== Seed Words for Video Search Context ========================
+# These represent common concepts in video content that benefit from synonym expansion
+
+SEED_WORDS: Dict[str, List[str]] = {
+    # Action & Movement
+    "action": ["run", "walk", "move", "chase", "escape", "fight", "attack"],
+    "emotion": ["happy", "sad", "angry", "afraid", "surprised", "calm"],
+    "speech": ["talk", "say", "tell", "ask", "answer", "shout", "whisper"],
+    "scene": ["scene", "moment", "part", "clip", "sequence", "segment"],
+    # People & Relationships
+    "person": ["man", "woman", "boy", "girl", "child", "person"],
+    "relationship": ["friend", "enemy", "lover", "partner", "colleague"],
+    "authority": ["police", "detective", "officer", "guard", "agent"],
+    # Objects & Settings
+    "vehicle": ["car", "truck", "bus", "van", "vehicle", "automobile"],
+    "location": ["house", "office", "street", "city", "country", "place"],
+    "food": ["eat", "dinner", "lunch", "breakfast", "meal", "snack"],
+    "weapon": ["gun", "knife", "sword", "bomb", "weapon"],
+    # Events & Activities
+    "event": ["party", "meeting", "gathering", "celebration", "festival"],
+    "crime": ["theft", "murder", "robbery", "assault", "kidnapping"],
+    "travel": ["travel", "trip", "journey", "flight", "drive", "ride"],
+    # Time & Duration
+    "time": ["morning", "noon", "evening", "night", "afternoon"],
+    "duration": ["second", "minute", "hour", "day", "week", "month", "year"],
+    # Emotions & States
+    "positive": ["love", "joy", "peace", "hope", "trust", "success"],
+    "negative": ["fear", "anger", "pain", "death", "loss", "failure"],
+    "mental": ["think", "know", "believe", "understand", "remember", "forget"],
+    # Sensory
+    "sight": ["see", "look", "watch", "observe", "notice", "find"],
+    "sound": ["hear", "listen", "noise", "music", "voice", "speak"],
+    # Money & Value
+    "money": ["cash", "dollar", "coin", "payment", "price", "wealth"],
+    "transaction": ["buy", "sell", "pay", "spend", "cost", "price"],
+    # Chinese specific concepts
+    "chinese_emotion": ["愛", "恨", "喜", "怒", "哀", "樂", "愁", "驚"],
+    "chinese_action": ["走", "跑", "說", "看", "聽", "想", "做", "吃"],
+    "chinese_object": ["房子", "車子", "書", "電話", "電腦", "手機"],
+    "chinese_person": ["男人", "女人", "小孩", "老人", "朋友", "敵人"],
+}
+
+# ======================== LLM Query Functions ========================
+
+SYSTEM_PROMPT = """You are a synonym generation assistant. For each given word, provide 8-15 synonyms in the same language.
+Rules:
+1. Return ONLY a JSON array of strings, nothing else
+2. Synonyms should be contextually relevant for video content search
+3. Include common words, informal terms, and related concepts
+4. Do NOT include the input word in the output
+5. All synonyms must be in the SAME language as the input word
+6. No explanations, no markdown, just the JSON array
+
+Example input: "money"
+Example output: ["cash", "dollar", "currency", "funds", "bucks", "greenbacks", "coins", "wealth", "payment"]
+
+Example input: "快樂"
+Example output: ["開心", "高興", "愉快", "歡喜", "歡樂", "喜悅", "愉悅", "幸福"]"""
+
+
+def query_llm(
+    word: str, model: str = "llama3", retries: int = 3
+) -> Optional[List[str]]:
+    """Query LLM for synonyms of a word"""
+    for attempt in range(retries):
+        try:
+            response = ollama.chat(
+                model=model,
+                messages=[
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": f'Give synonyms for: "{word}"'},
+                ],
+                options={"temperature": 0.3, "num_predict": 150},
+            )
+
+            content = response["message"]["content"].strip()
+
+            # Parse JSON from response
+            if content.startswith("```"):
+                content = content.split("```")[1]
+                if content.startswith("json"):
+                    content = content[4:]
+                content = content.strip()
+
+            synonyms = json.loads(content)
+
+            if isinstance(synonyms, list) and len(synonyms) > 0:
+                # Filter: remove empty strings, normalize
+                synonyms = [s.strip().lower() for s in synonyms if s.strip()]
+                return synonyms
+
+            print(f"  ⚠ Invalid format for '{word}'")
+            return None
+
+        except json.JSONDecodeError:
+            print(f"  ⚠ JSON parse error for '{word}' (attempt {attempt + 1})")
+        except Exception as e:
+            print(f"  ⚠ LLM error for '{word}': {e} (attempt {attempt + 1})")
+            if attempt < retries - 1:
+                time.sleep(2)
+
+    return None
+
+
+# ======================== Batch Generation ========================
+
+
+def generate_synonyms_batch(
+    seed_words: Dict[str, List[str]],
+    model: str = "llama3",
+    output_file: str = "data/llm_synonyms.json",
+    rate_limit: float = 1.0,
+) -> Dict[str, List[str]]:
+    """Generate synonyms for all seed words"""
+
+    synonym_db: Dict[str, List[str]] = {}
+    total_words = sum(len(words) for words in seed_words.values())
+    processed = 0
+
+    print(f"\n📝 Generating synonyms for {total_words} words using {model}...")
+    print("=" * 60)
+
+    for category, words in seed_words.items():
+        print(f"\n📂 Category: {category}")
+        for word in words:
+            print(f"  🔍 {word}...", end=" ")
+
+            # Check cache first
+            if word in synonym_db:
+                print("⏭ cached")
+                continue
+
+            synonyms = query_llm(word, model=model)
+
+            if synonyms:
+                synonym_db[word] = synonyms
+                print(f"✅ {len(synonyms)} synonyms")
+            else:
+                print("❌ failed")
+
+            processed += 1
+            time.sleep(rate_limit)  # Rate limit
+
+        # Save progress after each category
+        with open(output_file, "w", encoding="utf-8") as f:
+            json.dump(synonym_db, f, ensure_ascii=False, indent=2)
+
+    print("\n" + "=" * 60)
+    print(f"✅ Done! Saved {len(synonym_db)} entries to {output_file}")
+    print(f"   Total words processed: {processed}/{total_words}")
+
+    return synonym_db
+
+
+def load_existing_db(filepath: str) -> Dict[str, List[str]]:
+    """Load existing synonym database"""
+    if os.path.exists(filepath):
+        with open(filepath, "r", encoding="utf-8") as f:
+            return json.load(f)
+    return {}
+
+
+# ======================== Main ========================
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="LLM-Based Chinese-English Synonym Generator for Momentry"
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="llama3",
+        help="Ollama model name (default: llama3)",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="data/llm_synonyms.json",
+        help="Output file path (default: data/llm_synonyms.json)",
+    )
+    parser.add_argument(
+        "--rate-limit",
+        type=float,
+        default=1.0,
+        help="Rate limit in seconds between requests (default: 1.0)",
+    )
+    parser.add_argument(
+        "--category",
+        type=str,
+        default=None,
+        help="Process only this category (e.g., 'action', 'emotion')",
+    )
+    parser.add_argument(
+        "--resume", action="store_true", help="Resume from existing output file"
+    )
+    parser.add_argument(
+        "--test", action="store_true", help="Test with a few words only"
+    )
+
+    args = parser.parse_args()
+
+    # Prepare seed words
+    seeds = SEED_WORDS.copy()
+    if args.category:
+        if args.category in seeds:
+            seeds = {args.category: seeds[args.category]}
+        else:
+            print(f"Error: category '{args.category}' not found")
+            sys.exit(1)
+
+    if args.test:
+        seeds = {"test": ["happy", "money", "警察"]}
+
+    # Load existing data if resuming
+    if args.resume:
+        existing = load_existing_db(args.output)
+        print(f"📥 Loaded {len(existing)} existing entries")
+    else:
+        existing = {}
+
+    # Generate synonyms
+    generate_synonyms_batch(
+        seed_words=seeds,
+        model=args.model,
+        output_file=args.output,
+        rate_limit=args.rate_limit,
+    )
+
+
+if __name__ == "__main__":
+    main()