feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
This commit is contained in:
262
scripts/generate_synonyms_ollama.py
Normal file
262
scripts/generate_synonyms_ollama.py
Normal file
@@ -0,0 +1,262 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
LLM-Based Chinese-English Synonym Generator for Momentry
|
||||
|
||||
Generates a synonym database by querying LLM (via Ollama or OpenAI-compatible API).
|
||||
Output format: JSON with word -> [synonyms] mapping
|
||||
|
||||
Usage:
|
||||
python scripts/generate_synonyms_ollama.py # Using Ollama (default: llama3)
|
||||
python scripts/generate_synonyms_ollama.py --model gemma:2b # Specify model
|
||||
python scripts/generate_synonyms_ollama.py --help # Show help
|
||||
|
||||
Requires:
|
||||
- Ollama running (http://localhost:11434)
|
||||
- pip install ollama
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import argparse
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
try:
|
||||
import ollama
|
||||
except ImportError:
|
||||
print("Error: ollama package required. Install with: pip install ollama")
|
||||
sys.exit(1)
|
||||
|
||||
# ======================== Seed Words for Video Search Context ========================
|
||||
# These represent common concepts in video content that benefit from synonym expansion
|
||||
|
||||
SEED_WORDS: Dict[str, List[str]] = {
|
||||
# Action & Movement
|
||||
"action": ["run", "walk", "move", "chase", "escape", "fight", "attack"],
|
||||
"emotion": ["happy", "sad", "angry", "afraid", "surprised", "calm"],
|
||||
"speech": ["talk", "say", "tell", "ask", "answer", "shout", "whisper"],
|
||||
"scene": ["scene", "moment", "part", "clip", "sequence", "segment"],
|
||||
# People & Relationships
|
||||
"person": ["man", "woman", "boy", "girl", "child", "person"],
|
||||
"relationship": ["friend", "enemy", "lover", "partner", "colleague"],
|
||||
"authority": ["police", "detective", "officer", "guard", "agent"],
|
||||
# Objects & Settings
|
||||
"vehicle": ["car", "truck", "bus", "van", "vehicle", "automobile"],
|
||||
"location": ["house", "office", "street", "city", "country", "place"],
|
||||
"food": ["eat", "dinner", "lunch", "breakfast", "meal", "snack"],
|
||||
"weapon": ["gun", "knife", "sword", "bomb", "weapon"],
|
||||
# Events & Activities
|
||||
"event": ["party", "meeting", "gathering", "celebration", "festival"],
|
||||
"crime": ["theft", "murder", "robbery", "assault", "kidnapping"],
|
||||
"travel": ["travel", "trip", "journey", "flight", "drive", "ride"],
|
||||
# Time & Duration
|
||||
"time": ["morning", "noon", "evening", "night", "afternoon"],
|
||||
"duration": ["second", "minute", "hour", "day", "week", "month", "year"],
|
||||
# Emotions & States
|
||||
"positive": ["love", "joy", "peace", "hope", "trust", "success"],
|
||||
"negative": ["fear", "anger", "pain", "death", "loss", "failure"],
|
||||
"mental": ["think", "know", "believe", "understand", "remember", "forget"],
|
||||
# Sensory
|
||||
"sight": ["see", "look", "watch", "observe", "notice", "find"],
|
||||
"sound": ["hear", "listen", "noise", "music", "voice", "speak"],
|
||||
# Money & Value
|
||||
"money": ["cash", "dollar", "coin", "payment", "price", "wealth"],
|
||||
"transaction": ["buy", "sell", "pay", "spend", "cost", "price"],
|
||||
# Chinese specific concepts
|
||||
"chinese_emotion": ["愛", "恨", "喜", "怒", "哀", "樂", "愁", "驚"],
|
||||
"chinese_action": ["走", "跑", "說", "看", "聽", "想", "做", "吃"],
|
||||
"chinese_object": ["房子", "車子", "書", "電話", "電腦", "手機"],
|
||||
"chinese_person": ["男人", "女人", "小孩", "老人", "朋友", "敵人"],
|
||||
}
|
||||
|
||||
# ======================== LLM Query Functions ========================
|
||||
|
||||
SYSTEM_PROMPT = """You are a synonym generation assistant. For each given word, provide 8-15 synonyms in the same language.
|
||||
Rules:
|
||||
1. Return ONLY a JSON array of strings, nothing else
|
||||
2. Synonyms should be contextually relevant for video content search
|
||||
3. Include common words, informal terms, and related concepts
|
||||
4. Do NOT include the input word in the output
|
||||
5. All synonyms must be in the SAME language as the input word
|
||||
6. No explanations, no markdown, just the JSON array
|
||||
|
||||
Example input: "money"
|
||||
Example output: ["cash", "dollar", "currency", "funds", "bucks", "greenbacks", "coins", "wealth", "payment"]
|
||||
|
||||
Example input: "快樂"
|
||||
Example output: ["開心", "高興", "愉快", "歡喜", "歡樂", "喜悅", "愉悅", "幸福"]"""
|
||||
|
||||
|
||||
def query_llm(
|
||||
word: str, model: str = "llama3", retries: int = 3
|
||||
) -> Optional[List[str]]:
|
||||
"""Query LLM for synonyms of a word"""
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
response = ollama.chat(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": f'Give synonyms for: "{word}"'},
|
||||
],
|
||||
options={"temperature": 0.3, "num_predict": 150},
|
||||
)
|
||||
|
||||
content = response["message"]["content"].strip()
|
||||
|
||||
# Parse JSON from response
|
||||
if content.startswith("```"):
|
||||
content = content.split("```")[1]
|
||||
if content.startswith("json"):
|
||||
content = content[4:]
|
||||
content = content.strip()
|
||||
|
||||
synonyms = json.loads(content)
|
||||
|
||||
if isinstance(synonyms, list) and len(synonyms) > 0:
|
||||
# Filter: remove empty strings, normalize
|
||||
synonyms = [s.strip().lower() for s in synonyms if s.strip()]
|
||||
return synonyms
|
||||
|
||||
print(f" ⚠ Invalid format for '{word}'")
|
||||
return None
|
||||
|
||||
except json.JSONDecodeError:
|
||||
print(f" ⚠ JSON parse error for '{word}' (attempt {attempt + 1})")
|
||||
except Exception as e:
|
||||
print(f" ⚠ LLM error for '{word}': {e} (attempt {attempt + 1})")
|
||||
if attempt < retries - 1:
|
||||
time.sleep(2)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ======================== Batch Generation ========================
|
||||
|
||||
|
||||
def generate_synonyms_batch(
|
||||
seed_words: Dict[str, List[str]],
|
||||
model: str = "llama3",
|
||||
output_file: str = "data/llm_synonyms.json",
|
||||
rate_limit: float = 1.0,
|
||||
) -> Dict[str, List[str]]:
|
||||
"""Generate synonyms for all seed words"""
|
||||
|
||||
synonym_db: Dict[str, List[str]] = {}
|
||||
total_words = sum(len(words) for words in seed_words.values())
|
||||
processed = 0
|
||||
|
||||
print(f"\n📝 Generating synonyms for {total_words} words using {model}...")
|
||||
print("=" * 60)
|
||||
|
||||
for category, words in seed_words.items():
|
||||
print(f"\n📂 Category: {category}")
|
||||
for word in words:
|
||||
print(f" 🔍 {word}...", end=" ")
|
||||
|
||||
# Check cache first
|
||||
if word in synonym_db:
|
||||
print("⏭ cached")
|
||||
continue
|
||||
|
||||
synonyms = query_llm(word, model=model)
|
||||
|
||||
if synonyms:
|
||||
synonym_db[word] = synonyms
|
||||
print(f"✅ {len(synonyms)} synonyms")
|
||||
else:
|
||||
print("❌ failed")
|
||||
|
||||
processed += 1
|
||||
time.sleep(rate_limit) # Rate limit
|
||||
|
||||
# Save progress after each category
|
||||
with open(output_file, "w", encoding="utf-8") as f:
|
||||
json.dump(synonym_db, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(f"✅ Done! Saved {len(synonym_db)} entries to {output_file}")
|
||||
print(f" Total words processed: {processed}/{total_words}")
|
||||
|
||||
return synonym_db
|
||||
|
||||
|
||||
def load_existing_db(filepath: str) -> Dict[str, List[str]]:
|
||||
"""Load existing synonym database"""
|
||||
if os.path.exists(filepath):
|
||||
with open(filepath, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
|
||||
# ======================== Main ========================
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="LLM-Based Chinese-English Synonym Generator for Momentry"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default="llama3",
|
||||
help="Ollama model name (default: llama3)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output",
|
||||
type=str,
|
||||
default="data/llm_synonyms.json",
|
||||
help="Output file path (default: data/llm_synonyms.json)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--rate-limit",
|
||||
type=float,
|
||||
default=1.0,
|
||||
help="Rate limit in seconds between requests (default: 1.0)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--category",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Process only this category (e.g., 'action', 'emotion')",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--resume", action="store_true", help="Resume from existing output file"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test", action="store_true", help="Test with a few words only"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Prepare seed words
|
||||
seeds = SEED_WORDS.copy()
|
||||
if args.category:
|
||||
if args.category in seeds:
|
||||
seeds = {args.category: seeds[args.category]}
|
||||
else:
|
||||
print(f"Error: category '{args.category}' not found")
|
||||
sys.exit(1)
|
||||
|
||||
if args.test:
|
||||
seeds = {"test": ["happy", "money", "警察"]}
|
||||
|
||||
# Load existing data if resuming
|
||||
if args.resume:
|
||||
existing = load_existing_db(args.output)
|
||||
print(f"📥 Loaded {len(existing)} existing entries")
|
||||
else:
|
||||
existing = {}
|
||||
|
||||
# Generate synonyms
|
||||
generate_synonyms_batch(
|
||||
seed_words=seeds,
|
||||
model=args.model,
|
||||
output_file=args.output,
|
||||
rate_limit=args.rate_limit,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user