- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
263 lines
8.9 KiB
Python
263 lines
8.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
LLM-Based Chinese-English Synonym Generator for Momentry
|
|
|
|
Generates a synonym database by querying LLM (via Ollama or OpenAI-compatible API).
|
|
Output format: JSON with word -> [synonyms] mapping
|
|
|
|
Usage:
|
|
python scripts/generate_synonyms_ollama.py # Using Ollama (default: llama3)
|
|
python scripts/generate_synonyms_ollama.py --model gemma:2b # Specify model
|
|
python scripts/generate_synonyms_ollama.py --help # Show help
|
|
|
|
Requires:
|
|
- Ollama running (http://localhost:11434)
|
|
- pip install ollama
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import time
|
|
import argparse
|
|
from typing import Dict, List, Optional
|
|
|
|
try:
|
|
import ollama
|
|
except ImportError:
|
|
print("Error: ollama package required. Install with: pip install ollama")
|
|
sys.exit(1)
|
|
|
|
# ======================== Seed Words for Video Search Context ========================
|
|
# These represent common concepts in video content that benefit from synonym expansion
|
|
|
|
SEED_WORDS: Dict[str, List[str]] = {
|
|
# Action & Movement
|
|
"action": ["run", "walk", "move", "chase", "escape", "fight", "attack"],
|
|
"emotion": ["happy", "sad", "angry", "afraid", "surprised", "calm"],
|
|
"speech": ["talk", "say", "tell", "ask", "answer", "shout", "whisper"],
|
|
"scene": ["scene", "moment", "part", "clip", "sequence", "segment"],
|
|
# People & Relationships
|
|
"person": ["man", "woman", "boy", "girl", "child", "person"],
|
|
"relationship": ["friend", "enemy", "lover", "partner", "colleague"],
|
|
"authority": ["police", "detective", "officer", "guard", "agent"],
|
|
# Objects & Settings
|
|
"vehicle": ["car", "truck", "bus", "van", "vehicle", "automobile"],
|
|
"location": ["house", "office", "street", "city", "country", "place"],
|
|
"food": ["eat", "dinner", "lunch", "breakfast", "meal", "snack"],
|
|
"weapon": ["gun", "knife", "sword", "bomb", "weapon"],
|
|
# Events & Activities
|
|
"event": ["party", "meeting", "gathering", "celebration", "festival"],
|
|
"crime": ["theft", "murder", "robbery", "assault", "kidnapping"],
|
|
"travel": ["travel", "trip", "journey", "flight", "drive", "ride"],
|
|
# Time & Duration
|
|
"time": ["morning", "noon", "evening", "night", "afternoon"],
|
|
"duration": ["second", "minute", "hour", "day", "week", "month", "year"],
|
|
# Emotions & States
|
|
"positive": ["love", "joy", "peace", "hope", "trust", "success"],
|
|
"negative": ["fear", "anger", "pain", "death", "loss", "failure"],
|
|
"mental": ["think", "know", "believe", "understand", "remember", "forget"],
|
|
# Sensory
|
|
"sight": ["see", "look", "watch", "observe", "notice", "find"],
|
|
"sound": ["hear", "listen", "noise", "music", "voice", "speak"],
|
|
# Money & Value
|
|
"money": ["cash", "dollar", "coin", "payment", "price", "wealth"],
|
|
"transaction": ["buy", "sell", "pay", "spend", "cost", "price"],
|
|
# Chinese specific concepts
|
|
"chinese_emotion": ["愛", "恨", "喜", "怒", "哀", "樂", "愁", "驚"],
|
|
"chinese_action": ["走", "跑", "說", "看", "聽", "想", "做", "吃"],
|
|
"chinese_object": ["房子", "車子", "書", "電話", "電腦", "手機"],
|
|
"chinese_person": ["男人", "女人", "小孩", "老人", "朋友", "敵人"],
|
|
}
|
|
|
|
# ======================== LLM Query Functions ========================
|
|
|
|
SYSTEM_PROMPT = """You are a synonym generation assistant. For each given word, provide 8-15 synonyms in the same language.
|
|
Rules:
|
|
1. Return ONLY a JSON array of strings, nothing else
|
|
2. Synonyms should be contextually relevant for video content search
|
|
3. Include common words, informal terms, and related concepts
|
|
4. Do NOT include the input word in the output
|
|
5. All synonyms must be in the SAME language as the input word
|
|
6. No explanations, no markdown, just the JSON array
|
|
|
|
Example input: "money"
|
|
Example output: ["cash", "dollar", "currency", "funds", "bucks", "greenbacks", "coins", "wealth", "payment"]
|
|
|
|
Example input: "快樂"
|
|
Example output: ["開心", "高興", "愉快", "歡喜", "歡樂", "喜悅", "愉悅", "幸福"]"""
|
|
|
|
|
|
def query_llm(
|
|
word: str, model: str = "llama3", retries: int = 3
|
|
) -> Optional[List[str]]:
|
|
"""Query LLM for synonyms of a word"""
|
|
for attempt in range(retries):
|
|
try:
|
|
response = ollama.chat(
|
|
model=model,
|
|
messages=[
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{"role": "user", "content": f'Give synonyms for: "{word}"'},
|
|
],
|
|
options={"temperature": 0.3, "num_predict": 150},
|
|
)
|
|
|
|
content = response["message"]["content"].strip()
|
|
|
|
# Parse JSON from response
|
|
if content.startswith("```"):
|
|
content = content.split("```")[1]
|
|
if content.startswith("json"):
|
|
content = content[4:]
|
|
content = content.strip()
|
|
|
|
synonyms = json.loads(content)
|
|
|
|
if isinstance(synonyms, list) and len(synonyms) > 0:
|
|
# Filter: remove empty strings, normalize
|
|
synonyms = [s.strip().lower() for s in synonyms if s.strip()]
|
|
return synonyms
|
|
|
|
print(f" ⚠ Invalid format for '{word}'")
|
|
return None
|
|
|
|
except json.JSONDecodeError:
|
|
print(f" ⚠ JSON parse error for '{word}' (attempt {attempt + 1})")
|
|
except Exception as e:
|
|
print(f" ⚠ LLM error for '{word}': {e} (attempt {attempt + 1})")
|
|
if attempt < retries - 1:
|
|
time.sleep(2)
|
|
|
|
return None
|
|
|
|
|
|
# ======================== Batch Generation ========================
|
|
|
|
|
|
def generate_synonyms_batch(
|
|
seed_words: Dict[str, List[str]],
|
|
model: str = "llama3",
|
|
output_file: str = "data/llm_synonyms.json",
|
|
rate_limit: float = 1.0,
|
|
) -> Dict[str, List[str]]:
|
|
"""Generate synonyms for all seed words"""
|
|
|
|
synonym_db: Dict[str, List[str]] = {}
|
|
total_words = sum(len(words) for words in seed_words.values())
|
|
processed = 0
|
|
|
|
print(f"\n📝 Generating synonyms for {total_words} words using {model}...")
|
|
print("=" * 60)
|
|
|
|
for category, words in seed_words.items():
|
|
print(f"\n📂 Category: {category}")
|
|
for word in words:
|
|
print(f" 🔍 {word}...", end=" ")
|
|
|
|
# Check cache first
|
|
if word in synonym_db:
|
|
print("⏭ cached")
|
|
continue
|
|
|
|
synonyms = query_llm(word, model=model)
|
|
|
|
if synonyms:
|
|
synonym_db[word] = synonyms
|
|
print(f"✅ {len(synonyms)} synonyms")
|
|
else:
|
|
print("❌ failed")
|
|
|
|
processed += 1
|
|
time.sleep(rate_limit) # Rate limit
|
|
|
|
# Save progress after each category
|
|
with open(output_file, "w", encoding="utf-8") as f:
|
|
json.dump(synonym_db, f, ensure_ascii=False, indent=2)
|
|
|
|
print("\n" + "=" * 60)
|
|
print(f"✅ Done! Saved {len(synonym_db)} entries to {output_file}")
|
|
print(f" Total words processed: {processed}/{total_words}")
|
|
|
|
return synonym_db
|
|
|
|
|
|
def load_existing_db(filepath: str) -> Dict[str, List[str]]:
|
|
"""Load existing synonym database"""
|
|
if os.path.exists(filepath):
|
|
with open(filepath, "r", encoding="utf-8") as f:
|
|
return json.load(f)
|
|
return {}
|
|
|
|
|
|
# ======================== Main ========================
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="LLM-Based Chinese-English Synonym Generator for Momentry"
|
|
)
|
|
parser.add_argument(
|
|
"--model",
|
|
type=str,
|
|
default="llama3",
|
|
help="Ollama model name (default: llama3)",
|
|
)
|
|
parser.add_argument(
|
|
"--output",
|
|
type=str,
|
|
default="data/llm_synonyms.json",
|
|
help="Output file path (default: data/llm_synonyms.json)",
|
|
)
|
|
parser.add_argument(
|
|
"--rate-limit",
|
|
type=float,
|
|
default=1.0,
|
|
help="Rate limit in seconds between requests (default: 1.0)",
|
|
)
|
|
parser.add_argument(
|
|
"--category",
|
|
type=str,
|
|
default=None,
|
|
help="Process only this category (e.g., 'action', 'emotion')",
|
|
)
|
|
parser.add_argument(
|
|
"--resume", action="store_true", help="Resume from existing output file"
|
|
)
|
|
parser.add_argument(
|
|
"--test", action="store_true", help="Test with a few words only"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Prepare seed words
|
|
seeds = SEED_WORDS.copy()
|
|
if args.category:
|
|
if args.category in seeds:
|
|
seeds = {args.category: seeds[args.category]}
|
|
else:
|
|
print(f"Error: category '{args.category}' not found")
|
|
sys.exit(1)
|
|
|
|
if args.test:
|
|
seeds = {"test": ["happy", "money", "警察"]}
|
|
|
|
# Load existing data if resuming
|
|
if args.resume:
|
|
existing = load_existing_db(args.output)
|
|
print(f"📥 Loaded {len(existing)} existing entries")
|
|
else:
|
|
existing = {}
|
|
|
|
# Generate synonyms
|
|
generate_synonyms_batch(
|
|
seed_words=seeds,
|
|
model=args.model,
|
|
output_file=args.output,
|
|
rate_limit=args.rate_limit,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|