momentry_core/scripts/generate_synonyms_ollama.py

#!/usr/bin/env python3
"""
LLM-Based Chinese-English Synonym Generator for Momentry

Generates a synonym database by querying LLM (via Ollama or OpenAI-compatible API).
Output format: JSON with word -> [synonyms] mapping

Usage:
  python scripts/generate_synonyms_ollama.py           # Using Ollama (default: llama3)
  python scripts/generate_synonyms_ollama.py --model gemma:2b  # Specify model
  python scripts/generate_synonyms_ollama.py --help    # Show help

Requires:
  - Ollama running (http://localhost:11434)
  - pip install ollama
"""

import json
import os
import sys
import time
import argparse
from typing import Dict, List, Optional

try:
    import ollama
except ImportError:
    print("Error: ollama package required. Install with: pip install ollama")
    sys.exit(1)

# ======================== Seed Words for Video Search Context ========================
# These represent common concepts in video content that benefit from synonym expansion

SEED_WORDS: Dict[str, List[str]] = {
    # Action & Movement
    "action": ["run", "walk", "move", "chase", "escape", "fight", "attack"],
    "emotion": ["happy", "sad", "angry", "afraid", "surprised", "calm"],
    "speech": ["talk", "say", "tell", "ask", "answer", "shout", "whisper"],
    "scene": ["scene", "moment", "part", "clip", "sequence", "segment"],
    # People & Relationships
    "person": ["man", "woman", "boy", "girl", "child", "person"],
    "relationship": ["friend", "enemy", "lover", "partner", "colleague"],
    "authority": ["police", "detective", "officer", "guard", "agent"],
    # Objects & Settings
    "vehicle": ["car", "truck", "bus", "van", "vehicle", "automobile"],
    "location": ["house", "office", "street", "city", "country", "place"],
    "food": ["eat", "dinner", "lunch", "breakfast", "meal", "snack"],
    "weapon": ["gun", "knife", "sword", "bomb", "weapon"],
    # Events & Activities
    "event": ["party", "meeting", "gathering", "celebration", "festival"],
    "crime": ["theft", "murder", "robbery", "assault", "kidnapping"],
    "travel": ["travel", "trip", "journey", "flight", "drive", "ride"],
    # Time & Duration
    "time": ["morning", "noon", "evening", "night", "afternoon"],
    "duration": ["second", "minute", "hour", "day", "week", "month", "year"],
    # Emotions & States
    "positive": ["love", "joy", "peace", "hope", "trust", "success"],
    "negative": ["fear", "anger", "pain", "death", "loss", "failure"],
    "mental": ["think", "know", "believe", "understand", "remember", "forget"],
    # Sensory
    "sight": ["see", "look", "watch", "observe", "notice", "find"],
    "sound": ["hear", "listen", "noise", "music", "voice", "speak"],
    # Money & Value
    "money": ["cash", "dollar", "coin", "payment", "price", "wealth"],
    "transaction": ["buy", "sell", "pay", "spend", "cost", "price"],
    # Chinese specific concepts
    "chinese_emotion": ["愛", "恨", "喜", "怒", "哀", "樂", "愁", "驚"],
    "chinese_action": ["走", "跑", "說", "看", "聽", "想", "做", "吃"],
    "chinese_object": ["房子", "車子", "書", "電話", "電腦", "手機"],
    "chinese_person": ["男人", "女人", "小孩", "老人", "朋友", "敵人"],
}

# ======================== LLM Query Functions ========================

SYSTEM_PROMPT = """You are a synonym generation assistant. For each given word, provide 8-15 synonyms in the same language.
Rules:
1. Return ONLY a JSON array of strings, nothing else
2. Synonyms should be contextually relevant for video content search
3. Include common words, informal terms, and related concepts
4. Do NOT include the input word in the output
5. All synonyms must be in the SAME language as the input word
6. No explanations, no markdown, just the JSON array

Example input: "money"
Example output: ["cash", "dollar", "currency", "funds", "bucks", "greenbacks", "coins", "wealth", "payment"]

Example input: "快樂"
Example output: ["開心", "高興", "愉快", "歡喜", "歡樂", "喜悅", "愉悅", "幸福"]"""


def query_llm(
    word: str, model: str = "llama3", retries: int = 3
) -> Optional[List[str]]:
    """Query LLM for synonyms of a word"""
    for attempt in range(retries):
        try:
            response = ollama.chat(
                model=model,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": f'Give synonyms for: "{word}"'},
                ],
                options={"temperature": 0.3, "num_predict": 150},
            )

            content = response["message"]["content"].strip()

            # Parse JSON from response
            if content.startswith("```"):
                content = content.split("```")[1]
                if content.startswith("json"):
                    content = content[4:]
                content = content.strip()

            synonyms = json.loads(content)

            if isinstance(synonyms, list) and len(synonyms) > 0:
                # Filter: remove empty strings, normalize
                synonyms = [s.strip().lower() for s in synonyms if s.strip()]
                return synonyms

            print(f"  ⚠ Invalid format for '{word}'")
            return None

        except json.JSONDecodeError:
            print(f"  ⚠ JSON parse error for '{word}' (attempt {attempt + 1})")
        except Exception as e:
            print(f"  ⚠ LLM error for '{word}': {e} (attempt {attempt + 1})")
            if attempt < retries - 1:
                time.sleep(2)

    return None


# ======================== Batch Generation ========================


def generate_synonyms_batch(
    seed_words: Dict[str, List[str]],
    model: str = "llama3",
    output_file: str = "data/llm_synonyms.json",
    rate_limit: float = 1.0,
) -> Dict[str, List[str]]:
    """Generate synonyms for all seed words"""

    synonym_db: Dict[str, List[str]] = {}
    total_words = sum(len(words) for words in seed_words.values())
    processed = 0

    print(f"\n📝 Generating synonyms for {total_words} words using {model}...")
    print("=" * 60)

    for category, words in seed_words.items():
        print(f"\n📂 Category: {category}")
        for word in words:
            print(f"  🔍 {word}...", end=" ")

            # Check cache first
            if word in synonym_db:
                print("⏭ cached")
                continue

            synonyms = query_llm(word, model=model)

            if synonyms:
                synonym_db[word] = synonyms
                print(f"✅ {len(synonyms)} synonyms")
            else:
                print("❌ failed")

            processed += 1
            time.sleep(rate_limit)  # Rate limit

        # Save progress after each category
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(synonym_db, f, ensure_ascii=False, indent=2)

    print("\n" + "=" * 60)
    print(f"✅ Done! Saved {len(synonym_db)} entries to {output_file}")
    print(f"   Total words processed: {processed}/{total_words}")

    return synonym_db


def load_existing_db(filepath: str) -> Dict[str, List[str]]:
    """Load existing synonym database"""
    if os.path.exists(filepath):
        with open(filepath, "r", encoding="utf-8") as f:
            return json.load(f)
    return {}


# ======================== Main ========================


def main():
    parser = argparse.ArgumentParser(
        description="LLM-Based Chinese-English Synonym Generator for Momentry"
    )
    parser.add_argument(
        "--model",
        type=str,
        default="llama3",
        help="Ollama model name (default: llama3)",
    )
    parser.add_argument(
        "--output",
        type=str,
        default="data/llm_synonyms.json",
        help="Output file path (default: data/llm_synonyms.json)",
    )
    parser.add_argument(
        "--rate-limit",
        type=float,
        default=1.0,
        help="Rate limit in seconds between requests (default: 1.0)",
    )
    parser.add_argument(
        "--category",
        type=str,
        default=None,
        help="Process only this category (e.g., 'action', 'emotion')",
    )
    parser.add_argument(
        "--resume", action="store_true", help="Resume from existing output file"
    )
    parser.add_argument(
        "--test", action="store_true", help="Test with a few words only"
    )

    args = parser.parse_args()

    # Prepare seed words
    seeds = SEED_WORDS.copy()
    if args.category:
        if args.category in seeds:
            seeds = {args.category: seeds[args.category]}
        else:
            print(f"Error: category '{args.category}' not found")
            sys.exit(1)

    if args.test:
        seeds = {"test": ["happy", "money", "警察"]}

    # Load existing data if resuming
    if args.resume:
        existing = load_existing_db(args.output)
        print(f"📥 Loaded {len(existing)} existing entries")
    else:
        existing = {}

    # Generate synonyms
    generate_synonyms_batch(
        seed_words=seeds,
        model=args.model,
        output_file=args.output,
        rate_limit=args.rate_limit,
    )


if __name__ == "__main__":
    main()