momentry_core/scripts/generate_synonyms_llamacpp.py

#!/usr/bin/env python3
"""
LLM-Based Chinese-English Synonym Generator for Momentry

Generates a synonym database by querying Gemma4 via llama.cpp server.
Output format: JSON with word -> [synonyms] mapping

Usage:
  python scripts/generate_synonyms_llamacpp.py           # Using default llama.cpp server
  python scripts/generate_synonyms_llamacpp.py --url http://127.0.0.1:8081
  python scripts/generate_synonyms_llamacpp.py --test    # Quick test
  python scripts/generate_synonyms_llamacpp.py --help    # Show help

Requires:
  - llama.cpp server running (default: http://127.0.0.1:8081)
  - pip install requests
"""

import json
import os
import sys
import time
import argparse
from typing import Dict, List, Optional
import requests

# ======================== Configuration ========================

# llama.cpp server default endpoint
DEFAULT_API_URL = "http://127.0.0.1:8081"
DEFAULT_MODEL = "gemma4"
DEFAULT_TIMEOUT = 60

# ======================== Seed Words for Video Search Context ========================

SEED_WORDS: Dict[str, List[str]] = {
    # Action & Movement
    "action": ["run", "walk", "move", "chase", "escape", "fight", "attack"],
    "emotion": ["happy", "sad", "angry", "afraid", "surprised", "calm"],
    "speech": ["talk", "say", "tell", "ask", "answer", "shout", "whisper"],
    "scene": ["scene", "moment", "part", "clip", "sequence", "segment"],
    # People & Relationships
    "person": ["man", "woman", "boy", "girl", "child", "person"],
    "relationship": ["friend", "enemy", "lover", "partner", "colleague"],
    "authority": ["police", "detective", "officer", "guard", "agent"],
    # Objects & Settings
    "vehicle": ["car", "truck", "bus", "van", "vehicle", "automobile"],
    "location": ["house", "office", "street", "city", "country", "place"],
    "food": ["eat", "dinner", "lunch", "breakfast", "meal", "snack"],
    "weapon": ["gun", "knife", "sword", "bomb", "weapon"],
    # Events & Activities
    "event": ["party", "meeting", "gathering", "celebration", "festival"],
    "crime": ["theft", "murder", "robbery", "assault", "kidnapping"],
    "travel": ["travel", "trip", "journey", "flight", "drive", "ride"],
    # Time & Duration
    "time": ["morning", "noon", "evening", "night", "afternoon"],
    "duration": ["second", "minute", "hour", "day", "week", "month", "year"],
    # Emotions & States
    "positive": ["love", "joy", "peace", "hope", "trust", "success"],
    "negative": ["fear", "anger", "pain", "death", "loss", "failure"],
    "mental": ["think", "know", "believe", "understand", "remember", "forget"],
    # Sensory
    "sight": ["see", "look", "watch", "observe", "notice", "find"],
    "sound": ["hear", "listen", "noise", "music", "voice", "speak"],
    # Money & Value
    "money": ["cash", "dollar", "coin", "payment", "price", "wealth"],
    "transaction": ["buy", "sell", "pay", "spend", "cost", "price"],
    # Chinese specific concepts
    "chinese_emotion": ["愛", "恨", "喜", "怒", "哀", "樂", "愁", "驚"],
    "chinese_action": ["走", "跑", "說", "看", "聽", "想", "做", "吃"],
    "chinese_object": ["房子", "車子", "書", "電話", "電腦", "手機"],
    "chinese_person": ["男人", "女人", "小孩", "老人", "朋友", "敵人"],
}

# ======================== LLM Query Functions ========================

SYSTEM_PROMPT = """You are a synonym generation assistant. For each given word, provide 8-15 synonyms in the same language.
Rules:
1. Return ONLY a JSON array of strings, nothing else
2. Synonyms should be contextually relevant for video content search
3. Include common words, informal terms, and related concepts
4. Do NOT include the input word in the output
5. All synonyms must be in the SAME language as the input word
6. No explanations, no markdown, just the JSON array

Example input: "money"
Example output: ["cash", "dollar", "currency", "funds", "bucks", "greenbacks", "coins", "wealth", "payment"]

Example input: "快樂"
Example output: ["開心", "高興", "愉快", "歡喜", "歡樂", "喜悅", "愉悅", "幸福"]"""


def check_server_health(api_url: str) -> bool:
    """Check if llama.cpp server is running"""
    try:
        resp = requests.get(f"{api_url}/health", timeout=5)
        if resp.status_code == 200:
            print(f"✅ llama.cpp server is running at {api_url}")
            return True
    except requests.exceptions.ConnectionError:
        print(f"❌ Cannot connect to llama.cpp server at {api_url}")
    except requests.exceptions.Timeout:
        print("❌ Connection to llama.cpp server timed out")
    return False


def query_llm(
    word: str,
    api_url: str = DEFAULT_API_URL,
    model: str = DEFAULT_MODEL,
    timeout: int = DEFAULT_TIMEOUT,
    retries: int = 3,
) -> Optional[List[str]]:
    """Query Gemma4 via llama.cpp OpenAI-compatible endpoint"""
    for attempt in range(retries):
        try:
            payload = {
                "model": model,
                "messages": [
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": f'Give synonyms for: "{word}"'},
                ],
                "temperature": 0.3,
                "stream": False,
                "max_tokens": 256,
            }

            response = requests.post(
                f"{api_url}/v1/chat/completions",
                json=payload,
                headers={"Content-Type": "application/json"},
                timeout=timeout,
            )

            if response.status_code != 200:
                print(f"  ⚠ HTTP {response.status_code} for '{word}'")
                print(f"    Response: {response.text[:200]}")
                time.sleep(2)
                continue

            data = response.json()
            content = data["choices"][0]["message"]["content"].strip()

            # Extract JSON from response (handle markdown code blocks)
            if "```" in content:
                parts = content.split("```")
                for part in parts:
                    part = part.strip()
                    if part.startswith("json"):
                        part = part[4:].strip()
                    if part.startswith("[") and part.endswith("]"):
                        content = part
                        break

            synonyms = json.loads(content)

            if isinstance(synonyms, list) and len(synonyms) > 0:
                # Filter: remove empty strings, normalize
                synonyms = [s.strip().lower() for s in synonyms if s.strip()]
                return synonyms

            print(f"  ⚠ Invalid format for '{word}'")
            return None

        except json.JSONDecodeError:
            print(f"  ⚠ JSON parse error for '{word}' (attempt {attempt + 1})")
        except requests.exceptions.Timeout:
            print(f"  ⚠ Timeout for '{word}' (attempt {attempt + 1})")
            time.sleep(2)
        except Exception as e:
            print(f"  ⚠ Error for '{word}': {e} (attempt {attempt + 1})")
            if attempt < retries - 1:
                time.sleep(2)

    return None


# ======================== Batch Generation ========================


def generate_synonyms_batch(
    seed_words: Dict[str, List[str]],
    api_url: str = DEFAULT_API_URL,
    model: str = DEFAULT_MODEL,
    output_file: str = "data/llm_synonyms.json",
    rate_limit: float = 1.0,
) -> Dict[str, List[str]]:
    """Generate synonyms for all seed words"""

    # Load existing data if output file exists (auto-resume)
    synonym_db: Dict[str, List[str]] = {}
    if os.path.exists(output_file):
        try:
            with open(output_file, "r", encoding="utf-8") as f:
                synonym_db = json.load(f)
            print(f"📥 Resumed from {output_file} ({len(synonym_db)} entries)")
        except Exception:
            pass

    total_words = sum(len(words) for words in seed_words.values())
    processed = 0

    print(f"\n📝 Generating synonyms for {total_words} words using {model}...")
    print(f"🔗 Server: {api_url}")
    print("=" * 60)

    for category, words in seed_words.items():
        print(f"\n📂 Category: {category}")
        for word in words:
            print(f"  🔍 {word}...", end=" ")

            # Skip if already in DB
            if word in synonym_db:
                print(f"⏭ cached ({len(synonym_db[word])} synonyms)")
                continue

            synonyms = query_llm(word, api_url=api_url, model=model)

            if synonyms:
                synonym_db[word] = synonyms
                print(f"✅ {len(synonyms)} synonyms")
            else:
                print("❌ failed")

            processed += 1
            time.sleep(rate_limit)

        # Save progress after each category
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(synonym_db, f, ensure_ascii=False, indent=2)

    print("\n" + "=" * 60)
    print(f"✅ Done! Saved {len(synonym_db)} entries to {output_file}")
    print(f"   Total words processed: {processed}/{total_words}")

    return synonym_db


# ======================== Main ========================


def main():
    parser = argparse.ArgumentParser(
        description="LLM-Based Chinese-English Synonym Generator (llama.cpp / Gemma4)"
    )
    parser.add_argument(
        "--url",
        type=str,
        default=DEFAULT_API_URL,
        help=f"llama.cpp server URL (default: {DEFAULT_API_URL})",
    )
    parser.add_argument(
        "--model",
        type=str,
        default=DEFAULT_MODEL,
        help=f"Model name (default: {DEFAULT_MODEL})",
    )
    parser.add_argument(
        "--output",
        type=str,
        default="data/llm_synonyms.json",
        help="Output file path (default: data/llm_synonyms.json)",
    )
    parser.add_argument(
        "--rate-limit",
        type=float,
        default=0.5,
        help="Rate limit in seconds between requests (default: 0.5)",
    )
    parser.add_argument(
        "--category",
        type=str,
        default=None,
        help="Process only this category (e.g., 'action', 'emotion')",
    )
    parser.add_argument(
        "--test", action="store_true", help="Test with a few words only"
    )

    args = parser.parse_args()

    # Check server health
    if not check_server_health(args.url):
        print("\n💡 Start llama.cpp server with:")
        print("  llama-server --model <gemma4.gguf> --port 8081")
        sys.exit(1)

    # Prepare seed words
    seeds = SEED_WORDS.copy()
    if args.category:
        if args.category in seeds:
            seeds = {args.category: seeds[args.category]}
        else:
            print(f"Error: category '{args.category}' not found")
            sys.exit(1)

    if args.test:
        seeds = {"test": ["happy", "money", "愛"]}

    # Generate synonyms
    generate_synonyms_batch(
        seed_words=seeds,
        api_url=args.url,
        model=args.model,
        output_file=args.output,
        rate_limit=args.rate_limit,
    )


if __name__ == "__main__":
    main()