Files
momentry_core/scripts/llm_metadata_enhancer.py
M5Max128 701e71463d feat: identity PATCH update, alias system, name UNIQUE removal
- Add PATCH /api/v1/identity/:identity_uuid endpoint
- Migration 030: remove name UNIQUE, add tmdb_id index
- TMDb upsert: ON CONFLICT (name) -> ON CONFLICT (tmdb_id)
- get_or_create_identity: pre-check by name
- upload_identity: ON CONFLICT (name) -> ON CONFLICT (uuid)
- Search: include aliases in identity text search
- Add scripts/llm_metadata_enhancer.py
- Add DESIGN/IdentityUpdateAndAliasSystem.md
2026-05-22 08:35:32 +08:00

105 lines
3.3 KiB
Python

#!/usr/bin/env python3
"""
LLM Metadata Enhancer for Momentry Identity
Reads identity name + biography from stdin, calls llama.cpp (Gemma4) to
produce structured metadata: summary, nationality, profession, aliases (BCP 47 locale-tagged).
Output: JSON to stdout (no extra text).
Usage:
echo '{"name": "John Smith", "biography": "..."}' | python3 llm_metadata_enhancer.py
python3 llm_metadata_enhancer.py --url http://127.0.0.1:8081 < input.json
Requires:
- llama.cpp server running (default: http://127.0.0.1:8081)
- pip install requests
"""
import json
import sys
import argparse
import requests
DEFAULT_API_URL = "http://127.0.0.1:8081"
SYSTEM_PROMPT = """You are a metadata structuring assistant for a media asset management system.
Given an identity name and biography text, produce a structured JSON object with these fields:
- summary: 2-3 sentence summary in the same language as the biography
- nationality: inferred nationality or null
- profession: array of inferred professions/titles
- birth_date: YYYY-MM-DD format if available, else null
- aliases: array of {locale: "BCP47_tag", name: "translated_name"} objects
- Use common well-known translations only (do not fabricate)
- Locale tags follow BCP 47 (e.g., en, zh-TW, zh-CN, ja, ko, fr, es, yue, th, ar, ru)
- Include at least one alias with locale "en" using the original name
- Only include locales where the name has a widely recognized translation
Output ONLY valid JSON. No explanation, no markdown, no extra text."""
def call_llm(api_url: str, name: str, biography: str) -> dict:
user_prompt = f"Identity name: {name}\n\nBiography:\n{biography}"
payload = {
"messages": [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_prompt},
],
"temperature": 0.1,
"max_tokens": 1024,
"stop": [],
}
resp = requests.post(f"{api_url}/v1/chat/completions", json=payload, timeout=60)
if resp.status_code != 200:
raise RuntimeError(f"LLM API error {resp.status_code}: {resp.text}")
content = resp.json()["choices"][0]["message"]["content"].strip()
# Strip markdown code blocks if present
if content.startswith("```"):
content = content.split("\n", 1)[-1]
content = content.rsplit("```", 1)[0]
if content.startswith("json"):
content = content[4:].strip()
return json.loads(content)
def main():
parser = argparse.ArgumentParser(description="Enhance identity metadata via LLM")
parser.add_argument("--url", default=DEFAULT_API_URL, help="llama.cpp server URL")
parser.add_argument("--input", help="Input JSON file (default: stdin)")
args = parser.parse_args()
source = args.input
if source:
with open(source) as f:
data = json.load(f)
else:
data = json.load(sys.stdin)
name = data.get("name", "")
biography = data.get("biography", "")
if not name:
result = {"error": "name is required"}
elif not biography:
result = {"error": "biography is required"}
else:
try:
result = call_llm(args.url, name, biography)
except Exception as e:
result = {"error": str(e)}
json.dump(result, sys.stdout, ensure_ascii=False, indent=2)
sys.stdout.write("\n")
if __name__ == "__main__":
main()