- Add PATCH /api/v1/identity/:identity_uuid endpoint - Migration 030: remove name UNIQUE, add tmdb_id index - TMDb upsert: ON CONFLICT (name) -> ON CONFLICT (tmdb_id) - get_or_create_identity: pre-check by name - upload_identity: ON CONFLICT (name) -> ON CONFLICT (uuid) - Search: include aliases in identity text search - Add scripts/llm_metadata_enhancer.py - Add DESIGN/IdentityUpdateAndAliasSystem.md
105 lines
3.3 KiB
Python
105 lines
3.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
LLM Metadata Enhancer for Momentry Identity
|
|
|
|
Reads identity name + biography from stdin, calls llama.cpp (Gemma4) to
|
|
produce structured metadata: summary, nationality, profession, aliases (BCP 47 locale-tagged).
|
|
|
|
Output: JSON to stdout (no extra text).
|
|
|
|
Usage:
|
|
echo '{"name": "John Smith", "biography": "..."}' | python3 llm_metadata_enhancer.py
|
|
python3 llm_metadata_enhancer.py --url http://127.0.0.1:8081 < input.json
|
|
|
|
Requires:
|
|
- llama.cpp server running (default: http://127.0.0.1:8081)
|
|
- pip install requests
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
import argparse
|
|
import requests
|
|
|
|
DEFAULT_API_URL = "http://127.0.0.1:8081"
|
|
|
|
SYSTEM_PROMPT = """You are a metadata structuring assistant for a media asset management system.
|
|
|
|
Given an identity name and biography text, produce a structured JSON object with these fields:
|
|
|
|
- summary: 2-3 sentence summary in the same language as the biography
|
|
- nationality: inferred nationality or null
|
|
- profession: array of inferred professions/titles
|
|
- birth_date: YYYY-MM-DD format if available, else null
|
|
- aliases: array of {locale: "BCP47_tag", name: "translated_name"} objects
|
|
- Use common well-known translations only (do not fabricate)
|
|
- Locale tags follow BCP 47 (e.g., en, zh-TW, zh-CN, ja, ko, fr, es, yue, th, ar, ru)
|
|
- Include at least one alias with locale "en" using the original name
|
|
- Only include locales where the name has a widely recognized translation
|
|
|
|
Output ONLY valid JSON. No explanation, no markdown, no extra text."""
|
|
|
|
|
|
def call_llm(api_url: str, name: str, biography: str) -> dict:
|
|
user_prompt = f"Identity name: {name}\n\nBiography:\n{biography}"
|
|
|
|
payload = {
|
|
"messages": [
|
|
{"role": "system", "content": SYSTEM_PROMPT},
|
|
{"role": "user", "content": user_prompt},
|
|
],
|
|
"temperature": 0.1,
|
|
"max_tokens": 1024,
|
|
"stop": [],
|
|
}
|
|
|
|
resp = requests.post(f"{api_url}/v1/chat/completions", json=payload, timeout=60)
|
|
|
|
if resp.status_code != 200:
|
|
raise RuntimeError(f"LLM API error {resp.status_code}: {resp.text}")
|
|
|
|
content = resp.json()["choices"][0]["message"]["content"].strip()
|
|
|
|
# Strip markdown code blocks if present
|
|
if content.startswith("```"):
|
|
content = content.split("\n", 1)[-1]
|
|
content = content.rsplit("```", 1)[0]
|
|
if content.startswith("json"):
|
|
content = content[4:].strip()
|
|
|
|
return json.loads(content)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Enhance identity metadata via LLM")
|
|
parser.add_argument("--url", default=DEFAULT_API_URL, help="llama.cpp server URL")
|
|
parser.add_argument("--input", help="Input JSON file (default: stdin)")
|
|
args = parser.parse_args()
|
|
|
|
source = args.input
|
|
if source:
|
|
with open(source) as f:
|
|
data = json.load(f)
|
|
else:
|
|
data = json.load(sys.stdin)
|
|
|
|
name = data.get("name", "")
|
|
biography = data.get("biography", "")
|
|
|
|
if not name:
|
|
result = {"error": "name is required"}
|
|
elif not biography:
|
|
result = {"error": "biography is required"}
|
|
else:
|
|
try:
|
|
result = call_llm(args.url, name, biography)
|
|
except Exception as e:
|
|
result = {"error": str(e)}
|
|
|
|
json.dump(result, sys.stdout, ensure_ascii=False, indent=2)
|
|
sys.stdout.write("\n")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|