momentry_core/scripts/tmdb_agent.py

#!/opt/homebrew/bin/python3.11
"""
TMDb Agent — pre-fetch TMDb data and write directly to identity files.

Usage:
    python3 scripts/tmdb_agent.py --file-uuid <uuid>
    python3 scripts/tmdb_agent.py --file-uuid <uuid> --db "dbname=momentry user=accusys"

Environment:
    TMDB_API_KEY          Required. TMDb API key.
    MOMENTRY_OUTPUT_DIR   Default: /Users/accusys/momentry/output
    DATABASE_URL          Default: dbname=momentry user=accusys host=localhost

Flow:
    1. Query videos table for file_name
    2. Extract movie name from filename
    3. TMDB /search/movie → find best match
    4. TMDB /movie/{id}/credits → fetch cast
    5. TMDB /person/{id} → fetch person details
    6. Write {OUTPUT}/identities/{uuid}/identity.json + profile.jpg for each cast member
    7. Write {OUTPUT}/{uuid}.tmdb.json cache (movie info + identity uuid list)
"""
import argparse
import hashlib
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path

import requests
import psycopg2
import psycopg2.extras


TMDB_BASE = "https://api.themoviedb.org/3"
TMDB_API_KEY = os.getenv("TMDB_API_KEY")


def extract_movie_name(filename: str) -> str | None:
    """Extract movie name from filename (e.g. 'Charade_1963.mp4' → 'Charade 1963')"""
    name = Path(filename).stem
    cleaned = re.sub(r'[._]', ' ', name).strip()
    # Strip text after separators like |, (, [, {
    for sep in ('|', '(', '[', '{', '\u2502'):
        idx = cleaned.find(sep)
        if idx > 0:
            cleaned = cleaned[:idx].strip()
    # Strip common suffixes (quality, format, source, etc.)
    suffixes = (
        r'\d{3,4}p', r'\d{3,4}x\d{3,4}', r'\d+fps', r'bluray', r'web[ -]?dl',
        r'webrip', r'hdrip', r'dvdrip', r'dvd', r'brrip', r'hdtv', r'xvid',
        r'x264', r'h264', r'x265', r'h265', r'hevc', r'aac', r'mp3', r'ac3',
        r'dts', r'5\.1', r'7\.1', r'dual[ -]?audio', r'multi[ -]?sub',
        r'proper', r'repack', r'extended', r'unrated', r'directors[ -]?cut',
        r'theatrical', r'internal', r'limited', r'complete', r'full[ -]?movie',
        r'english', r'french', r'spanish', r'german', r'chinese',
        r'youtube', r'yify', r'ettv', r'rarbg', r'tgx', r'axxo', r'ctrlhd',
    )
    pattern = r'\b(?:' + '|'.join(suffixes) + r')\b'
    cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip()
    # Collapse multiple spaces
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned if len(cleaned) >= 3 else None


def search_movie(query: str) -> dict | None:
    """Search TMDB for a movie by name. Returns first result."""
    url = f"{TMDB_BASE}/search/movie"
    params = {"query": query, "api_key": TMDB_API_KEY, "language": "en-US", "page": 1}
    try:
        resp = requests.get(url, params=params, timeout=15)
        resp.raise_for_status()
        results = resp.json().get("results", [])
        return results[0] if results else None
    except Exception as e:
        print(f"TMDB search failed: {e}", file=sys.stderr)
        return None


def get_credits(movie_id: int) -> list[dict]:
    """Get cast credits for a movie from TMDB."""
    url = f"{TMDB_BASE}/movie/{movie_id}/credits"
    params = {"api_key": TMDB_API_KEY, "language": "en-US"}
    try:
        resp = requests.get(url, params=params, timeout=15)
        resp.raise_for_status()
        return resp.json().get("cast", [])
    except Exception as e:
        print(f"TMDB credits failed: {e}", file=sys.stderr)
        return []


def get_person_details(person_id: int) -> dict:
    """Fetch person details from TMDB /person/{id}."""
    url = f"{TMDB_BASE}/person/{person_id}"
    params = {"api_key": TMDB_API_KEY, "language": "en-US"}
    try:
        resp = requests.get(url, params=params, timeout=15)
        resp.raise_for_status()
        data = resp.json()
        return {
            "biography": data.get("biography"),
            "birthday": data.get("birthday"),
            "place_of_birth": data.get("place_of_birth"),
            "also_known_as": data.get("also_known_as", []),
            "imdb_id": data.get("imdb_id"),
            "known_for_department": data.get("known_for_department"),
            "popularity": data.get("popularity"),
            "deathday": data.get("deathday"),
            "gender": data.get("gender"),
            "homepage": data.get("homepage"),
        }
    except Exception as e:
        print(f"TMDB person details failed for {person_id}: {e}", file=sys.stderr)
        return {}


def main():
    parser = argparse.ArgumentParser(description="TMDb Agent — pre-fetch cache")
    parser.add_argument("--file-uuid", required=True, help="File UUID to enrich")
    parser.add_argument("--db", default=os.getenv("DATABASE_URL", "dbname=momentry user=accusys host=localhost"))
    parser.add_argument("--output", default=os.getenv("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output"))
    args = parser.parse_args()

    if not TMDB_API_KEY:
        print("ERROR: TMDB_API_KEY not set.", file=sys.stderr)
        sys.exit(1)

    # 1. Query DB for file_name
    schema = os.getenv("DATABASE_SCHEMA", "").strip()
    table = f"{schema}.videos" if schema else "videos"
    conn = psycopg2.connect(args.db)
    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
    cur.execute(f"SELECT file_name FROM {table} WHERE file_uuid = %s", (args.file_uuid,))
    row = cur.fetchone()
    cur.close()
    conn.close()

    if not row:
        print(f"ERROR: File not found: {args.file_uuid}", file=sys.stderr)
        sys.exit(1)

    file_name = row["file_name"]
    print(f"[TKG-AGENT] File: {file_name} ({args.file_uuid})")

    # 2. Extract movie name
    movie_name = extract_movie_name(file_name)
    if not movie_name:
        print(f"ERROR: Cannot extract movie name from: {file_name}", file=sys.stderr)
        sys.exit(1)
    print(f"[TKG-AGENT] Extracted movie name: '{movie_name}'")

    # 3. Search TMDB
    movie = search_movie(movie_name)
    if not movie:
        print(f"ERROR: No TMDB movie found for: {movie_name}", file=sys.stderr)
        sys.exit(1)
    print(f"[TKG-AGENT] Matched: {movie['title']} (TMDB id={movie['id']})")

    # 4. Fetch credits
    cast = get_credits(movie["id"])
    if not cast:
        print(f"WARN: No cast data found for movie {movie['id']}", file=sys.stderr)

    # 5. Enrich each cast member with person details and write identity files
    output = Path(args.output)
    identities_root = output / "identities"
    identities_root.mkdir(parents=True, exist_ok=True)

    now = datetime.now(timezone.utc).isoformat()
    created_identities = []

    for i, m in enumerate(cast):
        person_id = m["id"]
        person = get_person_details(person_id)

        # Generate deterministic UUID: SHA256("tmdb-{movie_id}-{person_id}-{name}")
        uuid_raw = hashlib.sha256(f"tmdb-{movie['id']}-{person_id}-{m['name']}".encode()).hexdigest()[:32]
        profile_url = (
            f"https://image.tmdb.org/t/p/w185{m['profile_path']}"
            if m.get("profile_path") else None
        )

        # Build identity.json
        metadata = {
            "tmdb_character": m.get("character", ""),
            "tmdb_cast_order": i,
            "tmdb_movie_id": movie["id"],
            "tmdb_movie_title": movie["title"],
            "tmdb_biography": person.get("biography"),
            "tmdb_birthday": person.get("birthday"),
            "tmdb_place_of_birth": person.get("place_of_birth"),
            "tmdb_aliases": person.get("also_known_as", []),
            "tmdb_imdb_id": person.get("imdb_id"),
            "tmdb_department": person.get("known_for_department"),
            "tmdb_popularity": person.get("popularity"),
            "tmdb_deathday": person.get("deathday"),
            "tmdb_gender": person.get("gender"),
            "tmdb_homepage": person.get("homepage"),
        }

        identity = {
            "version": 1,
            "identity_uuid": uuid_raw,
            "name": m["name"],
            "identity_type": "people",
            "source": "tmdb",
            "status": "confirmed",
            "tmdb_id": person_id,
            "tmdb_profile": profile_url,
            "metadata": {k: v for k, v in metadata.items() if v is not None or k == "tmdb_aliases"},
            "file_bindings": [],
            "created_at": now,
            "updated_at": now,
        }

        # Write identity.json
        identity_dir = identities_root / uuid_raw
        identity_dir.mkdir(parents=True, exist_ok=True)
        identity_path = identity_dir / "identity.json"
        with open(identity_path, "w", encoding="utf-8") as f:
            json.dump(identity, f, indent=2, ensure_ascii=False)

        # Download profile.jpg
        if profile_url:
            img_path = identity_dir / "profile.jpg"
            if not img_path.exists():
                try:
                    resp = requests.get(profile_url, timeout=15)
                    if resp.status_code == 200:
                        img_path.write_bytes(resp.content)
                except Exception as e:
                    print(f"  [WARN] Failed to download profile for {m['name']}: {e}", file=sys.stderr)

        created_identities.append({
            "identity_uuid": uuid_raw,
            "name": m["name"],
            "tmdb_id": person_id,
            "character": m.get("character", ""),
            "order": i,
        })

        if (i + 1) % 5 == 0:
            print(f"[TKG-AGENT] Wrote {i+1}/{len(cast)} identity files")

    # Update _index.json
    index_path = identities_root / "_index.json"
    index = {}
    if index_path.exists():
        with open(index_path) as f:
            index = json.load(f)
    for ci in created_identities:
        index[ci["identity_uuid"]] = ci["name"]
    with open(index_path, "w", encoding="utf-8") as f:
        json.dump(index, f, indent=2, ensure_ascii=False)

    # Write movie cache ({uuid}.tmdb.json) — simplified, no per-person data
    cache = {
        "file_uuid": args.file_uuid,
        "fetched_at": now,
        "source": "agent",
        "movie": {
            "tmdb_id": movie["id"],
            "title": movie["title"],
            "release_date": movie.get("release_date"),
            "overview": movie.get("overview"),
            "poster_path": movie.get("poster_path"),
        },
        "cast_count": len(cast),
        "identities_created": len(created_identities),
        "identities": created_identities,
    }

    cache_path = output / f"{args.file_uuid}.tmdb.json"
    with open(cache_path, "w", encoding="utf-8") as f:
        json.dump(cache, f, indent=2, ensure_ascii=False)

    print(f"[TKG-AGENT] Cache written: {cache_path}")
    print(f"[TKG-AGENT] Identity files: {len(created_identities)} cast members → {identities_root}/")


if __name__ == "__main__":
    main()