#!/opt/homebrew/bin/python3.11 """ TMDb Agent — pre-fetch TMDb data and write directly to identity files. Usage: python3 scripts/tmdb_agent.py --file-uuid python3 scripts/tmdb_agent.py --file-uuid --db "dbname=momentry user=accusys" Environment: TMDB_API_KEY Required. TMDb API key. MOMENTRY_OUTPUT_DIR Default: /Users/accusys/momentry/output DATABASE_URL Default: dbname=momentry user=accusys host=localhost Flow: 1. Query videos table for file_name 2. Extract movie name from filename 3. TMDB /search/movie → find best match 4. TMDB /movie/{id}/credits → fetch cast 5. TMDB /person/{id} → fetch person details 6. Write {OUTPUT}/identities/{uuid}/identity.json + profile.jpg for each cast member 7. Write {OUTPUT}/{uuid}.tmdb.json cache (movie info + identity uuid list) """ import argparse import hashlib import json import os import re import sys from datetime import datetime, timezone from pathlib import Path import requests import psycopg2 import psycopg2.extras TMDB_BASE = "https://api.themoviedb.org/3" TMDB_API_KEY = os.getenv("TMDB_API_KEY") def extract_movie_name(filename: str) -> str | None: """Extract movie name from filename (e.g. 'Charade_1963.mp4' → 'Charade 1963')""" name = Path(filename).stem cleaned = re.sub(r'[._]', ' ', name).strip() # Strip text after separators like |, (, [, { for sep in ('|', '(', '[', '{', '\u2502'): idx = cleaned.find(sep) if idx > 0: cleaned = cleaned[:idx].strip() # Strip common suffixes (quality, format, source, etc.) suffixes = ( r'\d{3,4}p', r'\d{3,4}x\d{3,4}', r'\d+fps', r'bluray', r'web[ -]?dl', r'webrip', r'hdrip', r'dvdrip', r'dvd', r'brrip', r'hdtv', r'xvid', r'x264', r'h264', r'x265', r'h265', r'hevc', r'aac', r'mp3', r'ac3', r'dts', r'5\.1', r'7\.1', r'dual[ -]?audio', r'multi[ -]?sub', r'proper', r'repack', r'extended', r'unrated', r'directors[ -]?cut', r'theatrical', r'internal', r'limited', r'complete', r'full[ -]?movie', r'english', r'french', r'spanish', r'german', r'chinese', r'youtube', r'yify', r'ettv', r'rarbg', r'tgx', r'axxo', r'ctrlhd', ) pattern = r'\b(?:' + '|'.join(suffixes) + r')\b' cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip() # Collapse multiple spaces cleaned = re.sub(r'\s+', ' ', cleaned).strip() return cleaned if len(cleaned) >= 3 else None def search_movie(query: str) -> dict | None: """Search TMDB for a movie by name. Returns first result.""" url = f"{TMDB_BASE}/search/movie" params = {"query": query, "api_key": TMDB_API_KEY, "language": "en-US", "page": 1} try: resp = requests.get(url, params=params, timeout=15) resp.raise_for_status() results = resp.json().get("results", []) return results[0] if results else None except Exception as e: print(f"TMDB search failed: {e}", file=sys.stderr) return None def get_credits(movie_id: int) -> list[dict]: """Get cast credits for a movie from TMDB.""" url = f"{TMDB_BASE}/movie/{movie_id}/credits" params = {"api_key": TMDB_API_KEY, "language": "en-US"} try: resp = requests.get(url, params=params, timeout=15) resp.raise_for_status() return resp.json().get("cast", []) except Exception as e: print(f"TMDB credits failed: {e}", file=sys.stderr) return [] def get_person_details(person_id: int) -> dict: """Fetch person details from TMDB /person/{id}.""" url = f"{TMDB_BASE}/person/{person_id}" params = {"api_key": TMDB_API_KEY, "language": "en-US"} try: resp = requests.get(url, params=params, timeout=15) resp.raise_for_status() data = resp.json() return { "biography": data.get("biography"), "birthday": data.get("birthday"), "place_of_birth": data.get("place_of_birth"), "also_known_as": data.get("also_known_as", []), "imdb_id": data.get("imdb_id"), "known_for_department": data.get("known_for_department"), "popularity": data.get("popularity"), "deathday": data.get("deathday"), "gender": data.get("gender"), "homepage": data.get("homepage"), } except Exception as e: print(f"TMDB person details failed for {person_id}: {e}", file=sys.stderr) return {} def main(): parser = argparse.ArgumentParser(description="TMDb Agent — pre-fetch cache") parser.add_argument("--file-uuid", required=True, help="File UUID to enrich") parser.add_argument("--db", default=os.getenv("DATABASE_URL", "dbname=momentry user=accusys host=localhost")) parser.add_argument("--output", default=os.getenv("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output")) args = parser.parse_args() if not TMDB_API_KEY: print("ERROR: TMDB_API_KEY not set.", file=sys.stderr) sys.exit(1) # 1. Query DB for file_name schema = os.getenv("DATABASE_SCHEMA", "").strip() table = f"{schema}.videos" if schema else "videos" conn = psycopg2.connect(args.db) cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) cur.execute(f"SELECT file_name FROM {table} WHERE file_uuid = %s", (args.file_uuid,)) row = cur.fetchone() cur.close() conn.close() if not row: print(f"ERROR: File not found: {args.file_uuid}", file=sys.stderr) sys.exit(1) file_name = row["file_name"] print(f"[TKG-AGENT] File: {file_name} ({args.file_uuid})") # 2. Extract movie name movie_name = extract_movie_name(file_name) if not movie_name: print(f"ERROR: Cannot extract movie name from: {file_name}", file=sys.stderr) sys.exit(1) print(f"[TKG-AGENT] Extracted movie name: '{movie_name}'") # 3. Search TMDB movie = search_movie(movie_name) if not movie: print(f"ERROR: No TMDB movie found for: {movie_name}", file=sys.stderr) sys.exit(1) print(f"[TKG-AGENT] Matched: {movie['title']} (TMDB id={movie['id']})") # 4. Fetch credits cast = get_credits(movie["id"]) if not cast: print(f"WARN: No cast data found for movie {movie['id']}", file=sys.stderr) # 5. Enrich each cast member with person details and write identity files output = Path(args.output) identities_root = output / "identities" identities_root.mkdir(parents=True, exist_ok=True) now = datetime.now(timezone.utc).isoformat() created_identities = [] for i, m in enumerate(cast): person_id = m["id"] person = get_person_details(person_id) # Generate deterministic UUID: SHA256("tmdb-{movie_id}-{person_id}-{name}") uuid_raw = hashlib.sha256(f"tmdb-{movie['id']}-{person_id}-{m['name']}".encode()).hexdigest()[:32] profile_url = ( f"https://image.tmdb.org/t/p/w185{m['profile_path']}" if m.get("profile_path") else None ) # Build identity.json metadata = { "tmdb_character": m.get("character", ""), "tmdb_cast_order": i, "tmdb_movie_id": movie["id"], "tmdb_movie_title": movie["title"], "tmdb_biography": person.get("biography"), "tmdb_birthday": person.get("birthday"), "tmdb_place_of_birth": person.get("place_of_birth"), "tmdb_aliases": person.get("also_known_as", []), "tmdb_imdb_id": person.get("imdb_id"), "tmdb_department": person.get("known_for_department"), "tmdb_popularity": person.get("popularity"), "tmdb_deathday": person.get("deathday"), "tmdb_gender": person.get("gender"), "tmdb_homepage": person.get("homepage"), } identity = { "version": 1, "identity_uuid": uuid_raw, "name": m["name"], "identity_type": "people", "source": "tmdb", "status": "confirmed", "tmdb_id": person_id, "tmdb_profile": profile_url, "metadata": {k: v for k, v in metadata.items() if v is not None or k == "tmdb_aliases"}, "file_bindings": [], "created_at": now, "updated_at": now, } # Write identity.json identity_dir = identities_root / uuid_raw identity_dir.mkdir(parents=True, exist_ok=True) identity_path = identity_dir / "identity.json" with open(identity_path, "w", encoding="utf-8") as f: json.dump(identity, f, indent=2, ensure_ascii=False) # Download profile.jpg if profile_url: img_path = identity_dir / "profile.jpg" if not img_path.exists(): try: resp = requests.get(profile_url, timeout=15) if resp.status_code == 200: img_path.write_bytes(resp.content) except Exception as e: print(f" [WARN] Failed to download profile for {m['name']}: {e}", file=sys.stderr) created_identities.append({ "identity_uuid": uuid_raw, "name": m["name"], "tmdb_id": person_id, "character": m.get("character", ""), "order": i, }) if (i + 1) % 5 == 0: print(f"[TKG-AGENT] Wrote {i+1}/{len(cast)} identity files") # Update _index.json index_path = identities_root / "_index.json" index = {} if index_path.exists(): with open(index_path) as f: index = json.load(f) for ci in created_identities: index[ci["identity_uuid"]] = ci["name"] with open(index_path, "w", encoding="utf-8") as f: json.dump(index, f, indent=2, ensure_ascii=False) # Write movie cache ({uuid}.tmdb.json) — simplified, no per-person data cache = { "file_uuid": args.file_uuid, "fetched_at": now, "source": "agent", "movie": { "tmdb_id": movie["id"], "title": movie["title"], "release_date": movie.get("release_date"), "overview": movie.get("overview"), "poster_path": movie.get("poster_path"), }, "cast_count": len(cast), "identities_created": len(created_identities), "identities": created_identities, } cache_path = output / f"{args.file_uuid}.tmdb.json" with open(cache_path, "w", encoding="utf-8") as f: json.dump(cache, f, indent=2, ensure_ascii=False) print(f"[TKG-AGENT] Cache written: {cache_path}") print(f"[TKG-AGENT] Identity files: {len(created_identities)} cast members → {identities_root}/") if __name__ == "__main__": main()