Files
momentry_core/scripts/tmdb_agent.py

286 lines
11 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
TMDb Agent — pre-fetch TMDb data and write directly to identity files.
Usage:
python3 scripts/tmdb_agent.py --file-uuid <uuid>
python3 scripts/tmdb_agent.py --file-uuid <uuid> --db "dbname=momentry user=accusys"
Environment:
TMDB_API_KEY Required. TMDb API key.
MOMENTRY_OUTPUT_DIR Default: /Users/accusys/momentry/output
DATABASE_URL Default: dbname=momentry user=accusys host=localhost
Flow:
1. Query videos table for file_name
2. Extract movie name from filename
3. TMDB /search/movie → find best match
4. TMDB /movie/{id}/credits → fetch cast
5. TMDB /person/{id} → fetch person details
6. Write {OUTPUT}/identities/{uuid}/identity.json + profile.jpg for each cast member
7. Write {OUTPUT}/{uuid}.tmdb.json cache (movie info + identity uuid list)
"""
import argparse
import hashlib
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
import requests
import psycopg2
import psycopg2.extras
TMDB_BASE = "https://api.themoviedb.org/3"
TMDB_API_KEY = os.getenv("TMDB_API_KEY")
def extract_movie_name(filename: str) -> str | None:
"""Extract movie name from filename (e.g. 'Charade_1963.mp4''Charade 1963')"""
name = Path(filename).stem
cleaned = re.sub(r'[._]', ' ', name).strip()
# Strip text after separators like |, (, [, {
for sep in ('|', '(', '[', '{', '\u2502'):
idx = cleaned.find(sep)
if idx > 0:
cleaned = cleaned[:idx].strip()
# Strip common suffixes (quality, format, source, etc.)
suffixes = (
r'\d{3,4}p', r'\d{3,4}x\d{3,4}', r'\d+fps', r'bluray', r'web[ -]?dl',
r'webrip', r'hdrip', r'dvdrip', r'dvd', r'brrip', r'hdtv', r'xvid',
r'x264', r'h264', r'x265', r'h265', r'hevc', r'aac', r'mp3', r'ac3',
r'dts', r'5\.1', r'7\.1', r'dual[ -]?audio', r'multi[ -]?sub',
r'proper', r'repack', r'extended', r'unrated', r'directors[ -]?cut',
r'theatrical', r'internal', r'limited', r'complete', r'full[ -]?movie',
r'english', r'french', r'spanish', r'german', r'chinese',
r'youtube', r'yify', r'ettv', r'rarbg', r'tgx', r'axxo', r'ctrlhd',
)
pattern = r'\b(?:' + '|'.join(suffixes) + r')\b'
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip()
# Collapse multiple spaces
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
return cleaned if len(cleaned) >= 3 else None
def search_movie(query: str) -> dict | None:
"""Search TMDB for a movie by name. Returns first result."""
url = f"{TMDB_BASE}/search/movie"
params = {"query": query, "api_key": TMDB_API_KEY, "language": "en-US", "page": 1}
try:
resp = requests.get(url, params=params, timeout=15)
resp.raise_for_status()
results = resp.json().get("results", [])
return results[0] if results else None
except Exception as e:
print(f"TMDB search failed: {e}", file=sys.stderr)
return None
def get_credits(movie_id: int) -> list[dict]:
"""Get cast credits for a movie from TMDB."""
url = f"{TMDB_BASE}/movie/{movie_id}/credits"
params = {"api_key": TMDB_API_KEY, "language": "en-US"}
try:
resp = requests.get(url, params=params, timeout=15)
resp.raise_for_status()
return resp.json().get("cast", [])
except Exception as e:
print(f"TMDB credits failed: {e}", file=sys.stderr)
return []
def get_person_details(person_id: int) -> dict:
"""Fetch person details from TMDB /person/{id}."""
url = f"{TMDB_BASE}/person/{person_id}"
params = {"api_key": TMDB_API_KEY, "language": "en-US"}
try:
resp = requests.get(url, params=params, timeout=15)
resp.raise_for_status()
data = resp.json()
return {
"biography": data.get("biography"),
"birthday": data.get("birthday"),
"place_of_birth": data.get("place_of_birth"),
"also_known_as": data.get("also_known_as", []),
"imdb_id": data.get("imdb_id"),
"known_for_department": data.get("known_for_department"),
"popularity": data.get("popularity"),
"deathday": data.get("deathday"),
"gender": data.get("gender"),
"homepage": data.get("homepage"),
}
except Exception as e:
print(f"TMDB person details failed for {person_id}: {e}", file=sys.stderr)
return {}
def main():
parser = argparse.ArgumentParser(description="TMDb Agent — pre-fetch cache")
parser.add_argument("--file-uuid", required=True, help="File UUID to enrich")
parser.add_argument("--db", default=os.getenv("DATABASE_URL", "dbname=momentry user=accusys host=localhost"))
parser.add_argument("--output", default=os.getenv("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output"))
args = parser.parse_args()
if not TMDB_API_KEY:
print("ERROR: TMDB_API_KEY not set.", file=sys.stderr)
sys.exit(1)
# 1. Query DB for file_name
schema = os.getenv("DATABASE_SCHEMA", "").strip()
table = f"{schema}.videos" if schema else "videos"
conn = psycopg2.connect(args.db)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute(f"SELECT file_name FROM {table} WHERE file_uuid = %s", (args.file_uuid,))
row = cur.fetchone()
cur.close()
conn.close()
if not row:
print(f"ERROR: File not found: {args.file_uuid}", file=sys.stderr)
sys.exit(1)
file_name = row["file_name"]
print(f"[TKG-AGENT] File: {file_name} ({args.file_uuid})")
# 2. Extract movie name
movie_name = extract_movie_name(file_name)
if not movie_name:
print(f"ERROR: Cannot extract movie name from: {file_name}", file=sys.stderr)
sys.exit(1)
print(f"[TKG-AGENT] Extracted movie name: '{movie_name}'")
# 3. Search TMDB
movie = search_movie(movie_name)
if not movie:
print(f"ERROR: No TMDB movie found for: {movie_name}", file=sys.stderr)
sys.exit(1)
print(f"[TKG-AGENT] Matched: {movie['title']} (TMDB id={movie['id']})")
# 4. Fetch credits
cast = get_credits(movie["id"])
if not cast:
print(f"WARN: No cast data found for movie {movie['id']}", file=sys.stderr)
# 5. Enrich each cast member with person details and write identity files
output = Path(args.output)
identities_root = output / "identities"
identities_root.mkdir(parents=True, exist_ok=True)
now = datetime.now(timezone.utc).isoformat()
created_identities = []
for i, m in enumerate(cast):
person_id = m["id"]
person = get_person_details(person_id)
# Generate deterministic UUID: SHA256("tmdb-{movie_id}-{person_id}-{name}")
uuid_raw = hashlib.sha256(f"tmdb-{movie['id']}-{person_id}-{m['name']}".encode()).hexdigest()[:32]
profile_url = (
f"https://image.tmdb.org/t/p/w185{m['profile_path']}"
if m.get("profile_path") else None
)
# Build identity.json
metadata = {
"tmdb_character": m.get("character", ""),
"tmdb_cast_order": i,
"tmdb_movie_id": movie["id"],
"tmdb_movie_title": movie["title"],
"tmdb_biography": person.get("biography"),
"tmdb_birthday": person.get("birthday"),
"tmdb_place_of_birth": person.get("place_of_birth"),
"tmdb_aliases": person.get("also_known_as", []),
"tmdb_imdb_id": person.get("imdb_id"),
"tmdb_department": person.get("known_for_department"),
"tmdb_popularity": person.get("popularity"),
"tmdb_deathday": person.get("deathday"),
"tmdb_gender": person.get("gender"),
"tmdb_homepage": person.get("homepage"),
}
identity = {
"version": 1,
"identity_uuid": uuid_raw,
"name": m["name"],
"identity_type": "people",
"source": "tmdb",
"status": "confirmed",
"tmdb_id": person_id,
"tmdb_profile": profile_url,
"metadata": {k: v for k, v in metadata.items() if v is not None or k == "tmdb_aliases"},
"file_bindings": [],
"created_at": now,
"updated_at": now,
}
# Write identity.json
identity_dir = identities_root / uuid_raw
identity_dir.mkdir(parents=True, exist_ok=True)
identity_path = identity_dir / "identity.json"
with open(identity_path, "w", encoding="utf-8") as f:
json.dump(identity, f, indent=2, ensure_ascii=False)
# Download profile.jpg
if profile_url:
img_path = identity_dir / "profile.jpg"
if not img_path.exists():
try:
resp = requests.get(profile_url, timeout=15)
if resp.status_code == 200:
img_path.write_bytes(resp.content)
except Exception as e:
print(f" [WARN] Failed to download profile for {m['name']}: {e}", file=sys.stderr)
created_identities.append({
"identity_uuid": uuid_raw,
"name": m["name"],
"tmdb_id": person_id,
"character": m.get("character", ""),
"order": i,
})
if (i + 1) % 5 == 0:
print(f"[TKG-AGENT] Wrote {i+1}/{len(cast)} identity files")
# Update _index.json
index_path = identities_root / "_index.json"
index = {}
if index_path.exists():
with open(index_path) as f:
index = json.load(f)
for ci in created_identities:
index[ci["identity_uuid"]] = ci["name"]
with open(index_path, "w", encoding="utf-8") as f:
json.dump(index, f, indent=2, ensure_ascii=False)
# Write movie cache ({uuid}.tmdb.json) — simplified, no per-person data
cache = {
"file_uuid": args.file_uuid,
"fetched_at": now,
"source": "agent",
"movie": {
"tmdb_id": movie["id"],
"title": movie["title"],
"release_date": movie.get("release_date"),
"overview": movie.get("overview"),
"poster_path": movie.get("poster_path"),
},
"cast_count": len(cast),
"identities_created": len(created_identities),
"identities": created_identities,
}
cache_path = output / f"{args.file_uuid}.tmdb.json"
with open(cache_path, "w", encoding="utf-8") as f:
json.dump(cache, f, indent=2, ensure_ascii=False)
print(f"[TKG-AGENT] Cache written: {cache_path}")
print(f"[TKG-AGENT] Identity files: {len(created_identities)} cast members → {identities_root}/")
if __name__ == "__main__":
main()