286 lines
11 KiB
Python
286 lines
11 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
TMDb Agent — pre-fetch TMDb data and write directly to identity files.
|
|
|
|
Usage:
|
|
python3 scripts/tmdb_agent.py --file-uuid <uuid>
|
|
python3 scripts/tmdb_agent.py --file-uuid <uuid> --db "dbname=momentry user=accusys"
|
|
|
|
Environment:
|
|
TMDB_API_KEY Required. TMDb API key.
|
|
MOMENTRY_OUTPUT_DIR Default: /Users/accusys/momentry/output
|
|
DATABASE_URL Default: dbname=momentry user=accusys host=localhost
|
|
|
|
Flow:
|
|
1. Query videos table for file_name
|
|
2. Extract movie name from filename
|
|
3. TMDB /search/movie → find best match
|
|
4. TMDB /movie/{id}/credits → fetch cast
|
|
5. TMDB /person/{id} → fetch person details
|
|
6. Write {OUTPUT}/identities/{uuid}/identity.json + profile.jpg for each cast member
|
|
7. Write {OUTPUT}/{uuid}.tmdb.json cache (movie info + identity uuid list)
|
|
"""
|
|
import argparse
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
import psycopg2
|
|
import psycopg2.extras
|
|
|
|
|
|
TMDB_BASE = "https://api.themoviedb.org/3"
|
|
TMDB_API_KEY = os.getenv("TMDB_API_KEY")
|
|
|
|
|
|
def extract_movie_name(filename: str) -> str | None:
|
|
"""Extract movie name from filename (e.g. 'Charade_1963.mp4' → 'Charade 1963')"""
|
|
name = Path(filename).stem
|
|
cleaned = re.sub(r'[._]', ' ', name).strip()
|
|
# Strip text after separators like |, (, [, {
|
|
for sep in ('|', '(', '[', '{', '\u2502'):
|
|
idx = cleaned.find(sep)
|
|
if idx > 0:
|
|
cleaned = cleaned[:idx].strip()
|
|
# Strip common suffixes (quality, format, source, etc.)
|
|
suffixes = (
|
|
r'\d{3,4}p', r'\d{3,4}x\d{3,4}', r'\d+fps', r'bluray', r'web[ -]?dl',
|
|
r'webrip', r'hdrip', r'dvdrip', r'dvd', r'brrip', r'hdtv', r'xvid',
|
|
r'x264', r'h264', r'x265', r'h265', r'hevc', r'aac', r'mp3', r'ac3',
|
|
r'dts', r'5\.1', r'7\.1', r'dual[ -]?audio', r'multi[ -]?sub',
|
|
r'proper', r'repack', r'extended', r'unrated', r'directors[ -]?cut',
|
|
r'theatrical', r'internal', r'limited', r'complete', r'full[ -]?movie',
|
|
r'english', r'french', r'spanish', r'german', r'chinese',
|
|
r'youtube', r'yify', r'ettv', r'rarbg', r'tgx', r'axxo', r'ctrlhd',
|
|
)
|
|
pattern = r'\b(?:' + '|'.join(suffixes) + r')\b'
|
|
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip()
|
|
# Collapse multiple spaces
|
|
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
|
|
return cleaned if len(cleaned) >= 3 else None
|
|
|
|
|
|
def search_movie(query: str) -> dict | None:
|
|
"""Search TMDB for a movie by name. Returns first result."""
|
|
url = f"{TMDB_BASE}/search/movie"
|
|
params = {"query": query, "api_key": TMDB_API_KEY, "language": "en-US", "page": 1}
|
|
try:
|
|
resp = requests.get(url, params=params, timeout=15)
|
|
resp.raise_for_status()
|
|
results = resp.json().get("results", [])
|
|
return results[0] if results else None
|
|
except Exception as e:
|
|
print(f"TMDB search failed: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def get_credits(movie_id: int) -> list[dict]:
|
|
"""Get cast credits for a movie from TMDB."""
|
|
url = f"{TMDB_BASE}/movie/{movie_id}/credits"
|
|
params = {"api_key": TMDB_API_KEY, "language": "en-US"}
|
|
try:
|
|
resp = requests.get(url, params=params, timeout=15)
|
|
resp.raise_for_status()
|
|
return resp.json().get("cast", [])
|
|
except Exception as e:
|
|
print(f"TMDB credits failed: {e}", file=sys.stderr)
|
|
return []
|
|
|
|
|
|
def get_person_details(person_id: int) -> dict:
|
|
"""Fetch person details from TMDB /person/{id}."""
|
|
url = f"{TMDB_BASE}/person/{person_id}"
|
|
params = {"api_key": TMDB_API_KEY, "language": "en-US"}
|
|
try:
|
|
resp = requests.get(url, params=params, timeout=15)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
return {
|
|
"biography": data.get("biography"),
|
|
"birthday": data.get("birthday"),
|
|
"place_of_birth": data.get("place_of_birth"),
|
|
"also_known_as": data.get("also_known_as", []),
|
|
"imdb_id": data.get("imdb_id"),
|
|
"known_for_department": data.get("known_for_department"),
|
|
"popularity": data.get("popularity"),
|
|
"deathday": data.get("deathday"),
|
|
"gender": data.get("gender"),
|
|
"homepage": data.get("homepage"),
|
|
}
|
|
except Exception as e:
|
|
print(f"TMDB person details failed for {person_id}: {e}", file=sys.stderr)
|
|
return {}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="TMDb Agent — pre-fetch cache")
|
|
parser.add_argument("--file-uuid", required=True, help="File UUID to enrich")
|
|
parser.add_argument("--db", default=os.getenv("DATABASE_URL", "dbname=momentry user=accusys host=localhost"))
|
|
parser.add_argument("--output", default=os.getenv("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output"))
|
|
args = parser.parse_args()
|
|
|
|
if not TMDB_API_KEY:
|
|
print("ERROR: TMDB_API_KEY not set.", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# 1. Query DB for file_name
|
|
schema = os.getenv("DATABASE_SCHEMA", "").strip()
|
|
table = f"{schema}.videos" if schema else "videos"
|
|
conn = psycopg2.connect(args.db)
|
|
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
|
cur.execute(f"SELECT file_name FROM {table} WHERE file_uuid = %s", (args.file_uuid,))
|
|
row = cur.fetchone()
|
|
cur.close()
|
|
conn.close()
|
|
|
|
if not row:
|
|
print(f"ERROR: File not found: {args.file_uuid}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
file_name = row["file_name"]
|
|
print(f"[TKG-AGENT] File: {file_name} ({args.file_uuid})")
|
|
|
|
# 2. Extract movie name
|
|
movie_name = extract_movie_name(file_name)
|
|
if not movie_name:
|
|
print(f"ERROR: Cannot extract movie name from: {file_name}", file=sys.stderr)
|
|
sys.exit(1)
|
|
print(f"[TKG-AGENT] Extracted movie name: '{movie_name}'")
|
|
|
|
# 3. Search TMDB
|
|
movie = search_movie(movie_name)
|
|
if not movie:
|
|
print(f"ERROR: No TMDB movie found for: {movie_name}", file=sys.stderr)
|
|
sys.exit(1)
|
|
print(f"[TKG-AGENT] Matched: {movie['title']} (TMDB id={movie['id']})")
|
|
|
|
# 4. Fetch credits
|
|
cast = get_credits(movie["id"])
|
|
if not cast:
|
|
print(f"WARN: No cast data found for movie {movie['id']}", file=sys.stderr)
|
|
|
|
# 5. Enrich each cast member with person details and write identity files
|
|
output = Path(args.output)
|
|
identities_root = output / "identities"
|
|
identities_root.mkdir(parents=True, exist_ok=True)
|
|
|
|
now = datetime.now(timezone.utc).isoformat()
|
|
created_identities = []
|
|
|
|
for i, m in enumerate(cast):
|
|
person_id = m["id"]
|
|
person = get_person_details(person_id)
|
|
|
|
# Generate deterministic UUID: SHA256("tmdb-{movie_id}-{person_id}-{name}")
|
|
uuid_raw = hashlib.sha256(f"tmdb-{movie['id']}-{person_id}-{m['name']}".encode()).hexdigest()[:32]
|
|
profile_url = (
|
|
f"https://image.tmdb.org/t/p/w185{m['profile_path']}"
|
|
if m.get("profile_path") else None
|
|
)
|
|
|
|
# Build identity.json
|
|
metadata = {
|
|
"tmdb_character": m.get("character", ""),
|
|
"tmdb_cast_order": i,
|
|
"tmdb_movie_id": movie["id"],
|
|
"tmdb_movie_title": movie["title"],
|
|
"tmdb_biography": person.get("biography"),
|
|
"tmdb_birthday": person.get("birthday"),
|
|
"tmdb_place_of_birth": person.get("place_of_birth"),
|
|
"tmdb_aliases": person.get("also_known_as", []),
|
|
"tmdb_imdb_id": person.get("imdb_id"),
|
|
"tmdb_department": person.get("known_for_department"),
|
|
"tmdb_popularity": person.get("popularity"),
|
|
"tmdb_deathday": person.get("deathday"),
|
|
"tmdb_gender": person.get("gender"),
|
|
"tmdb_homepage": person.get("homepage"),
|
|
}
|
|
|
|
identity = {
|
|
"version": 1,
|
|
"identity_uuid": uuid_raw,
|
|
"name": m["name"],
|
|
"identity_type": "people",
|
|
"source": "tmdb",
|
|
"status": "confirmed",
|
|
"tmdb_id": person_id,
|
|
"tmdb_profile": profile_url,
|
|
"metadata": {k: v for k, v in metadata.items() if v is not None or k == "tmdb_aliases"},
|
|
"file_bindings": [],
|
|
"created_at": now,
|
|
"updated_at": now,
|
|
}
|
|
|
|
# Write identity.json
|
|
identity_dir = identities_root / uuid_raw
|
|
identity_dir.mkdir(parents=True, exist_ok=True)
|
|
identity_path = identity_dir / "identity.json"
|
|
with open(identity_path, "w", encoding="utf-8") as f:
|
|
json.dump(identity, f, indent=2, ensure_ascii=False)
|
|
|
|
# Download profile.jpg
|
|
if profile_url:
|
|
img_path = identity_dir / "profile.jpg"
|
|
if not img_path.exists():
|
|
try:
|
|
resp = requests.get(profile_url, timeout=15)
|
|
if resp.status_code == 200:
|
|
img_path.write_bytes(resp.content)
|
|
except Exception as e:
|
|
print(f" [WARN] Failed to download profile for {m['name']}: {e}", file=sys.stderr)
|
|
|
|
created_identities.append({
|
|
"identity_uuid": uuid_raw,
|
|
"name": m["name"],
|
|
"tmdb_id": person_id,
|
|
"character": m.get("character", ""),
|
|
"order": i,
|
|
})
|
|
|
|
if (i + 1) % 5 == 0:
|
|
print(f"[TKG-AGENT] Wrote {i+1}/{len(cast)} identity files")
|
|
|
|
# Update _index.json
|
|
index_path = identities_root / "_index.json"
|
|
index = {}
|
|
if index_path.exists():
|
|
with open(index_path) as f:
|
|
index = json.load(f)
|
|
for ci in created_identities:
|
|
index[ci["identity_uuid"]] = ci["name"]
|
|
with open(index_path, "w", encoding="utf-8") as f:
|
|
json.dump(index, f, indent=2, ensure_ascii=False)
|
|
|
|
# Write movie cache ({uuid}.tmdb.json) — simplified, no per-person data
|
|
cache = {
|
|
"file_uuid": args.file_uuid,
|
|
"fetched_at": now,
|
|
"source": "agent",
|
|
"movie": {
|
|
"tmdb_id": movie["id"],
|
|
"title": movie["title"],
|
|
"release_date": movie.get("release_date"),
|
|
"overview": movie.get("overview"),
|
|
"poster_path": movie.get("poster_path"),
|
|
},
|
|
"cast_count": len(cast),
|
|
"identities_created": len(created_identities),
|
|
"identities": created_identities,
|
|
}
|
|
|
|
cache_path = output / f"{args.file_uuid}.tmdb.json"
|
|
with open(cache_path, "w", encoding="utf-8") as f:
|
|
json.dump(cache, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"[TKG-AGENT] Cache written: {cache_path}")
|
|
print(f"[TKG-AGENT] Identity files: {len(created_identities)} cast members → {identities_root}/")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|