From 67caf0973261fa069b821d79c7a52efd371a3b49 Mon Sep 17 00:00:00 2001 From: Accusys Date: Fri, 26 Jun 2026 13:39:08 +0800 Subject: [PATCH] feat: tmdb_agent now inserts identities and file_identities to DB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - tmdb_agent.py: INSERT identities with status='pending' - tmdb_agent.py: INSERT file_identities (file_uuid → identity_id) - identity.json: file_bindings includes file_uuid, movie_id, character - backfill_file_identities.py: migrate existing TMDb identities - Tested: 27 Charade cast identities linked to file --- scripts/backfill_file_identities.py | 147 ++++++++++++++++++++++++++++ scripts/tmdb_agent.py | 89 ++++++++++++++++- 2 files changed, 234 insertions(+), 2 deletions(-) create mode 100644 scripts/backfill_file_identities.py diff --git a/scripts/backfill_file_identities.py b/scripts/backfill_file_identities.py new file mode 100644 index 0000000..8d3f34e --- /dev/null +++ b/scripts/backfill_file_identities.py @@ -0,0 +1,147 @@ +#!/opt/homebrew/bin/python3.11 +""" +Backfill file_identities table for existing TMDb identities. + +For each TMDb identity with tmdb_movie_id in metadata: +1. Find matching file by movie name +2. INSERT into file_identities (file_uuid, identity_id) + +Usage: + python3 scripts/backfill_file_identities.py --schema public + python3 scripts/backfill_file_identities.py --schema dev +""" + +import argparse +import os +import psycopg2 +import psycopg2.extras +import re +from pathlib import Path + + +def extract_movie_name(filename: str) -> str | None: + """Extract movie name from filename""" + name = Path(filename).stem + cleaned = re.sub(r'[._]', ' ', name).strip() + for sep in ('|', '(', '[', '{', '\u2502'): + idx = cleaned.find(sep) + if idx > 0: + cleaned = cleaned[:idx].strip() + suffixes = ( + r'\d{3,4}p', r'\d{3,4}x\d{3,4}', r'\d+fps', r'bluray', r'web[ -]?dl', + r'webrip', r'hdrip', r'dvdrip', r'dvd', r'brrip', r'hdtv', r'xvid', + r'x264', r'h264', r'x265', r'h265', r'hevc', r'aac', r'mp3', r'ac3', + r'dts', r'5\.1', r'7\.1', r'dual[ -]?audio', r'multi[ -]?sub', + r'proper', r'repack', r'extended', r'unrated', r'directors[ -]?cut', + r'theatrical', r'internal', r'limited', r'complete', r'full[ -]?movie', + r'english', r'french', r'spanish', r'german', r'chinese', + r'youtube', r'yify', r'ettv', r'rarbg', r'tgx', r'axxo', r'ctrlhd', + ) + pattern = r'\b(?:' + '|'.join(suffixes) + r')\b' + cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip() + cleaned = re.sub(r'\s+', ' ', cleaned).strip() + return cleaned if len(cleaned) >= 3 else None + + +def main(): + parser = argparse.ArgumentParser(description="Backfill file_identities") + parser.add_argument("--schema", default="public", help="Database schema") + parser.add_argument("--db", default=os.getenv("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")) + args = parser.parse_args() + + schema = args.schema + identities_table = f"{schema}.identities" if schema != "public" else "identities" + file_identities_table = f"{schema}.file_identities" if schema != "public" else "file_identities" + videos_table = f"{schema}.videos" if schema != "public" else "videos" + + conn = psycopg2.connect(args.db) + cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) + + # Get TMDb identities with tmdb_movie_id + cur.execute(f""" + SELECT id, name, tmdb_id, metadata->>'tmdb_movie_id' as tmdb_movie_id, + metadata->>'tmdb_movie_title' as tmdb_movie_title, + metadata->>'tmdb_character' as tmdb_character, + metadata->>'tmdb_cast_order' as tmdb_cast_order + FROM {identities_table} + WHERE source = 'tmdb' AND tmdb_id IS NOT NULL + AND metadata->>'tmdb_movie_id' IS NOT NULL + """) + identities = cur.fetchall() + + print(f"[Backfill] Found {len(identities)} TMDb identities with movie_id") + + # Get all files + cur.execute(f"SELECT file_uuid, file_name FROM {videos_table}") + files = cur.fetchall() + print(f"[Backfill] Found {len(files)} files") + + # Build file lookup by movie name + file_by_movie = {} + for f in files: + movie_name = extract_movie_name(f["file_name"]) + if movie_name: + file_by_movie[movie_name.lower()] = f["file_uuid"] + + # Match identities to files + matched = 0 + inserted = 0 + + for identity in identities: + tmdb_movie_title = identity.get("tmdb_movie_title") + if not tmdb_movie_title: + continue + + # Try to find matching file + movie_key = tmdb_movie_title.lower().strip() + file_uuid = file_by_movie.get(movie_key) + + # Also try partial match + if not file_uuid: + for key, fid in file_by_movie.items(): + if movie_key in key or key in movie_key: + file_uuid = fid + break + + if file_uuid: + matched += 1 + try: + # Check if already exists + cur.execute(f""" + SELECT 1 FROM {file_identities_table} + WHERE file_uuid = %s AND identity_id = %s + """, (file_uuid, identity["id"])) + if cur.fetchone(): + continue + + # Insert + cur.execute(f""" + INSERT INTO {file_identities_table} ( + file_uuid, identity_id, confidence, metadata + ) VALUES (%s, %s, %s, %s) + """, ( + file_uuid, + identity["id"], + 1.0, + psycopg2.extras.Json({ + "source": "tmdb_backfill", + "tmdb_movie_id": identity.get("tmdb_movie_id"), + "tmdb_movie_title": tmdb_movie_title, + "character": identity.get("tmdb_character"), + "cast_order": int(identity.get("tmdb_cast_order", 0)) if identity.get("tmdb_cast_order") else None, + }), + )) + inserted += 1 + except Exception as e: + print(f" [WARN] Failed for {identity['name']}: {e}") + + conn.commit() + cur.close() + conn.close() + + print(f"[Backfill] Matched: {matched}/{len(identities)}") + print(f"[Backfill] Inserted: {inserted} new file_identities") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/tmdb_agent.py b/scripts/tmdb_agent.py index fc6f196..be564c1 100644 --- a/scripts/tmdb_agent.py +++ b/scripts/tmdb_agent.py @@ -207,11 +207,11 @@ def main(): "name": m["name"], "identity_type": "people", "source": "tmdb", - "status": "confirmed", + "status": "pending", "tmdb_id": person_id, "tmdb_profile": profile_url, "metadata": {k: v for k, v in metadata.items() if v is not None or k == "tmdb_aliases"}, - "file_bindings": [], + "file_bindings": [{"file_uuid": args.file_uuid, "movie_id": movie["id"], "character": m.get("character", ""), "cast_order": i}], "created_at": now, "updated_at": now, } @@ -240,6 +240,7 @@ def main(): "tmdb_id": person_id, "character": m.get("character", ""), "order": i, + "profile_path": m.get("profile_path"), }) if (i + 1) % 5 == 0: @@ -256,6 +257,90 @@ def main(): with open(index_path, "w", encoding="utf-8") as f: json.dump(index, f, indent=2, ensure_ascii=False) + # 6. Insert identities into database and create file_identities links + print(f"[TKG-AGENT] Syncing {len(created_identities)} identities to database...") + + identities_table = f"{schema}.identities" if schema else "identities" + file_identities_table = f"{schema}.file_identities" if schema else "file_identities" + + conn = psycopg2.connect(args.db) + cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) + + synced_count = 0 + for ci in created_identities: + try: + # Insert into identities table (ON CONFLICT DO UPDATE) + cur.execute(f""" + INSERT INTO {identities_table} ( + uuid, name, identity_type, source, status, + tmdb_id, tmdb_profile, metadata, created_at, updated_at + ) VALUES ( + %s, %s, %s, %s, %s, + %s, %s, %s, %s, %s + ) + ON CONFLICT (tmdb_id) WHERE tmdb_id IS NOT NULL DO UPDATE SET + tmdb_profile = EXCLUDED.tmdb_profile, + metadata = EXCLUDED.metadata, + updated_at = EXCLUDED.updated_at + RETURNING id + """, ( + ci["identity_uuid"], + ci["name"], + "people", + "tmdb", + "pending", + ci["tmdb_id"], + f"https://image.tmdb.org/t/p/w185{ci['profile_path']}" if ci.get("profile_path") else None, + json.dumps({ + "tmdb_character": ci.get("character", ""), + "tmdb_cast_order": ci.get("order", 0), + "tmdb_movie_id": movie["id"], + "tmdb_movie_title": movie["title"], + }), + now, + now, + )) + + identity_row = cur.fetchone() + if identity_row: + identity_id = identity_row["id"] + + # Insert into file_identities table (link file_uuid to identity_id) + cur.execute(f""" + INSERT INTO {file_identities_table} ( + file_uuid, identity_id, confidence, metadata, created_at + ) VALUES ( + %s, %s, %s, %s, %s + ) + ON CONFLICT (file_uuid, identity_id) DO UPDATE SET + confidence = EXCLUDED.confidence, + metadata = EXCLUDED.metadata, + created_at = EXCLUDED.created_at + """, ( + args.file_uuid, + identity_id, + 1.0, + json.dumps({ + "source": "tmdb_cast", + "tmdb_movie_id": movie["id"], + "tmdb_movie_title": movie["title"], + "character": ci.get("character", ""), + "cast_order": ci.get("order", 0), + }), + now, + )) + + synced_count += 1 + + except Exception as e: + print(f" [WARN] Failed to sync {ci['name']}: {e}", file=sys.stderr) + + conn.commit() + cur.close() + conn.close() + + print(f"[TKG-AGENT] Synced {synced_count}/{len(created_identities)} identities to database") + # Write movie cache ({uuid}.tmdb.json) — simplified, no per-person data cache = { "file_uuid": args.file_uuid,