#!/opt/homebrew/bin/python3.11 """ Backfill file_identities table for existing TMDb identities. For each TMDb identity with tmdb_movie_id in metadata: 1. Find matching file by movie name 2. INSERT into file_identities (file_uuid, identity_id) Usage: python3 scripts/backfill_file_identities.py --schema public python3 scripts/backfill_file_identities.py --schema dev """ import argparse import os import psycopg2 import psycopg2.extras import re from pathlib import Path def extract_movie_name(filename: str) -> str | None: """Extract movie name from filename""" name = Path(filename).stem cleaned = re.sub(r'[._]', ' ', name).strip() for sep in ('|', '(', '[', '{', '\u2502'): idx = cleaned.find(sep) if idx > 0: cleaned = cleaned[:idx].strip() suffixes = ( r'\d{3,4}p', r'\d{3,4}x\d{3,4}', r'\d+fps', r'bluray', r'web[ -]?dl', r'webrip', r'hdrip', r'dvdrip', r'dvd', r'brrip', r'hdtv', r'xvid', r'x264', r'h264', r'x265', r'h265', r'hevc', r'aac', r'mp3', r'ac3', r'dts', r'5\.1', r'7\.1', r'dual[ -]?audio', r'multi[ -]?sub', r'proper', r'repack', r'extended', r'unrated', r'directors[ -]?cut', r'theatrical', r'internal', r'limited', r'complete', r'full[ -]?movie', r'english', r'french', r'spanish', r'german', r'chinese', r'youtube', r'yify', r'ettv', r'rarbg', r'tgx', r'axxo', r'ctrlhd', ) pattern = r'\b(?:' + '|'.join(suffixes) + r')\b' cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip() cleaned = re.sub(r'\s+', ' ', cleaned).strip() return cleaned if len(cleaned) >= 3 else None def main(): parser = argparse.ArgumentParser(description="Backfill file_identities") parser.add_argument("--schema", default="public", help="Database schema") parser.add_argument("--db", default=os.getenv("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")) args = parser.parse_args() schema = args.schema identities_table = f"{schema}.identities" if schema != "public" else "identities" file_identities_table = f"{schema}.file_identities" if schema != "public" else "file_identities" videos_table = f"{schema}.videos" if schema != "public" else "videos" conn = psycopg2.connect(args.db) cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) # Get TMDb identities with tmdb_movie_id cur.execute(f""" SELECT id, name, tmdb_id, metadata->>'tmdb_movie_id' as tmdb_movie_id, metadata->>'tmdb_movie_title' as tmdb_movie_title, metadata->>'tmdb_character' as tmdb_character, metadata->>'tmdb_cast_order' as tmdb_cast_order FROM {identities_table} WHERE source = 'tmdb' AND tmdb_id IS NOT NULL AND metadata->>'tmdb_movie_id' IS NOT NULL """) identities = cur.fetchall() print(f"[Backfill] Found {len(identities)} TMDb identities with movie_id") # Get all files cur.execute(f"SELECT file_uuid, file_name FROM {videos_table}") files = cur.fetchall() print(f"[Backfill] Found {len(files)} files") # Build file lookup by movie name file_by_movie = {} for f in files: movie_name = extract_movie_name(f["file_name"]) if movie_name: file_by_movie[movie_name.lower()] = f["file_uuid"] # Match identities to files matched = 0 inserted = 0 for identity in identities: tmdb_movie_title = identity.get("tmdb_movie_title") if not tmdb_movie_title: continue # Try to find matching file movie_key = tmdb_movie_title.lower().strip() file_uuid = file_by_movie.get(movie_key) # Also try partial match if not file_uuid: for key, fid in file_by_movie.items(): if movie_key in key or key in movie_key: file_uuid = fid break if file_uuid: matched += 1 try: # Check if already exists cur.execute(f""" SELECT 1 FROM {file_identities_table} WHERE file_uuid = %s AND identity_id = %s """, (file_uuid, identity["id"])) if cur.fetchone(): continue # Insert cur.execute(f""" INSERT INTO {file_identities_table} ( file_uuid, identity_id, confidence, metadata ) VALUES (%s, %s, %s, %s) """, ( file_uuid, identity["id"], 1.0, psycopg2.extras.Json({ "source": "tmdb_backfill", "tmdb_movie_id": identity.get("tmdb_movie_id"), "tmdb_movie_title": tmdb_movie_title, "character": identity.get("tmdb_character"), "cast_order": int(identity.get("tmdb_cast_order", 0)) if identity.get("tmdb_cast_order") else None, }), )) inserted += 1 except Exception as e: print(f" [WARN] Failed for {identity['name']}: {e}") conn.commit() cur.close() conn.close() print(f"[Backfill] Matched: {matched}/{len(identities)}") print(f"[Backfill] Inserted: {inserted} new file_identities") if __name__ == "__main__": main()