Files
momentry_core/scripts/backfill_file_identities.py
Accusys 67caf09732 feat: tmdb_agent now inserts identities and file_identities to DB
- tmdb_agent.py: INSERT identities with status='pending'
- tmdb_agent.py: INSERT file_identities (file_uuid → identity_id)
- identity.json: file_bindings includes file_uuid, movie_id, character
- backfill_file_identities.py: migrate existing TMDb identities
- Tested: 27 Charade cast identities linked to file
2026-06-26 13:39:08 +08:00

147 lines
5.3 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Backfill file_identities table for existing TMDb identities.
For each TMDb identity with tmdb_movie_id in metadata:
1. Find matching file by movie name
2. INSERT into file_identities (file_uuid, identity_id)
Usage:
python3 scripts/backfill_file_identities.py --schema public
python3 scripts/backfill_file_identities.py --schema dev
"""
import argparse
import os
import psycopg2
import psycopg2.extras
import re
from pathlib import Path
def extract_movie_name(filename: str) -> str | None:
"""Extract movie name from filename"""
name = Path(filename).stem
cleaned = re.sub(r'[._]', ' ', name).strip()
for sep in ('|', '(', '[', '{', '\u2502'):
idx = cleaned.find(sep)
if idx > 0:
cleaned = cleaned[:idx].strip()
suffixes = (
r'\d{3,4}p', r'\d{3,4}x\d{3,4}', r'\d+fps', r'bluray', r'web[ -]?dl',
r'webrip', r'hdrip', r'dvdrip', r'dvd', r'brrip', r'hdtv', r'xvid',
r'x264', r'h264', r'x265', r'h265', r'hevc', r'aac', r'mp3', r'ac3',
r'dts', r'5\.1', r'7\.1', r'dual[ -]?audio', r'multi[ -]?sub',
r'proper', r'repack', r'extended', r'unrated', r'directors[ -]?cut',
r'theatrical', r'internal', r'limited', r'complete', r'full[ -]?movie',
r'english', r'french', r'spanish', r'german', r'chinese',
r'youtube', r'yify', r'ettv', r'rarbg', r'tgx', r'axxo', r'ctrlhd',
)
pattern = r'\b(?:' + '|'.join(suffixes) + r')\b'
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip()
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
return cleaned if len(cleaned) >= 3 else None
def main():
parser = argparse.ArgumentParser(description="Backfill file_identities")
parser.add_argument("--schema", default="public", help="Database schema")
parser.add_argument("--db", default=os.getenv("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry"))
args = parser.parse_args()
schema = args.schema
identities_table = f"{schema}.identities" if schema != "public" else "identities"
file_identities_table = f"{schema}.file_identities" if schema != "public" else "file_identities"
videos_table = f"{schema}.videos" if schema != "public" else "videos"
conn = psycopg2.connect(args.db)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
# Get TMDb identities with tmdb_movie_id
cur.execute(f"""
SELECT id, name, tmdb_id, metadata->>'tmdb_movie_id' as tmdb_movie_id,
metadata->>'tmdb_movie_title' as tmdb_movie_title,
metadata->>'tmdb_character' as tmdb_character,
metadata->>'tmdb_cast_order' as tmdb_cast_order
FROM {identities_table}
WHERE source = 'tmdb' AND tmdb_id IS NOT NULL
AND metadata->>'tmdb_movie_id' IS NOT NULL
""")
identities = cur.fetchall()
print(f"[Backfill] Found {len(identities)} TMDb identities with movie_id")
# Get all files
cur.execute(f"SELECT file_uuid, file_name FROM {videos_table}")
files = cur.fetchall()
print(f"[Backfill] Found {len(files)} files")
# Build file lookup by movie name
file_by_movie = {}
for f in files:
movie_name = extract_movie_name(f["file_name"])
if movie_name:
file_by_movie[movie_name.lower()] = f["file_uuid"]
# Match identities to files
matched = 0
inserted = 0
for identity in identities:
tmdb_movie_title = identity.get("tmdb_movie_title")
if not tmdb_movie_title:
continue
# Try to find matching file
movie_key = tmdb_movie_title.lower().strip()
file_uuid = file_by_movie.get(movie_key)
# Also try partial match
if not file_uuid:
for key, fid in file_by_movie.items():
if movie_key in key or key in movie_key:
file_uuid = fid
break
if file_uuid:
matched += 1
try:
# Check if already exists
cur.execute(f"""
SELECT 1 FROM {file_identities_table}
WHERE file_uuid = %s AND identity_id = %s
""", (file_uuid, identity["id"]))
if cur.fetchone():
continue
# Insert
cur.execute(f"""
INSERT INTO {file_identities_table} (
file_uuid, identity_id, confidence, metadata
) VALUES (%s, %s, %s, %s)
""", (
file_uuid,
identity["id"],
1.0,
psycopg2.extras.Json({
"source": "tmdb_backfill",
"tmdb_movie_id": identity.get("tmdb_movie_id"),
"tmdb_movie_title": tmdb_movie_title,
"character": identity.get("tmdb_character"),
"cast_order": int(identity.get("tmdb_cast_order", 0)) if identity.get("tmdb_cast_order") else None,
}),
))
inserted += 1
except Exception as e:
print(f" [WARN] Failed for {identity['name']}: {e}")
conn.commit()
cur.close()
conn.close()
print(f"[Backfill] Matched: {matched}/{len(identities)}")
print(f"[Backfill] Inserted: {inserted} new file_identities")
if __name__ == "__main__":
main()