feat: tmdb_agent now inserts identities and file_identities to DB
- tmdb_agent.py: INSERT identities with status='pending' - tmdb_agent.py: INSERT file_identities (file_uuid → identity_id) - identity.json: file_bindings includes file_uuid, movie_id, character - backfill_file_identities.py: migrate existing TMDb identities - Tested: 27 Charade cast identities linked to file
This commit is contained in:
147
scripts/backfill_file_identities.py
Normal file
147
scripts/backfill_file_identities.py
Normal file
@@ -0,0 +1,147 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Backfill file_identities table for existing TMDb identities.
|
||||
|
||||
For each TMDb identity with tmdb_movie_id in metadata:
|
||||
1. Find matching file by movie name
|
||||
2. INSERT into file_identities (file_uuid, identity_id)
|
||||
|
||||
Usage:
|
||||
python3 scripts/backfill_file_identities.py --schema public
|
||||
python3 scripts/backfill_file_identities.py --schema dev
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def extract_movie_name(filename: str) -> str | None:
|
||||
"""Extract movie name from filename"""
|
||||
name = Path(filename).stem
|
||||
cleaned = re.sub(r'[._]', ' ', name).strip()
|
||||
for sep in ('|', '(', '[', '{', '\u2502'):
|
||||
idx = cleaned.find(sep)
|
||||
if idx > 0:
|
||||
cleaned = cleaned[:idx].strip()
|
||||
suffixes = (
|
||||
r'\d{3,4}p', r'\d{3,4}x\d{3,4}', r'\d+fps', r'bluray', r'web[ -]?dl',
|
||||
r'webrip', r'hdrip', r'dvdrip', r'dvd', r'brrip', r'hdtv', r'xvid',
|
||||
r'x264', r'h264', r'x265', r'h265', r'hevc', r'aac', r'mp3', r'ac3',
|
||||
r'dts', r'5\.1', r'7\.1', r'dual[ -]?audio', r'multi[ -]?sub',
|
||||
r'proper', r'repack', r'extended', r'unrated', r'directors[ -]?cut',
|
||||
r'theatrical', r'internal', r'limited', r'complete', r'full[ -]?movie',
|
||||
r'english', r'french', r'spanish', r'german', r'chinese',
|
||||
r'youtube', r'yify', r'ettv', r'rarbg', r'tgx', r'axxo', r'ctrlhd',
|
||||
)
|
||||
pattern = r'\b(?:' + '|'.join(suffixes) + r')\b'
|
||||
cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip()
|
||||
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
|
||||
return cleaned if len(cleaned) >= 3 else None
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Backfill file_identities")
|
||||
parser.add_argument("--schema", default="public", help="Database schema")
|
||||
parser.add_argument("--db", default=os.getenv("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry"))
|
||||
args = parser.parse_args()
|
||||
|
||||
schema = args.schema
|
||||
identities_table = f"{schema}.identities" if schema != "public" else "identities"
|
||||
file_identities_table = f"{schema}.file_identities" if schema != "public" else "file_identities"
|
||||
videos_table = f"{schema}.videos" if schema != "public" else "videos"
|
||||
|
||||
conn = psycopg2.connect(args.db)
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
|
||||
# Get TMDb identities with tmdb_movie_id
|
||||
cur.execute(f"""
|
||||
SELECT id, name, tmdb_id, metadata->>'tmdb_movie_id' as tmdb_movie_id,
|
||||
metadata->>'tmdb_movie_title' as tmdb_movie_title,
|
||||
metadata->>'tmdb_character' as tmdb_character,
|
||||
metadata->>'tmdb_cast_order' as tmdb_cast_order
|
||||
FROM {identities_table}
|
||||
WHERE source = 'tmdb' AND tmdb_id IS NOT NULL
|
||||
AND metadata->>'tmdb_movie_id' IS NOT NULL
|
||||
""")
|
||||
identities = cur.fetchall()
|
||||
|
||||
print(f"[Backfill] Found {len(identities)} TMDb identities with movie_id")
|
||||
|
||||
# Get all files
|
||||
cur.execute(f"SELECT file_uuid, file_name FROM {videos_table}")
|
||||
files = cur.fetchall()
|
||||
print(f"[Backfill] Found {len(files)} files")
|
||||
|
||||
# Build file lookup by movie name
|
||||
file_by_movie = {}
|
||||
for f in files:
|
||||
movie_name = extract_movie_name(f["file_name"])
|
||||
if movie_name:
|
||||
file_by_movie[movie_name.lower()] = f["file_uuid"]
|
||||
|
||||
# Match identities to files
|
||||
matched = 0
|
||||
inserted = 0
|
||||
|
||||
for identity in identities:
|
||||
tmdb_movie_title = identity.get("tmdb_movie_title")
|
||||
if not tmdb_movie_title:
|
||||
continue
|
||||
|
||||
# Try to find matching file
|
||||
movie_key = tmdb_movie_title.lower().strip()
|
||||
file_uuid = file_by_movie.get(movie_key)
|
||||
|
||||
# Also try partial match
|
||||
if not file_uuid:
|
||||
for key, fid in file_by_movie.items():
|
||||
if movie_key in key or key in movie_key:
|
||||
file_uuid = fid
|
||||
break
|
||||
|
||||
if file_uuid:
|
||||
matched += 1
|
||||
try:
|
||||
# Check if already exists
|
||||
cur.execute(f"""
|
||||
SELECT 1 FROM {file_identities_table}
|
||||
WHERE file_uuid = %s AND identity_id = %s
|
||||
""", (file_uuid, identity["id"]))
|
||||
if cur.fetchone():
|
||||
continue
|
||||
|
||||
# Insert
|
||||
cur.execute(f"""
|
||||
INSERT INTO {file_identities_table} (
|
||||
file_uuid, identity_id, confidence, metadata
|
||||
) VALUES (%s, %s, %s, %s)
|
||||
""", (
|
||||
file_uuid,
|
||||
identity["id"],
|
||||
1.0,
|
||||
psycopg2.extras.Json({
|
||||
"source": "tmdb_backfill",
|
||||
"tmdb_movie_id": identity.get("tmdb_movie_id"),
|
||||
"tmdb_movie_title": tmdb_movie_title,
|
||||
"character": identity.get("tmdb_character"),
|
||||
"cast_order": int(identity.get("tmdb_cast_order", 0)) if identity.get("tmdb_cast_order") else None,
|
||||
}),
|
||||
))
|
||||
inserted += 1
|
||||
except Exception as e:
|
||||
print(f" [WARN] Failed for {identity['name']}: {e}")
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
print(f"[Backfill] Matched: {matched}/{len(identities)}")
|
||||
print(f"[Backfill] Inserted: {inserted} new file_identities")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -207,11 +207,11 @@ def main():
|
||||
"name": m["name"],
|
||||
"identity_type": "people",
|
||||
"source": "tmdb",
|
||||
"status": "confirmed",
|
||||
"status": "pending",
|
||||
"tmdb_id": person_id,
|
||||
"tmdb_profile": profile_url,
|
||||
"metadata": {k: v for k, v in metadata.items() if v is not None or k == "tmdb_aliases"},
|
||||
"file_bindings": [],
|
||||
"file_bindings": [{"file_uuid": args.file_uuid, "movie_id": movie["id"], "character": m.get("character", ""), "cast_order": i}],
|
||||
"created_at": now,
|
||||
"updated_at": now,
|
||||
}
|
||||
@@ -240,6 +240,7 @@ def main():
|
||||
"tmdb_id": person_id,
|
||||
"character": m.get("character", ""),
|
||||
"order": i,
|
||||
"profile_path": m.get("profile_path"),
|
||||
})
|
||||
|
||||
if (i + 1) % 5 == 0:
|
||||
@@ -256,6 +257,90 @@ def main():
|
||||
with open(index_path, "w", encoding="utf-8") as f:
|
||||
json.dump(index, f, indent=2, ensure_ascii=False)
|
||||
|
||||
# 6. Insert identities into database and create file_identities links
|
||||
print(f"[TKG-AGENT] Syncing {len(created_identities)} identities to database...")
|
||||
|
||||
identities_table = f"{schema}.identities" if schema else "identities"
|
||||
file_identities_table = f"{schema}.file_identities" if schema else "file_identities"
|
||||
|
||||
conn = psycopg2.connect(args.db)
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
|
||||
synced_count = 0
|
||||
for ci in created_identities:
|
||||
try:
|
||||
# Insert into identities table (ON CONFLICT DO UPDATE)
|
||||
cur.execute(f"""
|
||||
INSERT INTO {identities_table} (
|
||||
uuid, name, identity_type, source, status,
|
||||
tmdb_id, tmdb_profile, metadata, created_at, updated_at
|
||||
) VALUES (
|
||||
%s, %s, %s, %s, %s,
|
||||
%s, %s, %s, %s, %s
|
||||
)
|
||||
ON CONFLICT (tmdb_id) WHERE tmdb_id IS NOT NULL DO UPDATE SET
|
||||
tmdb_profile = EXCLUDED.tmdb_profile,
|
||||
metadata = EXCLUDED.metadata,
|
||||
updated_at = EXCLUDED.updated_at
|
||||
RETURNING id
|
||||
""", (
|
||||
ci["identity_uuid"],
|
||||
ci["name"],
|
||||
"people",
|
||||
"tmdb",
|
||||
"pending",
|
||||
ci["tmdb_id"],
|
||||
f"https://image.tmdb.org/t/p/w185{ci['profile_path']}" if ci.get("profile_path") else None,
|
||||
json.dumps({
|
||||
"tmdb_character": ci.get("character", ""),
|
||||
"tmdb_cast_order": ci.get("order", 0),
|
||||
"tmdb_movie_id": movie["id"],
|
||||
"tmdb_movie_title": movie["title"],
|
||||
}),
|
||||
now,
|
||||
now,
|
||||
))
|
||||
|
||||
identity_row = cur.fetchone()
|
||||
if identity_row:
|
||||
identity_id = identity_row["id"]
|
||||
|
||||
# Insert into file_identities table (link file_uuid to identity_id)
|
||||
cur.execute(f"""
|
||||
INSERT INTO {file_identities_table} (
|
||||
file_uuid, identity_id, confidence, metadata, created_at
|
||||
) VALUES (
|
||||
%s, %s, %s, %s, %s
|
||||
)
|
||||
ON CONFLICT (file_uuid, identity_id) DO UPDATE SET
|
||||
confidence = EXCLUDED.confidence,
|
||||
metadata = EXCLUDED.metadata,
|
||||
created_at = EXCLUDED.created_at
|
||||
""", (
|
||||
args.file_uuid,
|
||||
identity_id,
|
||||
1.0,
|
||||
json.dumps({
|
||||
"source": "tmdb_cast",
|
||||
"tmdb_movie_id": movie["id"],
|
||||
"tmdb_movie_title": movie["title"],
|
||||
"character": ci.get("character", ""),
|
||||
"cast_order": ci.get("order", 0),
|
||||
}),
|
||||
now,
|
||||
))
|
||||
|
||||
synced_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f" [WARN] Failed to sync {ci['name']}: {e}", file=sys.stderr)
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
print(f"[TKG-AGENT] Synced {synced_count}/{len(created_identities)} identities to database")
|
||||
|
||||
# Write movie cache ({uuid}.tmdb.json) — simplified, no per-person data
|
||||
cache = {
|
||||
"file_uuid": args.file_uuid,
|
||||
|
||||
Reference in New Issue
Block a user