feat: tmdb_agent now inserts identities and file_identities to DB

- tmdb_agent.py: INSERT identities with status='pending' - tmdb_agent.py: INSERT file_identities (file_uuid → identity_id) - identity.json: file_bindings includes file_uuid, movie_id, character - backfill_file_identities.py: migrate existing TMDb identities - Tested: 27 Charade cast identities linked to file
2026-06-26 13:39:08 +08:00
parent 6cbc11efda
commit 67caf09732
2 changed files with 234 additions and 2 deletions
--- a/scripts/backfill_file_identities.py
+++ b/scripts/backfill_file_identities.py
@@ -0,0 +1,147 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Backfill file_identities table for existing TMDb identities.
+
+For each TMDb identity with tmdb_movie_id in metadata:
+1. Find matching file by movie name
+2. INSERT into file_identities (file_uuid, identity_id)
+
+Usage:
+    python3 scripts/backfill_file_identities.py --schema public
+    python3 scripts/backfill_file_identities.py --schema dev
+"""
+
+import argparse
+import os
+import psycopg2
+import psycopg2.extras
+import re
+from pathlib import Path
+
+
+def extract_movie_name(filename: str) -> str | None:
+    """Extract movie name from filename"""
+    name = Path(filename).stem
+    cleaned = re.sub(r'[._]', ' ', name).strip()
+    for sep in ('|', '(', '[', '{', '\u2502'):
+        idx = cleaned.find(sep)
+        if idx > 0:
+            cleaned = cleaned[:idx].strip()
+    suffixes = (
+        r'\d{3,4}p', r'\d{3,4}x\d{3,4}', r'\d+fps', r'bluray', r'web[ -]?dl',
+        r'webrip', r'hdrip', r'dvdrip', r'dvd', r'brrip', r'hdtv', r'xvid',
+        r'x264', r'h264', r'x265', r'h265', r'hevc', r'aac', r'mp3', r'ac3',
+        r'dts', r'5\.1', r'7\.1', r'dual[ -]?audio', r'multi[ -]?sub',
+        r'proper', r'repack', r'extended', r'unrated', r'directors[ -]?cut',
+        r'theatrical', r'internal', r'limited', r'complete', r'full[ -]?movie',
+        r'english', r'french', r'spanish', r'german', r'chinese',
+        r'youtube', r'yify', r'ettv', r'rarbg', r'tgx', r'axxo', r'ctrlhd',
+    )
+    pattern = r'\b(?:' + '|'.join(suffixes) + r')\b'
+    cleaned = re.sub(pattern, '', cleaned, flags=re.IGNORECASE).strip()
+    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
+    return cleaned if len(cleaned) >= 3 else None
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Backfill file_identities")
+    parser.add_argument("--schema", default="public", help="Database schema")
+    parser.add_argument("--db", default=os.getenv("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry"))
+    args = parser.parse_args()
+
+    schema = args.schema
+    identities_table = f"{schema}.identities" if schema != "public" else "identities"
+    file_identities_table = f"{schema}.file_identities" if schema != "public" else "file_identities"
+    videos_table = f"{schema}.videos" if schema != "public" else "videos"
+
+    conn = psycopg2.connect(args.db)
+    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+
+    # Get TMDb identities with tmdb_movie_id
+    cur.execute(f"""
+        SELECT id, name, tmdb_id, metadata->>'tmdb_movie_id' as tmdb_movie_id,
+               metadata->>'tmdb_movie_title' as tmdb_movie_title,
+               metadata->>'tmdb_character' as tmdb_character,
+               metadata->>'tmdb_cast_order' as tmdb_cast_order
+        FROM {identities_table}
+        WHERE source = 'tmdb' AND tmdb_id IS NOT NULL
+          AND metadata->>'tmdb_movie_id' IS NOT NULL
+    """)
+    identities = cur.fetchall()
+
+    print(f"[Backfill] Found {len(identities)} TMDb identities with movie_id")
+
+    # Get all files
+    cur.execute(f"SELECT file_uuid, file_name FROM {videos_table}")
+    files = cur.fetchall()
+    print(f"[Backfill] Found {len(files)} files")
+
+    # Build file lookup by movie name
+    file_by_movie = {}
+    for f in files:
+        movie_name = extract_movie_name(f["file_name"])
+        if movie_name:
+            file_by_movie[movie_name.lower()] = f["file_uuid"]
+
+    # Match identities to files
+    matched = 0
+    inserted = 0
+
+    for identity in identities:
+        tmdb_movie_title = identity.get("tmdb_movie_title")
+        if not tmdb_movie_title:
+            continue
+
+        # Try to find matching file
+        movie_key = tmdb_movie_title.lower().strip()
+        file_uuid = file_by_movie.get(movie_key)
+
+        # Also try partial match
+        if not file_uuid:
+            for key, fid in file_by_movie.items():
+                if movie_key in key or key in movie_key:
+                    file_uuid = fid
+                    break
+
+        if file_uuid:
+            matched += 1
+            try:
+                # Check if already exists
+                cur.execute(f"""
+                    SELECT 1 FROM {file_identities_table}
+                    WHERE file_uuid = %s AND identity_id = %s
+                """, (file_uuid, identity["id"]))
+                if cur.fetchone():
+                    continue
+
+                # Insert
+                cur.execute(f"""
+                    INSERT INTO {file_identities_table} (
+                        file_uuid, identity_id, confidence, metadata
+                    ) VALUES (%s, %s, %s, %s)
+                """, (
+                    file_uuid,
+                    identity["id"],
+                    1.0,
+                    psycopg2.extras.Json({
+                        "source": "tmdb_backfill",
+                        "tmdb_movie_id": identity.get("tmdb_movie_id"),
+                        "tmdb_movie_title": tmdb_movie_title,
+                        "character": identity.get("tmdb_character"),
+                        "cast_order": int(identity.get("tmdb_cast_order", 0)) if identity.get("tmdb_cast_order") else None,
+                    }),
+                ))
+                inserted += 1
+            except Exception as e:
+                print(f"  [WARN] Failed for {identity['name']}: {e}")
+
+    conn.commit()
+    cur.close()
+    conn.close()
+
+    print(f"[Backfill] Matched: {matched}/{len(identities)}")
+    print(f"[Backfill] Inserted: {inserted} new file_identities")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/tmdb_agent.py
+++ b/scripts/tmdb_agent.py
@@ -207,11 +207,11 @@ def main():
            "name": m["name"],
            "identity_type": "people",
            "source": "tmdb",
-            "status": "confirmed",
+            "status": "pending",
            "tmdb_id": person_id,
            "tmdb_profile": profile_url,
            "metadata": {k: v for k, v in metadata.items() if v is not None or k == "tmdb_aliases"},
-            "file_bindings": [],
+            "file_bindings": [{"file_uuid": args.file_uuid, "movie_id": movie["id"], "character": m.get("character", ""), "cast_order": i}],
            "created_at": now,
            "updated_at": now,
        }
@@ -240,6 +240,7 @@ def main():
            "tmdb_id": person_id,
            "character": m.get("character", ""),
            "order": i,
+            "profile_path": m.get("profile_path"),
        })

        if (i + 1) % 5 == 0:
@@ -256,6 +257,90 @@ def main():
    with open(index_path, "w", encoding="utf-8") as f:
        json.dump(index, f, indent=2, ensure_ascii=False)

+    # 6. Insert identities into database and create file_identities links
+    print(f"[TKG-AGENT] Syncing {len(created_identities)} identities to database...")
+    
+    identities_table = f"{schema}.identities" if schema else "identities"
+    file_identities_table = f"{schema}.file_identities" if schema else "file_identities"
+    
+    conn = psycopg2.connect(args.db)
+    cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
+    
+    synced_count = 0
+    for ci in created_identities:
+        try:
+            # Insert into identities table (ON CONFLICT DO UPDATE)
+            cur.execute(f"""
+                INSERT INTO {identities_table} (
+                    uuid, name, identity_type, source, status,
+                    tmdb_id, tmdb_profile, metadata, created_at, updated_at
+                ) VALUES (
+                    %s, %s, %s, %s, %s,
+                    %s, %s, %s, %s, %s
+                )
+                ON CONFLICT (tmdb_id) WHERE tmdb_id IS NOT NULL DO UPDATE SET
+                    tmdb_profile = EXCLUDED.tmdb_profile,
+                    metadata = EXCLUDED.metadata,
+                    updated_at = EXCLUDED.updated_at
+                RETURNING id
+            """, (
+                ci["identity_uuid"],
+                ci["name"],
+                "people",
+                "tmdb",
+                "pending",
+                ci["tmdb_id"],
+                f"https://image.tmdb.org/t/p/w185{ci['profile_path']}" if ci.get("profile_path") else None,
+                json.dumps({
+                    "tmdb_character": ci.get("character", ""),
+                    "tmdb_cast_order": ci.get("order", 0),
+                    "tmdb_movie_id": movie["id"],
+                    "tmdb_movie_title": movie["title"],
+                }),
+                now,
+                now,
+            ))
+            
+            identity_row = cur.fetchone()
+            if identity_row:
+                identity_id = identity_row["id"]
+                
+                # Insert into file_identities table (link file_uuid to identity_id)
+                cur.execute(f"""
+                    INSERT INTO {file_identities_table} (
+                        file_uuid, identity_id, confidence, metadata, created_at
+                    ) VALUES (
+                        %s, %s, %s, %s, %s
+                    )
+                    ON CONFLICT (file_uuid, identity_id) DO UPDATE SET
+                        confidence = EXCLUDED.confidence,
+                        metadata = EXCLUDED.metadata,
+                        created_at = EXCLUDED.created_at
+                """, (
+                    args.file_uuid,
+                    identity_id,
+                    1.0,
+                    json.dumps({
+                        "source": "tmdb_cast",
+                        "tmdb_movie_id": movie["id"],
+                        "tmdb_movie_title": movie["title"],
+                        "character": ci.get("character", ""),
+                        "cast_order": ci.get("order", 0),
+                    }),
+                    now,
+                ))
+                
+                synced_count += 1
+            
+        except Exception as e:
+            print(f"  [WARN] Failed to sync {ci['name']}: {e}", file=sys.stderr)
+    
+    conn.commit()
+    cur.close()
+    conn.close()
+    
+    print(f"[TKG-AGENT] Synced {synced_count}/{len(created_identities)} identities to database")
+
    # Write movie cache ({uuid}.tmdb.json) — simplified, no per-person data
    cache = {
        "file_uuid": args.file_uuid,