feat: Phase 1 handover - schema migration, correction mechanism, API fixes

Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index
Correction: asr-1.json format, generate/apply scripts
API: 37/37 endpoints fixed and tested
Docs: HANDOVER_V2.0.md for M4
This commit is contained in:
Accusys
2026-05-11 07:03:22 +08:00
parent ef894a44ad
commit 39ba5ddf76
147 changed files with 19843 additions and 3053 deletions

327
scripts/export_file.py Executable file
View File

@@ -0,0 +1,327 @@
#!/opt/homebrew/bin/python3.11
"""
momentry-export — 打包檔案歷程
將單一 file_uuid 的所有產出打包成可攜帶的 tar.gz
Usage:
python3 scripts/export_file.py <uuid> [--output <path>] [--include-video]
Example:
python3 scripts/export_file.py fa182e9c26145b2c1a932f73d1d484e5 --output /tmp/test_export.tar.gz
"""
import sys, os, json, argparse, tarfile, io, time
from pathlib import Path
from datetime import datetime
import psycopg2
import psycopg2.extras
DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
SCHEMA = os.environ.get("MOMENTRY_DB_SCHEMA", "dev")
OUTPUT_DIR = os.environ.get("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
TABLES = [
"pre_chunks", "chunks", "face_detections",
"processor_results", "processor_versions",
"videos", "api_keys",
]
def get_conn():
return psycopg2.connect(DB_URL)
def fetch_table(conn, table: str, uuid: str) -> list[dict]:
"""Fetch rows from a table that reference this UUID"""
uuid_columns = {"file_uuid", "uuid"}
# Get columns
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute(
"SELECT column_name, data_type FROM information_schema.columns "
"WHERE table_schema = %s AND table_name = %s",
(SCHEMA, table),
)
cols = cur.fetchall()
uuid_col = None
for c in cols:
if c["column_name"] in uuid_columns:
uuid_col = c["column_name"]
break
if not uuid_col:
cur.close()
return []
# Fetch rows
cur.execute(
f"SELECT * FROM {SCHEMA}.{table} WHERE {uuid_col} = %s",
(uuid,),
)
rows = [dict(r) for r in cur.fetchall()]
cur.close()
return rows
def fetch_video_row(conn, uuid: str) -> dict | None:
"""Get video metadata"""
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute(f"SELECT * FROM {SCHEMA}.videos WHERE file_uuid = %s", (uuid,))
row = cur.fetchone()
cur.close()
return dict(row) if row else None
def serialize_value(v):
"""Convert DB types to JSON-serializable"""
if isinstance(v, (datetime,)):
return v.isoformat()
if isinstance(v, bytes):
return list(v) # convert bytea to list of ints
if isinstance(v, (list,)):
# Check if it's a pgvector (list of floats)
return v
return v
def export_file(uuid: str, output_path: str, include_video: bool = False):
"""Export all data for a UUID into a tar.gz"""
t0 = time.time()
print(f"[EXPORT] Exporting {uuid}...")
conn = get_conn()
buf = io.BytesIO()
# 先確認是否完成
cur = conn.cursor()
cur.execute(
f"SELECT status FROM {SCHEMA}.monitor_jobs WHERE uuid = %s ORDER BY id DESC LIMIT 1",
(uuid,),
)
row = cur.fetchone()
job_status = row[0] if row else "unknown"
cur.close()
if job_status == "completed":
print(f" [EXPORT] Job status: ✅ {job_status}")
elif job_status == "failed":
print(f" [EXPORT] ⚠️ Job status: ❌ {job_status} (仍可匯出部分資料)")
elif job_status == "running":
print(f" [EXPORT] ⚠️ Job status: ⏳ {job_status} (處理中,產出不完全)")
else:
print(f" [EXPORT] ⚠️ Job status: {job_status}")
video = fetch_video_row(conn, uuid)
if not video:
print(f"[EXPORT] UUID {uuid} not found in videos table")
conn.close()
return False
# 歷程完整性檢查
print(f"\n ── 歷程完整性檢查 ──")
# Job status
completeness = {"job": job_status == "completed"}
# Processors: 7 processors all completed
cur = conn.cursor()
cur.execute(
f"SELECT processor, status FROM {SCHEMA}.processor_results "
f"WHERE file_uuid = %s ORDER BY processor",
(uuid,),
)
procs = {r[0]: r[1] for r in cur.fetchall()}
cur.close()
expected = ["asr", "asrx", "cut", "face", "ocr", "pose", "yolo"]
for p in expected:
st = procs.get(p, "missing")
completeness[f"proc_{p}"] = st == "completed"
completeness["processors"] = f"{sum(1 for p in expected if procs.get(p)=='completed')}/{len(expected)}"
# Output JSON files
output_dir = Path(OUTPUT_DIR)
json_files = sorted(output_dir.glob(f"{uuid}.*.json"))
completeness["output_jsons"] = len(json_files)
# Face detections
cur = conn.cursor()
cur.execute(
f"SELECT count(*) FROM {SCHEMA}.face_detections WHERE file_uuid = %s",
(uuid,),
)
completeness["face_detections"] = cur.fetchone()[0]
cur.close()
# Chunks (Rule 1)
cur = conn.cursor()
cur.execute(
f"SELECT count(*) FROM {SCHEMA}.chunks WHERE file_uuid = %s",
(uuid,),
)
completeness["chunks"] = cur.fetchone()[0]
cur.close()
# Print completeness report
for k, v in completeness.items():
icon = "" if v is True else ("" if v is False else "")
print(f" {icon} {k}: {v}")
# Decide if export is viable
has_core_data = completeness["output_jsons"] > 0 or completeness["face_detections"] > 0 or completeness["chunks"] > 0
if not has_core_data and job_status != "completed":
print(f"\n ⛔ 歷程不完整,無核心產出,中止匯出")
conn.close()
return False
print(f" ─────────────────\n")
with tarfile.open(fileobj=buf, mode="w:gz") as tar:
manifest = {
"exported_at": datetime.now().isoformat(),
"version": "1.0",
"file_uuid": uuid,
"file_name": video.get("file_name"),
"duration": video.get("duration"),
"fps": float(video.get("fps") or 0),
"width": video.get("width"),
"height": video.get("height"),
"total_frames": video.get("total_frames"),
"include_video": include_video,
"completeness": {k: str(v) if not isinstance(v, (bool, int, str)) else v
for k, v in completeness.items()},
"merge_policy": {
"identities": "merge_by_name",
"description": "匯入時 identity 依名稱比對,已存在則合併(保留 target 的 identity_id不存在則新增",
},
}
_add_json(tar, "manifest.json", manifest)
# 2. Video metadata (videos table row)
_add_json(tar, "data/video.json", video)
# 3. DB tables
for table in TABLES:
rows = fetch_table(conn, table, uuid)
if rows:
_add_json(tar, f"data/{table}.json", rows)
print(f" [EXPORT] {table}: {len(rows)} rows")
else:
print(f" [EXPORT] {table}: (empty)")
# 4. Face detection embeddings (handle vector type)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute(
f"SELECT id, file_uuid, frame_number, trace_id, x, y, width, height, "
f"confidence, identity_id FROM {SCHEMA}.face_detections WHERE file_uuid = %s",
(uuid,),
)
fd_rows = [dict(r) for r in cur.fetchall()]
cur.close()
if fd_rows:
_add_json(tar, "data/face_detections_meta.json", fd_rows)
print(f" [EXPORT] face_detections (meta): {len(fd_rows)} rows")
else:
print(f" [EXPORT] face_detections: (empty)")
# 5. Identity 關聯資料
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
# 找出此 file_uuid 相關的所有 identity_id
cur.execute(
f"SELECT DISTINCT identity_id FROM {SCHEMA}.face_detections "
f"WHERE file_uuid = %s AND identity_id IS NOT NULL",
(uuid,),
)
identity_ids = [r["identity_id"] for r in cur.fetchall()]
if identity_ids:
# 查 identities 表
placeholders = ",".join(["%s"] * len(identity_ids))
cur.execute(
f"SELECT * FROM {SCHEMA}.identities WHERE id IN ({placeholders})",
identity_ids,
)
ident_rows = [dict(r) for r in cur.fetchall()]
_add_json(tar, "data/identities.json", ident_rows)
print(f" [EXPORT] identities: {len(ident_rows)} rows")
# 查 identity_bindings
cur.execute(
f"SELECT * FROM {SCHEMA}.identity_bindings "
f"WHERE identity_id IN ({placeholders})",
identity_ids,
)
bind_rows = [dict(r) for r in cur.fetchall()]
if bind_rows:
_add_json(tar, "data/identity_bindings.json", bind_rows)
print(f" [EXPORT] identity_bindings: {len(bind_rows)} rows")
# 查 file_identities若 table 存在)
try:
cur.execute(
f"SELECT * FROM {SCHEMA}.file_identities WHERE file_uuid = %s",
(uuid,),
)
fi_rows = [dict(r) for r in cur.fetchall()]
if fi_rows:
_add_json(tar, "data/file_identities.json", fi_rows)
print(f" [EXPORT] file_identities: {len(fi_rows)} rows")
except Exception:
pass # table 可能不存在
else:
print(f" [EXPORT] identities: (none bound to this file)")
cur.close()
# 6. Output JSON files
output_dir = Path(OUTPUT_DIR)
json_files = list(output_dir.glob(f"{uuid}.*.json"))
for jf in json_files:
arcname = f"output/{jf.name}"
tar.add(str(jf), arcname=arcname)
print(f" [EXPORT] output/{jf.name} ({jf.stat().st_size / 1024:.0f}KB)")
print(f" [EXPORT] output JSONs: {len(json_files)} files")
# 7. Original video file (optional)
if include_video and video.get("file_path"):
src = video["file_path"]
if os.path.exists(src):
tar.add(src, arcname="original/" + os.path.basename(src))
print(f" [EXPORT] original video: {src}")
else:
print(f" [WARN] Video file not found: {src}")
conn.close()
# Write to disk
with open(output_path, "wb") as f:
f.write(buf.getvalue())
size_mb = os.path.getsize(output_path) / 1e6
elapsed = time.time() - t0
print(f"\n[EXPORT] Done: {output_path} ({size_mb:.1f}MB, {elapsed:.1f}s)")
return True
def _add_json(tar: tarfile.TarFile, arcname: str, data):
"""Add a JSON file to the tar archive"""
raw = json.dumps(data, ensure_ascii=False, default=str, indent=2).encode()
info = tarfile.TarInfo(name=arcname)
info.size = len(raw)
info.mtime = int(time.time())
tar.addfile(info, io.BytesIO(raw))
def main():
parser = argparse.ArgumentParser(description="Export file processing history")
parser.add_argument("uuid", help="File UUID to export")
parser.add_argument("--output", "-o", default=None,
help="Output tar.gz path (default: {uuid}.tar.gz)")
parser.add_argument("--include-video", action="store_true",
help="Include original video file in export")
args = parser.parse_args()
output = args.output or f"{args.uuid}.tar.gz"
success = export_file(args.uuid, output, args.include_video)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()