momentry_core/scripts/pipeline_checklist.py

#!/opt/homebrew/bin/python3.11
"""
Pipeline Checklist — 獨立於 pipeline 之外的驗收檢查
每個 stage 完成後自動檢查產出是否到位，沒過的標記出來。
"""

import json, os, subprocess, sys
from datetime import datetime
from pathlib import Path

PROJECT = Path(__file__).resolve().parent.parent
OUTPUT_DIR = Path(os.environ.get("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev"))
DB_USER = os.environ.get("USER", "accusys")
DB_NAME = "momentry"
PSQL = "/Users/accusys/pgsql/18.3/bin/psql"


def run_sql(sql: str) -> str:
    r = subprocess.run(
        [PSQL, "-U", DB_USER, "-d", DB_NAME, "-t", "-A", "-c", sql],
        capture_output=True, text=True, timeout=30,
    )
    return r.stdout.strip()


def check_file(uuid: str, suffix: str) -> tuple[bool, str]:
    f = OUTPUT_DIR / f"{uuid}.{suffix}"
    if not f.exists():
        return False, f"missing: {f.name}"
    if f.stat().st_size == 0:
        return False, f"empty: {f.name} (0 bytes)"
    return True, f"ok ({f.stat().st_size // 1024}KB)"


stages = []  # (name, checks)
uuid = "aeed71342a899fe4b4c57b7d41bcb692"
results = []

def check(name, checks):
    stage_results = []
    for desc, ok, msg in checks:
        stage_results.append((desc, ok, msg))
    passed = all(ok for _, ok, _ in stage_results)
    results.append((name, passed, stage_results))
    return passed


print(f"\n{'='*60}")
print(f"Pipeline Checklist — {uuid}")
print(f"{'='*60}\n")

# ── Stage 1: ASR ──
print("[1/8] ASR")
ok, msg = check_file(uuid, "asr.json")
segments = 0
if ok:
    try:
        with open(OUTPUT_DIR / f"{uuid}.asr.json") as fh:
            d = json.load(fh)
        segments = len(d.get("segments", []))
    except Exception:
        segments = 0
check("ASR output", [
    ("asr.json exists", ok, msg),
    ("has segments", segments > 0, f"{segments} segments"),
])

# ── Stage 2: ASRX ──
print("[2/8] ASRX")
ok, msg = check_file(uuid, "asrx.json")
segments = 0
if ok:
    try:
        with open(OUTPUT_DIR / f"{uuid}.asrx.json") as fh:
            d = json.load(fh)
        segments = len(d.get("segments", []))
    except Exception:
        segments = 0
check("ASRX output", [
    ("asrx.json exists", ok, msg),
    ("has segments", segments > 0, f"{segments} segments"),
])

# ── Stage 3: Rule 1 + Sentence Chunks ──
print("[3/8] Rule 1 - Sentence Chunks")
chunk_count = int(run_sql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{uuid}' AND chunk_type='sentence'"))
check("sentence chunks", [
    ("chunks exist", chunk_count > 0, f"{chunk_count} sentence chunks"),
])

# ── Stage 4: Vectorization ──
print("[4/8] Vectorization")
vec_count = int(run_sql(f"SELECT count(*) FROM dev.chunk_vectors WHERE uuid='{uuid}'"))
col = os.environ.get('QDRANT_COLLECTION', 'momentry_dev_rule1_v2')
qdrant = subprocess.run(
    ["curl", "-s", "-X", "POST", f"http://localhost:6333/collections/{col}/points/count",
     "-H", "Content-Type: application/json", "-d", '{"exact": true}'],
    capture_output=True, text=True, timeout=10,
)
qdrant_ok = '"count"' in qdrant.stdout
check("vector embeddings", [
    ("PG vectors", vec_count > 0, f"{vec_count} vectors"),
    ("Qdrant accessible", qdrant_ok, "ok" if qdrant_ok else "no response"),
])

# ── Stage 5: Face Trace ──
print("[5/8] Face Trace")
trace_count = int(run_sql(f"SELECT count(DISTINCT trace_id) FROM dev.face_detections WHERE file_uuid='{uuid}' AND trace_id IS NOT NULL"))
face_count = int(run_sql(f"SELECT count(*) FROM dev.face_detections WHERE file_uuid='{uuid}' AND trace_id IS NOT NULL"))
check("face trace", [
    ("traces exist", trace_count > 0, f"{trace_count} traces, {face_count} detections"),
])

# ── Stage 6: TKG Graph ──
print("[6/8] TKG")
node_count = int(run_sql(f"SELECT count(*) FROM dev.tkg_nodes WHERE file_uuid='{uuid}'"))
edge_count = int(run_sql(f"SELECT count(*) FROM dev.tkg_edges WHERE file_uuid='{uuid}'"))
face_face = int(run_sql(f"SELECT count(*) FROM dev.tkg_edges WHERE file_uuid='{uuid}' AND edge_type='CO_OCCURS_WITH' AND source_node_id IN (SELECT id FROM dev.tkg_nodes WHERE node_type='face_trace')"))
check("TKG graph", [
    ("nodes", node_count > 0, f"{node_count} nodes"),
    ("edges", edge_count > 0, f"{edge_count} edges"),
    ("face-face edges", face_face > 0, f"{face_face} face-face edges"),
])

# ── Stage 7: Trace Chunks ──
print("[7/8] Trace Chunks")
trace_chunks = int(run_sql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{uuid}' AND chunk_type='trace'"))
check("trace chunks", [
    ("trace chunks exist", trace_chunks > 0, f"{trace_chunks} trace chunks"),
])

# ── Stage 8: Phase 1 Release ──
print("[8/8] Phase 1 Release")
phase1_dir = PROJECT / "release" / "phase1" / "latest"
phase1_ok = phase1_dir.exists() and (phase1_dir / "RELEASE_INFO.txt").exists()
if phase1_ok:
    total_size = sum(f.stat().st_size for f in phase1_dir.rglob("*") if f.is_file())
    msg = f"ok ({total_size // 1024}KB)"
else:
    msg = "not found"
check("Phase 1 release", [
    ("release dir exists", phase1_ok, msg),
])


# ── Summary ──
print(f"\n{'='*60}")
print("SUMMARY")
print(f"{'='*60}")
all_passed = True
for name, passed, _ in results:
    status = "✅" if passed else "❌"
    print(f"  {status} {name}")
    all_passed = all_passed and passed

print(f"\n{'PASS' if all_passed else 'FAIL'}")
print(f"{'='*60}\n")

# Output as JSON for machine parsing
report = {
    "uuid": uuid,
    "timestamp": datetime.utcnow().isoformat(),
    "passed": all_passed,
    "stages": {name: {"passed": passed, "checks": {d: o for d, o, _ in checks}}
               for name, passed, checks in results},
}
print(json.dumps(report, indent=2))