diff --git a/.env.development b/.env.development index 3251d43..df75941 100644 --- a/.env.development +++ b/.env.development @@ -29,7 +29,7 @@ REDIS_PASSWORD=accusys # Qdrant Vector Database - Collection isolation QDRANT_URL=http://localhost:6333 QDRANT_API_KEY=Test3200Test3200Test3200 -QDRANT_COLLECTION=momentry_dev_rule1 +QDRANT_COLLECTION=momentry_dev_v1 # Paths MOMENTRY_OUTPUT_DIR=/Users/accusys/momentry/output_dev diff --git a/.env.example b/.env.example index 9ff9a71..fa70426 100644 --- a/.env.example +++ b/.env.example @@ -24,7 +24,7 @@ MONGODB_DATABASE=momentry # =========================================== QDRANT_URL=http://localhost:6333 QDRANT_API_KEY=your_qdrant_api_key -QDRANT_COLLECTION=momentry_rule1 +QDRANT_COLLECTION=momentry_v2_full # =========================================== # API Server Configuration diff --git a/docs/RELEASE_PHASES.md b/docs/RELEASE_PHASES.md index 6040df9..dc043a3 100644 --- a/docs/RELEASE_PHASES.md +++ b/docs/RELEASE_PHASES.md @@ -22,18 +22,19 @@ Search / Query / Identity APIs - **Model** = 每部影片的產出 package(output_json + chunks + vectors) - **Engine** = momentry core,吃 model 提供 API(search, trace, identity) -如同模型的 tiny / small / medium / large,每個影片可有多個 model 版本: +每個影片可有多個 model 版本,命名保留升級空間: -| Model 版本 | 內容 | 觸發時機 | -|-----------|------|---------| -| `{uuid}_v1_tiny` | sentence chunk embedding | ASR + ASRX + Rule 1 完成 | -| `{uuid}_v2_full` | 完整 pipeline + 5W1H | 全部完成 | +| Model 版本 | Qdrant Collection | 內容 | 觸發時機 | +|-----------|------------------|------|---------| +| `{uuid}_v1` | `momentry_dev_v1` | sentence chunk embedding(base) | ASR + ASRX + Rule 1 完成 | +| `{uuid}_v2` | `momentry_dev_v2` | 完整 pipeline + 5W1H | 全部完成 | +| `{uuid}_v3` | - | 預留後續升級 | - | 各版本共存不覆蓋。 ## 階段劃分 -### Phase 1:Sentence Chunk Embedding(tiny model) +### Phase 1:Sentence Chunk Embedding(base model) **觸發時機**: ASR + ASRX 完成 + Rule 1 Ingestion + vectorize 完成 @@ -45,7 +46,7 @@ Search / Query / Identity APIs **用途**: 終端使用者可進行語意搜尋 -### Phase 2:完整 Pipeline(full model) +### Phase 2:完整 Pipeline(v2 model) **觸發時機**: 全部 processor 完成 + Rule 3 Ingestion + 5W1H Agent @@ -69,7 +70,7 @@ Rule 1 Ingestion (sentence chunks) ↓ vectorize_chunks (sentence embedding) ↓ -📦 Phase 1 release ───→ release/phase1/latest/ (tiny model) +📦 Phase 1 release ───→ release/phase1/latest/ (base model) ↓ 其他 processors 繼續 (yolo, face, pose, ocr, ...) ↓ @@ -109,5 +110,5 @@ release/ | 類比 | 訓練好的 weights | inference engine | | 內容 | `.json` + chunks + vectors | Rust binary | | 生命週期 | 每部影片產出一個 | 一個 binary 服務所有影片 | -| 版本 | `{uuid}_v1_tiny` / `{uuid}_v2_full` | `momentry_playground` / `momentry` | +| 版本 | `{uuid}_v1`(base) / `{uuid}_v2` | `momentry_playground` / `momentry` | | 交付對象 | 終端使用者 | 部署工程師 | diff --git a/scripts/release_pack.py b/scripts/release_pack.py index 8847c73..a50c16a 100644 --- a/scripts/release_pack.py +++ b/scripts/release_pack.py @@ -24,6 +24,8 @@ VERSION = "v1.0.0" DB_USER = os.environ.get("USER", "accusys") DB_NAME = "momentry" +QDRANT_URL = os.environ.get("QDRANT_URL", "http://localhost:6333") +QDRANT_COLLECTION = os.environ.get("QDRANT_COLLECTION", "momentry_dev_rule1_v2") def ts(): @@ -74,18 +76,51 @@ def pack_phase(file_uuid: str, phase: int) -> Path: idents_csv = pkg_dir / "identities.csv" run_sql(f"\\COPY (SELECT * FROM dev.identities) TO '{idents_csv}' CSV HEADER") + # 匯出 Qdrant collection 快照 + import urllib.request + qdrant_path = pkg_dir / "qdrant_points.jsonl" + try: + offset = None + with open(qdrant_path, "w") as qf: + while True: + params = f"limit=1000&with_payload=true&with_vectors=true" + if offset is not None: + params += f"&offset={offset}" + url = f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/scroll?{params}" + req = urllib.request.Request(url) + with urllib.request.urlopen(req, timeout=30) as resp: + data = json.loads(resp.read()) + pts = data.get("result", {}).get("points", []) + if not pts: + break + for p in pts: + qf.write(json.dumps(p, ensure_ascii=False) + "\n") + # 從回傳的 next_page_offset 取得下一頁偏移量 + offset = data.get("result", {}).get("next_page_offset") + if offset is None: + break + n_points = sum(1 for _ in open(qdrant_path) if _.strip()) + print(f"[RELEASE] Qdrant: {n_points} points exported from '{QDRANT_COLLECTION}'") + except Exception as e: + print(f"[RELEASE] Qdrant export skipped: {e}") + if qdrant_path.exists(): + qdrant_path.unlink() + # RELEASE_INFO git_commit = subprocess.run( ["git", "-C", str(PROJECT), "rev-parse", "HEAD"], capture_output=True, text=True, timeout=10, ).stdout.strip() + model_name = f"{file_uuid}_v1" if phase == 1 else f"{file_uuid}_v2" info = pkg_dir / "RELEASE_INFO.txt" with open(info, "w") as fh: + fh.write(f"Model: {model_name}\n") fh.write(f"Phase: {phase}\n") fh.write(f"Version: {VERSION}\n") fh.write(f"Timestamp: {stamp}\n") fh.write(f"File UUID: {file_uuid}\n") + fh.write(f"Qdrant Collection: {QDRANT_COLLECTION}\n") fh.write(f"Git Commit: {git_commit}\n") fh.write(f"Packaged at: {datetime.now(timezone.utc).isoformat()}\n")