feat: frame/time pipeline split + output validation
- Add PipelineType enum + pipeline() to ProcessorType - Split ProcessorPool into frame_slots (max 2) and time_slots (max 1) - Add can_start_for() for pipeline-aware scheduling - Add validate_output_file() — checks JSON validity before marking complete - Add 3 unit tests for validate_output_file() - Create DESIGN/FRAME_TIME_PIPELINE_V1.0.md (492 lines)
This commit is contained in:
492
docs_v1.0/DESIGN/FRAME_TIME_PIPELINE_V1.0.md
Normal file
492
docs_v1.0/DESIGN/FRAME_TIME_PIPELINE_V1.0.md
Normal file
@@ -0,0 +1,492 @@
|
||||
---
|
||||
title: "Frame / Time 雙產線分流協作設計 v1.0"
|
||||
version: "1.0"
|
||||
date: "2026-05-23"
|
||||
author: "M5"
|
||||
status: "draft"
|
||||
scope: "architecture, worker, storage"
|
||||
---
|
||||
|
||||
# Frame / Time 雙產線分流協作設計 v1.0
|
||||
|
||||
| Scope | Status | Applies to |
|
||||
|---------|--------|------------|
|
||||
| architecture / worker / storage | draft | momentry_core worker pipeline |
|
||||
|
||||
---
|
||||
|
||||
## 1. 緣起與問題
|
||||
|
||||
### 1.1 現狀問題
|
||||
|
||||
worker 將所有 processor 混在一起平行執行,導致:
|
||||
|
||||
| 問題 | 說明 |
|
||||
|------|------|
|
||||
| OOM | `max_concurrent=6` 時 6 個 Python 行程同時載入模型 → 記憶體不足被 kill |
|
||||
| 資源競爭 | 多個 processor 各自開 ffmpeg decode 同一部影片 → 6 倍 decode |
|
||||
| 重試粒度粗 | 一個 processor 失敗 → 整部片全部重來 |
|
||||
| 進度不精確 | 0% → 100%,中間無細粒度進度 |
|
||||
|
||||
### 1.2 手動 vs Worker 差異
|
||||
|
||||
| 面向 | 手動執行 | Worker 自動化 |
|
||||
|------|---------|--------------|
|
||||
| 執行方式 | 循序,一次一個 processor | 平行,最多 max_concurrent 個 |
|
||||
| 產出檢查 | 人工確認 JSON 內容正確 | `output_path.exists()` 僅檢查檔案存在 |
|
||||
| 資源 | 單一模型在記憶體 | 多個模型競爭記憶體 |
|
||||
|
||||
### 1.3 核心結論
|
||||
|
||||
兩個根本問題必須解決:
|
||||
|
||||
1. **產線分流** — Frame-base 與 Time-base processor 不應混合排程
|
||||
2. **Frame-level resource management** — 透過 MarkbaseFMS 統一 frame 存取
|
||||
|
||||
---
|
||||
|
||||
## 2. 雙產線架構
|
||||
|
||||
### 2.1 Pipeline Overview
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
Input[Input Video] --> Probe
|
||||
Probe -->|frame info| FMS[MarkbaseFMS]
|
||||
Probe -->|audio track| TimePipe[Time Pipeline]
|
||||
|
||||
FMS -->|frame batches| FramePipe[Frame Pipeline]
|
||||
FMS -->|cache / align / convert| FMS
|
||||
|
||||
subgraph FramePipe [Frame Pipeline]
|
||||
CUT[CUT - scene detection]
|
||||
YOLO[YOLO - object detection]
|
||||
Face[Face - face detection]
|
||||
OCR[OCR - text detection]
|
||||
Pose[Pose - pose estimation]
|
||||
end
|
||||
|
||||
subgraph TimePipe [Time Pipeline]
|
||||
ASR[ASR - speech recognition]
|
||||
ASRX[ASRX - speaker diarization]
|
||||
end
|
||||
|
||||
CUT --> YOLO
|
||||
CUT --> Face
|
||||
CUT --> OCR
|
||||
CUT --> Pose
|
||||
|
||||
ASR --> ASRX
|
||||
|
||||
YOLO & Face & OCR & Pose --> Merge[Merge Processor Results]
|
||||
ASR & ASRX --> Merge
|
||||
|
||||
Face --> Lip[Lip Sync]
|
||||
ASR --> Lip
|
||||
ASRX --> Lip
|
||||
|
||||
Merge -->|all essential done| PostProcess
|
||||
Lip --> PostProcess
|
||||
|
||||
subgraph PostProcess [Post-processing]
|
||||
R1[Rule 1 Chunking]
|
||||
R3[Rule 3 Chunking]
|
||||
TK[TKG Build]
|
||||
W1[5W1H Agent]
|
||||
ID[Identity Agent]
|
||||
end
|
||||
```
|
||||
|
||||
### 2.2 各產線定義
|
||||
|
||||
#### Frame Pipeline(frame-based)
|
||||
|
||||
| Processor | 輸入 | 輸出 | 瓶頸資源 | 產線 |
|
||||
|-----------|------|------|---------|------|
|
||||
| CUT | frame (降解析) | scene.json | CPU | Frame |
|
||||
| YOLO | frame batch | yolo.json | GPU | Frame |
|
||||
| Face | frame batch | face.json | ANE/GPU | Frame |
|
||||
| OCR | frame batch | ocr.json | CPU | Frame |
|
||||
| Pose | frame batch | pose.json | GPU | Frame |
|
||||
|
||||
#### Time Pipeline(time-based)
|
||||
|
||||
| Processor | 輸入 | 輸出 | 瓶頸資源 | 產線 |
|
||||
|-----------|------|------|---------|------|
|
||||
| ASR | audio stream | asr.json | GPU/CPU | Time |
|
||||
| ASRX | audio stream + ASR result | asrx.json | CPU | Time |
|
||||
|
||||
#### 合流點
|
||||
|
||||
| 項目 | 需要 | 產出 |
|
||||
|------|------|------|
|
||||
| Lip Sync | Face + ASR + ASRX | lip.json (who speaks when) |
|
||||
| Rule 1 Chunking | ASR + ASRX | sentence chunks |
|
||||
| Rule 3 Chunking | CUT + ASR | scene chunks |
|
||||
| TKG Build | 所有 processor | tkg_nodes / tkg_edges |
|
||||
| 5W1H Agent | CUT + ASR | story summary |
|
||||
| Identity Agent | Face + ASRX | identity bindings |
|
||||
|
||||
---
|
||||
|
||||
## 3. MarkbaseFMS 底層設計
|
||||
|
||||
### 3.1 三層架構
|
||||
|
||||
```
|
||||
Application
|
||||
YOLO / Face / OCR / Pose / CUT
|
||||
讀取 frame buffer → 直接做 inference
|
||||
│
|
||||
│ frame-aligned access
|
||||
▼
|
||||
┌───────────────────────────────────┐
|
||||
│ FMS Filesystem Layer │
|
||||
│ Layout: frame data 連續存放 │
|
||||
│ 不跨 frame split │
|
||||
│ 格式: opaque raw buffer │
|
||||
│ metadata: 獨立區域 │
|
||||
│ read-ahead: 預取下一個 Block │
|
||||
│ alignment: 每 frame page-aligned │
|
||||
└──────────────────┬────────────────┘
|
||||
│ page-aligned (4096)
|
||||
┌──────────────────▼────────────────┐
|
||||
│ FMS Cache Layer │
|
||||
│ unit: FrameBlock (64 frames) │
|
||||
│ alignment: page boundary/frame │
|
||||
│ eviction: LRU on Block │
|
||||
│ pin: 使用中的 frame 不 evict │
|
||||
│ prefetch: 預拉下一個 Block │
|
||||
│ Direct I/O bypass OS cache │
|
||||
└──────────────────┬────────────────┘
|
||||
│ block-aligned (4K / 64K)
|
||||
┌──────────────────▼────────────────┐
|
||||
│ Block Device / Storage │
|
||||
│ sector alignment 4K │
|
||||
│ FrameBlock = N 個連續 sectors │
|
||||
│ 無跨 sector split │
|
||||
│ atomic write per block │
|
||||
│ O_DIRECT 直接 IO │
|
||||
└───────────────────────────────────┘
|
||||
```
|
||||
|
||||
### 3.2 對齊原則
|
||||
|
||||
#### 各層對齊要求
|
||||
|
||||
| 層級 | 對齊單位 | 原因 |
|
||||
|------|---------|------|
|
||||
| Block Device | 4K sector | 現代 SSD 原生 sector,避免 RMW |
|
||||
| Cache | 4096 page | `mmap` + `madvise` 大頁面,減少 TLB miss |
|
||||
| Filesystem | frame block (64 frames) | 連續 layout,預測性 read-ahead |
|
||||
| Frame Buffer | 16 bytes stride | NEON SIMD,MPS/ANE texture 要求 |
|
||||
| Block Index | power of 2 | index 用 bit shift + mask,無需除法 |
|
||||
|
||||
#### Frame Buffer Layout
|
||||
|
||||
```
|
||||
每個 frame 的 raw buffer:
|
||||
|
||||
┌──────────────┐
|
||||
│ Y Plane │ ← 16-byte aligned stride, page-aligned offset
|
||||
│ (width×h) │
|
||||
├──────────────┤
|
||||
│ UV Plane │ ← 16-byte aligned stride
|
||||
│ (w/2×h/2×2) │ (NV12 interleaved)
|
||||
└──────────────┘
|
||||
|
||||
frame buffer offset: page-aligned (4096)
|
||||
row stride: align(width * pixel_size, 16)
|
||||
frame size: align(total_bytes, page_size)
|
||||
```
|
||||
|
||||
### 3.3 Block 排列方式
|
||||
|
||||
```
|
||||
每個 Block 包含連續 64 frames(configurable):
|
||||
|
||||
Block index ≤— bit shift (frame_num / 64)
|
||||
Frame offset = base + (frame_num & 63) * frame_size ← bit mask
|
||||
|
||||
Block Dispatching Strategy:
|
||||
|
||||
Worker 要求 "batch 0, format=RGB, width=640"
|
||||
FMS:
|
||||
1. Check Block Cache (RAM):
|
||||
Block 0 frames 0-63 是否已 decode?
|
||||
✅ hit → 直接回傳
|
||||
❌ miss → decode 64 frames → 存入 cache → 回傳
|
||||
2. Format Conversion (on-the-fly):
|
||||
原始 NV12 → per request:
|
||||
YOLO → RGB (float32 normalized)
|
||||
Face → RGB (uint8)
|
||||
OCR → Gray (uint8)
|
||||
Pose → RGB (uint8)
|
||||
CUT → RGB (降解析, uint8)
|
||||
```
|
||||
|
||||
### 3.4 儲存 Layout
|
||||
|
||||
```
|
||||
Disk Layout (per file_uuid):
|
||||
|
||||
┌──────────────────────────────────────────────┐
|
||||
│ Metadata Region │
|
||||
│ - file_uuid (32 bytes) │
|
||||
│ - total_frames (u32) │
|
||||
│ - width, height (u32 × 2) │
|
||||
│ - fps (f64) │
|
||||
│ - pixel_format (u8 : 0=NV12) │
|
||||
│ - block_capacity (u32, default 64) │
|
||||
│ - block_count (u32) │
|
||||
│ - frame_size (u32, bytes per raw frame)│
|
||||
│ - block_offsets [u64 × block_count] │
|
||||
│ Padding to 4096 │
|
||||
├──────────────────────────────────────────────┤
|
||||
│ Data Region │
|
||||
│ Block 0: frames [0, 63] │
|
||||
│ frame_0: [frame_size bytes] ← page align│
|
||||
│ frame_1: [frame_size bytes] │
|
||||
│ ... │
|
||||
│ frame_63: [frame_size bytes] │
|
||||
│ Block 1: frames [64, 127] │
|
||||
│ ... │
|
||||
│ Block N: frames [N*64, ...] │
|
||||
└──────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### 3.5 記憶體管理
|
||||
|
||||
```
|
||||
┌────────────────────────────────────────────┐
|
||||
│ FMS Cache │
|
||||
├──────────┬──────────┬──────────┬───────────┤
|
||||
│ Block 0 │ Block 1 │ Block 2 │ ... │ ← mmap'd
|
||||
│ (64 fr) │ (64 fr) │ (64 fr) │ │ or anonymous
|
||||
├──────────┴──────────┴──────────┴───────────┤
|
||||
│ LRU eviction policy │
|
||||
│ max_memory = configurable │
|
||||
│ (default: 256 frames ~1.5GB @1080p NV12) │
|
||||
│ pin_count: 正在被 processor 存取的 frame │
|
||||
│ pinned frame 不參與 LRU eviction │
|
||||
└─────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. Worker 調度器修改
|
||||
|
||||
### 4.1 Processor 產線標記
|
||||
|
||||
```rust
|
||||
enum PipelineType {
|
||||
Frame, // frame-based
|
||||
Time, // time-based
|
||||
Cross, // needs both frame + time
|
||||
}
|
||||
|
||||
impl ProcessorType {
|
||||
fn pipeline(&self) -> PipelineType {
|
||||
match self {
|
||||
Self::Cut
|
||||
| Self::Yolo
|
||||
| Self::Face
|
||||
| Self::Ocr
|
||||
| Self::Pose => PipelineType::Frame,
|
||||
|
||||
Self::Asr | Self::Asrx => PipelineType::Time,
|
||||
|
||||
Self::Story
|
||||
| Self::Tkg
|
||||
| Self::Identity
|
||||
| Self::FiveW1h
|
||||
| Self::Caption => PipelineType::Cross,
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 4.2 資源配額
|
||||
|
||||
| 產線 | Max Concurrent | 策略 |
|
||||
|------|---------------|------|
|
||||
| Frame | 2 | 最多同時 2 個 frame processor |
|
||||
| Time | 1 | 一次只跑 1 個 audio processor |
|
||||
| Cross | 1 | Frame + Time 都完成後才允許 |
|
||||
|
||||
Frame pipeline 內部建議順序:
|
||||
|
||||
```
|
||||
CUT (先確定場景邊界)
|
||||
→ YOLO / Face 可同時 (GPU-bound)
|
||||
→ OCR / Pose 可同時 (CPU/GPU mixed)
|
||||
```
|
||||
|
||||
### 4.3 產出驗證加強
|
||||
|
||||
```rust
|
||||
// 目前 (job_worker.rs:346):
|
||||
if output_path.exists() {
|
||||
mark_completed();
|
||||
}
|
||||
|
||||
// 改為:
|
||||
if output_path.exists() {
|
||||
match validate_processor_output(&output_path, processor_type) {
|
||||
Ok(true) => mark_completed(),
|
||||
Ok(false) => retry_or_fail(),
|
||||
Err(e) => mark_failed(e),
|
||||
}
|
||||
}
|
||||
|
||||
fn validate_processor_output(path: &Path, pt: ProcessorType) -> Result<bool> {
|
||||
let content = std::fs::read_to_string(path)?;
|
||||
let json: serde_json::Value = serde_json::from_str(&content)?;
|
||||
// 至少要有基本欄位
|
||||
match pt {
|
||||
ProcessorType::Asr => json.get("segments").is_some(),
|
||||
ProcessorType::Yolo => json.get("frames").is_some(),
|
||||
ProcessorType::Face => json.get("frames").is_some(),
|
||||
// ...
|
||||
};
|
||||
Ok(true) // or false
|
||||
}
|
||||
```
|
||||
|
||||
### 4.4 啟動順序
|
||||
|
||||
```
|
||||
1. Probe → 決定 frame 數、audio 格式
|
||||
2. CUT (Frame Pipeline 第一階段—決定場景邊界)
|
||||
3. Frame Pipeline 平行: YOLO / Face / OCR / Pose
|
||||
Time Pipeline 平行: ASR
|
||||
4. ASRX (依賴 ASR 結果)
|
||||
5. Lip Sync (等待 Face + ASR + ASRX)
|
||||
6. 所有 processor 完成 → 合流:
|
||||
- Rule 1 / Rule 3 Chunking
|
||||
- Face Trace / TKG
|
||||
- 5W1H / Identity Agent
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. Processor 串接 FMS
|
||||
|
||||
### 5.1 FMS API
|
||||
|
||||
```rust
|
||||
// Frame access API (async)
|
||||
impl FmsClient {
|
||||
/// 取得單一 frame buffer
|
||||
async fn get_frame(
|
||||
&self,
|
||||
file_uuid: &str,
|
||||
frame_num: u32,
|
||||
format: PixelFormat,
|
||||
width: u32,
|
||||
height: u32,
|
||||
) -> Result<RawFrame>;
|
||||
|
||||
/// 取得一個 batch frames (block-aligned)
|
||||
async fn get_block(
|
||||
&self,
|
||||
file_uuid: &str,
|
||||
block_idx: u32,
|
||||
format: PixelFormat,
|
||||
width: u32,
|
||||
height: u32,
|
||||
) -> Result<FrameBlock>;
|
||||
|
||||
/// 串流 frames (lazy batch iteration)
|
||||
fn stream_frames(
|
||||
&self,
|
||||
file_uuid: &str,
|
||||
range: Range<u32>,
|
||||
format: PixelFormat,
|
||||
width: u32,
|
||||
height: u32,
|
||||
) -> FrameStream;
|
||||
}
|
||||
```
|
||||
|
||||
### 5.2 YOLO 為例:processor 修改
|
||||
|
||||
```
|
||||
目前:
|
||||
processor::process_yolo(video_path, output_path, uuid)
|
||||
→ 自己開 ffmpeg decode
|
||||
→ 逐 frame 處理
|
||||
→ 寫入 yolo.json
|
||||
|
||||
改為 FMS-based:
|
||||
processor::process_yolo_with_fms(fms, uuid, output_path)
|
||||
→ let results = Vec::new()
|
||||
→ for batch in fms.stream_frames(uuid, 0..total, RGB, 640, 640):
|
||||
→ let detections = yolo_model.infer(batch)
|
||||
→ results.push(detections)
|
||||
→ write partial → yolo.partial.{batch_idx}.json
|
||||
→ merge_partials() → yolo.json
|
||||
```
|
||||
|
||||
### 5.3 Partial Result Merge
|
||||
|
||||
```
|
||||
Frame Pipeline 產出多個 partial JSON:
|
||||
|
||||
yolo.0000.json (frames 0-63)
|
||||
yolo.0001.json (frames 64-127)
|
||||
...
|
||||
yolo.0063.json (frames 4032-4095)
|
||||
|
||||
Worker 合併為單一 yolo.json:
|
||||
|
||||
```rust
|
||||
async fn merge_partials(
|
||||
uuid: &str,
|
||||
processor: &str,
|
||||
partial_dir: &Path,
|
||||
output_path: &Path,
|
||||
) -> Result<()> {
|
||||
let partials = read_sorted_partials(uuid, processor, partial_dir);
|
||||
let merged = merge_detections(partials);
|
||||
write_json(output_path, merged)
|
||||
}
|
||||
```
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. 實作優先序
|
||||
|
||||
| 優先 | 項目 | 說明 |
|
||||
|------|------|------|
|
||||
| P0 | Worker 產線分流 | ProcessorType::pipeline() + frame_slots / time_slots 分開計算 |
|
||||
| P0 | 產出驗證加強 | `output_path.exists()` + JSON validity + schema check |
|
||||
| P1 | FMS FrameBlock 資料結構 | 含 16-byte stride / 4096 page alignment |
|
||||
| P1 | mmap-based frame cache | page-aligned frame buffers, LRU eviction |
|
||||
| P2 | FMS API 實作 | get_frame / get_block / stream_frames |
|
||||
| P2 | YOLO processor 串接 FMS | 改為 stream_frames 方式 |
|
||||
| P2 | 其他 processor 串接 FMS | Face / OCR / Pose / CUT |
|
||||
| P3 | FMS Direct I/O + sector alignment | O_DIRECT bypass OS page cache |
|
||||
| P3 | Prefetch / readahead | 預測下一個 block 並提前載入 |
|
||||
|
||||
---
|
||||
|
||||
## 7. 注意事項
|
||||
|
||||
| # | 項目 |
|
||||
|---|------|
|
||||
| 1 | raw buffer 格式依 processor 需求轉換,FMS 負責 NV12 → RGB / Gray |
|
||||
| 2 | Time pipeline 的 ASR/ASRX **不經過 FMS**,直接處理 audio stream |
|
||||
| 3 | macOS 的 `mmap` 支援 page-aligned,但 `O_DIRECT` 需確認 compat |
|
||||
| 4 | FrameBlock size (64) 可配置,但需維持 power-of-2 |
|
||||
| 5 | FMS 只管理 frame lifecycle,不處理 processor-specific 邏輯 |
|
||||
| 6 | 多 processor 共享 frame 時,FMS 保證只 decode 一次 |
|
||||
|
||||
---
|
||||
|
||||
## 版本歷史
|
||||
|
||||
| Version | Date | Author | Changes |
|
||||
|---------|------|--------|---------|
|
||||
| 1.0 | 2026-05-23 | M5 | 初版 |
|
||||
@@ -43,7 +43,7 @@ pub use mongodb_db::MongoDb;
|
||||
pub use postgres_db::{
|
||||
Bm25Result, CandidateRecord, CreateApiKeyConfig, FileIdentityRecord, FileRecord,
|
||||
HybridSearchResult, IdentityChunkRecord, IdentityDetailRecord, IdentityFaceRecord,
|
||||
IdentityFileRecord, MonitorJob, MonitorJobStats, MonitorJobStatus, PostgresDb,
|
||||
IdentityFileRecord, MonitorJob, MonitorJobStats, MonitorJobStatus, PipelineType, PostgresDb,
|
||||
ProcessorJobStatus, ProcessorResult, ProcessorType, ResourceRecord, VideoRecord, VideoStatus,
|
||||
};
|
||||
pub use qdrant_db::{QdrantDb, VectorPayload};
|
||||
|
||||
@@ -559,6 +559,31 @@ impl ProcessorType {
|
||||
ProcessorType::FiveW1H,
|
||||
]
|
||||
}
|
||||
|
||||
/// Pipeline type for scheduling: Frame-based, Time-based, or Cross (needs both).
|
||||
pub fn pipeline(&self) -> PipelineType {
|
||||
match self {
|
||||
Self::Cut
|
||||
| Self::Yolo
|
||||
| Self::Face
|
||||
| Self::Ocr
|
||||
| Self::Pose
|
||||
| Self::VisualChunk
|
||||
| Self::Scene => PipelineType::Frame,
|
||||
|
||||
Self::Asr | Self::Asrx => PipelineType::Time,
|
||||
|
||||
Self::Story | Self::FiveW1H => PipelineType::Cross,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Pipeline classification for worker scheduling.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum PipelineType {
|
||||
Frame,
|
||||
Time,
|
||||
Cross,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
|
||||
|
||||
@@ -12,8 +12,8 @@ use crate::core::chunk::{rule1_ingest, rule3_ingest};
|
||||
use crate::core::config::OUTPUT_DIR;
|
||||
use crate::core::db::qdrant_db::QdrantDb;
|
||||
use crate::core::db::{
|
||||
schema, MonitorJobStatus, PostgresDb, ProcessorJobStatus, RedisClient, VectorPayload,
|
||||
VideoStatus,
|
||||
schema, MonitorJobStatus, PipelineType, PostgresDb, ProcessorJobStatus, ProcessorType,
|
||||
RedisClient, VectorPayload, VideoStatus,
|
||||
};
|
||||
use crate::core::embedding::Embedder;
|
||||
use crate::core::processor::heuristic_scene::generate_scene_meta;
|
||||
@@ -338,62 +338,81 @@ impl JobWorker {
|
||||
.await?;
|
||||
|
||||
// Check if output file already exists on disk (source of truth)
|
||||
// and validate that it's a parseable JSON with expected structure.
|
||||
let output_path = PathBuf::from(OUTPUT_DIR.as_str()).join(format!(
|
||||
"{}.{}.json",
|
||||
job.uuid,
|
||||
processor_type.as_str()
|
||||
));
|
||||
if output_path.exists() {
|
||||
info!(
|
||||
"Processor {} output file exists, marking completed and skipping",
|
||||
processor_type.as_str()
|
||||
);
|
||||
self.db
|
||||
.update_processor_progress(
|
||||
&job.uuid,
|
||||
processor_type.as_str(),
|
||||
total_frames,
|
||||
total_frames,
|
||||
"completed",
|
||||
)
|
||||
.await?;
|
||||
let total = total_frames as i32;
|
||||
self.redis
|
||||
.update_worker_processor_status(
|
||||
&job.uuid,
|
||||
processor_type.as_str(),
|
||||
"completed",
|
||||
None,
|
||||
total,
|
||||
total,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
)
|
||||
.await?;
|
||||
started_count += 1;
|
||||
// 覆寫 result_map 讓相依性檢查能正確判斷
|
||||
result_map.insert(
|
||||
*processor_type,
|
||||
crate::core::db::ProcessorResult {
|
||||
id: 0,
|
||||
job_id: job.id,
|
||||
processor_type: *processor_type,
|
||||
status: ProcessorJobStatus::Completed,
|
||||
started_at: None,
|
||||
completed_at: None,
|
||||
duration_secs: None,
|
||||
chunks_produced: 0,
|
||||
frames_processed: total_frames as i32,
|
||||
output_size_bytes: 0,
|
||||
error_message: None,
|
||||
output_data: None,
|
||||
retry_count: 0,
|
||||
created_at: String::new(),
|
||||
updated_at: String::new(),
|
||||
},
|
||||
);
|
||||
continue;
|
||||
match validate_output_file(&output_path, *processor_type) {
|
||||
Ok(true) => {
|
||||
info!(
|
||||
"Processor {} output file exists and valid, marking completed",
|
||||
processor_type.as_str()
|
||||
);
|
||||
self.db
|
||||
.update_processor_progress(
|
||||
&job.uuid,
|
||||
processor_type.as_str(),
|
||||
total_frames,
|
||||
total_frames,
|
||||
"completed",
|
||||
)
|
||||
.await?;
|
||||
let total = total_frames as i32;
|
||||
self.redis
|
||||
.update_worker_processor_status(
|
||||
&job.uuid,
|
||||
processor_type.as_str(),
|
||||
"completed",
|
||||
None,
|
||||
total,
|
||||
total,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
)
|
||||
.await?;
|
||||
started_count += 1;
|
||||
result_map.insert(
|
||||
*processor_type,
|
||||
crate::core::db::ProcessorResult {
|
||||
id: 0,
|
||||
job_id: job.id,
|
||||
processor_type: *processor_type,
|
||||
status: ProcessorJobStatus::Completed,
|
||||
started_at: None,
|
||||
completed_at: None,
|
||||
duration_secs: None,
|
||||
chunks_produced: 0,
|
||||
frames_processed: total_frames as i32,
|
||||
output_size_bytes: 0,
|
||||
error_message: None,
|
||||
output_data: None,
|
||||
retry_count: 0,
|
||||
created_at: String::new(),
|
||||
updated_at: String::new(),
|
||||
},
|
||||
);
|
||||
continue;
|
||||
}
|
||||
Ok(false) => {
|
||||
warn!(
|
||||
"Processor {} output file exists but content invalid, will reprocess",
|
||||
processor_type.as_str()
|
||||
);
|
||||
// fall through → reprocess
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"Processor {} output validation error: {}, will reprocess",
|
||||
processor_type.as_str(),
|
||||
e
|
||||
);
|
||||
// fall through → reprocess
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check if processor already in terminal state
|
||||
@@ -484,10 +503,15 @@ impl JobWorker {
|
||||
}
|
||||
}
|
||||
|
||||
// Check capacity before starting processor
|
||||
if !self.processor_pool.can_start().await {
|
||||
// Check pipeline capacity before starting processor
|
||||
if !self.processor_pool.can_start_for(processor_type.pipeline()).await {
|
||||
info!(
|
||||
"Max concurrent processors reached, skipping remaining processors for job {}",
|
||||
"Max {} processors reached, skipping remaining processors for job {}",
|
||||
match processor_type.pipeline() {
|
||||
PipelineType::Frame => "frame",
|
||||
PipelineType::Time => "time",
|
||||
PipelineType::Cross => "cross",
|
||||
},
|
||||
job.uuid
|
||||
);
|
||||
// 為所有未啟動的 processors 創建 Skipped 記錄
|
||||
@@ -1180,6 +1204,35 @@ impl JobWorker {
|
||||
}
|
||||
}
|
||||
|
||||
/// 驗證 processor 輸出檔案的完整性。
|
||||
/// 回傳 Ok(true) 表示有效,Ok(false) 表示檔案存在但內容異常需重跑,Err 表示檢查失敗。
|
||||
fn validate_output_file(path: &std::path::Path, processor_type: ProcessorType) -> Result<bool> {
|
||||
let content = match std::fs::read_to_string(path) {
|
||||
Ok(c) => c,
|
||||
Err(_) => return Ok(false),
|
||||
};
|
||||
if content.trim().is_empty() {
|
||||
return Ok(false);
|
||||
}
|
||||
let json: serde_json::Value = match serde_json::from_str(&content) {
|
||||
Ok(v) => v,
|
||||
Err(_) => return Ok(false),
|
||||
};
|
||||
// 依 processor type 檢查必要欄位
|
||||
let valid = match processor_type {
|
||||
ProcessorType::Asr => json.get("segments").and_then(|s| s.as_array()).map_or(false, |a| !a.is_empty()),
|
||||
ProcessorType::Asrx => json.get("segments").and_then(|s| s.as_array()).map_or(false, |a| !a.is_empty()),
|
||||
ProcessorType::Yolo => json.get("frames").and_then(|f| f.as_object()).is_some(),
|
||||
ProcessorType::Face => json.get("frames").and_then(|f| f.as_object()).is_some(),
|
||||
ProcessorType::Ocr => json.get("frames").and_then(|f| f.as_object()).is_some(),
|
||||
ProcessorType::Pose => json.get("frames").and_then(|f| f.as_object()).is_some(),
|
||||
ProcessorType::Cut => json.get("segments").or_else(|| json.get("scenes")).and_then(|s| s.as_array()).map_or(false, |a| !a.is_empty()),
|
||||
// VisualChunk / Scene / Story / FiveW1H: 只檢查是 valid JSON 即可
|
||||
_ => true,
|
||||
};
|
||||
Ok(valid)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -1190,4 +1243,34 @@ mod tests {
|
||||
assert!(config.enabled);
|
||||
assert!(config.max_concurrent >= 1);
|
||||
}
|
||||
|
||||
fn test_validate_path(name: &str) -> std::path::PathBuf {
|
||||
let dir = std::env::temp_dir().join(format!("test_validate_{}", name));
|
||||
let _ = std::fs::create_dir_all(&dir);
|
||||
dir.join("output.json")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_validate_output_empty() {
|
||||
let path = test_validate_path("empty");
|
||||
std::fs::write(&path, "").unwrap();
|
||||
assert!(!validate_output_file(&path, ProcessorType::Yolo).unwrap_or(false));
|
||||
let _ = std::fs::remove_file(&path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_validate_output_invalid_json() {
|
||||
let path = test_validate_path("invalid");
|
||||
std::fs::write(&path, "not json").unwrap();
|
||||
assert!(!validate_output_file(&path, ProcessorType::Yolo).unwrap_or(false));
|
||||
let _ = std::fs::remove_file(&path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_validate_output_yolo_ok() {
|
||||
let path = test_validate_path("yolo_ok");
|
||||
std::fs::write(&path, r#"{"frames":{"1":{"detections":[]}}}"#).unwrap();
|
||||
assert!(validate_output_file(&path, ProcessorType::Yolo).unwrap_or(false));
|
||||
let _ = std::fs::remove_file(&path);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,35 +7,56 @@ use std::sync::Arc;
|
||||
use tokio::sync::{mpsc, RwLock};
|
||||
use tracing::{error, info, warn};
|
||||
|
||||
|
||||
|
||||
/// Guard that ensures processor pool cleanup runs even if the task panics.
|
||||
struct ProcessorCleanupGuard {
|
||||
job_id: i32,
|
||||
running: Arc<RwLock<HashMap<i32, ProcessorHandle>>>,
|
||||
running_count: Arc<RwLock<usize>>,
|
||||
frame_count: Arc<RwLock<usize>>,
|
||||
time_count: Arc<RwLock<usize>>,
|
||||
pipeline: PipelineType,
|
||||
}
|
||||
|
||||
impl Drop for ProcessorCleanupGuard {
|
||||
fn drop(&mut self) {
|
||||
use tokio::sync::TryLockError;
|
||||
// 嘗試同步清理;若 lock 被佔用則跳過(避免 deadlock)
|
||||
if let Ok(mut guard) = self.running.try_write() {
|
||||
guard.remove(&self.job_id);
|
||||
} else {
|
||||
warn!("[ProcessorCleanupGuard] running lock contended, skipping cleanup");
|
||||
warn!("[ProcessorCleanupGuard] running lock contended");
|
||||
}
|
||||
if let Ok(mut guard) = self.running_count.try_write() {
|
||||
if *guard > 0 {
|
||||
*guard -= 1;
|
||||
if *guard > 0 { *guard -= 1; }
|
||||
}
|
||||
match self.pipeline {
|
||||
PipelineType::Frame => {
|
||||
if let Ok(mut guard) = self.frame_count.try_write() {
|
||||
if *guard > 0 { *guard -= 1; }
|
||||
}
|
||||
}
|
||||
} else {
|
||||
warn!("[ProcessorCleanupGuard] running_count lock contended, skipping cleanup");
|
||||
PipelineType::Time => {
|
||||
if let Ok(mut guard) = self.time_count.try_write() {
|
||||
if *guard > 0 { *guard -= 1; }
|
||||
}
|
||||
}
|
||||
PipelineType::Cross => {} // cross pipeline not tracked in slot counts
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ProcessorHandle {
|
||||
#[allow(dead_code)]
|
||||
processor_type: ProcessorType,
|
||||
cancel_tx: mpsc::Sender<()>,
|
||||
child_pid: Arc<RwLock<Option<i32>>>,
|
||||
}
|
||||
|
||||
use crate::core::config::{OUTPUT_DIR, PYTHON_PATH, SCRIPTS_DIR};
|
||||
use crate::core::db::{
|
||||
MonitorJob, PostgresDb, ProcessorJobStatus, ProcessorType, QdrantDb, RedisClient,
|
||||
MonitorJob, PipelineType, PostgresDb, ProcessorJobStatus, ProcessorType, QdrantDb, RedisClient,
|
||||
};
|
||||
use crate::core::processor;
|
||||
use crate::core::processor::asr::AsrResult;
|
||||
@@ -67,19 +88,19 @@ pub struct ProcessorTask {
|
||||
pub frame_dir: Option<String>,
|
||||
}
|
||||
|
||||
/// Frame pipeline max concurrent processors (hard limit).
|
||||
const FRAME_SLOT_MAX: usize = 2;
|
||||
/// Time pipeline max concurrent processors (audio is heavy, run 1 at a time).
|
||||
const TIME_SLOT_MAX: usize = 1;
|
||||
|
||||
pub struct ProcessorPool {
|
||||
db: Arc<PostgresDb>,
|
||||
redis: Arc<RedisClient>,
|
||||
config_max: usize,
|
||||
running: Arc<RwLock<HashMap<i32, ProcessorHandle>>>,
|
||||
running_count: Arc<RwLock<usize>>,
|
||||
}
|
||||
|
||||
struct ProcessorHandle {
|
||||
#[allow(dead_code)]
|
||||
processor_type: ProcessorType,
|
||||
cancel_tx: mpsc::Sender<()>,
|
||||
child_pid: Arc<RwLock<Option<i32>>>,
|
||||
running_frame_count: Arc<RwLock<usize>>,
|
||||
running_time_count: Arc<RwLock<usize>>,
|
||||
}
|
||||
|
||||
impl ProcessorPool {
|
||||
@@ -90,6 +111,8 @@ impl ProcessorPool {
|
||||
config_max: max_concurrent,
|
||||
running: Arc::new(RwLock::new(HashMap::new())),
|
||||
running_count: Arc::new(RwLock::new(0)),
|
||||
running_frame_count: Arc::new(RwLock::new(0)),
|
||||
running_time_count: Arc::new(RwLock::new(0)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,10 +128,27 @@ impl ProcessorPool {
|
||||
count < max
|
||||
}
|
||||
|
||||
/// 檢查特定產線是否可啟動新的 processor。
|
||||
/// Frame pipeline 最多 FRAME_SLOT_MAX 個,Time pipeline 最多 TIME_SLOT_MAX 個。
|
||||
pub async fn can_start_for(&self, pipeline: PipelineType) -> bool {
|
||||
let count = *self.running_count.read().await;
|
||||
let max = self.current_max().await;
|
||||
if count >= max {
|
||||
return false;
|
||||
}
|
||||
match pipeline {
|
||||
PipelineType::Frame => *self.running_frame_count.read().await < FRAME_SLOT_MAX,
|
||||
PipelineType::Time => *self.running_time_count.read().await < TIME_SLOT_MAX,
|
||||
PipelineType::Cross => false, // cross pipeline = wait until frame+time done
|
||||
}
|
||||
}
|
||||
|
||||
/// 清理 stale running state:若系統中實際運行的 processor 比記錄少,修正 count
|
||||
pub async fn sweep_stale(&self) {
|
||||
let handle_count = self.running.read().await.len();
|
||||
let count = *self.running_count.read().await;
|
||||
let frame_count = *self.running_frame_count.read().await;
|
||||
let time_count = *self.running_time_count.read().await;
|
||||
if handle_count != count {
|
||||
warn!(
|
||||
"[ProcessorPool] Stale count detected: handles={}, count={}, fixing",
|
||||
@@ -117,6 +157,13 @@ impl ProcessorPool {
|
||||
let mut c = self.running_count.write().await;
|
||||
*c = handle_count;
|
||||
}
|
||||
// 若 frame 或 time slot 超出 handle_count,降回合理值
|
||||
if frame_count + time_count > handle_count {
|
||||
let mut fc = self.running_frame_count.write().await;
|
||||
let mut tc = self.running_time_count.write().await;
|
||||
*fc = (*fc).min(handle_count);
|
||||
*tc = (*tc).min(handle_count.saturating_sub(*fc));
|
||||
}
|
||||
|
||||
if handle_count == 0 && count == 0 {
|
||||
if let Err(e) = self
|
||||
@@ -162,6 +209,7 @@ impl ProcessorPool {
|
||||
let job_id = task.job.id;
|
||||
let processor_type = task.processor_type;
|
||||
|
||||
let pipeline = task.processor_type.pipeline();
|
||||
let current_limit = self.current_max().await;
|
||||
{
|
||||
let mut count = self.running_count.write().await;
|
||||
@@ -173,9 +221,17 @@ impl ProcessorPool {
|
||||
}
|
||||
*count += 1;
|
||||
}
|
||||
// 遞增產線專屬 slot
|
||||
match pipeline {
|
||||
PipelineType::Frame => *self.running_frame_count.write().await += 1,
|
||||
PipelineType::Time => *self.running_time_count.write().await += 1,
|
||||
PipelineType::Cross => {} // cross pipeline uses global slot
|
||||
}
|
||||
|
||||
let running = self.running.clone();
|
||||
let running_count = self.running_count.clone();
|
||||
let running_frame_count = self.running_frame_count.clone();
|
||||
let running_time_count = self.running_time_count.clone();
|
||||
let child_pid: Arc<RwLock<Option<i32>>> = Arc::new(RwLock::new(None));
|
||||
running.write().await.insert(
|
||||
job_id,
|
||||
@@ -205,6 +261,9 @@ impl ProcessorPool {
|
||||
job_id,
|
||||
running: running.clone(),
|
||||
running_count: running_count.clone(),
|
||||
frame_count: running_frame_count.clone(),
|
||||
time_count: running_time_count.clone(),
|
||||
pipeline,
|
||||
};
|
||||
|
||||
info!("Starting processor {} for job {}", processor_name, job.uuid);
|
||||
|
||||
Reference in New Issue
Block a user