feat: backup architecture docs, source code, and scripts

2026-04-25 17:15:45 +08:00
parent 59809dae1f
commit 1f84e5469f
368 changed files with 146329 additions and 261 deletions
--- a/src/core/processor/asr_legacy.rs
+++ b/src/core/processor/asr_legacy.rs
@@ -0,0 +1,124 @@
+use anyhow::{Context, Result};
+use serde::{Deserialize, Serialize};
+use std::time::Duration;
+
+use super::executor::PythonExecutor;
+use crate::core::config::processor;
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct AsrResult {
+    pub language: Option<String>,
+    pub language_probability: Option<f64>,
+    pub segments: Vec<AsrSegment>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct AsrSegment {
+    pub start: f64,
+    pub end: f64,
+    pub text: String,
+}
+
+pub async fn process_asr(
+    video_path: &str,
+    output_path: &str,
+    uuid: Option<&str>,
+) -> Result<AsrResult> {
+    let executor = PythonExecutor::new()?;
+    let script_path = executor.script_path("asr_processor.py");
+
+    tracing::info!("[ASR] Starting ASR processing: {}", video_path);
+
+    executor
+        .run(
+            "asr_processor.py",
+            &[video_path, output_path],
+            uuid,
+            "ASR",
+            Some(Duration::from_secs(*processor::ASR_TIMEOUT_SECS)),
+        )
+        .await
+        .with_context(|| format!("Failed to run {:?}", script_path))?;
+
+    let json_str = std::fs::read_to_string(output_path).context("Failed to read ASR output")?;
+
+    let result: AsrResult =
+        serde_json::from_str(&json_str).context("Failed to parse ASR output")?;
+
+    tracing::info!(
+        "[ASR] Result: {} segments, language: {:?}",
+        result.segments.len(),
+        result.language
+    );
+
+    Ok(result)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_asr_result_serialization() {
+        let result = AsrResult {
+            language: Some("en".to_string()),
+            language_probability: Some(0.95),
+            segments: vec![
+                AsrSegment {
+                    start: 0.0,
+                    end: 2.5,
+                    text: "Hello world".to_string(),
+                },
+                AsrSegment {
+                    start: 2.5,
+                    end: 5.0,
+                    text: "Test speech".to_string(),
+                },
+            ],
+        };
+
+        let json = serde_json::to_string(&result).unwrap();
+        assert!(json.contains("Hello world"));
+        assert!(json.contains("en"));
+    }
+
+    #[test]
+    fn test_asr_result_deserialization() {
+        let json = r#"{
+            "language": "zh",
+            "language_probability": 0.98,
+            "segments": [
+                {"start": 0.0, "end": 1.5, "text": "測試"}
+            ]
+        }"#;
+
+        let result: AsrResult = serde_json::from_str(json).unwrap();
+        assert_eq!(result.language, Some("zh".to_string()));
+        assert_eq!(result.language_probability, Some(0.98));
+        assert_eq!(result.segments.len(), 1);
+        assert_eq!(result.segments[0].text, "測試");
+    }
+
+    #[test]
+    fn test_asr_segment_default() {
+        let segment = AsrSegment {
+            start: 0.0,
+            end: 1.0,
+            text: String::new(),
+        };
+        assert_eq!(segment.start, 0.0);
+        assert_eq!(segment.end, 1.0);
+        assert!(segment.text.is_empty());
+    }
+
+    #[test]
+    fn test_asr_result_empty_segments() {
+        let result = AsrResult {
+            language: None,
+            language_probability: None,
+            segments: vec![],
+        };
+        assert!(result.language.is_none());
+        assert!(result.segments.is_empty());
+    }
+}
--- a/src/core/processor/face_recognition.rs
+++ b/src/core/processor/face_recognition.rs
@@ -0,0 +1,345 @@
+use anyhow::{Context, Result};
+use serde::{Deserialize, Serialize};
+use std::time::Duration;
+
+use super::executor::PythonExecutor;
+
+const FACE_RECOGNITION_TIMEOUT: Duration = Duration::from_secs(10800); // 3 hours for recognition
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct FaceRecognitionResult {
+    pub frame_count: u64,
+    pub fps: f64,
+    pub frames: Vec<FaceRecognitionFrame>,
+    pub recognized_faces: Vec<RecognizedFace>,
+    pub face_clusters: Vec<FaceCluster>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct FaceRecognitionFrame {
+    pub frame: u64,
+    pub timestamp: f64,
+    pub faces: Vec<RecognizedFaceDetection>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct RecognizedFaceDetection {
+    pub face_id: Option<String>,
+    pub x: i32,
+    pub y: i32,
+    pub width: i32,
+    pub height: i32,
+    pub confidence: f32,
+    pub embedding: Option<Vec<f32>>,
+    pub attributes: Option<FaceAttributes>,
+    pub identity: Option<FaceIdentity>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct FaceAttributes {
+    pub age: Option<u8>,
+    pub gender: Option<String>,
+    pub emotion: Option<String>,
+    pub glasses: Option<bool>,
+    pub mask: Option<bool>,
+    pub pose: Option<FacePose>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct FacePose {
+    pub yaw: f32,
+    pub pitch: f32,
+    pub roll: f32,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct FaceIdentity {
+    pub name: Option<String>,
+    pub confidence: f32,
+    pub database_id: Option<String>,
+    pub metadata: Option<serde_json::Value>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct RecognizedFace {
+    pub face_id: String,
+    pub embedding: Vec<f32>,
+    pub first_seen: f64,
+    pub last_seen: f64,
+    pub total_appearances: u32,
+    pub attributes: Option<FaceAttributes>,
+    pub identities: Vec<FaceIdentity>,
+    pub cluster_id: Option<String>,
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct FaceCluster {
+    pub cluster_id: String,
+    pub face_ids: Vec<String>,
+    pub centroid: Vec<f32>,
+    pub size: u32,
+    pub representative_face_id: Option<String>,
+    pub metadata: Option<serde_json::Value>,
+}
+
+pub async fn process_face_recognition(
+    video_path: &str,
+    output_path: &str,
+    uuid: Option<&str>,
+    enable_recognition: bool,
+    enable_tracking: bool,
+    enable_clustering: bool,
+) -> Result<FaceRecognitionResult> {
+    let executor = PythonExecutor::new()?;
+    let script_path = executor.script_path("face_recognition_processor.py");
+
+    tracing::info!(
+        "[FACE_RECOGNITION] Starting face recognition: {}",
+        video_path
+    );
+
+    if !script_path.exists() {
+        tracing::warn!("[FACE_RECOGNITION] Script not found, returning empty result");
+        return Ok(FaceRecognitionResult {
+            frame_count: 0,
+            fps: 0.0,
+            frames: vec![],
+            recognized_faces: vec![],
+            face_clusters: vec![],
+        });
+    }
+
+    let args = vec![
+        video_path,
+        output_path,
+        if enable_recognition { "1" } else { "0" },
+        if enable_tracking { "1" } else { "0" },
+        if enable_clustering { "1" } else { "0" },
+    ];
+
+    executor
+        .run(
+            "face_recognition_processor.py",
+            &args,
+            uuid,
+            "FACE_RECOGNITION",
+            Some(FACE_RECOGNITION_TIMEOUT),
+        )
+        .await
+        .with_context(|| format!("Failed to run {:?}", script_path))?;
+
+    let json_str =
+        std::fs::read_to_string(output_path).context("Failed to read FACE_RECOGNITION output")?;
+
+    let result: FaceRecognitionResult =
+        serde_json::from_str(&json_str).context("Failed to parse FACE_RECOGNITION output")?;
+
+    tracing::info!(
+        "[FACE_RECOGNITION] Result: {} frames, {} recognized faces, {} clusters",
+        result.frames.len(),
+        result.recognized_faces.len(),
+        result.face_clusters.len()
+    );
+
+    Ok(result)
+}
+
+pub async fn register_face(
+    image_path: &str,
+    name: &str,
+    metadata: Option<serde_json::Value>,
+) -> Result<FaceRegistrationResult> {
+    let executor = PythonExecutor::new()?;
+    let script_path = executor.script_path("face_registration.py");
+
+    tracing::info!("[FACE_REGISTRATION] Registering face: {}", name);
+
+    if !script_path.exists() {
+        anyhow::bail!("Face registration script not found");
+    }
+
+    let output_path = format!("/tmp/face_registration_{}.json", uuid::Uuid::new_v4());
+
+    // Handle metadata separately to avoid lifetime issues
+    let meta_temp_file = metadata.as_ref().map(|meta| {
+        let meta_path = format!("/tmp/face_metadata_{}.json", uuid::Uuid::new_v4());
+        std::fs::write(&meta_path, serde_json::to_string(meta).unwrap()).unwrap();
+        meta_path
+    });
+
+    // Build arguments - use output_path as database path so Python writes there
+    let mut args = vec![
+        image_path.to_string(),
+        output_path.clone(),
+        name.to_string(),
+    ];
+
+    // Add database parameter (point to same output for now)
+    let database_path = output_path.clone();
+    args.push("--database".to_string());
+    args.push(database_path.clone());
+
+    if let Some(ref meta_path) = meta_temp_file {
+        args.push("--metadata".to_string());
+        args.push(meta_path.clone());
+    }
+
+    let args_refs: Vec<&str> = args.iter().map(|s| s.as_str()).collect();
+    executor
+        .run(
+            "face_registration.py",
+            &args_refs,
+            None,
+            "FACE_REGISTRATION",
+            Some(Duration::from_secs(300)),
+        )
+        .await
+        .with_context(|| format!("Failed to run {:?}", script_path))?;
+
+    let json_str =
+        std::fs::read_to_string(&output_path).context("Failed to read registration output")?;
+
+    let result: FaceRegistrationResult =
+        serde_json::from_str(&json_str).context("Failed to parse registration output")?;
+
+    // Clean up temp files
+    let _ = std::fs::remove_file(&output_path);
+    if let Some(meta_path) = meta_temp_file {
+        let _ = std::fs::remove_file(&meta_path);
+    }
+
+    tracing::info!("[FACE_REGISTRATION] Registered face: {}", result.face_id);
+
+    Ok(result)
+}
+
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct FaceRegistrationResult {
+    pub face_id: String,
+    pub embedding: Vec<f32>,
+    pub attributes: Option<FaceAttributes>,
+    pub success: bool,
+    pub message: String,
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_face_recognition_result_serialization() {
+        let result = FaceRecognitionResult {
+            frame_count: 100,
+            fps: 30.0,
+            frames: vec![FaceRecognitionFrame {
+                frame: 0,
+                timestamp: 0.0,
+                faces: vec![RecognizedFaceDetection {
+                    face_id: Some("face_1".to_string()),
+                    x: 100,
+                    y: 100,
+                    width: 50,
+                    height: 60,
+                    confidence: 0.95,
+                    embedding: Some(vec![0.1, 0.2, 0.3]),
+                    attributes: Some(FaceAttributes {
+                        age: Some(30),
+                        gender: Some("male".to_string()),
+                        emotion: Some("neutral".to_string()),
+                        glasses: Some(false),
+                        mask: Some(false),
+                        pose: Some(FacePose {
+                            yaw: 0.1,
+                            pitch: 0.2,
+                            roll: 0.3,
+                        }),
+                    }),
+                    identity: Some(FaceIdentity {
+                        name: Some("John Doe".to_string()),
+                        confidence: 0.85,
+                        database_id: Some("user_123".to_string()),
+                        metadata: Some(serde_json::json!({"role": "employee"})),
+                    }),
+                }],
+            }],
+            recognized_faces: vec![RecognizedFace {
+                face_id: "face_1".to_string(),
+                embedding: vec![0.1, 0.2, 0.3],
+                first_seen: 0.0,
+                last_seen: 10.0,
+                total_appearances: 5,
+                attributes: Some(FaceAttributes {
+                    age: Some(30),
+                    gender: Some("male".to_string()),
+                    emotion: Some("neutral".to_string()),
+                    glasses: Some(false),
+                    mask: Some(false),
+                    pose: Some(FacePose {
+                        yaw: 0.1,
+                        pitch: 0.2,
+                        roll: 0.3,
+                    }),
+                }),
+                identities: vec![FaceIdentity {
+                    name: Some("John Doe".to_string()),
+                    confidence: 0.85,
+                    database_id: Some("user_123".to_string()),
+                    metadata: Some(serde_json::json!({"role": "employee"})),
+                }],
+                cluster_id: Some("cluster_1".to_string()),
+            }],
+            face_clusters: vec![FaceCluster {
+                cluster_id: "cluster_1".to_string(),
+                face_ids: vec!["face_1".to_string()],
+                centroid: vec![0.1, 0.2, 0.3],
+                size: 1,
+                representative_face_id: Some("face_1".to_string()),
+                metadata: Some(serde_json::json!({"description": "main person"})),
+            }],
+        };
+
+        let json = serde_json::to_string(&result).unwrap();
+        assert!(json.contains("face_1"));
+        assert!(json.contains("John Doe"));
+        assert!(json.contains("cluster_1"));
+    }
+
+    #[test]
+    fn test_face_attributes_serialization() {
+        let attributes = FaceAttributes {
+            age: Some(25),
+            gender: Some("female".to_string()),
+            emotion: Some("happy".to_string()),
+            glasses: Some(true),
+            mask: Some(false),
+            pose: Some(FacePose {
+                yaw: -0.1,
+                pitch: 0.05,
+                roll: 0.02,
+            }),
+        };
+
+        let json = serde_json::to_string(&attributes).unwrap();
+        assert!(json.contains("\"age\":25"));
+        assert!(json.contains("\"gender\":\"female\""));
+        assert!(json.contains("\"emotion\":\"happy\""));
+    }
+
+    #[test]
+    fn test_face_identity_serialization() {
+        let identity = FaceIdentity {
+            name: Some("Alice Smith".to_string()),
+            confidence: 0.92,
+            database_id: Some("employee_456".to_string()),
+            metadata: Some(serde_json::json!({
+                "department": "engineering",
+                "position": "senior developer"
+            })),
+        };
+
+        let json = serde_json::to_string(&identity).unwrap();
+        assert!(json.contains("Alice Smith"));
+        assert!(json.contains("\"confidence\":0.92"));
+        assert!(json.contains("engineering"));
+    }
+}
--- a/src/core/processor/visual_chunk.rs
+++ b/src/core/processor/visual_chunk.rs
@@ -0,0 +1,562 @@
+//! 視覺分片處理器 (Phase 2.2)
+//!
+//! 從 YOLO 結果生成視覺分片
+
+use anyhow::{Context, Result};
+use serde::{Deserialize, Serialize};
+use std::time::Duration;
+
+use super::executor::PythonExecutor;
+use super::yolo::{YoloFrame, YoloResult};
+
+const VISUAL_CHUNK_TIMEOUT: Duration = Duration::from_secs(3600);
+
+/// 視覺分片處理結果
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct VisualChunkResult {
+    /// 生成的視覺分片數量
+    pub chunk_count: u32,
+    /// 處理的總幀數
+    pub total_frames: u32,
+    /// 檢測到的總物件數
+    pub total_objects: u32,
+    /// 唯一物件類別數
+    pub unique_classes: u32,
+    /// 生成的視覺分片
+    pub chunks: Vec<crate::core::chunk::Chunk>,
+}
+
+/// 從 YOLO 結果生成視覺分片
+pub async fn process_visual_chunk(
+    file_id: i32,
+    uuid: String,
+    video_path: &str,
+    yolo_result: &YoloResult,
+    chunk_index_offset: u32,
+    fps: f64,
+) -> Result<VisualChunkResult> {
+    tracing::info!(
+        "[VisualChunk] Starting visual chunk generation for video: {}, {} frames",
+        video_path,
+        yolo_result.frames.len()
+    );
+
+    if yolo_result.frames.is_empty() {
+        tracing::warn!("[VisualChunk] No YOLO frames to process");
+        return Ok(VisualChunkResult {
+            chunk_count: 0,
+            total_frames: 0,
+            total_objects: 0,
+            unique_classes: 0,
+            chunks: vec![],
+        });
+    }
+
+    // 策略 1: 固定幀數分片（每 N 幀一個分片）
+    let chunks = create_fixed_frame_chunks(file_id, &uuid, yolo_result, chunk_index_offset, fps);
+
+    // 統計信息
+    let total_objects: u32 = yolo_result
+        .frames
+        .iter()
+        .map(|f| f.objects.len() as u32)
+        .sum();
+    let all_classes: Vec<String> = yolo_result
+        .frames
+        .iter()
+        .flat_map(|f| f.objects.iter().map(|o| o.class_name.clone()))
+        .collect();
+    let unique_classes: u32 = all_classes
+        .iter()
+        .cloned()
+        .collect::<std::collections::HashSet<_>>()
+        .len() as u32;
+
+    tracing::info!(
+        "[VisualChunk] Generated {} visual chunks from {} frames, {} total objects, {} unique classes",
+        chunks.len(),
+        yolo_result.frames.len(),
+        total_objects,
+        unique_classes
+    );
+
+    Ok(VisualChunkResult {
+        chunk_count: chunks.len() as u32,
+        total_frames: yolo_result.frames.len() as u32,
+        total_objects,
+        unique_classes,
+        chunks,
+    })
+}
+
+/// 創建固定幀數分片（每 N 幀一個分片）
+fn create_fixed_frame_chunks(
+    file_id: i32,
+    uuid: &str,
+    yolo_result: &YoloResult,
+    chunk_index_offset: u32,
+    fps: f64,
+) -> Vec<crate::core::chunk::Chunk> {
+    let mut chunks = Vec::new();
+
+    // 配置：每 30 幀創建一個分片（約 1 秒，如果 fps=30）
+    let frames_per_chunk = 30;
+    let total_frames = yolo_result.frames.len();
+
+    if total_frames == 0 {
+        return chunks;
+    }
+
+    let mut chunk_index = chunk_index_offset;
+    let mut start_idx = 0;
+
+    while start_idx < total_frames {
+        let end_idx = std::cmp::min(start_idx + frames_per_chunk, total_frames);
+
+        // 獲取這個分片的幀
+        let chunk_frames: Vec<YoloFrame> = yolo_result.frames[start_idx..end_idx]
+            .iter()
+            .cloned()
+            .collect();
+
+        if chunk_frames.is_empty() {
+            break;
+        }
+
+        // 計算幀範圍
+        let start_frame = chunk_frames.first().unwrap().frame as i64;
+        let end_frame = chunk_frames.last().unwrap().frame as i64 + 1; // exclusive
+
+        // 創建視覺分片
+        let chunk = crate::core::chunk::Chunk::from_yolo_frames(
+            file_id,
+            uuid.to_string(),
+            chunk_index,
+            start_frame,
+            end_frame,
+            fps,
+            chunk_frames,
+        );
+
+        chunks.push(chunk);
+
+        // 更新索引
+        start_idx = end_idx;
+        chunk_index += 1;
+    }
+
+    chunks
+}
+
+/// 基於物件相似度創建分片
+fn create_similarity_based_chunks(
+    file_id: i32,
+    uuid: &str,
+    yolo_result: &YoloResult,
+    chunk_index_offset: u32,
+    fps: f64,
+    similarity_threshold: f32,
+    min_frames_per_chunk: usize,
+) -> Vec<crate::core::chunk::Chunk> {
+    let mut chunks = Vec::new();
+
+    if yolo_result.frames.is_empty() {
+        return chunks;
+    }
+
+    let mut current_chunk_frames: Vec<YoloFrame> = Vec::new();
+    let mut chunk_index = chunk_index_offset;
+    let mut current_start_frame = 0;
+
+    for (i, frame) in yolo_result.frames.iter().enumerate() {
+        if current_chunk_frames.is_empty() {
+            current_chunk_frames.push(frame.clone());
+            current_start_frame = frame.frame as i64;
+            continue;
+        }
+
+        // 檢查相似度（簡化版本：檢查物件類別是否相同）
+        let last_frame = current_chunk_frames.last().unwrap();
+        let similarity = calculate_frame_similarity(last_frame, frame);
+
+        if similarity >= similarity_threshold {
+            // 相似度高，加入當前分片
+            current_chunk_frames.push(frame.clone());
+        } else {
+            // 相似度低，創建新分片
+            if current_chunk_frames.len() >= min_frames_per_chunk {
+                let end_frame = current_chunk_frames.last().unwrap().frame as i64 + 1;
+
+                let chunk = crate::core::chunk::Chunk::from_yolo_frames(
+                    file_id,
+                    uuid.to_string(),
+                    chunk_index,
+                    current_start_frame,
+                    end_frame,
+                    fps,
+                    current_chunk_frames.clone(),
+                );
+
+                chunks.push(chunk);
+                chunk_index += 1;
+            }
+
+            // 開始新的分片
+            current_chunk_frames = vec![frame.clone()];
+            current_start_frame = frame.frame as i64;
+        }
+    }
+
+    // 處理最後一個分片
+    if current_chunk_frames.len() >= min_frames_per_chunk {
+        let end_frame = current_chunk_frames.last().unwrap().frame as i64 + 1;
+
+        let chunk = crate::core::chunk::Chunk::from_yolo_frames(
+            file_id,
+            uuid.to_string(),
+            chunk_index,
+            current_start_frame,
+            end_frame,
+            fps,
+            current_chunk_frames,
+        );
+
+        chunks.push(chunk);
+    }
+
+    chunks
+}
+
+/// 計算兩個幀之間的相似度（基於物件類別）
+fn calculate_frame_similarity(frame1: &YoloFrame, frame2: &YoloFrame) -> f32 {
+    if frame1.objects.is_empty() && frame2.objects.is_empty() {
+        return 1.0;
+    }
+
+    if frame1.objects.is_empty() || frame2.objects.is_empty() {
+        return 0.0;
+    }
+
+    let set1: std::collections::HashSet<String> = frame1
+        .objects
+        .iter()
+        .map(|o| o.class_name.clone())
+        .collect();
+    let set2: std::collections::HashSet<String> = frame2
+        .objects
+        .iter()
+        .map(|o| o.class_name.clone())
+        .collect();
+
+    let intersection: Vec<_> = set1.intersection(&set2).collect();
+    let union: Vec<_> = set1.union(&set2).collect();
+
+    if union.is_empty() {
+        0.0
+    } else {
+        intersection.len() as f32 / union.len() as f32
+    }
+}
+
+/// 使用 Python 腳本生成視覺分片（進階版本）
+pub async fn process_visual_chunk_advanced(
+    video_path: &str,
+    output_path: &str,
+    uuid: Option<&str>,
+) -> Result<VisualChunkResult> {
+    let executor = PythonExecutor::new()?;
+    let script_path = executor.script_path("visual_chunk_processor.py");
+
+    tracing::info!(
+        "[VisualChunk] Starting advanced visual chunk generation: {}",
+        video_path
+    );
+
+    if !script_path.exists() {
+        tracing::warn!("[VisualChunk] Script not found, using basic generation");
+        // 這裡可以回退到基本生成方法
+        return Ok(VisualChunkResult {
+            chunk_count: 0,
+            total_frames: 0,
+            total_objects: 0,
+            unique_classes: 0,
+            chunks: vec![],
+        });
+    }
+
+    executor
+        .run(
+            "visual_chunk_processor.py",
+            &[video_path, output_path],
+            uuid,
+            "VisualChunk",
+            Some(VISUAL_CHUNK_TIMEOUT),
+        )
+        .await
+        .with_context(|| format!("Failed to run {:?}", script_path))?;
+
+    let json_str =
+        std::fs::read_to_string(output_path).context("Failed to read visual chunk output")?;
+
+    let result: VisualChunkResult =
+        serde_json::from_str(&json_str).context("Failed to parse visual chunk output")?;
+
+    tracing::info!(
+        "[VisualChunk] Advanced generation result: {} chunks, {} frames",
+        result.chunk_count,
+        result.total_frames
+    );
+
+    Ok(result)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_calculate_frame_similarity() {
+        use crate::core::processor::yolo::{YoloFrame, YoloObject};
+
+        let frame1 = YoloFrame {
+            frame: 0,
+            timestamp: 0.0,
+            objects: vec![
+                YoloObject {
+                    class_name: "person".to_string(),
+                    class_id: 0,
+                    x: 100,
+                    y: 200,
+                    width: 50,
+                    height: 100,
+                    confidence: 0.95,
+                },
+                YoloObject {
+                    class_name: "car".to_string(),
+                    class_id: 2,
+                    x: 300,
+                    y: 150,
+                    width: 80,
+                    height: 60,
+                    confidence: 0.87,
+                },
+            ],
+        };
+
+        let frame2 = YoloFrame {
+            frame: 1,
+            timestamp: 0.033,
+            objects: vec![
+                YoloObject {
+                    class_name: "person".to_string(),
+                    class_id: 0,
+                    x: 110,
+                    y: 210,
+                    width: 52,
+                    height: 102,
+                    confidence: 0.92,
+                },
+                YoloObject {
+                    class_name: "car".to_string(),
+                    class_id: 2,
+                    x: 310,
+                    y: 155,
+                    width: 82,
+                    height: 62,
+                    confidence: 0.85,
+                },
+            ],
+        };
+
+        let frame3 = YoloFrame {
+            frame: 2,
+            timestamp: 0.066,
+            objects: vec![YoloObject {
+                class_name: "dog".to_string(),
+                class_id: 16,
+                x: 150,
+                y: 250,
+                width: 40,
+                height: 60,
+                confidence: 0.78,
+            }],
+        };
+
+        // 相同物件的幀應該高度相似
+        let similarity_same = calculate_frame_similarity(&frame1, &frame2);
+        assert!((similarity_same - 1.0).abs() < 0.001);
+
+        // 不同物件的幀應該不相似
+        let similarity_diff = calculate_frame_similarity(&frame1, &frame3);
+        assert!((similarity_diff - 0.0).abs() < 0.001);
+
+        // 空幀應該完全相似
+        let empty_frame = YoloFrame {
+            frame: 3,
+            timestamp: 0.1,
+            objects: vec![],
+        };
+        let similarity_empty = calculate_frame_similarity(&empty_frame, &empty_frame);
+        assert!((similarity_empty - 1.0).abs() < 0.001);
+    }
+
+    #[tokio::test]
+    async fn test_create_fixed_frame_chunks() {
+        use crate::core::processor::yolo::{YoloFrame, YoloObject, YoloResult};
+
+        // 創建測試 YOLO 結果（60 幀，每幀都有物件）
+        let mut frames = Vec::new();
+        for i in 0..60 {
+            frames.push(YoloFrame {
+                frame: i as u64,
+                timestamp: i as f64 / 30.0, // 假設 fps=30
+                objects: vec![YoloObject {
+                    class_name: "person".to_string(),
+                    class_id: 0,
+                    x: 100,
+                    y: 200,
+                    width: 50,
+                    height: 100,
+                    confidence: 0.9,
+                }],
+            });
+        }
+
+        let yolo_result = YoloResult {
+            frame_count: 60,
+            fps: 30.0,
+            frames,
+        };
+
+        let chunks = create_fixed_frame_chunks(1, "test-uuid", &yolo_result, 0, 30.0);
+
+        // 60 幀，每 30 幀一個分片，應該有 2 個分片
+        assert_eq!(chunks.len(), 2);
+
+        // 檢查第一個分片
+        let first_chunk = &chunks[0];
+        assert_eq!(
+            first_chunk.chunk_type,
+            crate::core::chunk::ChunkType::Visual
+        );
+        assert_eq!(first_chunk.start_frame, 0);
+        assert_eq!(first_chunk.end_frame, 30); // exclusive
+        assert_eq!(first_chunk.frame_count, 30);
+
+        // 檢查第二個分片
+        let second_chunk = &chunks[1];
+        assert_eq!(
+            second_chunk.chunk_type,
+            crate::core::chunk::ChunkType::Visual
+        );
+        assert_eq!(second_chunk.start_frame, 30);
+        assert_eq!(second_chunk.end_frame, 60); // exclusive
+        assert_eq!(second_chunk.frame_count, 30);
+    }
+
+    #[test]
+    fn test_create_similarity_based_chunks() {
+        use crate::core::processor::yolo::{YoloFrame, YoloObject, YoloResult};
+
+        // 創建測試 YOLO 結果
+        let frames = vec![
+            YoloFrame {
+                // 幀 0-4: 都有 person 和 car
+                frame: 0,
+                timestamp: 0.0,
+                objects: vec![
+                    YoloObject {
+                        class_name: "person".to_string(),
+                        class_id: 0,
+                        x: 100,
+                        y: 200,
+                        width: 50,
+                        height: 100,
+                        confidence: 0.9,
+                    },
+                    YoloObject {
+                        class_name: "car".to_string(),
+                        class_id: 2,
+                        x: 300,
+                        y: 150,
+                        width: 80,
+                        height: 60,
+                        confidence: 0.8,
+                    },
+                ],
+            },
+            YoloFrame {
+                // 幀 1
+                frame: 1,
+                timestamp: 0.033,
+                objects: vec![
+                    YoloObject {
+                        class_name: "person".to_string(),
+                        class_id: 0,
+                        x: 110,
+                        y: 210,
+                        width: 52,
+                        height: 102,
+                        confidence: 0.88,
+                    },
+                    YoloObject {
+                        class_name: "car".to_string(),
+                        class_id: 2,
+                        x: 310,
+                        y: 155,
+                        width: 82,
+                        height: 62,
+                        confidence: 0.78,
+                    },
+                ],
+            },
+            YoloFrame {
+                // 幀 5-9: 只有 dog
+                frame: 5,
+                timestamp: 0.166,
+                objects: vec![YoloObject {
+                    class_name: "dog".to_string(),
+                    class_id: 16,
+                    x: 150,
+                    y: 250,
+                    width: 40,
+                    height: 60,
+                    confidence: 0.7,
+                }],
+            },
+            YoloFrame {
+                // 幀 6
+                frame: 6,
+                timestamp: 0.2,
+                objects: vec![YoloObject {
+                    class_name: "dog".to_string(),
+                    class_id: 16,
+                    x: 155,
+                    y: 255,
+                    width: 42,
+                    height: 62,
+                    confidence: 0.68,
+                }],
+            },
+        ];
+
+        let yolo_result = YoloResult {
+            frame_count: 7,
+            fps: 30.0,
+            frames,
+        };
+
+        let chunks = create_similarity_based_chunks(
+            1,
+            "test-uuid",
+            &yolo_result,
+            0,
+            30.0,
+            0.5, // similarity threshold
+            2,   // min frames per chunk
+        );
+
+        // 應該有 2 個分片：一個是 person+car，一個是 dog
+        assert_eq!(chunks.len(), 2);
+    }
+}