feat: Rule 1 now creates chunks for OCR-only text

Previously Rule 1 only created chunks from ASRX segments, merging OCR text where frame ranges overlapped. OCR text that didn't overlap with any ASRX segment was ignored. Now Rule 1 has two phases: 1. Process ASRX segments (merge OCR where overlapping) - existing behavior 2. Create chunks for OCR-only text (frames not covered by ASRX) OCR-only chunks are grouped by consecutive frames (within 5 frames) to avoid creating too many single-frame chunks. Example: ASRX 819 + OCR-only 4 = 823 sentence chunks
2026-07-05 22:06:35 +08:00
parent 465552f8b2
commit 0b82aa875c
1 changed files with 114 additions and 5 deletions
--- a/src/core/chunk/rule1_ingest.rs
+++ b/src/core/chunk/rule1_ingest.rs
@@ -21,8 +21,8 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
        .await?
        .context("Video not found")?;

-    if asr_segments.is_empty() {
-        info!("Rule 1: no ASR segments for video {}", file_uuid);
+    if asr_segments.is_empty() && ocr_map.is_empty() {
+        info!("Rule 1: no ASR segments or OCR for video {}", file_uuid);
        return Ok(0);
    }

@@ -30,7 +30,11 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
    let mut count = 0;
    let mut tx = pool.begin().await?;

-    for (idx, seg) in asr_segments.iter().enumerate() {
+    // Track which OCR frames were merged into ASRX segments
+    let mut merged_ocr_frames = std::collections::HashSet::new();
+
+    // Phase 1: Process ASRX segments (merge OCR where overlapping)
+    for seg in asr_segments.iter() {
        let ocr_text = collect_ocr_text(seg.start_frame, seg.end_frame, &ocr_map);
        let combined_text = if ocr_text.is_empty() {
            seg.text.clone()
@@ -38,6 +42,13 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
            format!("{} {}", seg.text, ocr_text)
        };

+        // Track merged OCR frames
+        if !ocr_text.is_empty() {
+            for frame in seg.start_frame..=seg.end_frame {
+                merged_ocr_frames.insert(frame);
+            }
+        }
+
        // Skip chunks with no text (empty ASRX and no OCR)
        if combined_text.trim().is_empty() {
            continue;
@@ -78,11 +89,51 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
        }
    }

+    // Phase 2: Create chunks for OCR-only text (not overlapping with ASRX)
+    let ocr_only_chunks = collect_ocr_only_chunks(&ocr_map, &merged_ocr_frames, fps);
+    let ocr_only_count = ocr_only_chunks.len();
+    for (frame, ocr_text) in ocr_only_chunks {
+        if ocr_text.trim().is_empty() {
+            continue;
+        }
+
+        let time = frame as f64 / fps;
+        let metadata = serde_json::json!({
+            "language": "ocr",
+        });
+
+        let content = serde_json::json!({
+            "text": "",
+            "ocr_text": ocr_text,
+        });
+
+        let chunk = Chunk::from_seconds(
+            file_id as i32,
+            file_uuid.to_string(),
+            format!("{}", count),
+            ChunkType::Sentence,
+            ChunkRule::Rule1,
+            time,
+            time + (1.0 / fps),
+            fps,
+            content,
+        )
+        .with_metadata(metadata)
+        .with_text_content(ocr_text);
+
+        db.store_chunk_in_tx(&chunk, &mut tx).await?;
+
+        count += 1;
+    }
+
    tx.commit().await?;

    info!(
-        "Rule 1 completed: {} sentence chunks created for video {}",
-        count, file_uuid
+        "Rule 1 completed: {} sentence chunks created for video {} ({} ASRX + {} OCR-only)",
+        count,
+        file_uuid,
+        count - ocr_only_count,
+        ocr_only_count
    );

    Ok(count)
@@ -251,3 +302,61 @@ fn collect_ocr_text(

    parts.join(" ")
 }
+
+/// Collect OCR text that doesn't overlap with any ASRX segment
+/// Returns vec of (frame, combined_ocr_text) for OCR-only chunks
+fn collect_ocr_only_chunks(
+    ocr_map: &BTreeMap<i64, Vec<String>>,
+    merged_ocr_frames: &std::collections::HashSet<i64>,
+    _fps: f64,
+) -> Vec<(i64, String)> {
+    let mut result = Vec::new();
+    let mut current_frame: Option<i64> = None;
+    let mut current_texts: Vec<String> = Vec::new();
+
+    for (frame, texts) in ocr_map.iter() {
+        // Skip frames already merged into ASRX segments
+        if merged_ocr_frames.contains(frame) {
+            continue;
+        }
+
+        // Start a new group or continue existing group
+        if current_frame.is_none() {
+            current_frame = Some(*frame);
+            current_texts = texts.clone();
+        } else {
+            // Group consecutive OCR frames (within 5 frames = ~0.2s at 24fps)
+            if *frame - current_frame.unwrap() <= 5 {
+                current_texts.extend(texts.clone());
+            } else {
+                // Save previous group and start new one
+                if !current_texts.is_empty() {
+                    let mut seen = std::collections::HashSet::new();
+                    let unique: Vec<String> = current_texts
+                        .iter()
+                        .filter(|t| seen.insert((*t).clone()))
+                        .cloned()
+                        .collect();
+                    result.push((current_frame.unwrap(), unique.join(" ")));
+                }
+                current_frame = Some(*frame);
+                current_texts = texts.clone();
+            }
+        }
+    }
+
+    // Don't forget the last group
+    if let Some(frame) = current_frame {
+        if !current_texts.is_empty() {
+            let mut seen = std::collections::HashSet::new();
+            let unique: Vec<String> = current_texts
+                .iter()
+                .filter(|t| seen.insert((*t).clone()))
+                .cloned()
+                .collect();
+            result.push((frame, unique.join(" ")));
+        }
+    }
+
+    result
+}