From 0b82aa875cf040bd0e49ce1a00d0573628aaf088 Mon Sep 17 00:00:00 2001 From: Accusys Date: Sun, 5 Jul 2026 22:06:35 +0800 Subject: [PATCH] feat: Rule 1 now creates chunks for OCR-only text Previously Rule 1 only created chunks from ASRX segments, merging OCR text where frame ranges overlapped. OCR text that didn't overlap with any ASRX segment was ignored. Now Rule 1 has two phases: 1. Process ASRX segments (merge OCR where overlapping) - existing behavior 2. Create chunks for OCR-only text (frames not covered by ASRX) OCR-only chunks are grouped by consecutive frames (within 5 frames) to avoid creating too many single-frame chunks. Example: ASRX 819 + OCR-only 4 = 823 sentence chunks --- src/core/chunk/rule1_ingest.rs | 119 +++++++++++++++++++++++++++++++-- 1 file changed, 114 insertions(+), 5 deletions(-) diff --git a/src/core/chunk/rule1_ingest.rs b/src/core/chunk/rule1_ingest.rs index 30644a9..b78b7e7 100644 --- a/src/core/chunk/rule1_ingest.rs +++ b/src/core/chunk/rule1_ingest.rs @@ -21,8 +21,8 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result .await? .context("Video not found")?; - if asr_segments.is_empty() { - info!("Rule 1: no ASR segments for video {}", file_uuid); + if asr_segments.is_empty() && ocr_map.is_empty() { + info!("Rule 1: no ASR segments or OCR for video {}", file_uuid); return Ok(0); } @@ -30,7 +30,11 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result let mut count = 0; let mut tx = pool.begin().await?; - for (idx, seg) in asr_segments.iter().enumerate() { + // Track which OCR frames were merged into ASRX segments + let mut merged_ocr_frames = std::collections::HashSet::new(); + + // Phase 1: Process ASRX segments (merge OCR where overlapping) + for seg in asr_segments.iter() { let ocr_text = collect_ocr_text(seg.start_frame, seg.end_frame, &ocr_map); let combined_text = if ocr_text.is_empty() { seg.text.clone() @@ -38,6 +42,13 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result format!("{} {}", seg.text, ocr_text) }; + // Track merged OCR frames + if !ocr_text.is_empty() { + for frame in seg.start_frame..=seg.end_frame { + merged_ocr_frames.insert(frame); + } + } + // Skip chunks with no text (empty ASRX and no OCR) if combined_text.trim().is_empty() { continue; @@ -78,11 +89,51 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result } } + // Phase 2: Create chunks for OCR-only text (not overlapping with ASRX) + let ocr_only_chunks = collect_ocr_only_chunks(&ocr_map, &merged_ocr_frames, fps); + let ocr_only_count = ocr_only_chunks.len(); + for (frame, ocr_text) in ocr_only_chunks { + if ocr_text.trim().is_empty() { + continue; + } + + let time = frame as f64 / fps; + let metadata = serde_json::json!({ + "language": "ocr", + }); + + let content = serde_json::json!({ + "text": "", + "ocr_text": ocr_text, + }); + + let chunk = Chunk::from_seconds( + file_id as i32, + file_uuid.to_string(), + format!("{}", count), + ChunkType::Sentence, + ChunkRule::Rule1, + time, + time + (1.0 / fps), + fps, + content, + ) + .with_metadata(metadata) + .with_text_content(ocr_text); + + db.store_chunk_in_tx(&chunk, &mut tx).await?; + + count += 1; + } + tx.commit().await?; info!( - "Rule 1 completed: {} sentence chunks created for video {}", - count, file_uuid + "Rule 1 completed: {} sentence chunks created for video {} ({} ASRX + {} OCR-only)", + count, + file_uuid, + count - ocr_only_count, + ocr_only_count ); Ok(count) @@ -251,3 +302,61 @@ fn collect_ocr_text( parts.join(" ") } + +/// Collect OCR text that doesn't overlap with any ASRX segment +/// Returns vec of (frame, combined_ocr_text) for OCR-only chunks +fn collect_ocr_only_chunks( + ocr_map: &BTreeMap>, + merged_ocr_frames: &std::collections::HashSet, + _fps: f64, +) -> Vec<(i64, String)> { + let mut result = Vec::new(); + let mut current_frame: Option = None; + let mut current_texts: Vec = Vec::new(); + + for (frame, texts) in ocr_map.iter() { + // Skip frames already merged into ASRX segments + if merged_ocr_frames.contains(frame) { + continue; + } + + // Start a new group or continue existing group + if current_frame.is_none() { + current_frame = Some(*frame); + current_texts = texts.clone(); + } else { + // Group consecutive OCR frames (within 5 frames = ~0.2s at 24fps) + if *frame - current_frame.unwrap() <= 5 { + current_texts.extend(texts.clone()); + } else { + // Save previous group and start new one + if !current_texts.is_empty() { + let mut seen = std::collections::HashSet::new(); + let unique: Vec = current_texts + .iter() + .filter(|t| seen.insert((*t).clone())) + .cloned() + .collect(); + result.push((current_frame.unwrap(), unique.join(" "))); + } + current_frame = Some(*frame); + current_texts = texts.clone(); + } + } + } + + // Don't forget the last group + if let Some(frame) = current_frame { + if !current_texts.is_empty() { + let mut seen = std::collections::HashSet::new(); + let unique: Vec = current_texts + .iter() + .filter(|t| seen.insert((*t).clone())) + .cloned() + .collect(); + result.push((frame, unique.join(" "))); + } + } + + result +}