feat: Rule 1 now creates chunks for OCR-only text
Previously Rule 1 only created chunks from ASRX segments, merging OCR text where frame ranges overlapped. OCR text that didn't overlap with any ASRX segment was ignored. Now Rule 1 has two phases: 1. Process ASRX segments (merge OCR where overlapping) - existing behavior 2. Create chunks for OCR-only text (frames not covered by ASRX) OCR-only chunks are grouped by consecutive frames (within 5 frames) to avoid creating too many single-frame chunks. Example: ASRX 819 + OCR-only 4 = 823 sentence chunks
This commit is contained in:
@@ -21,8 +21,8 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
|
|||||||
.await?
|
.await?
|
||||||
.context("Video not found")?;
|
.context("Video not found")?;
|
||||||
|
|
||||||
if asr_segments.is_empty() {
|
if asr_segments.is_empty() && ocr_map.is_empty() {
|
||||||
info!("Rule 1: no ASR segments for video {}", file_uuid);
|
info!("Rule 1: no ASR segments or OCR for video {}", file_uuid);
|
||||||
return Ok(0);
|
return Ok(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -30,7 +30,11 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
|
|||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
let mut tx = pool.begin().await?;
|
let mut tx = pool.begin().await?;
|
||||||
|
|
||||||
for (idx, seg) in asr_segments.iter().enumerate() {
|
// Track which OCR frames were merged into ASRX segments
|
||||||
|
let mut merged_ocr_frames = std::collections::HashSet::new();
|
||||||
|
|
||||||
|
// Phase 1: Process ASRX segments (merge OCR where overlapping)
|
||||||
|
for seg in asr_segments.iter() {
|
||||||
let ocr_text = collect_ocr_text(seg.start_frame, seg.end_frame, &ocr_map);
|
let ocr_text = collect_ocr_text(seg.start_frame, seg.end_frame, &ocr_map);
|
||||||
let combined_text = if ocr_text.is_empty() {
|
let combined_text = if ocr_text.is_empty() {
|
||||||
seg.text.clone()
|
seg.text.clone()
|
||||||
@@ -38,6 +42,13 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
|
|||||||
format!("{} {}", seg.text, ocr_text)
|
format!("{} {}", seg.text, ocr_text)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Track merged OCR frames
|
||||||
|
if !ocr_text.is_empty() {
|
||||||
|
for frame in seg.start_frame..=seg.end_frame {
|
||||||
|
merged_ocr_frames.insert(frame);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Skip chunks with no text (empty ASRX and no OCR)
|
// Skip chunks with no text (empty ASRX and no OCR)
|
||||||
if combined_text.trim().is_empty() {
|
if combined_text.trim().is_empty() {
|
||||||
continue;
|
continue;
|
||||||
@@ -78,11 +89,51 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Phase 2: Create chunks for OCR-only text (not overlapping with ASRX)
|
||||||
|
let ocr_only_chunks = collect_ocr_only_chunks(&ocr_map, &merged_ocr_frames, fps);
|
||||||
|
let ocr_only_count = ocr_only_chunks.len();
|
||||||
|
for (frame, ocr_text) in ocr_only_chunks {
|
||||||
|
if ocr_text.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let time = frame as f64 / fps;
|
||||||
|
let metadata = serde_json::json!({
|
||||||
|
"language": "ocr",
|
||||||
|
});
|
||||||
|
|
||||||
|
let content = serde_json::json!({
|
||||||
|
"text": "",
|
||||||
|
"ocr_text": ocr_text,
|
||||||
|
});
|
||||||
|
|
||||||
|
let chunk = Chunk::from_seconds(
|
||||||
|
file_id as i32,
|
||||||
|
file_uuid.to_string(),
|
||||||
|
format!("{}", count),
|
||||||
|
ChunkType::Sentence,
|
||||||
|
ChunkRule::Rule1,
|
||||||
|
time,
|
||||||
|
time + (1.0 / fps),
|
||||||
|
fps,
|
||||||
|
content,
|
||||||
|
)
|
||||||
|
.with_metadata(metadata)
|
||||||
|
.with_text_content(ocr_text);
|
||||||
|
|
||||||
|
db.store_chunk_in_tx(&chunk, &mut tx).await?;
|
||||||
|
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
|
||||||
tx.commit().await?;
|
tx.commit().await?;
|
||||||
|
|
||||||
info!(
|
info!(
|
||||||
"Rule 1 completed: {} sentence chunks created for video {}",
|
"Rule 1 completed: {} sentence chunks created for video {} ({} ASRX + {} OCR-only)",
|
||||||
count, file_uuid
|
count,
|
||||||
|
file_uuid,
|
||||||
|
count - ocr_only_count,
|
||||||
|
ocr_only_count
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok(count)
|
Ok(count)
|
||||||
@@ -251,3 +302,61 @@ fn collect_ocr_text(
|
|||||||
|
|
||||||
parts.join(" ")
|
parts.join(" ")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Collect OCR text that doesn't overlap with any ASRX segment
|
||||||
|
/// Returns vec of (frame, combined_ocr_text) for OCR-only chunks
|
||||||
|
fn collect_ocr_only_chunks(
|
||||||
|
ocr_map: &BTreeMap<i64, Vec<String>>,
|
||||||
|
merged_ocr_frames: &std::collections::HashSet<i64>,
|
||||||
|
_fps: f64,
|
||||||
|
) -> Vec<(i64, String)> {
|
||||||
|
let mut result = Vec::new();
|
||||||
|
let mut current_frame: Option<i64> = None;
|
||||||
|
let mut current_texts: Vec<String> = Vec::new();
|
||||||
|
|
||||||
|
for (frame, texts) in ocr_map.iter() {
|
||||||
|
// Skip frames already merged into ASRX segments
|
||||||
|
if merged_ocr_frames.contains(frame) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start a new group or continue existing group
|
||||||
|
if current_frame.is_none() {
|
||||||
|
current_frame = Some(*frame);
|
||||||
|
current_texts = texts.clone();
|
||||||
|
} else {
|
||||||
|
// Group consecutive OCR frames (within 5 frames = ~0.2s at 24fps)
|
||||||
|
if *frame - current_frame.unwrap() <= 5 {
|
||||||
|
current_texts.extend(texts.clone());
|
||||||
|
} else {
|
||||||
|
// Save previous group and start new one
|
||||||
|
if !current_texts.is_empty() {
|
||||||
|
let mut seen = std::collections::HashSet::new();
|
||||||
|
let unique: Vec<String> = current_texts
|
||||||
|
.iter()
|
||||||
|
.filter(|t| seen.insert((*t).clone()))
|
||||||
|
.cloned()
|
||||||
|
.collect();
|
||||||
|
result.push((current_frame.unwrap(), unique.join(" ")));
|
||||||
|
}
|
||||||
|
current_frame = Some(*frame);
|
||||||
|
current_texts = texts.clone();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Don't forget the last group
|
||||||
|
if let Some(frame) = current_frame {
|
||||||
|
if !current_texts.is_empty() {
|
||||||
|
let mut seen = std::collections::HashSet::new();
|
||||||
|
let unique: Vec<String> = current_texts
|
||||||
|
.iter()
|
||||||
|
.filter(|t| seen.insert((*t).clone()))
|
||||||
|
.cloned()
|
||||||
|
.collect();
|
||||||
|
result.push((frame, unique.join(" ")));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user