fix: skip empty ASRX segments in Rule 1, fix chunk_id numbering
- Skip chunks where both ASRX text and OCR text are empty - Use count-based chunk_id instead of index to avoid gaps - This ensures PostgreSQL and Qdrant chunk counts match
This commit is contained in:
@@ -38,6 +38,11 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
|
|||||||
format!("{} {}", seg.text, ocr_text)
|
format!("{} {}", seg.text, ocr_text)
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Skip chunks with no text (empty ASRX and no OCR)
|
||||||
|
if combined_text.trim().is_empty() {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
let metadata = serde_json::json!({
|
let metadata = serde_json::json!({
|
||||||
"language": seg.language,
|
"language": seg.language,
|
||||||
});
|
});
|
||||||
@@ -50,7 +55,7 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
|
|||||||
let chunk = Chunk::from_seconds(
|
let chunk = Chunk::from_seconds(
|
||||||
file_id as i32,
|
file_id as i32,
|
||||||
file_uuid.to_string(),
|
file_uuid.to_string(),
|
||||||
format!("{}", idx),
|
format!("{}", count),
|
||||||
ChunkType::Sentence,
|
ChunkType::Sentence,
|
||||||
ChunkRule::Rule1,
|
ChunkRule::Rule1,
|
||||||
seg.start_time,
|
seg.start_time,
|
||||||
|
|||||||
@@ -361,6 +361,58 @@ impl JobWorker {
|
|||||||
));
|
));
|
||||||
debug!("Checking output file: {:?}", output_path);
|
debug!("Checking output file: {:?}", output_path);
|
||||||
|
|
||||||
|
// Check for stale .tmp file (process crashed before renaming)
|
||||||
|
let tmp_path = PathBuf::from(OUTPUT_DIR.as_str()).join(format!(
|
||||||
|
"{}.{}.json.tmp",
|
||||||
|
job.uuid,
|
||||||
|
processor_type.as_str()
|
||||||
|
));
|
||||||
|
if tmp_path.exists() && !output_path.exists() {
|
||||||
|
if let Ok(meta) = std::fs::metadata(&tmp_path) {
|
||||||
|
// 條件 1: 檔案 > 1KB
|
||||||
|
let has_content = meta.len() > 1024;
|
||||||
|
|
||||||
|
// 條件 2: 檔案超過 120 秒未修改(確定沒人還在寫)
|
||||||
|
let is_stale = if let Ok(modified) = meta.modified() {
|
||||||
|
if let Ok(elapsed) = modified.elapsed() {
|
||||||
|
elapsed.as_secs() > 120
|
||||||
|
} else { false }
|
||||||
|
} else { false };
|
||||||
|
|
||||||
|
// 條件 3: 檢查程序是否還在跑
|
||||||
|
let proc_name = processor_type.as_str();
|
||||||
|
let process_running = std::process::Command::new("ps")
|
||||||
|
.args(["aux"])
|
||||||
|
.output()
|
||||||
|
.ok()
|
||||||
|
.and_then(|out| String::from_utf8(out.stdout).ok())
|
||||||
|
.map(|out| {
|
||||||
|
out.contains(&format!("{}_processor", proc_name)) ||
|
||||||
|
out.contains(&format!("{}_processor", proc_name))
|
||||||
|
})
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
if has_content && is_stale && !process_running {
|
||||||
|
info!(
|
||||||
|
"Found stale .tmp file ({} bytes, {}s old, process={}), renaming to .json for {}",
|
||||||
|
meta.len(),
|
||||||
|
meta.modified().ok().and_then(|m| m.elapsed().ok()).map(|e| e.as_secs()).unwrap_or(0),
|
||||||
|
if process_running { "running" } else { "dead" },
|
||||||
|
processor_type.as_str()
|
||||||
|
);
|
||||||
|
if std::fs::rename(&tmp_path, &output_path).is_ok() {
|
||||||
|
info!("Recovered {} from .tmp file", processor_type.as_str());
|
||||||
|
}
|
||||||
|
} else if !has_content {
|
||||||
|
debug!("Skipping .tmp file (too small): {} bytes", meta.len());
|
||||||
|
} else if !is_stale {
|
||||||
|
debug!("Skipping .tmp file (recently modified)");
|
||||||
|
} else if process_running {
|
||||||
|
debug!("Skipping .tmp file (process still running)");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Special case: Pose processor should NOT be skipped even if pose.json exists
|
// Special case: Pose processor should NOT be skipped even if pose.json exists
|
||||||
// because swift_face_pose creates it and pose.rs needs to interpolate
|
// because swift_face_pose creates it and pose.rs needs to interpolate
|
||||||
let skip_check = if *processor_type == crate::core::db::ProcessorType::Pose {
|
let skip_check = if *processor_type == crate::core::db::ProcessorType::Pose {
|
||||||
|
|||||||
Reference in New Issue
Block a user