feat: ASR output frame numbers + rename start/end to start_time/end_time

- Python: asr_processor.py detects FPS from CUT/ffprobe (no fallback), outputs start_frame/end_frame
- Rust: All AsrSegment structs use start_time/end_time with #[serde(alias)] for backward compat
- store_asr_chunks: prefers ASR output frames, falls back to time-based conversion
- Added backward compatibility test for old JSON format (start/end)

Breaking change: ffprobe/CUT FPS failure now aborts instead of using default 24fps
This commit is contained in:
Accusys
2026-05-19 13:22:38 +08:00
parent 26725dcab7
commit 67ca846ccd
9 changed files with 572 additions and 68 deletions

View File

@@ -23,8 +23,10 @@ struct CutResult {
#[derive(Debug, Deserialize)]
struct AsrSegment {
start: f64,
end: f64,
#[serde(alias = "start")]
start_time: f64,
#[serde(alias = "end")]
end_time: f64,
text: String,
}
@@ -62,7 +64,7 @@ pub async fn ingest_rule3(pool: &PgPool, file_uuid: &str) -> Result<usize> {
let mut child_ids: Vec<String> = Vec::new();
for seg in &asr_segments {
if seg.start >= scene.start_time && seg.end <= scene.end_time {
if seg.start_time >= scene.start_time && seg.end_time <= scene.end_time {
scene_text.push_str(&seg.text);
scene_text.push(' ');
// We'll look up the chunk_id from Rule 1 later if needed,

View File

@@ -51,8 +51,8 @@ impl ChunkSplitter {
format!("{}", index),
ChunkType::Sentence,
ChunkRule::Rule1,
segment.start,
segment.end,
segment.start_time,
segment.end_time,
self.fps,
serde_json::json!({
"text": segment.text,
@@ -67,8 +67,8 @@ impl ChunkSplitter {
#[derive(Debug, Clone)]
pub struct AsrSegment {
pub start: f64,
pub end: f64,
pub start_time: f64,
pub end_time: f64,
pub text: String,
pub speaker_id: Option<String>,
}

View File

@@ -111,8 +111,8 @@ impl SyncDb {
"rule": "rule1",
"data": {
"text": segment.text,
"start": segment.start,
"end": segment.end,
"start_time": segment.start_time,
"end_time": segment.end_time,
},
});
let metadata = serde_json::json!({
@@ -132,8 +132,8 @@ impl SyncDb {
format!("{}", i),
ChunkType::Sentence,
ChunkRule::Rule1,
segment.start,
segment.end,
segment.start_time,
segment.end_time,
24.0, // fps
content,
)

View File

@@ -12,8 +12,12 @@ pub struct AsrResult {
#[derive(Debug, Serialize, Deserialize)]
pub struct AsrSegment {
pub start: f64,
pub end: f64,
#[serde(alias = "start")]
pub start_time: f64,
#[serde(alias = "end")]
pub end_time: f64,
pub start_frame: Option<i64>,
pub end_frame: Option<i64>,
pub text: String,
}
@@ -63,13 +67,17 @@ mod tests {
language_probability: Some(0.95),
segments: vec![
AsrSegment {
start: 0.0,
end: 2.5,
start_time: 0.0,
end_time: 2.5,
start_frame: Some(0),
end_frame: Some(60),
text: "Hello world".to_string(),
},
AsrSegment {
start: 2.5,
end: 5.0,
start_time: 2.5,
end_time: 5.0,
start_frame: Some(60),
end_frame: Some(120),
text: "Test speech".to_string(),
},
],
@@ -86,7 +94,7 @@ mod tests {
"language": "zh",
"language_probability": 0.98,
"segments": [
{"start": 0.0, "end": 1.5, "text": "測試"}
{"start_time": 0.0, "end_time": 1.5, "start_frame": 0, "end_frame": 36, "text": "測試"}
]
}"#;
@@ -100,12 +108,14 @@ mod tests {
#[test]
fn test_asr_segment_default() {
let segment = AsrSegment {
start: 0.0,
end: 1.0,
start_time: 0.0,
end_time: 1.0,
start_frame: Some(0),
end_frame: Some(24),
text: String::new(),
};
assert_eq!(segment.start, 0.0);
assert_eq!(segment.end, 1.0);
assert_eq!(segment.start_time, 0.0);
assert_eq!(segment.end_time, 1.0);
assert!(segment.text.is_empty());
}
@@ -119,4 +129,22 @@ mod tests {
assert!(result.language.is_none());
assert!(result.segments.is_empty());
}
#[test]
fn test_asr_backward_compat_old_format() {
// Old format uses "start" / "end" — should deserialize via #[serde(alias)]
let json = r#"{
"language": "en",
"segments": [
{"start": 10.0, "end": 12.5, "text": "Hello"}
]
}"#;
let result: AsrResult = serde_json::from_str(json).unwrap();
assert_eq!(result.segments.len(), 1);
assert_eq!(result.segments[0].start_time, 10.0);
assert_eq!(result.segments[0].end_time, 12.5);
assert_eq!(result.segments[0].text, "Hello");
assert!(result.segments[0].start_frame.is_none());
assert!(result.segments[0].end_frame.is_none());
}
}

View File

@@ -1,17 +1,57 @@
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use std::path::Path;
use std::time::Duration;
use super::executor::PythonExecutor;
const STORY_TIMEOUT: Duration = Duration::from_secs(3600);
// ── Input data structs (from JSON files) ──────────────────────────
#[derive(Debug, Deserialize)]
struct AsrData {
segments: Vec<AsrSegmentInput>,
}
#[derive(Debug, Deserialize)]
struct AsrSegmentInput {
#[serde(default, alias = "start")]
start_time: f64,
#[serde(default, alias = "end")]
end_time: f64,
#[serde(default)]
text: String,
#[serde(default)]
confidence: f64,
}
#[derive(Debug, Deserialize)]
struct CutData {
scenes: Vec<CutSceneInput>,
}
#[derive(Debug, Deserialize)]
struct CutSceneInput {
scene_number: Option<i64>,
#[allow(dead_code)]
start_frame: Option<i64>,
#[allow(dead_code)]
end_frame: Option<i64>,
start_time: Option<f64>,
end_time: Option<f64>,
}
// ── Output data structs ───────────────────────────────────────────
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct StoryResult {
pub child_chunks: Vec<StoryChildChunk>,
pub parent_chunks: Vec<StoryParentChunk>,
pub stats: StoryStats,
#[serde(default)]
pub metadata: serde_json::Value,
#[serde(default)]
pub parent_chunk_size: usize,
}
@@ -30,8 +70,10 @@ pub struct StoryChildChunk {
pub source: String,
pub start_time: f64,
pub end_time: f64,
#[serde(skip_serializing_if = "Option::is_none")]
pub text_content: Option<String>,
pub content: serde_json::Value,
#[serde(default)]
pub child_chunk_ids: Vec<String>,
pub parent_chunk_id: Option<String>,
}
@@ -45,22 +87,30 @@ pub struct StoryParentChunk {
pub end_time: f64,
pub text_content: String,
pub content: serde_json::Value,
#[serde(default)]
pub child_chunk_ids: Vec<String>,
pub parent_chunk_id: Option<String>,
}
// ── Public API ────────────────────────────────────────────────────
pub async fn process_story(
video_path: &str,
output_path: &str,
uuid: Option<&str>,
) -> Result<StoryResult> {
// Try native Rust implementation first
let result = try_native_story(video_path, output_path, uuid);
if let Ok(r) = result {
return Ok(r);
}
// Fallback: Python script
tracing::warn!("[STORY] Native impl failed, falling back to Python: {:?}", result.err());
let executor = PythonExecutor::new()?;
let script_path = executor.script_path("story_processor.py");
tracing::info!("[STORY] Starting story generation: {}", video_path);
if !script_path.exists() {
tracing::warn!("[STORY] Script not found, returning empty result");
return Ok(StoryResult {
child_chunks: vec![],
parent_chunks: vec![],
@@ -87,23 +137,311 @@ pub async fn process_story(
.with_context(|| format!("Failed to run {:?}", script_path))?;
let json_str = std::fs::read_to_string(output_path).context("Failed to read STORY output")?;
let result: StoryResult =
serde_json::from_str(&json_str).context("Failed to parse STORY output")?;
tracing::info!(
"[STORY] Result: {} parent chunks, {} child chunks",
result.stats.total_parent_chunks,
result.stats.total_child_chunks
);
Ok(result)
}
// ── Native implementation ─────────────────────────────────────────
fn try_native_story(_video_path: &str, output_path: &str, _uuid: Option<&str>) -> Result<StoryResult> {
let output_dir = Path::new(output_path).parent().unwrap_or(Path::new("."));
let basename = Path::new(output_path)
.file_stem()
.and_then(|s| s.to_str())
.and_then(|s| s.split('.').next())
.unwrap_or("unknown");
let asr_path = output_dir.join(format!("{}.asr.json", basename));
let cut_path = output_dir.join(format!("{}.cut.json", basename));
// ASR data is required; CUT is optional
let asr_data: AsrData = if asr_path.exists() {
let content = std::fs::read_to_string(&asr_path)
.with_context(|| format!("Failed to read {:?}", asr_path))?;
serde_json::from_str(&content)
.with_context(|| format!("Failed to parse {:?}", asr_path))?
} else {
AsrData { segments: vec![] }
};
let cut_data: CutData = if cut_path.exists() {
let content = std::fs::read_to_string(&cut_path)
.with_context(|| format!("Failed to read {:?}", cut_path))?;
serde_json::from_str(&content)
.with_context(|| format!("Failed to parse {:?}", cut_path))?
} else {
CutData { scenes: vec![] }
};
let parent_chunk_size: usize = 5;
// ── Build child chunks ────────────────────────────────────────
let mut child_chunks: Vec<StoryChildChunk> = Vec::new();
// ASR child chunks
for seg in &asr_data.segments {
let chunk_id = format!("asr_{:.1}_{:.1}", seg.start_time, seg.end_time);
child_chunks.push(StoryChildChunk {
chunk_id,
chunk_type: "asr".to_string(),
source: "asr".to_string(),
start_time: seg.start_time,
end_time: seg.end_time,
text_content: Some(seg.text.clone()),
content: serde_json::json!({
"text": seg.text,
"confidence": seg.confidence,
}),
child_chunk_ids: vec![],
parent_chunk_id: None,
});
}
// CUT child chunks
for scene in &cut_data.scenes {
let scene_num = scene.scene_number.unwrap_or(0);
let start_time = scene.start_time.unwrap_or(0.0);
let end_time = scene.end_time.unwrap_or(0.0);
let chunk_id = format!("cut_{}", scene_num);
child_chunks.push(StoryChildChunk {
chunk_id,
chunk_type: "cut".to_string(),
source: "cut".to_string(),
start_time,
end_time,
text_content: Some(format!("Scene {}", scene_num)),
content: serde_json::json!({
"scene_number": scene_num,
"start_time": start_time,
"end_time": end_time,
}),
child_chunk_ids: vec![],
parent_chunk_id: None,
});
}
let asr_child_ids: Vec<String> = child_chunks
.iter()
.filter(|c| c.source == "asr")
.map(|c| c.chunk_id.clone())
.collect();
let cut_child_ids: Vec<String> = child_chunks
.iter()
.filter(|c| c.source == "cut")
.map(|c| c.chunk_id.clone())
.collect();
// ── Build parent chunks from ASR ──────────────────────────────
let mut parent_chunks: Vec<StoryParentChunk> = Vec::new();
for (i, batch) in asr_child_ids.chunks(parent_chunk_size).enumerate() {
if batch.is_empty() {
continue;
}
let mut texts: Vec<String> = Vec::new();
let mut times: Vec<(f64, f64)> = Vec::new();
for child_id in batch {
if let Some(child) = child_chunks.iter().find(|c| &c.chunk_id == child_id) {
if let Some(ref t) = child.text_content {
texts.push(t.clone());
}
times.push((child.start_time, child.end_time));
}
}
let start_time = times.first().map(|t| t.0).unwrap_or(0.0);
let end_time = times.last().map(|t| t.1).unwrap_or(0.0);
let narrative = generate_narrative(&texts, &[], start_time, end_time);
let chunk_id = format!("story_asr_{:04}", i);
parent_chunks.push(StoryParentChunk {
chunk_id: chunk_id.clone(),
chunk_type: "story".to_string(),
source: "story_asr".to_string(),
start_time,
end_time,
text_content: narrative.clone(),
content: serde_json::json!({
"description": narrative,
"child_count": batch.len(),
"speech_preview": texts.iter().take(3).cloned().collect::<Vec<_>>().join(" "),
}),
child_chunk_ids: batch.to_vec(),
parent_chunk_id: None,
});
// Link children to parent
for child in &mut child_chunks {
if batch.contains(&child.chunk_id) {
child.parent_chunk_id = Some(chunk_id.clone());
}
}
}
// ── Build parent chunks from CUT ──────────────────────────────
for (i, batch) in cut_child_ids.chunks(parent_chunk_size).enumerate() {
if batch.is_empty() {
continue;
}
let mut times: Vec<(f64, f64)> = Vec::new();
for child_id in batch {
if let Some(child) = child_chunks.iter().find(|c| &c.chunk_id == child_id) {
times.push((child.start_time, child.end_time));
}
}
let start_time = times.first().map(|t| t.0).unwrap_or(0.0);
let end_time = times.last().map(|t| t.1).unwrap_or(0.0);
let narrative = generate_scene_narrative(&[], start_time, end_time, batch.len());
let chunk_id = format!("story_cut_{:04}", i);
parent_chunks.push(StoryParentChunk {
chunk_id: chunk_id.clone(),
chunk_type: "story".to_string(),
source: "story_cut".to_string(),
start_time,
end_time,
text_content: narrative.clone(),
content: serde_json::json!({
"description": narrative,
"child_count": batch.len(),
"scenes": batch,
}),
child_chunk_ids: batch.to_vec(),
parent_chunk_id: None,
});
for child in &mut child_chunks {
if batch.contains(&child.chunk_id) {
child.parent_chunk_id = Some(chunk_id.clone());
}
}
}
// ── Build result ──────────────────────────────────────────────
let total_child = asr_child_ids.len() + cut_child_ids.len();
let total_parent = parent_chunks.len();
let asr_count = asr_child_ids.len();
let cut_count = cut_child_ids.len();
let result = StoryResult {
child_chunks,
parent_chunks,
stats: StoryStats {
total_child_chunks: total_child,
total_parent_chunks: total_parent,
asr_children: asr_count,
cut_children: cut_count,
},
metadata: serde_json::json!({}),
parent_chunk_size,
};
// Write output (for compatibility with Python path)
let json_str = serde_json::to_string_pretty(&result)?;
std::fs::write(output_path, &json_str)
.with_context(|| format!("Failed to write {:?}", output_path))?;
Ok(result)
}
// ── Narrative generation (matching Python logic) ──────────────────
fn generate_narrative(texts: &[String], objects: &[String], start: f64, end: f64) -> String {
if texts.is_empty() && objects.is_empty() {
return format!("Video segment from {:.1}s to {:.1}s", start, end);
}
let mut parts: Vec<String> = Vec::new();
if !texts.is_empty() {
let combined = texts.join(" ");
let truncated = if combined.len() > 150 {
format!("{}...", &combined[..150])
} else {
combined
};
parts.push(format!("Speech: {}", truncated));
}
if !objects.is_empty() {
let mut unique: Vec<&String> = objects.iter().collect();
unique.sort();
unique.dedup();
let objs = unique.iter().take(5).map(|s| (*s).as_str()).collect::<Vec<_>>().join(", ");
parts.push(format!("Visuals: {}", objs));
}
format!("[{:.0}s-{:.0}s] {}", start, end, parts.join(" | "))
}
fn generate_scene_narrative(objects: &[String], start: f64, end: f64, scene_count: usize) -> String {
let mut unique: Vec<&String> = objects.iter().collect();
unique.sort();
unique.dedup();
let top5: Vec<&String> = unique.iter().take(5).cloned().collect();
if !top5.is_empty() {
let obj_str = top5.iter().map(|s| s.as_str()).collect::<Vec<_>>().join(", ");
format!("[{:.0}s-{:.0}s] {} scenes. Visuals: {}.", start, end, scene_count, obj_str)
} else {
format!("[{:.0}s-{:.0}s] {} video scenes.", start, end, scene_count)
}
}
// ── Tests ─────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_generate_narrative_with_text() {
let text = generate_narrative(
&["Hello world".to_string()],
&["person".to_string()],
0.0, 5.0,
);
assert!(text.contains("[0s-5s]"));
assert!(text.contains("Speech:"));
assert!(text.contains("Visuals:"));
}
#[test]
fn test_generate_narrative_empty() {
let text = generate_narrative(&[], &[], 10.0, 20.0);
assert!(text.contains("10.0s to 20.0s"));
}
#[test]
fn test_generate_scene_narrative() {
let text = generate_scene_narrative(&["person".to_string()], 0.0, 10.0, 3);
assert!(text.contains("3 scenes"));
assert!(text.contains("person"));
}
#[test]
fn test_generate_scene_narrative_empty() {
let text = generate_scene_narrative(&[], 0.0, 10.0, 1);
assert!(text.contains("1 video scenes"));
}
#[test]
fn test_narrative_truncation() {
let long_text = "a".repeat(200);
let text = generate_narrative(&[long_text], &[], 0.0, 5.0);
assert!(text.len() < 200 + 50); // truncated with "..."
assert!(text.ends_with("..."));
}
#[test]
fn test_story_result_serialization() {
let result = StoryResult {
@@ -187,9 +525,6 @@ mod tests {
assert_eq!(result.child_chunks.len(), 1);
assert_eq!(result.parent_chunks.len(), 1);
assert_eq!(result.stats.total_child_chunks, 1);
assert_eq!(result.stats.total_parent_chunks, 1);
assert_eq!(result.parent_chunks[0].child_chunk_ids[0], "asr_0001");
assert_eq!(result.child_chunks[0].parent_chunk_id, None);
}
#[test]
@@ -241,10 +576,89 @@ mod tests {
};
assert_eq!(result.parent_chunks[0].child_chunk_ids.len(), 2);
assert!(result
.child_chunks
.iter()
.all(|c| c.parent_chunk_id.is_some()));
assert!(result.child_chunks.iter().all(|c| c.parent_chunk_id.is_some()));
assert!(result.parent_chunks[0].parent_chunk_id.is_none());
}
#[test]
fn test_native_story_empty_data() {
// Write empty ASR and CUT files, then test try_native_story
let dir = std::env::temp_dir().join("story_test_empty");
let _ = std::fs::create_dir_all(&dir);
let basename = "test_video";
let asr_path = dir.join(format!("{}.asr.json", basename));
let cut_path = dir.join(format!("{}.cut.json", basename));
let out_path = dir.join(format!("{}.story.json", basename));
std::fs::write(&asr_path, r#"{"segments":[]}"#).unwrap();
std::fs::write(&cut_path, r#"{"scenes":[]}"#).unwrap();
let result = try_native_story(
"/dummy.mp4",
out_path.to_str().unwrap(),
None,
).unwrap();
assert_eq!(result.stats.total_child_chunks, 0);
assert_eq!(result.stats.total_parent_chunks, 0);
let _ = std::fs::remove_dir_all(&dir);
}
#[test]
fn test_native_story_with_data() {
let dir = std::env::temp_dir().join("story_test_data");
let _ = std::fs::create_dir_all(&dir);
let basename = "test_video";
let asr_path = dir.join(format!("{}.asr.json", basename));
let cut_path = dir.join(format!("{}.cut.json", basename));
let out_path = dir.join(format!("{}.story.json", basename));
std::fs::write(&asr_path, r#"{
"segments": [
{"start": 0.0, "end": 2.5, "text": "Hello", "confidence": 0.95},
{"start": 2.5, "end": 5.0, "text": "World", "confidence": 0.92},
{"start": 5.0, "end": 7.5, "text": "Foo", "confidence": 0.90}
]
}"#).unwrap();
std::fs::write(&cut_path, r#"{
"scenes": [
{"scene_number": 1, "start_frame": 0, "end_frame": 150, "start_time": 0.0, "end_time": 5.0},
{"scene_number": 2, "start_frame": 150, "end_frame": 300, "start_time": 5.0, "end_time": 10.0}
]
}"#).unwrap();
let result = try_native_story(
"/dummy.mp4",
out_path.to_str().unwrap(),
None,
).unwrap();
assert_eq!(result.stats.asr_children, 3);
assert_eq!(result.stats.cut_children, 2);
assert_eq!(result.stats.total_child_chunks, 5);
// 3 ASR segments, parent_chunk_size=5 → 1 parent
// 2 CUT scenes, parent_chunk_size=5 → 1 parent
assert_eq!(result.stats.total_parent_chunks, 2);
// Verify child-parent linking
for child in &result.child_chunks {
if child.source == "asr" {
assert!(child.parent_chunk_id.is_some());
assert!(child.parent_chunk_id.as_ref().unwrap().starts_with("story_asr_"));
}
}
// Verify output file was written
assert!(out_path.exists());
let content = std::fs::read_to_string(&out_path).unwrap();
assert!(content.contains("Hello"));
assert!(content.contains("World"));
let _ = std::fs::remove_dir_all(&dir);
}
}