use anyhow::{Context, Result}; use clap::Parser; use crossterm::event::{self, Event, KeyCode, KeyModifiers}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::io::{self, Write}; use std::path::PathBuf; use std::process::{Child, Command, Stdio}; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::Arc; use std::thread; use std::time::Duration; #[derive(Parser, Debug)] #[command(name = "integrated_player")] #[command(about = "Integrated player for ASR, Face, ASRX, and Pose")] struct Args { #[arg(short, long)] video: PathBuf, #[arg(short = 'r', long)] asr: Option, #[arg(short = 'f', long)] face: Option, #[arg(short = 'x', long)] asrx: Option, #[arg(short = 'p', long)] pose: Option, #[arg(short = 's', long, default_value = "0.0")] start: f64, #[arg(long)] speaker_name: Option, #[arg(long)] auto_play_speaker: bool, #[arg(long)] demo: bool, #[arg(long, default_value = "3")] demo_segments_per_speaker: usize, #[arg(long, default_value = "2.0")] demo_speed: f64, #[arg(long)] show_video: bool, #[arg(long, default_value = "800")] video_width: u32, #[arg(long, default_value = "600")] video_height: u32, #[arg(long)] continuous_demo: bool, } #[derive(Debug, Clone, Serialize, Deserialize)] struct AsrSegment { start: f64, end: f64, text: String, } #[derive(Debug, Clone, Serialize, Deserialize)] struct AsrData { language: Option, segments: Vec, } #[derive(Debug, Clone, Serialize, Deserialize)] struct FaceDetection { frame: u64, timestamp: f64, x: i32, y: i32, width: i32, height: i32, confidence: f64, } #[derive(Debug, Clone, Serialize, Deserialize)] struct FaceResult { results: FaceResults, } #[derive(Debug, Clone, Serialize, Deserialize)] struct FaceResults { detections: Vec, } #[derive(Debug, Clone, Serialize, Deserialize)] struct AsrxSegment { index: usize, start: f64, end: f64, duration: f64, speaker: String, } #[derive(Debug, Clone, Serialize, Deserialize)] struct AsrxData { segments: Vec, speaker_stats: HashMap, } #[derive(Debug, Clone, Serialize, Deserialize)] struct SpeakerStats { count: usize, duration: f64, } #[derive(Debug, Clone, Serialize, Deserialize)] struct Keypoint { name: String, x: f32, y: f32, confidence: f32, } #[derive(Debug, Clone, Serialize, Deserialize)] struct PersonPose { keypoints: Vec, bbox: Bbox, } #[derive(Debug, Clone, Serialize, Deserialize)] struct Bbox { x: i32, y: i32, width: i32, height: i32, } #[derive(Debug, Clone, Serialize, Deserialize)] struct PoseFrame { frame: u64, timestamp: f64, persons: Vec, } #[derive(Debug, Clone, Serialize, Deserialize)] struct PoseData { frames: Vec, } #[derive(Debug, Clone)] struct IntegratedSegment { start: f64, end: f64, text: Option, speaker: Option, face: Option, mouth_landmarks: Option>, } struct IntegratedPlayer { asr_data: Option, face_data: Option, asrx_data: Option, pose_data: Option, current_time: f64, is_playing: bool, speaker_names: HashMap, } impl IntegratedPlayer { fn new() -> Self { let mut speaker_names = HashMap::new(); speaker_names.insert( "SPEAKER_0".to_string(), ("Cary Grant".to_string(), "Peter Joshua".to_string()), ); speaker_names.insert( "SPEAKER_1".to_string(), ("Audrey Hepburn".to_string(), "Regina Lampert".to_string()), ); speaker_names.insert( "SPEAKER_2".to_string(), ( "Walter Matthau".to_string(), "Hamilton Bartholomew".to_string(), ), ); speaker_names.insert( "SPEAKER_4".to_string(), ("James Coburn".to_string(), "Tex Panthollow".to_string()), ); Self { asr_data: None, face_data: None, asrx_data: None, pose_data: None, current_time: 0.0, is_playing: false, speaker_names, } } fn load_asr(&mut self, path: &PathBuf) -> Result<()> { let content = std::fs::read_to_string(path) .with_context(|| format!("Failed to read ASR file: {:?}", path))?; self.asr_data = Some(serde_json::from_str(&content)?); println!( "✓ Loaded {} ASR segments", self.asr_data.as_ref().unwrap().segments.len() ); Ok(()) } fn load_face(&mut self, path: &PathBuf) -> Result<()> { let content = std::fs::read_to_string(path) .with_context(|| format!("Failed to read Face file: {:?}", path))?; self.face_data = Some(serde_json::from_str(&content)?); println!( "✓ Loaded {} face detections", self.face_data.as_ref().unwrap().results.detections.len() ); Ok(()) } fn load_asrx(&mut self, path: &PathBuf) -> Result<()> { let content = std::fs::read_to_string(path) .with_context(|| format!("Failed to read ASRX file: {:?}", path))?; self.asrx_data = Some(serde_json::from_str(&content)?); println!( "✓ Loaded {} ASRX segments, {} speakers", self.asrx_data.as_ref().unwrap().segments.len(), self.asrx_data.as_ref().unwrap().speaker_stats.len() ); Ok(()) } fn load_pose(&mut self, path: &PathBuf) -> Result<()> { let content = std::fs::read_to_string(path) .with_context(|| format!("Failed to read Pose file: {:?}", path))?; self.pose_data = Some(serde_json::from_str(&content)?); println!( "✓ Loaded {} pose frames", self.pose_data.as_ref().unwrap().frames.len() ); Ok(()) } fn get_current_segment(&self, time: f64) -> Option { let mut segment = IntegratedSegment { start: 0.0, end: 0.0, text: None, speaker: None, face: None, mouth_landmarks: None, }; if let Some(asr) = &self.asr_data { for seg in &asr.segments { if time >= seg.start && time <= seg.end { segment.start = seg.start; segment.end = seg.end; segment.text = Some(seg.text.clone()); break; } } } if let Some(asrx) = &self.asrx_data { for seg in &asrx.segments { if time >= seg.start && time <= seg.end { segment.start = seg.start; segment.end = seg.end; segment.speaker = Some(seg.speaker.clone()); break; } } } if let Some(face) = &self.face_data { for det in &face.results.detections { if (det.timestamp - time).abs() < 1.0 { segment.face = Some(det.clone()); break; } } } if let Some(pose) = &self.pose_data { for frame in &pose.frames { if (frame.timestamp - time).abs() < 0.5 { if let Some(person) = frame.persons.first() { let mouth_points: Vec = person .keypoints .iter() .filter(|kp| { kp.name.contains("mouth") || kp.name.contains("lip") || kp.name == "nose" }) .cloned() .collect(); if !mouth_points.is_empty() { segment.mouth_landmarks = Some(mouth_points); break; } } } } } if segment.text.is_some() || segment.speaker.is_some() || segment.face.is_some() || segment.mouth_landmarks.is_some() { Some(segment) } else { None } } fn get_speaker_info(&self, speaker_id: &str) -> (String, String) { self.speaker_names .get(speaker_id) .cloned() .unwrap_or_else(|| ("Unknown".to_string(), "Unknown".to_string())) } fn print_segment(&self, segment: &IntegratedSegment) { println!("\n{:=<80}", ""); println!("⏱ Time: {:.2}s - {:.2}s", segment.start, segment.end); if let Some(text) = &segment.text { println!("📝 Text: {}", text); } if let Some(speaker) = &segment.speaker { let (actor, character) = self.get_speaker_info(speaker); println!("🎤 Speaker: {} → {} ({})", speaker, actor, character); } if let Some(face) = &segment.face { println!( "👤 Face: bbox=({},{}) {}x{}, confidence={:.2}", face.x, face.y, face.width, face.height, face.confidence ); } if let Some(landmarks) = &segment.mouth_landmarks { println!("👄 Mouth landmarks: {} points", landmarks.len()); for kp in landmarks.iter().take(3) { println!( " • {}: ({:.1}, {:.1}) conf={:.2}", kp.name, kp.x, kp.y, kp.confidence ); } } println!("{:=<80}", ""); } fn list_speakers(&self) { if let Some(asrx) = &self.asrx_data { println!("\n📊 Speaker Statistics:"); println!("{:-<80}", ""); println!( "{:15} {:20} {:20} {:>10} {:>10}", "Speaker ID", "Actor", "Character", "Segments", "Duration" ); println!("{:-<80}", ""); for (speaker_id, stats) in &asrx.speaker_stats { let (actor, character) = self.get_speaker_info(speaker_id); println!( "{:15} {:20} {:20} {:>10} {:>9.1}s", speaker_id, actor, character, stats.count, stats.duration ); } println!("{:-<80}", ""); } } } fn play_segment(video_path: &PathBuf, start: f64, duration: f64, show_video: bool) -> Result<()> { println!("▶️ Playing {:.2}s - {:.2}s", start, start + duration); let mut cmd = Command::new("ffplay"); if show_video { cmd.args([ "-ss", &format!("{:.2}", start), "-t", &format!("{:.2}", duration), "-autoexit", video_path.to_str().unwrap(), ]); } else { cmd.args([ "-ss", &format!("{:.2}", start), "-t", &format!("{:.2}", duration), "-autoexit", "-nodisp", video_path.to_str().unwrap(), ]); } let _child = cmd .stdout(Stdio::null()) .stderr(Stdio::null()) .spawn() .context("Failed to start ffplay")?; Ok(()) } fn play_speaker_segments( player: &IntegratedPlayer, video_path: &PathBuf, speaker_id: &str, limit: Option, ) -> Result<()> { if let Some(asrx) = &player.asrx_data { let segments: Vec<&AsrxSegment> = asrx .segments .iter() .filter(|s| s.speaker == speaker_id) .collect(); let total = segments.len(); let count = limit.unwrap_or(total).min(total); println!("\n🎬 Playing {} segments for {}", count, speaker_id); for (i, seg) in segments.iter().take(count).enumerate() { println!("\n[{}/{}] Segment {}", i + 1, count, seg.index); if let Some(segment) = player.get_current_segment(seg.start + 0.1) { player.print_segment(&segment); } play_segment(video_path, seg.start, seg.duration, false)?; thread::sleep(Duration::from_millis(500)); } println!("\n✅ Finished playing {} segments", count); } Ok(()) } fn run_demo(player: &IntegratedPlayer, args: &Args) -> Result<()> { } } }); if let Some(asr) = &player.asr_data { let total_segments = asr.segments.len(); for (i, seg) in asr.segments.iter().enumerate() { // 檢查是否退出 if quit.load(Ordering::SeqCst) { println!("\n⏹️ Stopped by user"); break; } // 檢查是否暫停 while paused.load(Ordering::SeqCst) { println!("\r⏸️ Paused - Press SPACE to resume",); std::io::stdout().flush()?; thread::sleep(Duration::from_millis(100)); if quit.load(Ordering::SeqCst) { println!("\n⏹️ Stopped by user"); return Ok(()); } } println!("\n[{}/{}] Segment", i + 1, total_segments); println!("{:=<80}", ""); // 顯示所有信息 if let Some(segment) = player.get_current_segment(seg.start + 0.01) { player.print_segment(&segment); } // 播放音頻/視頻 let duration = seg.end - seg.start; println!( "▶️ Playing: {:.2}s - {:.2}s ({:.2}s)", seg.start, seg.end, duration ); let mut cmd = Command::new("ffplay"); if args.show_video { cmd.args([ "-ss", &format!("{:.2}", seg.start), "-t", &format!("{:.2}", duration), "-autoexit", "-x", &format!("{}", args.video_width), "-y", &format!("{}", args.video_height), args.video.to_str().unwrap(), ]); } else { cmd.args([ "-ss", &format!("{:.2}", seg.start), "-t", &format!("{:.2}", duration), "-autoexit", "-nodisp", args.video.to_str().unwrap(), ]); } let _child = cmd .stdout(Stdio::null()) .stderr(Stdio::null()) .spawn() .context("Failed to start ffplay")?; // 等待播放完成 thread::sleep(Duration::from_millis((duration * 1000.0) as u64 + 100)); } println!("\n{:=<80}", ""); println!("✅ Demo completed! Played {} segments", total_segments); println!("{:=<80}", ""); } else { println!("⚠️ No ASR data loaded"); } Ok(()) } fn run_demo(player: &IntegratedPlayer, args: &Args) -> Result<()> { println!("\n🎬 Auto Demo Mode"); println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); println!("Segments per speaker: {}", args.demo_segments_per_speaker); println!("Demo speed: {:.1}x", args.demo_speed); println!(); if let Some(asrx) = &player.asrx_data { let mut speaker_ids: Vec = asrx.speaker_stats.keys().cloned().collect(); speaker_ids.sort(); for speaker_id in &speaker_ids { let (actor, character) = player.get_speaker_info(speaker_id); println!("\n{:=<80}", ""); println!("🎭 Demo: {} → {} ({})", speaker_id, actor, character); println!("{:=<80}", ""); let segments: Vec<&AsrxSegment> = asrx .segments .iter() .filter(|s| s.speaker == *speaker_id) .collect(); let count = args.demo_segments_per_speaker.min(segments.len()); for (i, seg) in segments.iter().take(count).enumerate() { println!("\n[Segment {}/{}]", i + 1, count); if let Some(segment) = player.get_current_segment(seg.start + 0.1) { player.print_segment(&segment); } println!( "⏳ Playing audio ({:.1}s)...", seg.duration / args.demo_speed ); let _child = Command::new("ffplay") .args([ "-ss", &format!("{:.2}", seg.start), "-t", &format!("{:.2}", seg.duration / args.demo_speed), "-autoexit", "-nodisp", args.video.to_str().unwrap(), ]) .stdout(Stdio::null()) .stderr(Stdio::null()) .spawn() .context("Failed to start ffplay")?; thread::sleep(Duration::from_millis( ((seg.duration / args.demo_speed) * 1000.0) as u64 + 500, )); } println!("\n⏸️ Pausing 2 seconds before next speaker..."); thread::sleep(Duration::from_secs(2)); } println!("\n{:=<80}", ""); println!("✅ Demo completed!"); println!("{:=<80}", ""); } Ok(()) } fn main() -> Result<()> { let args = Args::parse(); if !args.video.exists() { anyhow::bail!("Video file not found: {:?}", args.video); } println!("🎬 Integrated Player for ASR/Face/ASRX/Pose"); println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"); println!("Video: {:?}", args.video); let mut player = IntegratedPlayer::new(); if let Some(asr_path) = &args.asr { if asr_path.exists() { player.load_asr(asr_path)?; } } if let Some(face_path) = &args.face { if face_path.exists() { player.load_face(face_path)?; } } if let Some(asrx_path) = &args.asrx { if asrx_path.exists() { player.load_asrx(asrx_path)?; } } if let Some(pose_path) = &args.pose { if pose_path.exists() { player.load_pose(pose_path)?; } } player.list_speakers(); if args.demo { run_demo(&player, &args)?; } else if args.continuous_demo { run_continuous_demo(&player, &args)?; } else if args.auto_play_speaker { if let Some(speaker_id) = &args.speaker_name { play_speaker_segments(&player, &args.video, speaker_id, Some(5))?; } else { println!("\n⚠️ --speaker-name required for --auto-play-speaker"); } } else { println!("\n🎮 Interactive Mode"); println!(" Commands:"); println!(" • Enter time in seconds to seek"); println!(" • 's' to show current segment"); println!(" • 'l' to list speakers"); println!(" • 'p ' to play speaker segments"); println!(" • 'q' to quit"); println!(); loop { print!("> "); std::io::Write::flush(&mut std::io::stdout())?; let mut input = String::new(); std::io::stdin().read_line(&mut input)?; let input = input.trim(); if input == "q" || input == "quit" || input == "exit" { break; } else if input == "s" || input == "show" { if let Some(segment) = player.get_current_segment(player.current_time) { player.print_segment(&segment); } else { println!("No segment at time {:.2}s", player.current_time); } } else if input == "l" || input == "list" { player.list_speakers(); } else if input.starts_with("p ") { let speaker_id = input.strip_prefix("p ").unwrap(); play_speaker_segments(&player, &args.video, speaker_id, Some(3))?; } else if let Ok(time) = input.parse::() { player.current_time = time; println!("Seeked to {:.2}s", time); if let Some(segment) = player.get_current_segment(time) { player.print_segment(&segment); } else { println!("No segment at this time"); } } } } Ok(()) }