Files
momentry_core/src/bin/integrated_player.rs.bak

712 lines
21 KiB
Rust

use anyhow::{Context, Result};
use clap::Parser;
use crossterm::event::{self, Event, KeyCode, KeyModifiers};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::io::{self, Write};
use std::path::PathBuf;
use std::process::{Child, Command, Stdio};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use std::thread;
use std::time::Duration;
#[derive(Parser, Debug)]
#[command(name = "integrated_player")]
#[command(about = "Integrated player for ASR, Face, ASRX, and Pose")]
struct Args {
#[arg(short, long)]
video: PathBuf,
#[arg(short = 'r', long)]
asr: Option<PathBuf>,
#[arg(short = 'f', long)]
face: Option<PathBuf>,
#[arg(short = 'x', long)]
asrx: Option<PathBuf>,
#[arg(short = 'p', long)]
pose: Option<PathBuf>,
#[arg(short = 's', long, default_value = "0.0")]
start: f64,
#[arg(long)]
speaker_name: Option<String>,
#[arg(long)]
auto_play_speaker: bool,
#[arg(long)]
demo: bool,
#[arg(long, default_value = "3")]
demo_segments_per_speaker: usize,
#[arg(long, default_value = "2.0")]
demo_speed: f64,
#[arg(long)]
show_video: bool,
#[arg(long, default_value = "800")]
video_width: u32,
#[arg(long, default_value = "600")]
video_height: u32,
#[arg(long)]
continuous_demo: bool,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct AsrSegment {
start: f64,
end: f64,
text: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct AsrData {
language: Option<String>,
segments: Vec<AsrSegment>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct FaceDetection {
frame: u64,
timestamp: f64,
x: i32,
y: i32,
width: i32,
height: i32,
confidence: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct FaceResult {
results: FaceResults,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct FaceResults {
detections: Vec<FaceDetection>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct AsrxSegment {
index: usize,
start: f64,
end: f64,
duration: f64,
speaker: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct AsrxData {
segments: Vec<AsrxSegment>,
speaker_stats: HashMap<String, SpeakerStats>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct SpeakerStats {
count: usize,
duration: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct Keypoint {
name: String,
x: f32,
y: f32,
confidence: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct PersonPose {
keypoints: Vec<Keypoint>,
bbox: Bbox,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct Bbox {
x: i32,
y: i32,
width: i32,
height: i32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct PoseFrame {
frame: u64,
timestamp: f64,
persons: Vec<PersonPose>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct PoseData {
frames: Vec<PoseFrame>,
}
#[derive(Debug, Clone)]
struct IntegratedSegment {
start: f64,
end: f64,
text: Option<String>,
speaker: Option<String>,
face: Option<FaceDetection>,
mouth_landmarks: Option<Vec<Keypoint>>,
}
struct IntegratedPlayer {
asr_data: Option<AsrData>,
face_data: Option<FaceResult>,
asrx_data: Option<AsrxData>,
pose_data: Option<PoseData>,
current_time: f64,
is_playing: bool,
speaker_names: HashMap<String, (String, String)>,
}
impl IntegratedPlayer {
fn new() -> Self {
let mut speaker_names = HashMap::new();
speaker_names.insert(
"SPEAKER_0".to_string(),
("Cary Grant".to_string(), "Peter Joshua".to_string()),
);
speaker_names.insert(
"SPEAKER_1".to_string(),
("Audrey Hepburn".to_string(), "Regina Lampert".to_string()),
);
speaker_names.insert(
"SPEAKER_2".to_string(),
(
"Walter Matthau".to_string(),
"Hamilton Bartholomew".to_string(),
),
);
speaker_names.insert(
"SPEAKER_4".to_string(),
("James Coburn".to_string(), "Tex Panthollow".to_string()),
);
Self {
asr_data: None,
face_data: None,
asrx_data: None,
pose_data: None,
current_time: 0.0,
is_playing: false,
speaker_names,
}
}
fn load_asr(&mut self, path: &PathBuf) -> Result<()> {
let content = std::fs::read_to_string(path)
.with_context(|| format!("Failed to read ASR file: {:?}", path))?;
self.asr_data = Some(serde_json::from_str(&content)?);
println!(
"✓ Loaded {} ASR segments",
self.asr_data.as_ref().unwrap().segments.len()
);
Ok(())
}
fn load_face(&mut self, path: &PathBuf) -> Result<()> {
let content = std::fs::read_to_string(path)
.with_context(|| format!("Failed to read Face file: {:?}", path))?;
self.face_data = Some(serde_json::from_str(&content)?);
println!(
"✓ Loaded {} face detections",
self.face_data.as_ref().unwrap().results.detections.len()
);
Ok(())
}
fn load_asrx(&mut self, path: &PathBuf) -> Result<()> {
let content = std::fs::read_to_string(path)
.with_context(|| format!("Failed to read ASRX file: {:?}", path))?;
self.asrx_data = Some(serde_json::from_str(&content)?);
println!(
"✓ Loaded {} ASRX segments, {} speakers",
self.asrx_data.as_ref().unwrap().segments.len(),
self.asrx_data.as_ref().unwrap().speaker_stats.len()
);
Ok(())
}
fn load_pose(&mut self, path: &PathBuf) -> Result<()> {
let content = std::fs::read_to_string(path)
.with_context(|| format!("Failed to read Pose file: {:?}", path))?;
self.pose_data = Some(serde_json::from_str(&content)?);
println!(
"✓ Loaded {} pose frames",
self.pose_data.as_ref().unwrap().frames.len()
);
Ok(())
}
fn get_current_segment(&self, time: f64) -> Option<IntegratedSegment> {
let mut segment = IntegratedSegment {
start: 0.0,
end: 0.0,
text: None,
speaker: None,
face: None,
mouth_landmarks: None,
};
if let Some(asr) = &self.asr_data {
for seg in &asr.segments {
if time >= seg.start && time <= seg.end {
segment.start = seg.start;
segment.end = seg.end;
segment.text = Some(seg.text.clone());
break;
}
}
}
if let Some(asrx) = &self.asrx_data {
for seg in &asrx.segments {
if time >= seg.start && time <= seg.end {
segment.start = seg.start;
segment.end = seg.end;
segment.speaker = Some(seg.speaker.clone());
break;
}
}
}
if let Some(face) = &self.face_data {
for det in &face.results.detections {
if (det.timestamp - time).abs() < 1.0 {
segment.face = Some(det.clone());
break;
}
}
}
if let Some(pose) = &self.pose_data {
for frame in &pose.frames {
if (frame.timestamp - time).abs() < 0.5 {
if let Some(person) = frame.persons.first() {
let mouth_points: Vec<Keypoint> = person
.keypoints
.iter()
.filter(|kp| {
kp.name.contains("mouth")
|| kp.name.contains("lip")
|| kp.name == "nose"
})
.cloned()
.collect();
if !mouth_points.is_empty() {
segment.mouth_landmarks = Some(mouth_points);
break;
}
}
}
}
}
if segment.text.is_some()
|| segment.speaker.is_some()
|| segment.face.is_some()
|| segment.mouth_landmarks.is_some()
{
Some(segment)
} else {
None
}
}
fn get_speaker_info(&self, speaker_id: &str) -> (String, String) {
self.speaker_names
.get(speaker_id)
.cloned()
.unwrap_or_else(|| ("Unknown".to_string(), "Unknown".to_string()))
}
fn print_segment(&self, segment: &IntegratedSegment) {
println!("\n{:=<80}", "");
println!("⏱ Time: {:.2}s - {:.2}s", segment.start, segment.end);
if let Some(text) = &segment.text {
println!("📝 Text: {}", text);
}
if let Some(speaker) = &segment.speaker {
let (actor, character) = self.get_speaker_info(speaker);
println!("🎤 Speaker: {}{} ({})", speaker, actor, character);
}
if let Some(face) = &segment.face {
println!(
"👤 Face: bbox=({},{}) {}x{}, confidence={:.2}",
face.x, face.y, face.width, face.height, face.confidence
);
}
if let Some(landmarks) = &segment.mouth_landmarks {
println!("👄 Mouth landmarks: {} points", landmarks.len());
for kp in landmarks.iter().take(3) {
println!(
"{}: ({:.1}, {:.1}) conf={:.2}",
kp.name, kp.x, kp.y, kp.confidence
);
}
}
println!("{:=<80}", "");
}
fn list_speakers(&self) {
if let Some(asrx) = &self.asrx_data {
println!("\n📊 Speaker Statistics:");
println!("{:-<80}", "");
println!(
"{:15} {:20} {:20} {:>10} {:>10}",
"Speaker ID", "Actor", "Character", "Segments", "Duration"
);
println!("{:-<80}", "");
for (speaker_id, stats) in &asrx.speaker_stats {
let (actor, character) = self.get_speaker_info(speaker_id);
println!(
"{:15} {:20} {:20} {:>10} {:>9.1}s",
speaker_id, actor, character, stats.count, stats.duration
);
}
println!("{:-<80}", "");
}
}
}
fn play_segment(video_path: &PathBuf, start: f64, duration: f64, show_video: bool) -> Result<()> {
println!("▶️ Playing {:.2}s - {:.2}s", start, start + duration);
let mut cmd = Command::new("ffplay");
if show_video {
cmd.args([
"-ss",
&format!("{:.2}", start),
"-t",
&format!("{:.2}", duration),
"-autoexit",
video_path.to_str().unwrap(),
]);
} else {
cmd.args([
"-ss",
&format!("{:.2}", start),
"-t",
&format!("{:.2}", duration),
"-autoexit",
"-nodisp",
video_path.to_str().unwrap(),
]);
}
let _child = cmd
.stdout(Stdio::null())
.stderr(Stdio::null())
.spawn()
.context("Failed to start ffplay")?;
Ok(())
}
fn play_speaker_segments(
player: &IntegratedPlayer,
video_path: &PathBuf,
speaker_id: &str,
limit: Option<usize>,
) -> Result<()> {
if let Some(asrx) = &player.asrx_data {
let segments: Vec<&AsrxSegment> = asrx
.segments
.iter()
.filter(|s| s.speaker == speaker_id)
.collect();
let total = segments.len();
let count = limit.unwrap_or(total).min(total);
println!("\n🎬 Playing {} segments for {}", count, speaker_id);
for (i, seg) in segments.iter().take(count).enumerate() {
println!("\n[{}/{}] Segment {}", i + 1, count, seg.index);
if let Some(segment) = player.get_current_segment(seg.start + 0.1) {
player.print_segment(&segment);
}
play_segment(video_path, seg.start, seg.duration, false)?;
thread::sleep(Duration::from_millis(500));
}
println!("\n✅ Finished playing {} segments", count);
}
Ok(())
}
fn run_demo(player: &IntegratedPlayer, args: &Args) -> Result<()> {
}
}
});
if let Some(asr) = &player.asr_data {
let total_segments = asr.segments.len();
for (i, seg) in asr.segments.iter().enumerate() {
// 檢查是否退出
if quit.load(Ordering::SeqCst) {
println!("\n⏹️ Stopped by user");
break;
}
// 檢查是否暫停
while paused.load(Ordering::SeqCst) {
println!("\r⏸️ Paused - Press SPACE to resume",);
std::io::stdout().flush()?;
thread::sleep(Duration::from_millis(100));
if quit.load(Ordering::SeqCst) {
println!("\n⏹️ Stopped by user");
return Ok(());
}
}
println!("\n[{}/{}] Segment", i + 1, total_segments);
println!("{:=<80}", "");
// 顯示所有信息
if let Some(segment) = player.get_current_segment(seg.start + 0.01) {
player.print_segment(&segment);
}
// 播放音頻/視頻
let duration = seg.end - seg.start;
println!(
"▶️ Playing: {:.2}s - {:.2}s ({:.2}s)",
seg.start, seg.end, duration
);
let mut cmd = Command::new("ffplay");
if args.show_video {
cmd.args([
"-ss",
&format!("{:.2}", seg.start),
"-t",
&format!("{:.2}", duration),
"-autoexit",
"-x",
&format!("{}", args.video_width),
"-y",
&format!("{}", args.video_height),
args.video.to_str().unwrap(),
]);
} else {
cmd.args([
"-ss",
&format!("{:.2}", seg.start),
"-t",
&format!("{:.2}", duration),
"-autoexit",
"-nodisp",
args.video.to_str().unwrap(),
]);
}
let _child = cmd
.stdout(Stdio::null())
.stderr(Stdio::null())
.spawn()
.context("Failed to start ffplay")?;
// 等待播放完成
thread::sleep(Duration::from_millis((duration * 1000.0) as u64 + 100));
}
println!("\n{:=<80}", "");
println!("✅ Demo completed! Played {} segments", total_segments);
println!("{:=<80}", "");
} else {
println!("⚠️ No ASR data loaded");
}
Ok(())
}
fn run_demo(player: &IntegratedPlayer, args: &Args) -> Result<()> {
println!("\n🎬 Auto Demo Mode");
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
println!("Segments per speaker: {}", args.demo_segments_per_speaker);
println!("Demo speed: {:.1}x", args.demo_speed);
println!();
if let Some(asrx) = &player.asrx_data {
let mut speaker_ids: Vec<String> = asrx.speaker_stats.keys().cloned().collect();
speaker_ids.sort();
for speaker_id in &speaker_ids {
let (actor, character) = player.get_speaker_info(speaker_id);
println!("\n{:=<80}", "");
println!("🎭 Demo: {}{} ({})", speaker_id, actor, character);
println!("{:=<80}", "");
let segments: Vec<&AsrxSegment> = asrx
.segments
.iter()
.filter(|s| s.speaker == *speaker_id)
.collect();
let count = args.demo_segments_per_speaker.min(segments.len());
for (i, seg) in segments.iter().take(count).enumerate() {
println!("\n[Segment {}/{}]", i + 1, count);
if let Some(segment) = player.get_current_segment(seg.start + 0.1) {
player.print_segment(&segment);
}
println!(
"⏳ Playing audio ({:.1}s)...",
seg.duration / args.demo_speed
);
let _child = Command::new("ffplay")
.args([
"-ss",
&format!("{:.2}", seg.start),
"-t",
&format!("{:.2}", seg.duration / args.demo_speed),
"-autoexit",
"-nodisp",
args.video.to_str().unwrap(),
])
.stdout(Stdio::null())
.stderr(Stdio::null())
.spawn()
.context("Failed to start ffplay")?;
thread::sleep(Duration::from_millis(
((seg.duration / args.demo_speed) * 1000.0) as u64 + 500,
));
}
println!("\n⏸️ Pausing 2 seconds before next speaker...");
thread::sleep(Duration::from_secs(2));
}
println!("\n{:=<80}", "");
println!("✅ Demo completed!");
println!("{:=<80}", "");
}
Ok(())
}
fn main() -> Result<()> {
let args = Args::parse();
if !args.video.exists() {
anyhow::bail!("Video file not found: {:?}", args.video);
}
println!("🎬 Integrated Player for ASR/Face/ASRX/Pose");
println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
println!("Video: {:?}", args.video);
let mut player = IntegratedPlayer::new();
if let Some(asr_path) = &args.asr {
if asr_path.exists() {
player.load_asr(asr_path)?;
}
}
if let Some(face_path) = &args.face {
if face_path.exists() {
player.load_face(face_path)?;
}
}
if let Some(asrx_path) = &args.asrx {
if asrx_path.exists() {
player.load_asrx(asrx_path)?;
}
}
if let Some(pose_path) = &args.pose {
if pose_path.exists() {
player.load_pose(pose_path)?;
}
}
player.list_speakers();
if args.demo {
run_demo(&player, &args)?;
} else if args.continuous_demo {
run_continuous_demo(&player, &args)?;
} else if args.auto_play_speaker {
if let Some(speaker_id) = &args.speaker_name {
play_speaker_segments(&player, &args.video, speaker_id, Some(5))?;
} else {
println!("\n⚠️ --speaker-name required for --auto-play-speaker");
}
} else {
println!("\n🎮 Interactive Mode");
println!(" Commands:");
println!(" • Enter time in seconds to seek");
println!(" • 's' to show current segment");
println!(" • 'l' to list speakers");
println!(" • 'p <speaker>' to play speaker segments");
println!(" • 'q' to quit");
println!();
loop {
print!("> ");
std::io::Write::flush(&mut std::io::stdout())?;
let mut input = String::new();
std::io::stdin().read_line(&mut input)?;
let input = input.trim();
if input == "q" || input == "quit" || input == "exit" {
break;
} else if input == "s" || input == "show" {
if let Some(segment) = player.get_current_segment(player.current_time) {
player.print_segment(&segment);
} else {
println!("No segment at time {:.2}s", player.current_time);
}
} else if input == "l" || input == "list" {
player.list_speakers();
} else if input.starts_with("p ") {
let speaker_id = input.strip_prefix("p ").unwrap();
play_speaker_segments(&player, &args.video, speaker_id, Some(3))?;
} else if let Ok(time) = input.parse::<f64>() {
player.current_time = time;
println!("Seeked to {:.2}s", time);
if let Some(segment) = player.get_current_segment(time) {
player.print_segment(&segment);
} else {
println!("No segment at this time");
}
}
}
}
Ok(())
}