feat: trace chunks with co-appearance relationships

- New trace_ingest module: creates chunks for each face trace (time + bbox + ASR text)
- Computes pairwise time overlaps between traces -> co_appearances in metadata
- Worker auto-triggers after face trace store + Qdrant sync
- SearchFilters: chunk_type filter (sentence/cut/trace/visual)
- SearchFilters: co_appears_with_trace_id filter
This commit is contained in:
Accusys
2026-05-09 06:18:32 +08:00
parent 9f5afd1b86
commit b902763d45
5 changed files with 373 additions and 6 deletions

View File

@@ -1,9 +1,11 @@
pub mod rule1_ingest;
pub mod rule3_ingest;
pub mod splitter;
pub mod trace_ingest;
pub mod types;
pub use rule1_ingest::execute_rule1;
pub use rule3_ingest::ingest_rule3;
pub use trace_ingest::ingest_traces;
pub use splitter::{AsrSegment, ChunkSplitter};
pub use types::{Chunk, ChunkType};

View File

@@ -0,0 +1,222 @@
use crate::core::chunk::types::{Chunk, ChunkRule, ChunkType};
use crate::core::db::schema;
use crate::core::db::PostgresDb;
use anyhow::{Context, Result};
use sqlx::Row;
use tracing::{error, info};
pub async fn ingest_traces(db: &PostgresDb, file_uuid: &str) -> Result<usize> {
let pool = db.pool();
let face_table = schema::table_name("face_detections");
let pre_table = schema::table_name("pre_chunks");
let video = db
.get_video_by_uuid(file_uuid)
.await?
.context("Video not found")?;
let file_id = video.id as i32;
let fps = video.fps;
let traces = sqlx::query_as::<_, TraceAgg>(&format!(
r#"
SELECT trace_id,
MIN(frame_number) AS first_frame,
MAX(frame_number) AS last_frame,
MIN(timestamp_secs) AS first_time,
MAX(timestamp_secs) AS last_time,
COUNT(*) AS face_count,
AVG(x)::float8 AS avg_x,
AVG(y)::float8 AS avg_y,
AVG(width)::float8 AS avg_w,
AVG(height)::float8 AS avg_h
FROM {}
WHERE file_uuid = $1 AND trace_id IS NOT NULL
GROUP BY trace_id
ORDER BY trace_id
"#,
face_table
))
.bind(file_uuid)
.fetch_all(pool)
.await?;
if traces.is_empty() {
info!("No traces found for {}", file_uuid);
return Ok(0);
}
let asr_segments = sqlx::query_as::<_, AsrSegment>(&format!(
r#"
SELECT start_frame, end_frame, start_time, end_time, data
FROM {}
WHERE file_uuid = $1 AND processor_type = 'asr'
ORDER BY start_frame
"#,
pre_table
))
.bind(file_uuid)
.fetch_all(pool)
.await?;
// 計算 pairwise trace 重疊關係
let overlaps = compute_overlaps(&traces);
let mut count = 0;
for trace in &traces {
let text = collect_overlapping_text(&asr_segments, trace.first_time, trace.last_time);
let bbox = serde_json::json!({
"x": trace.avg_x,
"y": trace.avg_y,
"width": trace.avg_w,
"height": trace.avg_h,
});
// 與此 trace 同框的其他 trace
let co_appearances: Vec<serde_json::Value> = overlaps
.iter()
.filter(|o| o.trace_id == trace.trace_id)
.map(|o| {
serde_json::json!({
"trace_id": o.other_trace_id,
"overlap_frames": o.overlap_frames,
"overlap_secs": (o.overlap_frames as f64 / fps * 100.0).round() / 100.0,
})
})
.collect();
let metadata = serde_json::json!({
"trace_id": trace.trace_id,
"face_count": trace.face_count,
"bbox": bbox,
"co_appearances": co_appearances,
});
let chunk = Chunk::new(
file_id,
file_uuid.to_string(),
(count + 1) as u32,
ChunkType::Trace,
ChunkRule::Rule1,
trace.first_frame as i64,
trace.last_frame as i64,
fps,
metadata.clone(),
)
.with_text_content(text)
.with_metadata(metadata)
.with_frame_count(trace.face_count as i32);
if let Err(e) = db.store_chunk(&chunk).await {
error!("Failed to store trace chunk {}: {}", trace.trace_id, e);
} else {
let preview = chunk.text_content.as_deref().unwrap_or("").chars().take(60).collect::<String>();
let co = chunk.metadata.as_ref()
.and_then(|m| m.get("co_appearances"))
.and_then(|c| c.as_array())
.map(|a| a.len())
.unwrap_or(0);
info!(
"Trace chunk {}: trace_id={} frames={}-{} faces={} co_appear={} text={}",
chunk.chunk_id, trace.trace_id,
trace.first_frame, trace.last_frame,
trace.face_count, co, preview,
);
count += 1;
}
}
info!("Ingested {} trace chunks for {}", count, file_uuid);
Ok(count)
}
/// 計算所有 trace pair 之間在時間上的重疊 frame 數
struct TraceOverlap {
trace_id: i32,
other_trace_id: i32,
overlap_frames: i64,
}
fn compute_overlaps(traces: &[TraceAgg]) -> Vec<TraceOverlap> {
let mut result = Vec::new();
for (i, a) in traces.iter().enumerate() {
for b in traces.iter().skip(i + 1) {
let overlap_start = a.first_frame.max(b.first_frame);
let overlap_end = a.last_frame.min(b.last_frame);
let frames = overlap_end - overlap_start;
if frames > 0 {
result.push(TraceOverlap {
trace_id: a.trace_id,
other_trace_id: b.trace_id,
overlap_frames: frames,
});
result.push(TraceOverlap {
trace_id: b.trace_id,
other_trace_id: a.trace_id,
overlap_frames: frames,
});
}
}
}
result
}
fn collect_overlapping_text(segments: &[AsrSegment], start_time: f64, end_time: f64) -> String {
let mut texts: Vec<&str> = Vec::new();
for seg in segments {
if seg.end_time >= start_time && seg.start_time <= end_time {
if let Some(t) = seg.text() {
texts.push(t);
}
}
}
texts.join(" ")
}
#[derive(Debug, sqlx::FromRow)]
struct TraceAgg {
trace_id: i32,
first_frame: i64,
last_frame: i64,
first_time: f64,
last_time: f64,
face_count: i64,
avg_x: f64,
avg_y: f64,
avg_w: f64,
avg_h: f64,
}
struct AsrSegment {
start_frame: i64,
end_frame: i64,
start_time: f64,
end_time: f64,
data: serde_json::Value,
}
impl<'r> sqlx::FromRow<'r, sqlx::postgres::PgRow> for AsrSegment {
fn from_row(row: &'r sqlx::postgres::PgRow) -> Result<Self, sqlx::Error> {
Ok(Self {
start_frame: row.try_get("start_frame")?,
end_frame: row.try_get("end_frame")?,
start_time: row.try_get("start_time")?,
end_time: row.try_get("end_time")?,
data: row.try_get("data")?,
})
}
}
impl AsrSegment {
fn text(&self) -> Option<&str> {
self.data
.get("text")
.and_then(|v| v.as_str())
.or_else(|| {
self.data
.get("data")
.and_then(|d| d.get("text"))
.and_then(|v| v.as_str())
})
}
}