feat: Phase 1 handover - schema migration, correction mechanism, API fixes

Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index
Correction: asr-1.json format, generate/apply scripts
API: 37/37 endpoints fixed and tested
Docs: HANDOVER_V2.0.md for M4
This commit is contained in:
Accusys
2026-05-11 07:03:22 +08:00
parent ef894a44ad
commit 39ba5ddf76
147 changed files with 19843 additions and 3053 deletions

View File

@@ -6,6 +6,6 @@ pub mod types;
pub use rule1_ingest::execute_rule1;
pub use rule3_ingest::ingest_rule3;
pub use trace_ingest::ingest_traces;
pub use splitter::{AsrSegment, ChunkSplitter};
pub use trace_ingest::ingest_traces;
pub use types::{Chunk, ChunkType};

View File

@@ -50,7 +50,7 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
let chunk = Chunk::from_seconds(
file_id as i32,
file_uuid.to_string(),
idx as u32,
format!("{}", idx),
ChunkType::Sentence,
ChunkRule::Rule1,
seg.start_time,

View File

@@ -73,7 +73,7 @@ pub async fn ingest_rule3(pool: &PgPool, file_uuid: &str) -> Result<usize> {
// Query chunks table for Rule 1 sentence chunks
let rule1_rows: Vec<(String,)> = sqlx::query_as(
r#"
SELECT chunk_id FROM chunks
SELECT chunk_id FROM dev.chunk
WHERE file_uuid = $1 AND chunk_type = 'sentence'
AND start_frame >= $2
AND end_frame <= $3
@@ -98,7 +98,7 @@ pub async fn ingest_rule3(pool: &PgPool, file_uuid: &str) -> Result<usize> {
let texts: Vec<String> = sqlx::query_scalar(
r#"
SELECT text_content FROM chunks
SELECT text_content FROM dev.chunk
WHERE file_uuid = $1 AND chunk_type = 'sentence'
AND start_frame >= $2
AND end_frame <= $3
@@ -135,10 +135,11 @@ pub async fn ingest_rule3(pool: &PgPool, file_uuid: &str) -> Result<usize> {
);
// 4. Insert into dev.chunks
let fps_query: Option<f64> = sqlx::query_scalar("SELECT fps FROM videos WHERE file_uuid = $1")
.bind(file_uuid)
.fetch_optional(&mut *tx)
.await?;
let fps_query: Option<f64> =
sqlx::query_scalar("SELECT fps FROM videos WHERE file_uuid = $1")
.bind(file_uuid)
.fetch_optional(&mut *tx)
.await?;
let fps = fps_query.unwrap_or(29.97);
// Prepare metadata JSON
@@ -149,12 +150,12 @@ pub async fn ingest_rule3(pool: &PgPool, file_uuid: &str) -> Result<usize> {
sqlx::query(
r#"
INSERT INTO chunks (
file_uuid, chunk_id, old_chunk_id, chunk_index, chunk_type,
INSERT INTO dev.chunk (
file_uuid, chunk_id, chunk_type,
start_time, end_time, fps, start_frame, end_frame,
content, text_content, summary_text, metadata, child_chunk_ids
) VALUES ($1, $2, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14)
ON CONFLICT (file_uuid, old_chunk_id) DO NOTHING
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13)
ON CONFLICT (file_uuid, chunk_id) DO NOTHING
"#,
)
.bind(file_uuid)

View File

@@ -23,7 +23,7 @@ impl ChunkSplitter {
chunks.push(Chunk::from_seconds(
0, // file_id
uuid.to_string(),
index,
format!("{}", index),
ChunkType::TimeBased,
ChunkRule::Rule1,
current_time,
@@ -48,7 +48,7 @@ impl ChunkSplitter {
chunks.push(Chunk::from_seconds(
0, // file_id
uuid.to_string(),
index as u32,
format!("{}", index),
ChunkType::Sentence,
ChunkRule::Rule1,
segment.start,

View File

@@ -95,7 +95,7 @@ pub async fn ingest_traces(db: &PostgresDb, file_uuid: &str) -> Result<usize> {
let chunk = Chunk::new(
file_id,
file_uuid.to_string(),
(count + 1) as u32,
format!("trace_{}", count + 1),
ChunkType::Trace,
ChunkRule::Rule1,
trace.first_frame as i64,
@@ -110,17 +110,29 @@ pub async fn ingest_traces(db: &PostgresDb, file_uuid: &str) -> Result<usize> {
if let Err(e) = db.store_chunk(&chunk).await {
error!("Failed to store trace chunk {}: {}", trace.trace_id, e);
} else {
let preview = chunk.text_content.as_deref().unwrap_or("").chars().take(60).collect::<String>();
let co = chunk.metadata.as_ref()
let preview = chunk
.text_content
.as_deref()
.unwrap_or("")
.chars()
.take(60)
.collect::<String>();
let co = chunk
.metadata
.as_ref()
.and_then(|m| m.get("co_appearances"))
.and_then(|c| c.as_array())
.map(|a| a.len())
.unwrap_or(0);
info!(
"Trace chunk {}: trace_id={} frames={}-{} faces={} co_appear={} text={}",
chunk.chunk_id, trace.trace_id,
trace.first_frame, trace.last_frame,
trace.face_count, co, preview,
chunk.chunk_id,
trace.trace_id,
trace.first_frame,
trace.last_frame,
trace.face_count,
co,
preview,
);
count += 1;
}
@@ -209,14 +221,11 @@ impl<'r> sqlx::FromRow<'r, sqlx::postgres::PgRow> for AsrSegment {
impl AsrSegment {
fn text(&self) -> Option<&str> {
self.data
.get("text")
.and_then(|v| v.as_str())
.or_else(|| {
self.data
.get("data")
.and_then(|d| d.get("text"))
.and_then(|v| v.as_str())
})
self.data.get("text").and_then(|v| v.as_str()).or_else(|| {
self.data
.get("data")
.and_then(|d| d.get("text"))
.and_then(|v| v.as_str())
})
}
}

View File

@@ -115,7 +115,6 @@ pub struct Chunk {
pub file_id: i32,
pub uuid: String,
pub chunk_id: String,
pub chunk_index: u32,
pub chunk_type: ChunkType,
pub rule: ChunkRule,
/// Frames per second (can be fractional, e.g., 29.97, 23.976)
@@ -140,7 +139,7 @@ impl Chunk {
pub fn new(
file_id: i32,
uuid: String,
chunk_index: u32,
chunk_id: String,
chunk_type: ChunkType,
rule: ChunkRule,
start_frame: i64,
@@ -149,13 +148,11 @@ impl Chunk {
content: serde_json::Value,
) -> Self {
let frame_count = (end_frame - start_frame) as i32;
let chunk_id = format!("{}_{}", uuid, chunk_index);
Self {
file_id,
uuid,
chunk_id,
chunk_index,
chunk_type,
rule,
fps,
@@ -177,7 +174,7 @@ impl Chunk {
pub fn new_visual(
file_id: i32,
uuid: String,
chunk_index: u32,
chunk_id: String,
start_frame: i64,
end_frame: i64,
fps: f64,
@@ -189,7 +186,7 @@ impl Chunk {
Self::new(
file_id,
uuid,
chunk_index,
chunk_id,
ChunkType::Visual,
ChunkRule::Rule2,
start_frame,
@@ -203,7 +200,7 @@ impl Chunk {
pub fn from_yolo_frames(
file_id: i32,
uuid: String,
chunk_index: u32,
chunk_id: String,
start_frame: i64,
end_frame: i64,
fps: f64,
@@ -307,7 +304,7 @@ impl Chunk {
Self::new_visual(
file_id,
uuid,
chunk_index,
chunk_id,
start_frame,
end_frame,
fps,
@@ -334,7 +331,7 @@ impl Chunk {
pub fn from_seconds(
file_id: i32,
uuid: String,
chunk_index: u32,
chunk_id: String,
chunk_type: ChunkType,
rule: ChunkRule,
start_time: f64,
@@ -347,7 +344,7 @@ impl Chunk {
Self::new(
file_id,
uuid,
chunk_index,
chunk_id,
chunk_type,
rule,
start_frame,

View File

@@ -103,7 +103,6 @@ pub struct Chunk {
pub file_id: i32,
pub uuid: String,
pub chunk_id: String,
pub chunk_index: u32,
pub chunk_type: ChunkType,
pub rule: ChunkRule,
/// Frames per second (can be fractional, e.g., 29.97, 23.976)
@@ -128,7 +127,7 @@ impl Chunk {
pub fn new_visual(
file_id: i32,
uuid: String,
chunk_index: u32,
chunk_id: String,
start_frame: i64,
end_frame: i64,
fps: f64,
@@ -140,7 +139,7 @@ impl Chunk {
Self::new(
file_id,
uuid,
chunk_index,
chunk_id,
ChunkType::Visual,
ChunkRule::Rule2,
start_frame,
@@ -154,7 +153,7 @@ impl Chunk {
pub fn from_yolo_result(
file_id: i32,
uuid: String,
chunk_index: u32,
chunk_id: String,
start_frame: i64,
end_frame: i64,
fps: f64,
@@ -263,7 +262,7 @@ impl Chunk {
Self::new_visual(
file_id,
uuid,
chunk_index,
chunk_id,
start_frame,
end_frame,
fps,
@@ -275,7 +274,7 @@ impl Chunk {
pub fn new(
file_id: i32,
uuid: String,
chunk_index: u32,
chunk_id: String,
chunk_type: ChunkType,
rule: ChunkRule,
start_frame: i64,
@@ -284,13 +283,11 @@ impl Chunk {
content: serde_json::Value,
) -> Self {
let frame_count = (end_frame - start_frame) as i32;
let chunk_id = format!("{}_{}", uuid, chunk_index);
Self {
file_id,
uuid,
chunk_id,
chunk_index,
chunk_type,
rule,
fps,

View File

@@ -13,7 +13,6 @@ pub struct MongoDb {
pub struct ChunkDocument {
pub uuid: String,
pub chunk_id: String,
pub chunk_index: u32,
pub chunk_type: String,
pub start_time: f64,
pub end_time: f64,
@@ -34,7 +33,6 @@ impl From<Chunk> for ChunkDocument {
Self {
uuid: chunk.uuid,
chunk_id: chunk.chunk_id,
chunk_index: chunk.chunk_index,
chunk_type: chunk.chunk_type.as_str().to_string(),
start_time,
end_time,
@@ -119,7 +117,7 @@ impl MongoDb {
file_id: 0,
uuid: doc.uuid,
chunk_id: doc.chunk_id,
chunk_index: doc.chunk_index,
chunk_type,
rule: ChunkRule::Rule1,
fps: doc.fps,
@@ -178,7 +176,7 @@ impl MongoDb {
file_id: 0,
uuid: doc.uuid,
chunk_id: doc.chunk_id,
chunk_index: doc.chunk_index,
chunk_type,
rule: ChunkRule::Rule1,
fps: doc.fps,
@@ -234,7 +232,7 @@ impl MongoDb {
file_id: 0,
uuid: doc.uuid,
chunk_id: doc.chunk_id,
chunk_index: doc.chunk_index,
chunk_type,
rule: ChunkRule::Rule1,
fps: doc.fps,

View File

@@ -56,7 +56,7 @@ pub struct CandidateRecord {
#[derive(Debug, Clone, Serialize, Deserialize, sqlx::FromRow)]
pub struct FileIdentityRecord {
pub id: i64,
pub id: i32,
pub file_uuid: String,
pub identity_id: i32,
pub name: String,
@@ -116,7 +116,7 @@ pub struct IdentityFaceRecord {
#[derive(Debug, Clone, Serialize, Deserialize, sqlx::FromRow)]
pub struct IdentityChunkRecord {
pub id: i64,
pub id: i32,
pub file_uuid: String,
pub chunk_id: String,
pub chunk_type: String,
@@ -788,8 +788,8 @@ impl PostgresDb {
.await?;
// Chunks
sqlx::query("CREATE TABLE IF NOT EXISTS chunks (id SERIAL PRIMARY KEY, file_uuid VARCHAR(32) NOT NULL, chunk_id VARCHAR(64) NOT NULL, chunk_index INTEGER NOT NULL, chunk_type VARCHAR(32) NOT NULL, start_time DOUBLE PRECISION NOT NULL, end_time DOUBLE PRECISION NOT NULL, fps DOUBLE PRECISION DEFAULT 24.0, start_frame BIGINT DEFAULT 0, end_frame BIGINT DEFAULT 0, content JSONB NOT NULL, metadata JSONB, vector_id VARCHAR(64), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, UNIQUE(file_uuid, chunk_id))").execute(pool).await?;
sqlx::query("CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file_uuid)")
sqlx::query("CREATE TABLE IF NOT EXISTS chunk (id SERIAL PRIMARY KEY, file_uuid VARCHAR(32) NOT NULL, chunk_id VARCHAR(64) NOT NULL, chunk_type VARCHAR(32) NOT NULL, start_time DOUBLE PRECISION NOT NULL, end_time DOUBLE PRECISION NOT NULL, fps DOUBLE PRECISION DEFAULT 24.0, start_frame BIGINT DEFAULT 0, end_frame BIGINT DEFAULT 0, content JSONB NOT NULL, metadata JSONB, vector_id VARCHAR(64), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, UNIQUE(file_uuid, chunk_id))").execute(pool).await?;
sqlx::query("CREATE INDEX IF NOT EXISTS idx_chunk_file ON chunk(file_uuid)")
.execute(pool)
.await?;
sqlx::query("CREATE INDEX IF NOT EXISTS idx_chunks_type ON chunks(chunk_type)")
@@ -845,7 +845,7 @@ impl PostgresDb {
sqlx::query(
"CREATE TRIGGER chunks_search_vector_trigger
BEFORE INSERT OR UPDATE ON chunks
BEFORE INSERT OR UPDATE ON chunk
FOR EACH ROW EXECUTE FUNCTION update_search_vector()",
)
.execute(pool)
@@ -1232,7 +1232,7 @@ impl PostgresDb {
let tx = self.pool.begin().await?;
let chunk_vectors = schema::table_name("chunk_vectors");
let chunks = schema::table_name("chunks");
let chunks = "dev.chunk";
let processor_results = schema::table_name("processor_results");
let videos = schema::table_name("videos");
@@ -1254,6 +1254,11 @@ impl PostgresDb {
.execute(&self.pool)
.await?;
sqlx::query(&format!("DELETE FROM dev.pre_chunks WHERE file_uuid = $1"))
.bind(uuid)
.execute(&self.pool)
.await?;
sqlx::query(&format!("DELETE FROM {} WHERE file_uuid = $1", videos))
.bind(uuid)
.execute(&self.pool)
@@ -1277,7 +1282,7 @@ impl PostgresDb {
}
pub async fn get_chunk_count(&self, uuid: &str) -> Result<(i64, i64)> {
let chunks = schema::table_name("chunks");
let chunks = "dev.chunk";
let sentence_count: i64 = sqlx::query_scalar(&format!(
"SELECT COUNT(*) FROM {} WHERE file_uuid = $1 AND chunk_type = 'sentence'",
chunks
@@ -2417,8 +2422,10 @@ impl PostgresDb {
pub async fn get_identity_by_uuid(&self, uuid: &Uuid) -> Result<Option<IdentityDetailRecord>> {
let query = r#"
SELECT id, uuid, name, identity_type, source, status, metadata, reference_data,
voice_embedding, identity_embedding, face_embedding,
tmdb_id, tmdb_profile, created_at, NULL::timestamptz as updated_at
voice_embedding::real[] as voice_embedding,
identity_embedding::real[] as identity_embedding,
face_embedding::real[] as face_embedding,
tmdb_id, tmdb_profile, created_at::timestamptz as created_at, NULL::timestamptz as updated_at
FROM identities
WHERE uuid = $1
"#;
@@ -2497,7 +2504,7 @@ impl PostgresDb {
let query = r#"
SELECT c.id, c.file_uuid, c.chunk_id, c.chunk_type,
c.start_time, c.end_time, c.text_content, c.content
FROM chunks c
FROM dev.chunk c
WHERE c.file_uuid IN (
SELECT DISTINCT fd.file_uuid
FROM face_detections fd
@@ -2538,7 +2545,7 @@ impl PostgresDb {
}
pub async fn store_chunk(&self, chunk: &Chunk) -> Result<()> {
let table = schema::table_name("chunks");
let table = "dev.chunk";
let content_with_rule = serde_json::json!({
"rule": chunk.rule.as_str(),
"data": chunk.content
@@ -2567,9 +2574,9 @@ impl PostgresDb {
sqlx::query(&format!(
r#"
INSERT INTO {} (file_id, file_uuid, chunk_id, old_chunk_id, chunk_index, chunk_type, start_time, end_time, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id, child_chunk_ids)
VALUES ($1, $2, $3, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12::jsonb, $13::jsonb, $14, $15, $16, $17, $18)
ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE SET
INSERT INTO {} (file_id, file_uuid, chunk_id, chunk_type, start_time, end_time, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id, child_chunk_ids)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11::jsonb, $12::jsonb, $13, $14, $15, $16, $17)
ON CONFLICT (file_uuid, chunk_id) DO UPDATE SET
start_time = EXCLUDED.start_time,
end_time = EXCLUDED.end_time,
fps = EXCLUDED.fps,
@@ -2590,7 +2597,6 @@ impl PostgresDb {
.bind(chunk.file_id)
.bind(&chunk.uuid)
.bind(&chunk.chunk_id)
.bind(chunk.chunk_index as i32)
.bind(chunk.chunk_type.as_str())
.bind(chunk.start_time().seconds())
.bind(chunk.end_time().seconds())
@@ -2616,7 +2622,7 @@ impl PostgresDb {
chunk: &Chunk,
tx: &mut sqlx::Transaction<'_, sqlx::Postgres>,
) -> Result<()> {
let table = schema::table_name("chunks");
let table = "dev.chunk";
let content_with_rule = serde_json::json!({
"rule": chunk.rule.as_str(),
"data": chunk.content
@@ -2642,9 +2648,9 @@ impl PostgresDb {
sqlx::query(&format!(
r#"
INSERT INTO {} (file_id, file_uuid, chunk_id, old_chunk_id, chunk_index, chunk_type, start_time, end_time, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id, child_chunk_ids)
VALUES ($1, $2, $3, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12::jsonb, $13::jsonb, $14, $15, $16, $17, $18)
ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE SET
INSERT INTO {} (file_id, file_uuid, chunk_id, chunk_type, start_time, end_time, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id, child_chunk_ids)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11::jsonb, $12::jsonb, $13, $14, $15, $16, $17)
ON CONFLICT (file_uuid, chunk_id) DO UPDATE SET
start_time = EXCLUDED.start_time,
end_time = EXCLUDED.end_time,
fps = EXCLUDED.fps,
@@ -2665,7 +2671,6 @@ impl PostgresDb {
.bind(chunk.file_id)
.bind(&chunk.uuid)
.bind(&chunk.chunk_id)
.bind(chunk.chunk_index as i32)
.bind(chunk.chunk_type.as_str())
.bind(chunk.start_time().seconds())
.bind(chunk.end_time().seconds())
@@ -2687,9 +2692,9 @@ impl PostgresDb {
}
pub async fn get_chunks_by_uuid(&self, uuid: &str) -> Result<Vec<Chunk>> {
let table = schema::table_name("chunks");
let table = "dev.chunk";
let rows = sqlx::query(&format!(
"SELECT COALESCE(file_id, 0) as file_id, file_uuid as uuid, chunk_id, chunk_index, chunk_type, COALESCE(fps, 24.0) as fps, COALESCE(start_frame, 0) as start_frame, COALESCE(end_frame, 0) as end_frame, text_content, content, metadata, vector_id, COALESCE(frame_count, 0) as frame_count, pre_chunk_ids, parent_chunk_id::text as parent_chunk_id, child_chunk_ids, visual_stats FROM {} WHERE file_uuid = $1 ORDER BY chunk_index",
"SELECT COALESCE(file_id, 0) as file_id, file_uuid as uuid, chunk_id, chunk_type, COALESCE(fps, 24.0) as fps, COALESCE(start_frame, 0) as start_frame, COALESCE(end_frame, 0) as end_frame, text_content, content, metadata, vector_id, COALESCE(frame_count, 0) as frame_count, pre_chunk_ids, parent_chunk_id::text as parent_chunk_id, child_chunk_ids, visual_stats FROM {} WHERE file_uuid = $1 ORDER BY id",
table
))
.bind(uuid)
@@ -2699,8 +2704,7 @@ impl PostgresDb {
let chunks: Vec<Chunk> = rows
.into_iter()
.map(|r| {
let chunk_type_str: String = r.get(4);
let chunk_index: i32 = r.get(3);
let chunk_type_str: String = r.get(3);
let chunk_type = match chunk_type_str.as_str() {
"time" => ChunkType::TimeBased,
"sentence" => ChunkType::Sentence,
@@ -2740,7 +2744,7 @@ impl PostgresDb {
file_id,
uuid: r.get("uuid"),
chunk_id: r.get("chunk_id"),
chunk_index: chunk_index as u32,
chunk_type,
rule,
@@ -2768,9 +2772,9 @@ impl PostgresDb {
chunk_id: &str,
uuid: &str,
) -> Result<Option<Chunk>> {
let table = schema::table_name("chunks");
let table = "dev.chunk";
let row = sqlx::query(&format!(
"SELECT COALESCE(file_id, 0) as file_id, uuid, chunk_id, chunk_index, chunk_type, COALESCE(fps, 24.0) as fps, COALESCE(start_frame, 0) as start_frame, COALESCE(end_frame, 0) as end_frame, text_content, content, metadata, vector_id, COALESCE(frame_count, 0) as frame_count, pre_chunk_ids, parent_chunk_id, child_chunk_ids, visual_stats FROM {} WHERE chunk_id = $1 AND uuid = $2",
"SELECT COALESCE(file_id, 0) as file_id, uuid, chunk_id, chunk_type, COALESCE(fps, 24.0) as fps, COALESCE(start_frame, 0) as start_frame, COALESCE(end_frame, 0) as end_frame, text_content, content, metadata, vector_id, COALESCE(frame_count, 0) as frame_count, pre_chunk_ids, parent_chunk_id, child_chunk_ids, visual_stats FROM {} WHERE chunk_id = $1 AND uuid = $2",
table
))
.bind(chunk_id)
@@ -2779,25 +2783,24 @@ impl PostgresDb {
.await?;
if let Some(r) = row {
let chunk_type_str: String = r.get(4);
let chunk_index: i32 = r.get(3);
let chunk_type = match chunk_type_str.as_str() {
"time" => ChunkType::TimeBased,
"sentence" => ChunkType::Sentence,
"cut" => ChunkType::Cut,
"trace" => ChunkType::Trace,
"story" => ChunkType::Story,
_ => ChunkType::TimeBased,
};
let chunk_type_str: String = r.get(3);
let chunk_type = match chunk_type_str.as_str() {
"time" => ChunkType::TimeBased,
"sentence" => ChunkType::Sentence,
"cut" => ChunkType::Cut,
"trace" => ChunkType::Trace,
"story" => ChunkType::Story,
_ => ChunkType::TimeBased,
};
let content: serde_json::Value = r.get(9);
let metadata: Option<serde_json::Value> = r.get(10);
let content: serde_json::Value = r.get(8);
let metadata: Option<serde_json::Value> = r.get(9);
let pre_chunk_ids: Vec<i32> = r.try_get(13).unwrap_or_default();
let parent_chunk_id: Option<String> = r.try_get(14).ok().flatten();
let child_chunk_ids: Vec<String> = r.try_get(15).unwrap_or_default();
let pre_chunk_ids: Vec<i32> = r.try_get(12).unwrap_or_default();
let parent_chunk_id: Option<String> = r.try_get(13).ok().flatten();
let child_chunk_ids: Vec<String> = r.try_get(14).unwrap_or_default();
let (rule, content_data) = if content.get("rule").is_some() {
let (rule, content_data) = if content.get("rule").is_some() {
let rule_str = content
.get("rule")
.and_then(|v| v.as_str())
@@ -2820,7 +2823,7 @@ impl PostgresDb {
file_id,
uuid: r.get("uuid"),
chunk_id: r.get("chunk_id"),
chunk_index: chunk_index as u32,
chunk_type,
rule,
fps: r.get("fps"),
@@ -2996,9 +2999,9 @@ impl PostgresDb {
start_time: f64,
end_time: f64,
) -> Result<Vec<Chunk>> {
let table = schema::table_name("chunks");
let table = "dev.chunk";
let rows = sqlx::query(&format!(
"SELECT file_id, uuid, chunk_id, chunk_index, chunk_type, start_time, end_time, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id::text as parent_chunk_id, child_chunk_ids
"SELECT file_id, uuid, chunk_id, chunk_type, start_time, end_time, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id::text as parent_chunk_id, child_chunk_ids
FROM {}
WHERE file_id = $1 AND start_time >= $2 AND end_time <= $3
ORDER BY start_time",
@@ -3013,8 +3016,7 @@ impl PostgresDb {
let chunks: Vec<Chunk> = rows
.into_iter()
.map(|r| {
let chunk_type_str: String = r.get(4);
let chunk_index: i32 = r.get(3);
let chunk_type_str: String = r.get(3);
let chunk_type = match chunk_type_str.as_str() {
"time" => ChunkType::TimeBased,
"sentence" => ChunkType::Sentence,
@@ -3024,12 +3026,12 @@ impl PostgresDb {
_ => ChunkType::TimeBased,
};
let content: serde_json::Value = r.get(11);
let metadata: Option<serde_json::Value> = r.get(12);
let content: serde_json::Value = r.get(10);
let metadata: Option<serde_json::Value> = r.get(11);
let pre_chunk_ids: Vec<i32> = r.try_get(15).unwrap_or_default();
let parent_chunk_id: Option<String> = r.try_get(16).ok().flatten();
let child_chunk_ids: Vec<String> = r.try_get(17).unwrap_or_default();
let pre_chunk_ids: Vec<i32> = r.try_get(14).unwrap_or_default();
let parent_chunk_id: Option<String> = r.try_get(15).ok().flatten();
let child_chunk_ids: Vec<String> = r.try_get(16).unwrap_or_default();
let (rule, content_data) = if content.get("rule").is_some() {
let rule_str = content
@@ -3054,7 +3056,7 @@ impl PostgresDb {
file_id,
uuid: r.get("uuid"),
chunk_id: r.get("chunk_id"),
chunk_index: chunk_index as u32,
chunk_type,
rule,
@@ -3082,9 +3084,9 @@ impl PostgresDb {
return Ok(vec![]);
}
let table = schema::table_name("chunks");
let table = "dev.chunk";
let rows = sqlx::query(&format!(
"SELECT file_id, uuid, chunk_id, chunk_index, chunk_type, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id::text as parent_chunk_id, child_chunk_ids FROM {} WHERE chunk_id = ANY($1) ORDER BY chunk_index",
"SELECT file_id, uuid, chunk_id, chunk_type, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id::text as parent_chunk_id, child_chunk_ids FROM {} WHERE chunk_id = ANY($1) ORDER BY id",
table
))
.bind(chunk_ids)
@@ -3094,8 +3096,7 @@ impl PostgresDb {
let chunks: Vec<Chunk> = rows
.into_iter()
.map(|r| {
let chunk_type_str: String = r.get(4);
let chunk_index: i32 = r.get(3);
let chunk_type_str: String = r.get(3);
let chunk_type = match chunk_type_str.as_str() {
"time" => ChunkType::TimeBased,
"sentence" => ChunkType::Sentence,
@@ -3135,7 +3136,7 @@ impl PostgresDb {
file_id,
uuid: r.get("uuid"),
chunk_id: r.get("chunk_id"),
chunk_index: chunk_index as u32,
chunk_type,
rule,
@@ -3192,7 +3193,7 @@ impl PostgresDb {
}
pub async fn update_vector_id(&self, chunk_id: &str, vector_id: &str) -> Result<()> {
let table = schema::table_name("chunks");
let table = "dev.chunk";
sqlx::query(&format!(
"UPDATE {} SET vector_id = $1 WHERE chunk_id = $2",
table
@@ -3214,12 +3215,12 @@ impl PostgresDb {
}
pub async fn search_text(&self, query: &str, chunk_type: Option<&str>) -> Result<Vec<Chunk>> {
let table = schema::table_name("chunks");
let table = "dev.chunk";
let query_pattern = format!("%{}%", query);
let sql = match chunk_type {
Some(_) => &format!("SELECT uuid, chunk_id, chunk_index, chunk_type, start_time, end_time, fps, start_frame, end_frame, content, metadata, vector_id, parent_chunk_id, child_chunk_ids FROM {} WHERE content->>'text' ILIKE $1 AND chunk_type = $2 ORDER BY chunk_index", table),
None => &format!("SELECT uuid, chunk_id, chunk_index, chunk_type, start_time, end_time, fps, start_frame, end_frame, content, metadata, vector_id, parent_chunk_id, child_chunk_ids FROM {} WHERE content->>'text' ILIKE $1 ORDER BY chunk_index", table),
Some(_) => &format!("SELECT uuid, chunk_id, chunk_type, start_time, end_time, fps, start_frame, end_frame, content, metadata, vector_id, parent_chunk_id, child_chunk_ids FROM {} WHERE content->>'text' ILIKE $1 AND chunk_type = $2 ORDER BY id", table),
None => &format!("SELECT uuid, chunk_id, chunk_type, start_time, end_time, fps, start_frame, end_frame, content, metadata, vector_id, parent_chunk_id, child_chunk_ids FROM {} WHERE content->>'text' ILIKE $1 ORDER BY id", table),
};
let chunks = if let Some(ct) = chunk_type {
@@ -3228,7 +3229,6 @@ impl PostgresDb {
(
String,
String,
i32,
String,
f64,
f64,
@@ -3252,7 +3252,6 @@ impl PostgresDb {
(
String,
String,
i32,
String,
f64,
f64,
@@ -3274,7 +3273,7 @@ impl PostgresDb {
let results: Vec<Chunk> = chunks
.into_iter()
.map(|r| {
let chunk_type = match r.3.as_str() {
let chunk_type = match r.2.as_str() {
"time_based" => ChunkType::TimeBased,
"sentence" => ChunkType::Sentence,
"cut" => ChunkType::Cut,
@@ -3284,29 +3283,29 @@ impl PostgresDb {
};
let content: serde_json::Value =
serde_json::from_str(&r.9).unwrap_or(serde_json::json!({}));
serde_json::from_str(&r.8).unwrap_or(serde_json::json!({}));
let metadata: Option<serde_json::Value> =
r.10.and_then(|m| serde_json::from_str(&m).ok());
r.9.and_then(|m| serde_json::from_str(&m).ok());
Chunk {
file_id: 0,
uuid: r.0,
chunk_id: r.1,
chunk_index: r.2 as u32,
chunk_type,
rule: ChunkRule::Rule1,
fps: r.6,
start_frame: r.7,
end_frame: r.8,
text_content: Some(r.9),
fps: r.5,
start_frame: r.6,
end_frame: r.7,
text_content: Some(r.8),
content,
metadata,
vector_id: r.11,
vector_id: r.10,
frame_count: 0,
pre_chunk_ids: vec![],
parent_chunk_id: r.12,
child_chunk_ids: r.13,
parent_chunk_id: r.11,
child_chunk_ids: r.12,
visual_stats: None,
}
})
@@ -3321,13 +3320,13 @@ impl PostgresDb {
uuid: Option<&str>,
limit: usize,
) -> Result<Vec<Bm25Result>> {
let table = schema::table_name("chunks");
let table = "dev.chunk";
let tsquery = self.prepare_tsquery(query).await?;
let sql = match uuid {
Some(_) => &format!(
r#"
SELECT c.chunk_id, c.file_uuid, c.chunk_index, c.chunk_type, c.start_frame, c.end_frame, c.fps, c.start_time, c.end_time,
SELECT c.chunk_id, c.file_uuid, c.chunk_type, c.start_frame, c.end_frame, c.fps, c.start_time, c.end_time,
c.text_content, GREATEST(ts_rank_cd(c.search_vector, to_tsquery('english', $1)), ts_rank_cd(pc.summary_tsvector, to_tsquery('english', $1))) as bm25_score,
c.visual_stats,
pc.metadata->'structured_summary' as scene_summary,
@@ -3342,7 +3341,7 @@ impl PostgresDb {
),
None => &format!(
r#"
SELECT c.chunk_id, c.file_uuid, c.chunk_index, c.chunk_type, c.start_frame, c.end_frame, c.fps, c.start_time, c.end_time,
SELECT c.chunk_id, c.file_uuid, c.chunk_type, c.start_frame, c.end_frame, c.fps, c.start_time, c.end_time,
c.text_content, GREATEST(ts_rank_cd(c.search_vector, to_tsquery('english', $1)), ts_rank_cd(pc.summary_tsvector, to_tsquery('english', $1))) as bm25_score,
c.visual_stats,
pc.metadata->'structured_summary' as scene_summary,
@@ -3406,7 +3405,7 @@ impl PostgresDb {
Bm25Result {
chunk_id: r.0,
uuid: r.1,
chunk_index: r.2 as u32,
chunk_type: r.3,
start_frame: r.4,
end_frame: r.5,
@@ -3472,7 +3471,7 @@ impl PostgresDb {
HybridSearchResult {
chunk_id: r.chunk_id.clone(),
uuid: r.uuid.clone(),
chunk_index: r.chunk_index,
chunk_type: r.chunk_type.clone(),
start_frame: r.start_frame,
end_frame: r.end_frame,
@@ -3526,7 +3525,7 @@ impl PostgresDb {
HybridSearchResult {
chunk_id: r.chunk_id.clone(),
uuid: r.uuid.clone(),
chunk_index: chunk_data.map(|c| c.chunk_index).unwrap_or(0),
chunk_type: chunk_data
.map(|c| c.chunk_type.as_str().to_string())
.unwrap_or_default(),
@@ -3779,7 +3778,6 @@ pub struct SceneSummary {
pub struct Bm25Result {
pub chunk_id: String,
pub uuid: String,
pub chunk_index: u32,
pub chunk_type: String,
pub start_frame: i64,
pub end_frame: i64,
@@ -3797,7 +3795,6 @@ pub struct Bm25Result {
pub struct HybridSearchResult {
pub uuid: String,
pub chunk_id: String,
pub chunk_index: u32,
pub chunk_type: String,
pub start_frame: i64,
pub end_frame: i64,
@@ -4443,7 +4440,7 @@ impl PostgresDb {
total_frames: u64,
) -> Result<()> {
let table = schema::table_name("videos");
let chunks_table = schema::table_name("chunks");
let chunks_table = "dev.chunk";
let pre_chunks_table = schema::table_name("pre_chunks");
// Query chunks count and frames
@@ -4622,7 +4619,7 @@ impl PostgresDb {
let results = sqlx::query_as::<_, SemanticSearchResult>(
r#"
SELECT
id, chunk_index as scene_order, start_time, end_time,
id as scene_order, start_time, end_time,
COALESCE(summary_text, text_content, '') as summary,
metadata,
(1 - (embedding <=> $1::vector)) as similarity
@@ -4820,7 +4817,7 @@ mod tests {
"file_id": 1,
"uuid": "test",
"chunk_id": "c1",
"chunk_index": 0,
"chunk_type": "time_based",
"rule": "rule1",
"start_time": 0.0,
@@ -4960,7 +4957,7 @@ mod tests {
let result = Bm25Result {
chunk_id: "sentence_001".to_string(),
uuid: "test-uuid".to_string(),
chunk_index: 1,
chunk_type: "sentence".to_string(),
start_frame: 0,
end_frame: 150,
@@ -4985,7 +4982,7 @@ mod tests {
let result = HybridSearchResult {
chunk_id: "sentence_001".to_string(),
uuid: "test-uuid".to_string(),
chunk_index: 1,
chunk_type: "sentence".to_string(),
start_frame: 0,
end_frame: 150,

View File

@@ -120,9 +120,16 @@ impl QdrantDb {
.json(&body)
.send()
.await
.context(format!("Failed to create Qdrant collection: {}", collection))?;
.context(format!(
"Failed to create Qdrant collection: {}",
collection
))?;
tracing::info!("Created Qdrant collection: {} (dim={})", collection, vector_dim);
tracing::info!(
"Created Qdrant collection: {} (dim={})",
collection,
vector_dim
);
Ok(())
}

View File

@@ -129,7 +129,7 @@ impl SyncDb {
let chunk = Chunk::from_seconds(
0, // file_id - will be set later
uuid.to_string(),
i as u32,
format!("{}", i),
ChunkType::Sentence,
ChunkRule::Rule1,
segment.start,

View File

@@ -43,8 +43,7 @@ impl Embedder {
}
fn default_url() -> String {
std::env::var("MOMENTRY_EMBED_URL")
.unwrap_or_else(|_| "http://localhost:11434".to_string())
std::env::var("MOMENTRY_EMBED_URL").unwrap_or_else(|_| "http://localhost:11434".to_string())
}
pub async fn embed_text(&self, text: &str) -> Result<Vec<f32>> {
@@ -91,7 +90,12 @@ impl Embedder {
.await
.context("Failed to parse embedding response")?;
Ok(result.data.into_iter().next().map(|d| d.embedding).unwrap_or_default())
Ok(result
.data
.into_iter()
.next()
.map(|d| d.embedding)
.unwrap_or_default())
} else {
let url = format!("{}/api/embeddings", self.base_url);
let response = self

View File

@@ -1,11 +1,8 @@
use anyhow::{Context, Result};
use serde::{Deserialize, Serialize};
use std::time::Duration;
use super::executor::PythonExecutor;
const ASR_TIMEOUT: Duration = Duration::from_secs(1800); // 30 minutes
#[derive(Debug, Serialize, Deserialize)]
pub struct AsrResult {
pub language: Option<String>,
@@ -36,7 +33,7 @@ pub async fn process_asr(
&[video_path, output_path],
uuid,
"ASR",
Some(ASR_TIMEOUT),
None,
)
.await
.with_context(|| format!("Failed to run {:?}", script_path))?;

View File

@@ -247,7 +247,10 @@ impl PythonExecutor {
let mut partial_path = out.to_path_buf();
partial_path.set_extension("json.partial");
let _ = std::fs::rename(tmp, &partial_path);
tracing::warn!("[Executor] Partial output preserved: {:?}", partial_path);
tracing::warn!(
"[Executor] Partial output preserved: {:?}",
partial_path
);
} else {
let mut err_path = out.to_path_buf();
err_path.set_extension("json.err");

View File

@@ -131,7 +131,7 @@ fn create_fixed_frame_chunks(
let chunk = crate::core::chunk::Chunk::from_yolo_frames(
file_id,
uuid.to_string(),
chunk_index,
format!("vis_{}", chunk_index),
start_frame,
end_frame,
fps,
@@ -190,7 +190,7 @@ fn create_similarity_based_chunks(
let chunk = crate::core::chunk::Chunk::from_yolo_frames(
file_id,
uuid.to_string(),
chunk_index,
format!("vis_{}", chunk_index),
current_start_frame,
end_frame,
fps,
@@ -214,7 +214,7 @@ fn create_similarity_based_chunks(
let chunk = crate::core::chunk::Chunk::from_yolo_frames(
file_id,
uuid.to_string(),
chunk_index,
format!("vis_{}", chunk_index),
current_start_frame,
end_frame,
fps,

View File

@@ -13,11 +13,17 @@ struct TmdbIdentity {
}
fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
if a.len() != b.len() || a.is_empty() { return 0.0; }
if a.len() != b.len() || a.is_empty() {
return 0.0;
}
let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
let na: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
let nb: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
if na == 0.0 || nb == 0.0 { 0.0 } else { dot / (na * nb) }
if na == 0.0 || nb == 0.0 {
0.0
} else {
dot / (na * nb)
}
}
/// Match face detections against TMDb identities using iterative multi-angle propagation.
@@ -42,10 +48,11 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
let fd_rows = sqlx::query_as::<_, (i32, Vec<f32>)>(
"SELECT trace_id, embedding FROM dev.face_detections \
WHERE file_uuid=$1 AND trace_id IS NOT NULL AND embedding IS NOT NULL \
ORDER BY trace_id"
ORDER BY trace_id",
)
.bind(file_uuid)
.fetch_all(pool).await?;
.fetch_all(pool)
.await?;
if fd_rows.is_empty() {
info!("[TKG-MATCH] No face detections for {}", file_uuid);
@@ -77,14 +84,23 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
for (id, name, tmdb_emb) in &tmdb_rows {
for face in faces {
let s = cosine_similarity(face, tmdb_emb);
if s > best_sim { best_sim = s; best_id = *id; best_name = name.clone(); }
if s > best_sim {
best_sim = s;
best_id = *id;
best_name = name.clone();
}
}
}
if best_sim >= TH {
matched.insert(tid, (best_id, best_name));
}
}
info!("[TKG-MATCH] Round 1: {} ({}/{})", matched.len(), matched.len() * 100 / total, total);
info!(
"[TKG-MATCH] Round 1: {} ({}/{})",
matched.len(),
matched.len() * 100 / total,
total
);
// Round 2+: propagate
for round_n in 2..=10 {
@@ -98,7 +114,9 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
let mut new_matches: Vec<(i32, i32, String)> = Vec::new();
for (&tid, faces) in &trace_faces {
if matched.contains_key(&tid) || faces.is_empty() { continue; }
if matched.contains_key(&tid) || faces.is_empty() {
continue;
}
let ref_face = &faces[0];
let mut best_id = 0i32;
let mut best_name = String::new();
@@ -106,13 +124,19 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
for (&id, seed_faces) in &seed_pool {
for seed in seed_faces {
let s = cosine_similarity(ref_face, seed);
if s > best_sim { best_sim = s; best_id = id; }
if s > best_sim {
best_sim = s;
best_id = id;
}
}
}
if best_sim >= TH {
// Look up name for this id
for (id, name, _) in &tmdb_rows {
if *id == best_id { best_name = name.clone(); break; }
if *id == best_id {
best_name = name.clone();
break;
}
}
new_matches.push((tid, best_id, best_name));
}
@@ -121,7 +145,9 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
matched.insert(tid, (id, name));
}
let new = matched.len() - prev;
if new < 5 { break; }
if new < 5 {
break;
}
}
// Step 4: Quality control
@@ -129,41 +155,62 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
let mut after_qc = HashMap::new();
for (&tid, &(id, ref name)) in &matched {
let cnt: i64 = sqlx::query_scalar(
"SELECT COUNT(*) FROM dev.face_detections WHERE file_uuid=$1 AND trace_id=$2"
"SELECT COUNT(*) FROM dev.face_detections WHERE file_uuid=$1 AND trace_id=$2",
)
.bind(file_uuid).bind(tid)
.fetch_one(pool).await.unwrap_or(0);
.bind(file_uuid)
.bind(tid)
.fetch_one(pool)
.await
.unwrap_or(0);
if cnt >= 4 {
after_qc.insert(tid, (id, name.clone()));
} else {
info!("[TKG-QC] trace {} removed: only {} face(s), need >= 4", tid, cnt);
info!(
"[TKG-QC] trace {} removed: only {} face(s), need >= 4",
tid, cnt
);
}
}
let matched = after_qc;
let removed_low = total - matched.len();
if removed_low > 0 {
info!("[TKG-QC] Removed {} low-confidence traces (< 4 faces)", removed_low);
info!(
"[TKG-QC] Removed {} low-confidence traces (< 4 faces)",
removed_low
);
}
// 4b: Temporal collision check
let removed_collisions = quality_check_temporal_collisions(pool, file_uuid).await?;
if removed_collisions > 0 {
info!("[TKG-QC] Resolved {} temporal collisions", removed_collisions);
info!(
"[TKG-QC] Resolved {} temporal collisions",
removed_collisions
);
}
// Step 5: Update DB
let mut updated = 0usize;
for (&tid, &(id, _)) in &matched {
let r = sqlx::query(
"UPDATE dev.face_detections SET identity_id=$1 WHERE file_uuid=$2 AND trace_id=$3"
"UPDATE dev.face_detections SET identity_id=$1 WHERE file_uuid=$2 AND trace_id=$3",
)
.bind(id).bind(file_uuid).bind(tid)
.execute(pool).await?;
if r.rows_affected() > 0 { updated += 1; }
.bind(id)
.bind(file_uuid)
.bind(tid)
.execute(pool)
.await?;
if r.rows_affected() > 0 {
updated += 1;
}
}
info!("[TKG-MATCH] Done: {}/{} traces matched ({}%)",
matched.len(), total, matched.len() * 100 / total);
info!(
"[TKG-MATCH] Done: {}/{} traces matched ({}%)",
matched.len(),
total,
matched.len() * 100 / total
);
Ok(updated)
}
@@ -185,10 +232,11 @@ async fn quality_check_temporal_collisions(pool: &sqlx::PgPool, file_uuid: &str)
AND a.identity_id IS NOT NULL
AND a.identity_id = b.identity_id
ORDER BY a.identity_id, a.frame_number
"#
"#,
)
.bind(file_uuid)
.fetch_all(pool).await?;
.fetch_all(pool)
.await?;
if collisions.is_empty() {
return Ok(0);
@@ -221,10 +269,12 @@ async fn quality_check_temporal_collisions(pool: &sqlx::PgPool, file_uuid: &str)
let victim_cnt = if cnt_a <= cnt_b { cnt_a } else { cnt_b };
sqlx::query(
"UPDATE dev.face_detections SET identity_id=NULL WHERE file_uuid=$1 AND trace_id=$2"
"UPDATE dev.face_detections SET identity_id=NULL WHERE file_uuid=$1 AND trace_id=$2",
)
.bind(file_uuid).bind(victim)
.execute(pool).await?;
.bind(file_uuid)
.bind(victim)
.execute(pool)
.await?;
unbound += 1;
warn!("[TKG-QC] Collision identity={}: trace {} vs trace {} ({} overlap frames). Unbound trace {} ({} detections)",