feat: Phase 1 handover - schema migration, correction mechanism, API fixes
Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
This commit is contained in:
@@ -6,6 +6,6 @@ pub mod types;
|
||||
|
||||
pub use rule1_ingest::execute_rule1;
|
||||
pub use rule3_ingest::ingest_rule3;
|
||||
pub use trace_ingest::ingest_traces;
|
||||
pub use splitter::{AsrSegment, ChunkSplitter};
|
||||
pub use trace_ingest::ingest_traces;
|
||||
pub use types::{Chunk, ChunkType};
|
||||
|
||||
@@ -50,7 +50,7 @@ pub async fn execute_rule1(db: &PostgresDb, file_uuid: &str, fps: f64) -> Result
|
||||
let chunk = Chunk::from_seconds(
|
||||
file_id as i32,
|
||||
file_uuid.to_string(),
|
||||
idx as u32,
|
||||
format!("{}", idx),
|
||||
ChunkType::Sentence,
|
||||
ChunkRule::Rule1,
|
||||
seg.start_time,
|
||||
|
||||
@@ -73,7 +73,7 @@ pub async fn ingest_rule3(pool: &PgPool, file_uuid: &str) -> Result<usize> {
|
||||
// Query chunks table for Rule 1 sentence chunks
|
||||
let rule1_rows: Vec<(String,)> = sqlx::query_as(
|
||||
r#"
|
||||
SELECT chunk_id FROM chunks
|
||||
SELECT chunk_id FROM dev.chunk
|
||||
WHERE file_uuid = $1 AND chunk_type = 'sentence'
|
||||
AND start_frame >= $2
|
||||
AND end_frame <= $3
|
||||
@@ -98,7 +98,7 @@ pub async fn ingest_rule3(pool: &PgPool, file_uuid: &str) -> Result<usize> {
|
||||
|
||||
let texts: Vec<String> = sqlx::query_scalar(
|
||||
r#"
|
||||
SELECT text_content FROM chunks
|
||||
SELECT text_content FROM dev.chunk
|
||||
WHERE file_uuid = $1 AND chunk_type = 'sentence'
|
||||
AND start_frame >= $2
|
||||
AND end_frame <= $3
|
||||
@@ -135,10 +135,11 @@ pub async fn ingest_rule3(pool: &PgPool, file_uuid: &str) -> Result<usize> {
|
||||
);
|
||||
|
||||
// 4. Insert into dev.chunks
|
||||
let fps_query: Option<f64> = sqlx::query_scalar("SELECT fps FROM videos WHERE file_uuid = $1")
|
||||
.bind(file_uuid)
|
||||
.fetch_optional(&mut *tx)
|
||||
.await?;
|
||||
let fps_query: Option<f64> =
|
||||
sqlx::query_scalar("SELECT fps FROM videos WHERE file_uuid = $1")
|
||||
.bind(file_uuid)
|
||||
.fetch_optional(&mut *tx)
|
||||
.await?;
|
||||
let fps = fps_query.unwrap_or(29.97);
|
||||
|
||||
// Prepare metadata JSON
|
||||
@@ -149,12 +150,12 @@ pub async fn ingest_rule3(pool: &PgPool, file_uuid: &str) -> Result<usize> {
|
||||
|
||||
sqlx::query(
|
||||
r#"
|
||||
INSERT INTO chunks (
|
||||
file_uuid, chunk_id, old_chunk_id, chunk_index, chunk_type,
|
||||
INSERT INTO dev.chunk (
|
||||
file_uuid, chunk_id, chunk_type,
|
||||
start_time, end_time, fps, start_frame, end_frame,
|
||||
content, text_content, summary_text, metadata, child_chunk_ids
|
||||
) VALUES ($1, $2, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14)
|
||||
ON CONFLICT (file_uuid, old_chunk_id) DO NOTHING
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13)
|
||||
ON CONFLICT (file_uuid, chunk_id) DO NOTHING
|
||||
"#,
|
||||
)
|
||||
.bind(file_uuid)
|
||||
|
||||
@@ -23,7 +23,7 @@ impl ChunkSplitter {
|
||||
chunks.push(Chunk::from_seconds(
|
||||
0, // file_id
|
||||
uuid.to_string(),
|
||||
index,
|
||||
format!("{}", index),
|
||||
ChunkType::TimeBased,
|
||||
ChunkRule::Rule1,
|
||||
current_time,
|
||||
@@ -48,7 +48,7 @@ impl ChunkSplitter {
|
||||
chunks.push(Chunk::from_seconds(
|
||||
0, // file_id
|
||||
uuid.to_string(),
|
||||
index as u32,
|
||||
format!("{}", index),
|
||||
ChunkType::Sentence,
|
||||
ChunkRule::Rule1,
|
||||
segment.start,
|
||||
|
||||
@@ -95,7 +95,7 @@ pub async fn ingest_traces(db: &PostgresDb, file_uuid: &str) -> Result<usize> {
|
||||
let chunk = Chunk::new(
|
||||
file_id,
|
||||
file_uuid.to_string(),
|
||||
(count + 1) as u32,
|
||||
format!("trace_{}", count + 1),
|
||||
ChunkType::Trace,
|
||||
ChunkRule::Rule1,
|
||||
trace.first_frame as i64,
|
||||
@@ -110,17 +110,29 @@ pub async fn ingest_traces(db: &PostgresDb, file_uuid: &str) -> Result<usize> {
|
||||
if let Err(e) = db.store_chunk(&chunk).await {
|
||||
error!("Failed to store trace chunk {}: {}", trace.trace_id, e);
|
||||
} else {
|
||||
let preview = chunk.text_content.as_deref().unwrap_or("").chars().take(60).collect::<String>();
|
||||
let co = chunk.metadata.as_ref()
|
||||
let preview = chunk
|
||||
.text_content
|
||||
.as_deref()
|
||||
.unwrap_or("")
|
||||
.chars()
|
||||
.take(60)
|
||||
.collect::<String>();
|
||||
let co = chunk
|
||||
.metadata
|
||||
.as_ref()
|
||||
.and_then(|m| m.get("co_appearances"))
|
||||
.and_then(|c| c.as_array())
|
||||
.map(|a| a.len())
|
||||
.unwrap_or(0);
|
||||
info!(
|
||||
"Trace chunk {}: trace_id={} frames={}-{} faces={} co_appear={} text={}",
|
||||
chunk.chunk_id, trace.trace_id,
|
||||
trace.first_frame, trace.last_frame,
|
||||
trace.face_count, co, preview,
|
||||
chunk.chunk_id,
|
||||
trace.trace_id,
|
||||
trace.first_frame,
|
||||
trace.last_frame,
|
||||
trace.face_count,
|
||||
co,
|
||||
preview,
|
||||
);
|
||||
count += 1;
|
||||
}
|
||||
@@ -209,14 +221,11 @@ impl<'r> sqlx::FromRow<'r, sqlx::postgres::PgRow> for AsrSegment {
|
||||
|
||||
impl AsrSegment {
|
||||
fn text(&self) -> Option<&str> {
|
||||
self.data
|
||||
.get("text")
|
||||
.and_then(|v| v.as_str())
|
||||
.or_else(|| {
|
||||
self.data
|
||||
.get("data")
|
||||
.and_then(|d| d.get("text"))
|
||||
.and_then(|v| v.as_str())
|
||||
})
|
||||
self.data.get("text").and_then(|v| v.as_str()).or_else(|| {
|
||||
self.data
|
||||
.get("data")
|
||||
.and_then(|d| d.get("text"))
|
||||
.and_then(|v| v.as_str())
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -115,7 +115,6 @@ pub struct Chunk {
|
||||
pub file_id: i32,
|
||||
pub uuid: String,
|
||||
pub chunk_id: String,
|
||||
pub chunk_index: u32,
|
||||
pub chunk_type: ChunkType,
|
||||
pub rule: ChunkRule,
|
||||
/// Frames per second (can be fractional, e.g., 29.97, 23.976)
|
||||
@@ -140,7 +139,7 @@ impl Chunk {
|
||||
pub fn new(
|
||||
file_id: i32,
|
||||
uuid: String,
|
||||
chunk_index: u32,
|
||||
chunk_id: String,
|
||||
chunk_type: ChunkType,
|
||||
rule: ChunkRule,
|
||||
start_frame: i64,
|
||||
@@ -149,13 +148,11 @@ impl Chunk {
|
||||
content: serde_json::Value,
|
||||
) -> Self {
|
||||
let frame_count = (end_frame - start_frame) as i32;
|
||||
let chunk_id = format!("{}_{}", uuid, chunk_index);
|
||||
|
||||
Self {
|
||||
file_id,
|
||||
uuid,
|
||||
chunk_id,
|
||||
chunk_index,
|
||||
chunk_type,
|
||||
rule,
|
||||
fps,
|
||||
@@ -177,7 +174,7 @@ impl Chunk {
|
||||
pub fn new_visual(
|
||||
file_id: i32,
|
||||
uuid: String,
|
||||
chunk_index: u32,
|
||||
chunk_id: String,
|
||||
start_frame: i64,
|
||||
end_frame: i64,
|
||||
fps: f64,
|
||||
@@ -189,7 +186,7 @@ impl Chunk {
|
||||
Self::new(
|
||||
file_id,
|
||||
uuid,
|
||||
chunk_index,
|
||||
chunk_id,
|
||||
ChunkType::Visual,
|
||||
ChunkRule::Rule2,
|
||||
start_frame,
|
||||
@@ -203,7 +200,7 @@ impl Chunk {
|
||||
pub fn from_yolo_frames(
|
||||
file_id: i32,
|
||||
uuid: String,
|
||||
chunk_index: u32,
|
||||
chunk_id: String,
|
||||
start_frame: i64,
|
||||
end_frame: i64,
|
||||
fps: f64,
|
||||
@@ -307,7 +304,7 @@ impl Chunk {
|
||||
Self::new_visual(
|
||||
file_id,
|
||||
uuid,
|
||||
chunk_index,
|
||||
chunk_id,
|
||||
start_frame,
|
||||
end_frame,
|
||||
fps,
|
||||
@@ -334,7 +331,7 @@ impl Chunk {
|
||||
pub fn from_seconds(
|
||||
file_id: i32,
|
||||
uuid: String,
|
||||
chunk_index: u32,
|
||||
chunk_id: String,
|
||||
chunk_type: ChunkType,
|
||||
rule: ChunkRule,
|
||||
start_time: f64,
|
||||
@@ -347,7 +344,7 @@ impl Chunk {
|
||||
Self::new(
|
||||
file_id,
|
||||
uuid,
|
||||
chunk_index,
|
||||
chunk_id,
|
||||
chunk_type,
|
||||
rule,
|
||||
start_frame,
|
||||
|
||||
@@ -103,7 +103,6 @@ pub struct Chunk {
|
||||
pub file_id: i32,
|
||||
pub uuid: String,
|
||||
pub chunk_id: String,
|
||||
pub chunk_index: u32,
|
||||
pub chunk_type: ChunkType,
|
||||
pub rule: ChunkRule,
|
||||
/// Frames per second (can be fractional, e.g., 29.97, 23.976)
|
||||
@@ -128,7 +127,7 @@ impl Chunk {
|
||||
pub fn new_visual(
|
||||
file_id: i32,
|
||||
uuid: String,
|
||||
chunk_index: u32,
|
||||
chunk_id: String,
|
||||
start_frame: i64,
|
||||
end_frame: i64,
|
||||
fps: f64,
|
||||
@@ -140,7 +139,7 @@ impl Chunk {
|
||||
Self::new(
|
||||
file_id,
|
||||
uuid,
|
||||
chunk_index,
|
||||
chunk_id,
|
||||
ChunkType::Visual,
|
||||
ChunkRule::Rule2,
|
||||
start_frame,
|
||||
@@ -154,7 +153,7 @@ impl Chunk {
|
||||
pub fn from_yolo_result(
|
||||
file_id: i32,
|
||||
uuid: String,
|
||||
chunk_index: u32,
|
||||
chunk_id: String,
|
||||
start_frame: i64,
|
||||
end_frame: i64,
|
||||
fps: f64,
|
||||
@@ -263,7 +262,7 @@ impl Chunk {
|
||||
Self::new_visual(
|
||||
file_id,
|
||||
uuid,
|
||||
chunk_index,
|
||||
chunk_id,
|
||||
start_frame,
|
||||
end_frame,
|
||||
fps,
|
||||
@@ -275,7 +274,7 @@ impl Chunk {
|
||||
pub fn new(
|
||||
file_id: i32,
|
||||
uuid: String,
|
||||
chunk_index: u32,
|
||||
chunk_id: String,
|
||||
chunk_type: ChunkType,
|
||||
rule: ChunkRule,
|
||||
start_frame: i64,
|
||||
@@ -284,13 +283,11 @@ impl Chunk {
|
||||
content: serde_json::Value,
|
||||
) -> Self {
|
||||
let frame_count = (end_frame - start_frame) as i32;
|
||||
let chunk_id = format!("{}_{}", uuid, chunk_index);
|
||||
|
||||
Self {
|
||||
file_id,
|
||||
uuid,
|
||||
chunk_id,
|
||||
chunk_index,
|
||||
chunk_type,
|
||||
rule,
|
||||
fps,
|
||||
|
||||
@@ -13,7 +13,6 @@ pub struct MongoDb {
|
||||
pub struct ChunkDocument {
|
||||
pub uuid: String,
|
||||
pub chunk_id: String,
|
||||
pub chunk_index: u32,
|
||||
pub chunk_type: String,
|
||||
pub start_time: f64,
|
||||
pub end_time: f64,
|
||||
@@ -34,7 +33,6 @@ impl From<Chunk> for ChunkDocument {
|
||||
Self {
|
||||
uuid: chunk.uuid,
|
||||
chunk_id: chunk.chunk_id,
|
||||
chunk_index: chunk.chunk_index,
|
||||
chunk_type: chunk.chunk_type.as_str().to_string(),
|
||||
start_time,
|
||||
end_time,
|
||||
@@ -119,7 +117,7 @@ impl MongoDb {
|
||||
file_id: 0,
|
||||
uuid: doc.uuid,
|
||||
chunk_id: doc.chunk_id,
|
||||
chunk_index: doc.chunk_index,
|
||||
|
||||
chunk_type,
|
||||
rule: ChunkRule::Rule1,
|
||||
fps: doc.fps,
|
||||
@@ -178,7 +176,7 @@ impl MongoDb {
|
||||
file_id: 0,
|
||||
uuid: doc.uuid,
|
||||
chunk_id: doc.chunk_id,
|
||||
chunk_index: doc.chunk_index,
|
||||
|
||||
chunk_type,
|
||||
rule: ChunkRule::Rule1,
|
||||
fps: doc.fps,
|
||||
@@ -234,7 +232,7 @@ impl MongoDb {
|
||||
file_id: 0,
|
||||
uuid: doc.uuid,
|
||||
chunk_id: doc.chunk_id,
|
||||
chunk_index: doc.chunk_index,
|
||||
|
||||
chunk_type,
|
||||
rule: ChunkRule::Rule1,
|
||||
fps: doc.fps,
|
||||
|
||||
@@ -56,7 +56,7 @@ pub struct CandidateRecord {
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, sqlx::FromRow)]
|
||||
pub struct FileIdentityRecord {
|
||||
pub id: i64,
|
||||
pub id: i32,
|
||||
pub file_uuid: String,
|
||||
pub identity_id: i32,
|
||||
pub name: String,
|
||||
@@ -116,7 +116,7 @@ pub struct IdentityFaceRecord {
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, sqlx::FromRow)]
|
||||
pub struct IdentityChunkRecord {
|
||||
pub id: i64,
|
||||
pub id: i32,
|
||||
pub file_uuid: String,
|
||||
pub chunk_id: String,
|
||||
pub chunk_type: String,
|
||||
@@ -788,8 +788,8 @@ impl PostgresDb {
|
||||
.await?;
|
||||
|
||||
// Chunks
|
||||
sqlx::query("CREATE TABLE IF NOT EXISTS chunks (id SERIAL PRIMARY KEY, file_uuid VARCHAR(32) NOT NULL, chunk_id VARCHAR(64) NOT NULL, chunk_index INTEGER NOT NULL, chunk_type VARCHAR(32) NOT NULL, start_time DOUBLE PRECISION NOT NULL, end_time DOUBLE PRECISION NOT NULL, fps DOUBLE PRECISION DEFAULT 24.0, start_frame BIGINT DEFAULT 0, end_frame BIGINT DEFAULT 0, content JSONB NOT NULL, metadata JSONB, vector_id VARCHAR(64), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, UNIQUE(file_uuid, chunk_id))").execute(pool).await?;
|
||||
sqlx::query("CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file_uuid)")
|
||||
sqlx::query("CREATE TABLE IF NOT EXISTS chunk (id SERIAL PRIMARY KEY, file_uuid VARCHAR(32) NOT NULL, chunk_id VARCHAR(64) NOT NULL, chunk_type VARCHAR(32) NOT NULL, start_time DOUBLE PRECISION NOT NULL, end_time DOUBLE PRECISION NOT NULL, fps DOUBLE PRECISION DEFAULT 24.0, start_frame BIGINT DEFAULT 0, end_frame BIGINT DEFAULT 0, content JSONB NOT NULL, metadata JSONB, vector_id VARCHAR(64), created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, UNIQUE(file_uuid, chunk_id))").execute(pool).await?;
|
||||
sqlx::query("CREATE INDEX IF NOT EXISTS idx_chunk_file ON chunk(file_uuid)")
|
||||
.execute(pool)
|
||||
.await?;
|
||||
sqlx::query("CREATE INDEX IF NOT EXISTS idx_chunks_type ON chunks(chunk_type)")
|
||||
@@ -845,7 +845,7 @@ impl PostgresDb {
|
||||
|
||||
sqlx::query(
|
||||
"CREATE TRIGGER chunks_search_vector_trigger
|
||||
BEFORE INSERT OR UPDATE ON chunks
|
||||
BEFORE INSERT OR UPDATE ON chunk
|
||||
FOR EACH ROW EXECUTE FUNCTION update_search_vector()",
|
||||
)
|
||||
.execute(pool)
|
||||
@@ -1232,7 +1232,7 @@ impl PostgresDb {
|
||||
let tx = self.pool.begin().await?;
|
||||
|
||||
let chunk_vectors = schema::table_name("chunk_vectors");
|
||||
let chunks = schema::table_name("chunks");
|
||||
let chunks = "dev.chunk";
|
||||
let processor_results = schema::table_name("processor_results");
|
||||
let videos = schema::table_name("videos");
|
||||
|
||||
@@ -1254,6 +1254,11 @@ impl PostgresDb {
|
||||
.execute(&self.pool)
|
||||
.await?;
|
||||
|
||||
sqlx::query(&format!("DELETE FROM dev.pre_chunks WHERE file_uuid = $1"))
|
||||
.bind(uuid)
|
||||
.execute(&self.pool)
|
||||
.await?;
|
||||
|
||||
sqlx::query(&format!("DELETE FROM {} WHERE file_uuid = $1", videos))
|
||||
.bind(uuid)
|
||||
.execute(&self.pool)
|
||||
@@ -1277,7 +1282,7 @@ impl PostgresDb {
|
||||
}
|
||||
|
||||
pub async fn get_chunk_count(&self, uuid: &str) -> Result<(i64, i64)> {
|
||||
let chunks = schema::table_name("chunks");
|
||||
let chunks = "dev.chunk";
|
||||
let sentence_count: i64 = sqlx::query_scalar(&format!(
|
||||
"SELECT COUNT(*) FROM {} WHERE file_uuid = $1 AND chunk_type = 'sentence'",
|
||||
chunks
|
||||
@@ -2417,8 +2422,10 @@ impl PostgresDb {
|
||||
pub async fn get_identity_by_uuid(&self, uuid: &Uuid) -> Result<Option<IdentityDetailRecord>> {
|
||||
let query = r#"
|
||||
SELECT id, uuid, name, identity_type, source, status, metadata, reference_data,
|
||||
voice_embedding, identity_embedding, face_embedding,
|
||||
tmdb_id, tmdb_profile, created_at, NULL::timestamptz as updated_at
|
||||
voice_embedding::real[] as voice_embedding,
|
||||
identity_embedding::real[] as identity_embedding,
|
||||
face_embedding::real[] as face_embedding,
|
||||
tmdb_id, tmdb_profile, created_at::timestamptz as created_at, NULL::timestamptz as updated_at
|
||||
FROM identities
|
||||
WHERE uuid = $1
|
||||
"#;
|
||||
@@ -2497,7 +2504,7 @@ impl PostgresDb {
|
||||
let query = r#"
|
||||
SELECT c.id, c.file_uuid, c.chunk_id, c.chunk_type,
|
||||
c.start_time, c.end_time, c.text_content, c.content
|
||||
FROM chunks c
|
||||
FROM dev.chunk c
|
||||
WHERE c.file_uuid IN (
|
||||
SELECT DISTINCT fd.file_uuid
|
||||
FROM face_detections fd
|
||||
@@ -2538,7 +2545,7 @@ impl PostgresDb {
|
||||
}
|
||||
|
||||
pub async fn store_chunk(&self, chunk: &Chunk) -> Result<()> {
|
||||
let table = schema::table_name("chunks");
|
||||
let table = "dev.chunk";
|
||||
let content_with_rule = serde_json::json!({
|
||||
"rule": chunk.rule.as_str(),
|
||||
"data": chunk.content
|
||||
@@ -2567,9 +2574,9 @@ impl PostgresDb {
|
||||
|
||||
sqlx::query(&format!(
|
||||
r#"
|
||||
INSERT INTO {} (file_id, file_uuid, chunk_id, old_chunk_id, chunk_index, chunk_type, start_time, end_time, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id, child_chunk_ids)
|
||||
VALUES ($1, $2, $3, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12::jsonb, $13::jsonb, $14, $15, $16, $17, $18)
|
||||
ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE SET
|
||||
INSERT INTO {} (file_id, file_uuid, chunk_id, chunk_type, start_time, end_time, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id, child_chunk_ids)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11::jsonb, $12::jsonb, $13, $14, $15, $16, $17)
|
||||
ON CONFLICT (file_uuid, chunk_id) DO UPDATE SET
|
||||
start_time = EXCLUDED.start_time,
|
||||
end_time = EXCLUDED.end_time,
|
||||
fps = EXCLUDED.fps,
|
||||
@@ -2590,7 +2597,6 @@ impl PostgresDb {
|
||||
.bind(chunk.file_id)
|
||||
.bind(&chunk.uuid)
|
||||
.bind(&chunk.chunk_id)
|
||||
.bind(chunk.chunk_index as i32)
|
||||
.bind(chunk.chunk_type.as_str())
|
||||
.bind(chunk.start_time().seconds())
|
||||
.bind(chunk.end_time().seconds())
|
||||
@@ -2616,7 +2622,7 @@ impl PostgresDb {
|
||||
chunk: &Chunk,
|
||||
tx: &mut sqlx::Transaction<'_, sqlx::Postgres>,
|
||||
) -> Result<()> {
|
||||
let table = schema::table_name("chunks");
|
||||
let table = "dev.chunk";
|
||||
let content_with_rule = serde_json::json!({
|
||||
"rule": chunk.rule.as_str(),
|
||||
"data": chunk.content
|
||||
@@ -2642,9 +2648,9 @@ impl PostgresDb {
|
||||
|
||||
sqlx::query(&format!(
|
||||
r#"
|
||||
INSERT INTO {} (file_id, file_uuid, chunk_id, old_chunk_id, chunk_index, chunk_type, start_time, end_time, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id, child_chunk_ids)
|
||||
VALUES ($1, $2, $3, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12::jsonb, $13::jsonb, $14, $15, $16, $17, $18)
|
||||
ON CONFLICT (file_uuid, old_chunk_id) DO UPDATE SET
|
||||
INSERT INTO {} (file_id, file_uuid, chunk_id, chunk_type, start_time, end_time, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id, child_chunk_ids)
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11::jsonb, $12::jsonb, $13, $14, $15, $16, $17)
|
||||
ON CONFLICT (file_uuid, chunk_id) DO UPDATE SET
|
||||
start_time = EXCLUDED.start_time,
|
||||
end_time = EXCLUDED.end_time,
|
||||
fps = EXCLUDED.fps,
|
||||
@@ -2665,7 +2671,6 @@ impl PostgresDb {
|
||||
.bind(chunk.file_id)
|
||||
.bind(&chunk.uuid)
|
||||
.bind(&chunk.chunk_id)
|
||||
.bind(chunk.chunk_index as i32)
|
||||
.bind(chunk.chunk_type.as_str())
|
||||
.bind(chunk.start_time().seconds())
|
||||
.bind(chunk.end_time().seconds())
|
||||
@@ -2687,9 +2692,9 @@ impl PostgresDb {
|
||||
}
|
||||
|
||||
pub async fn get_chunks_by_uuid(&self, uuid: &str) -> Result<Vec<Chunk>> {
|
||||
let table = schema::table_name("chunks");
|
||||
let table = "dev.chunk";
|
||||
let rows = sqlx::query(&format!(
|
||||
"SELECT COALESCE(file_id, 0) as file_id, file_uuid as uuid, chunk_id, chunk_index, chunk_type, COALESCE(fps, 24.0) as fps, COALESCE(start_frame, 0) as start_frame, COALESCE(end_frame, 0) as end_frame, text_content, content, metadata, vector_id, COALESCE(frame_count, 0) as frame_count, pre_chunk_ids, parent_chunk_id::text as parent_chunk_id, child_chunk_ids, visual_stats FROM {} WHERE file_uuid = $1 ORDER BY chunk_index",
|
||||
"SELECT COALESCE(file_id, 0) as file_id, file_uuid as uuid, chunk_id, chunk_type, COALESCE(fps, 24.0) as fps, COALESCE(start_frame, 0) as start_frame, COALESCE(end_frame, 0) as end_frame, text_content, content, metadata, vector_id, COALESCE(frame_count, 0) as frame_count, pre_chunk_ids, parent_chunk_id::text as parent_chunk_id, child_chunk_ids, visual_stats FROM {} WHERE file_uuid = $1 ORDER BY id",
|
||||
table
|
||||
))
|
||||
.bind(uuid)
|
||||
@@ -2699,8 +2704,7 @@ impl PostgresDb {
|
||||
let chunks: Vec<Chunk> = rows
|
||||
.into_iter()
|
||||
.map(|r| {
|
||||
let chunk_type_str: String = r.get(4);
|
||||
let chunk_index: i32 = r.get(3);
|
||||
let chunk_type_str: String = r.get(3);
|
||||
let chunk_type = match chunk_type_str.as_str() {
|
||||
"time" => ChunkType::TimeBased,
|
||||
"sentence" => ChunkType::Sentence,
|
||||
@@ -2740,7 +2744,7 @@ impl PostgresDb {
|
||||
file_id,
|
||||
uuid: r.get("uuid"),
|
||||
chunk_id: r.get("chunk_id"),
|
||||
chunk_index: chunk_index as u32,
|
||||
|
||||
chunk_type,
|
||||
rule,
|
||||
|
||||
@@ -2768,9 +2772,9 @@ impl PostgresDb {
|
||||
chunk_id: &str,
|
||||
uuid: &str,
|
||||
) -> Result<Option<Chunk>> {
|
||||
let table = schema::table_name("chunks");
|
||||
let table = "dev.chunk";
|
||||
let row = sqlx::query(&format!(
|
||||
"SELECT COALESCE(file_id, 0) as file_id, uuid, chunk_id, chunk_index, chunk_type, COALESCE(fps, 24.0) as fps, COALESCE(start_frame, 0) as start_frame, COALESCE(end_frame, 0) as end_frame, text_content, content, metadata, vector_id, COALESCE(frame_count, 0) as frame_count, pre_chunk_ids, parent_chunk_id, child_chunk_ids, visual_stats FROM {} WHERE chunk_id = $1 AND uuid = $2",
|
||||
"SELECT COALESCE(file_id, 0) as file_id, uuid, chunk_id, chunk_type, COALESCE(fps, 24.0) as fps, COALESCE(start_frame, 0) as start_frame, COALESCE(end_frame, 0) as end_frame, text_content, content, metadata, vector_id, COALESCE(frame_count, 0) as frame_count, pre_chunk_ids, parent_chunk_id, child_chunk_ids, visual_stats FROM {} WHERE chunk_id = $1 AND uuid = $2",
|
||||
table
|
||||
))
|
||||
.bind(chunk_id)
|
||||
@@ -2779,25 +2783,24 @@ impl PostgresDb {
|
||||
.await?;
|
||||
|
||||
if let Some(r) = row {
|
||||
let chunk_type_str: String = r.get(4);
|
||||
let chunk_index: i32 = r.get(3);
|
||||
let chunk_type = match chunk_type_str.as_str() {
|
||||
"time" => ChunkType::TimeBased,
|
||||
"sentence" => ChunkType::Sentence,
|
||||
"cut" => ChunkType::Cut,
|
||||
"trace" => ChunkType::Trace,
|
||||
"story" => ChunkType::Story,
|
||||
_ => ChunkType::TimeBased,
|
||||
};
|
||||
let chunk_type_str: String = r.get(3);
|
||||
let chunk_type = match chunk_type_str.as_str() {
|
||||
"time" => ChunkType::TimeBased,
|
||||
"sentence" => ChunkType::Sentence,
|
||||
"cut" => ChunkType::Cut,
|
||||
"trace" => ChunkType::Trace,
|
||||
"story" => ChunkType::Story,
|
||||
_ => ChunkType::TimeBased,
|
||||
};
|
||||
|
||||
let content: serde_json::Value = r.get(9);
|
||||
let metadata: Option<serde_json::Value> = r.get(10);
|
||||
let content: serde_json::Value = r.get(8);
|
||||
let metadata: Option<serde_json::Value> = r.get(9);
|
||||
|
||||
let pre_chunk_ids: Vec<i32> = r.try_get(13).unwrap_or_default();
|
||||
let parent_chunk_id: Option<String> = r.try_get(14).ok().flatten();
|
||||
let child_chunk_ids: Vec<String> = r.try_get(15).unwrap_or_default();
|
||||
let pre_chunk_ids: Vec<i32> = r.try_get(12).unwrap_or_default();
|
||||
let parent_chunk_id: Option<String> = r.try_get(13).ok().flatten();
|
||||
let child_chunk_ids: Vec<String> = r.try_get(14).unwrap_or_default();
|
||||
|
||||
let (rule, content_data) = if content.get("rule").is_some() {
|
||||
let (rule, content_data) = if content.get("rule").is_some() {
|
||||
let rule_str = content
|
||||
.get("rule")
|
||||
.and_then(|v| v.as_str())
|
||||
@@ -2820,7 +2823,7 @@ impl PostgresDb {
|
||||
file_id,
|
||||
uuid: r.get("uuid"),
|
||||
chunk_id: r.get("chunk_id"),
|
||||
chunk_index: chunk_index as u32,
|
||||
|
||||
chunk_type,
|
||||
rule,
|
||||
fps: r.get("fps"),
|
||||
@@ -2996,9 +2999,9 @@ impl PostgresDb {
|
||||
start_time: f64,
|
||||
end_time: f64,
|
||||
) -> Result<Vec<Chunk>> {
|
||||
let table = schema::table_name("chunks");
|
||||
let table = "dev.chunk";
|
||||
let rows = sqlx::query(&format!(
|
||||
"SELECT file_id, uuid, chunk_id, chunk_index, chunk_type, start_time, end_time, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id::text as parent_chunk_id, child_chunk_ids
|
||||
"SELECT file_id, uuid, chunk_id, chunk_type, start_time, end_time, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id::text as parent_chunk_id, child_chunk_ids
|
||||
FROM {}
|
||||
WHERE file_id = $1 AND start_time >= $2 AND end_time <= $3
|
||||
ORDER BY start_time",
|
||||
@@ -3013,8 +3016,7 @@ impl PostgresDb {
|
||||
let chunks: Vec<Chunk> = rows
|
||||
.into_iter()
|
||||
.map(|r| {
|
||||
let chunk_type_str: String = r.get(4);
|
||||
let chunk_index: i32 = r.get(3);
|
||||
let chunk_type_str: String = r.get(3);
|
||||
let chunk_type = match chunk_type_str.as_str() {
|
||||
"time" => ChunkType::TimeBased,
|
||||
"sentence" => ChunkType::Sentence,
|
||||
@@ -3024,12 +3026,12 @@ impl PostgresDb {
|
||||
_ => ChunkType::TimeBased,
|
||||
};
|
||||
|
||||
let content: serde_json::Value = r.get(11);
|
||||
let metadata: Option<serde_json::Value> = r.get(12);
|
||||
let content: serde_json::Value = r.get(10);
|
||||
let metadata: Option<serde_json::Value> = r.get(11);
|
||||
|
||||
let pre_chunk_ids: Vec<i32> = r.try_get(15).unwrap_or_default();
|
||||
let parent_chunk_id: Option<String> = r.try_get(16).ok().flatten();
|
||||
let child_chunk_ids: Vec<String> = r.try_get(17).unwrap_or_default();
|
||||
let pre_chunk_ids: Vec<i32> = r.try_get(14).unwrap_or_default();
|
||||
let parent_chunk_id: Option<String> = r.try_get(15).ok().flatten();
|
||||
let child_chunk_ids: Vec<String> = r.try_get(16).unwrap_or_default();
|
||||
|
||||
let (rule, content_data) = if content.get("rule").is_some() {
|
||||
let rule_str = content
|
||||
@@ -3054,7 +3056,7 @@ impl PostgresDb {
|
||||
file_id,
|
||||
uuid: r.get("uuid"),
|
||||
chunk_id: r.get("chunk_id"),
|
||||
chunk_index: chunk_index as u32,
|
||||
|
||||
chunk_type,
|
||||
rule,
|
||||
|
||||
@@ -3082,9 +3084,9 @@ impl PostgresDb {
|
||||
return Ok(vec![]);
|
||||
}
|
||||
|
||||
let table = schema::table_name("chunks");
|
||||
let table = "dev.chunk";
|
||||
let rows = sqlx::query(&format!(
|
||||
"SELECT file_id, uuid, chunk_id, chunk_index, chunk_type, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id::text as parent_chunk_id, child_chunk_ids FROM {} WHERE chunk_id = ANY($1) ORDER BY chunk_index",
|
||||
"SELECT file_id, uuid, chunk_id, chunk_type, fps, start_frame, end_frame, text_content, content, metadata, vector_id, frame_count, pre_chunk_ids, parent_chunk_id::text as parent_chunk_id, child_chunk_ids FROM {} WHERE chunk_id = ANY($1) ORDER BY id",
|
||||
table
|
||||
))
|
||||
.bind(chunk_ids)
|
||||
@@ -3094,8 +3096,7 @@ impl PostgresDb {
|
||||
let chunks: Vec<Chunk> = rows
|
||||
.into_iter()
|
||||
.map(|r| {
|
||||
let chunk_type_str: String = r.get(4);
|
||||
let chunk_index: i32 = r.get(3);
|
||||
let chunk_type_str: String = r.get(3);
|
||||
let chunk_type = match chunk_type_str.as_str() {
|
||||
"time" => ChunkType::TimeBased,
|
||||
"sentence" => ChunkType::Sentence,
|
||||
@@ -3135,7 +3136,7 @@ impl PostgresDb {
|
||||
file_id,
|
||||
uuid: r.get("uuid"),
|
||||
chunk_id: r.get("chunk_id"),
|
||||
chunk_index: chunk_index as u32,
|
||||
|
||||
chunk_type,
|
||||
rule,
|
||||
|
||||
@@ -3192,7 +3193,7 @@ impl PostgresDb {
|
||||
}
|
||||
|
||||
pub async fn update_vector_id(&self, chunk_id: &str, vector_id: &str) -> Result<()> {
|
||||
let table = schema::table_name("chunks");
|
||||
let table = "dev.chunk";
|
||||
sqlx::query(&format!(
|
||||
"UPDATE {} SET vector_id = $1 WHERE chunk_id = $2",
|
||||
table
|
||||
@@ -3214,12 +3215,12 @@ impl PostgresDb {
|
||||
}
|
||||
|
||||
pub async fn search_text(&self, query: &str, chunk_type: Option<&str>) -> Result<Vec<Chunk>> {
|
||||
let table = schema::table_name("chunks");
|
||||
let table = "dev.chunk";
|
||||
let query_pattern = format!("%{}%", query);
|
||||
|
||||
let sql = match chunk_type {
|
||||
Some(_) => &format!("SELECT uuid, chunk_id, chunk_index, chunk_type, start_time, end_time, fps, start_frame, end_frame, content, metadata, vector_id, parent_chunk_id, child_chunk_ids FROM {} WHERE content->>'text' ILIKE $1 AND chunk_type = $2 ORDER BY chunk_index", table),
|
||||
None => &format!("SELECT uuid, chunk_id, chunk_index, chunk_type, start_time, end_time, fps, start_frame, end_frame, content, metadata, vector_id, parent_chunk_id, child_chunk_ids FROM {} WHERE content->>'text' ILIKE $1 ORDER BY chunk_index", table),
|
||||
Some(_) => &format!("SELECT uuid, chunk_id, chunk_type, start_time, end_time, fps, start_frame, end_frame, content, metadata, vector_id, parent_chunk_id, child_chunk_ids FROM {} WHERE content->>'text' ILIKE $1 AND chunk_type = $2 ORDER BY id", table),
|
||||
None => &format!("SELECT uuid, chunk_id, chunk_type, start_time, end_time, fps, start_frame, end_frame, content, metadata, vector_id, parent_chunk_id, child_chunk_ids FROM {} WHERE content->>'text' ILIKE $1 ORDER BY id", table),
|
||||
};
|
||||
|
||||
let chunks = if let Some(ct) = chunk_type {
|
||||
@@ -3228,7 +3229,6 @@ impl PostgresDb {
|
||||
(
|
||||
String,
|
||||
String,
|
||||
i32,
|
||||
String,
|
||||
f64,
|
||||
f64,
|
||||
@@ -3252,7 +3252,6 @@ impl PostgresDb {
|
||||
(
|
||||
String,
|
||||
String,
|
||||
i32,
|
||||
String,
|
||||
f64,
|
||||
f64,
|
||||
@@ -3274,7 +3273,7 @@ impl PostgresDb {
|
||||
let results: Vec<Chunk> = chunks
|
||||
.into_iter()
|
||||
.map(|r| {
|
||||
let chunk_type = match r.3.as_str() {
|
||||
let chunk_type = match r.2.as_str() {
|
||||
"time_based" => ChunkType::TimeBased,
|
||||
"sentence" => ChunkType::Sentence,
|
||||
"cut" => ChunkType::Cut,
|
||||
@@ -3284,29 +3283,29 @@ impl PostgresDb {
|
||||
};
|
||||
|
||||
let content: serde_json::Value =
|
||||
serde_json::from_str(&r.9).unwrap_or(serde_json::json!({}));
|
||||
serde_json::from_str(&r.8).unwrap_or(serde_json::json!({}));
|
||||
|
||||
let metadata: Option<serde_json::Value> =
|
||||
r.10.and_then(|m| serde_json::from_str(&m).ok());
|
||||
r.9.and_then(|m| serde_json::from_str(&m).ok());
|
||||
|
||||
Chunk {
|
||||
file_id: 0,
|
||||
uuid: r.0,
|
||||
chunk_id: r.1,
|
||||
chunk_index: r.2 as u32,
|
||||
|
||||
chunk_type,
|
||||
rule: ChunkRule::Rule1,
|
||||
fps: r.6,
|
||||
start_frame: r.7,
|
||||
end_frame: r.8,
|
||||
text_content: Some(r.9),
|
||||
fps: r.5,
|
||||
start_frame: r.6,
|
||||
end_frame: r.7,
|
||||
text_content: Some(r.8),
|
||||
content,
|
||||
metadata,
|
||||
vector_id: r.11,
|
||||
vector_id: r.10,
|
||||
frame_count: 0,
|
||||
pre_chunk_ids: vec![],
|
||||
parent_chunk_id: r.12,
|
||||
child_chunk_ids: r.13,
|
||||
parent_chunk_id: r.11,
|
||||
child_chunk_ids: r.12,
|
||||
visual_stats: None,
|
||||
}
|
||||
})
|
||||
@@ -3321,13 +3320,13 @@ impl PostgresDb {
|
||||
uuid: Option<&str>,
|
||||
limit: usize,
|
||||
) -> Result<Vec<Bm25Result>> {
|
||||
let table = schema::table_name("chunks");
|
||||
let table = "dev.chunk";
|
||||
let tsquery = self.prepare_tsquery(query).await?;
|
||||
|
||||
let sql = match uuid {
|
||||
Some(_) => &format!(
|
||||
r#"
|
||||
SELECT c.chunk_id, c.file_uuid, c.chunk_index, c.chunk_type, c.start_frame, c.end_frame, c.fps, c.start_time, c.end_time,
|
||||
SELECT c.chunk_id, c.file_uuid, c.chunk_type, c.start_frame, c.end_frame, c.fps, c.start_time, c.end_time,
|
||||
c.text_content, GREATEST(ts_rank_cd(c.search_vector, to_tsquery('english', $1)), ts_rank_cd(pc.summary_tsvector, to_tsquery('english', $1))) as bm25_score,
|
||||
c.visual_stats,
|
||||
pc.metadata->'structured_summary' as scene_summary,
|
||||
@@ -3342,7 +3341,7 @@ impl PostgresDb {
|
||||
),
|
||||
None => &format!(
|
||||
r#"
|
||||
SELECT c.chunk_id, c.file_uuid, c.chunk_index, c.chunk_type, c.start_frame, c.end_frame, c.fps, c.start_time, c.end_time,
|
||||
SELECT c.chunk_id, c.file_uuid, c.chunk_type, c.start_frame, c.end_frame, c.fps, c.start_time, c.end_time,
|
||||
c.text_content, GREATEST(ts_rank_cd(c.search_vector, to_tsquery('english', $1)), ts_rank_cd(pc.summary_tsvector, to_tsquery('english', $1))) as bm25_score,
|
||||
c.visual_stats,
|
||||
pc.metadata->'structured_summary' as scene_summary,
|
||||
@@ -3406,7 +3405,7 @@ impl PostgresDb {
|
||||
Bm25Result {
|
||||
chunk_id: r.0,
|
||||
uuid: r.1,
|
||||
chunk_index: r.2 as u32,
|
||||
|
||||
chunk_type: r.3,
|
||||
start_frame: r.4,
|
||||
end_frame: r.5,
|
||||
@@ -3472,7 +3471,7 @@ impl PostgresDb {
|
||||
HybridSearchResult {
|
||||
chunk_id: r.chunk_id.clone(),
|
||||
uuid: r.uuid.clone(),
|
||||
chunk_index: r.chunk_index,
|
||||
|
||||
chunk_type: r.chunk_type.clone(),
|
||||
start_frame: r.start_frame,
|
||||
end_frame: r.end_frame,
|
||||
@@ -3526,7 +3525,7 @@ impl PostgresDb {
|
||||
HybridSearchResult {
|
||||
chunk_id: r.chunk_id.clone(),
|
||||
uuid: r.uuid.clone(),
|
||||
chunk_index: chunk_data.map(|c| c.chunk_index).unwrap_or(0),
|
||||
|
||||
chunk_type: chunk_data
|
||||
.map(|c| c.chunk_type.as_str().to_string())
|
||||
.unwrap_or_default(),
|
||||
@@ -3779,7 +3778,6 @@ pub struct SceneSummary {
|
||||
pub struct Bm25Result {
|
||||
pub chunk_id: String,
|
||||
pub uuid: String,
|
||||
pub chunk_index: u32,
|
||||
pub chunk_type: String,
|
||||
pub start_frame: i64,
|
||||
pub end_frame: i64,
|
||||
@@ -3797,7 +3795,6 @@ pub struct Bm25Result {
|
||||
pub struct HybridSearchResult {
|
||||
pub uuid: String,
|
||||
pub chunk_id: String,
|
||||
pub chunk_index: u32,
|
||||
pub chunk_type: String,
|
||||
pub start_frame: i64,
|
||||
pub end_frame: i64,
|
||||
@@ -4443,7 +4440,7 @@ impl PostgresDb {
|
||||
total_frames: u64,
|
||||
) -> Result<()> {
|
||||
let table = schema::table_name("videos");
|
||||
let chunks_table = schema::table_name("chunks");
|
||||
let chunks_table = "dev.chunk";
|
||||
let pre_chunks_table = schema::table_name("pre_chunks");
|
||||
|
||||
// Query chunks count and frames
|
||||
@@ -4622,7 +4619,7 @@ impl PostgresDb {
|
||||
let results = sqlx::query_as::<_, SemanticSearchResult>(
|
||||
r#"
|
||||
SELECT
|
||||
id, chunk_index as scene_order, start_time, end_time,
|
||||
id as scene_order, start_time, end_time,
|
||||
COALESCE(summary_text, text_content, '') as summary,
|
||||
metadata,
|
||||
(1 - (embedding <=> $1::vector)) as similarity
|
||||
@@ -4820,7 +4817,7 @@ mod tests {
|
||||
"file_id": 1,
|
||||
"uuid": "test",
|
||||
"chunk_id": "c1",
|
||||
"chunk_index": 0,
|
||||
|
||||
"chunk_type": "time_based",
|
||||
"rule": "rule1",
|
||||
"start_time": 0.0,
|
||||
@@ -4960,7 +4957,7 @@ mod tests {
|
||||
let result = Bm25Result {
|
||||
chunk_id: "sentence_001".to_string(),
|
||||
uuid: "test-uuid".to_string(),
|
||||
chunk_index: 1,
|
||||
|
||||
chunk_type: "sentence".to_string(),
|
||||
start_frame: 0,
|
||||
end_frame: 150,
|
||||
@@ -4985,7 +4982,7 @@ mod tests {
|
||||
let result = HybridSearchResult {
|
||||
chunk_id: "sentence_001".to_string(),
|
||||
uuid: "test-uuid".to_string(),
|
||||
chunk_index: 1,
|
||||
|
||||
chunk_type: "sentence".to_string(),
|
||||
start_frame: 0,
|
||||
end_frame: 150,
|
||||
|
||||
@@ -120,9 +120,16 @@ impl QdrantDb {
|
||||
.json(&body)
|
||||
.send()
|
||||
.await
|
||||
.context(format!("Failed to create Qdrant collection: {}", collection))?;
|
||||
.context(format!(
|
||||
"Failed to create Qdrant collection: {}",
|
||||
collection
|
||||
))?;
|
||||
|
||||
tracing::info!("Created Qdrant collection: {} (dim={})", collection, vector_dim);
|
||||
tracing::info!(
|
||||
"Created Qdrant collection: {} (dim={})",
|
||||
collection,
|
||||
vector_dim
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -129,7 +129,7 @@ impl SyncDb {
|
||||
let chunk = Chunk::from_seconds(
|
||||
0, // file_id - will be set later
|
||||
uuid.to_string(),
|
||||
i as u32,
|
||||
format!("{}", i),
|
||||
ChunkType::Sentence,
|
||||
ChunkRule::Rule1,
|
||||
segment.start,
|
||||
|
||||
@@ -43,8 +43,7 @@ impl Embedder {
|
||||
}
|
||||
|
||||
fn default_url() -> String {
|
||||
std::env::var("MOMENTRY_EMBED_URL")
|
||||
.unwrap_or_else(|_| "http://localhost:11434".to_string())
|
||||
std::env::var("MOMENTRY_EMBED_URL").unwrap_or_else(|_| "http://localhost:11434".to_string())
|
||||
}
|
||||
|
||||
pub async fn embed_text(&self, text: &str) -> Result<Vec<f32>> {
|
||||
@@ -91,7 +90,12 @@ impl Embedder {
|
||||
.await
|
||||
.context("Failed to parse embedding response")?;
|
||||
|
||||
Ok(result.data.into_iter().next().map(|d| d.embedding).unwrap_or_default())
|
||||
Ok(result
|
||||
.data
|
||||
.into_iter()
|
||||
.next()
|
||||
.map(|d| d.embedding)
|
||||
.unwrap_or_default())
|
||||
} else {
|
||||
let url = format!("{}/api/embeddings", self.base_url);
|
||||
let response = self
|
||||
|
||||
@@ -1,11 +1,8 @@
|
||||
use anyhow::{Context, Result};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::time::Duration;
|
||||
|
||||
use super::executor::PythonExecutor;
|
||||
|
||||
const ASR_TIMEOUT: Duration = Duration::from_secs(1800); // 30 minutes
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct AsrResult {
|
||||
pub language: Option<String>,
|
||||
@@ -36,7 +33,7 @@ pub async fn process_asr(
|
||||
&[video_path, output_path],
|
||||
uuid,
|
||||
"ASR",
|
||||
Some(ASR_TIMEOUT),
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.with_context(|| format!("Failed to run {:?}", script_path))?;
|
||||
|
||||
@@ -247,7 +247,10 @@ impl PythonExecutor {
|
||||
let mut partial_path = out.to_path_buf();
|
||||
partial_path.set_extension("json.partial");
|
||||
let _ = std::fs::rename(tmp, &partial_path);
|
||||
tracing::warn!("[Executor] Partial output preserved: {:?}", partial_path);
|
||||
tracing::warn!(
|
||||
"[Executor] Partial output preserved: {:?}",
|
||||
partial_path
|
||||
);
|
||||
} else {
|
||||
let mut err_path = out.to_path_buf();
|
||||
err_path.set_extension("json.err");
|
||||
|
||||
@@ -131,7 +131,7 @@ fn create_fixed_frame_chunks(
|
||||
let chunk = crate::core::chunk::Chunk::from_yolo_frames(
|
||||
file_id,
|
||||
uuid.to_string(),
|
||||
chunk_index,
|
||||
format!("vis_{}", chunk_index),
|
||||
start_frame,
|
||||
end_frame,
|
||||
fps,
|
||||
@@ -190,7 +190,7 @@ fn create_similarity_based_chunks(
|
||||
let chunk = crate::core::chunk::Chunk::from_yolo_frames(
|
||||
file_id,
|
||||
uuid.to_string(),
|
||||
chunk_index,
|
||||
format!("vis_{}", chunk_index),
|
||||
current_start_frame,
|
||||
end_frame,
|
||||
fps,
|
||||
@@ -214,7 +214,7 @@ fn create_similarity_based_chunks(
|
||||
let chunk = crate::core::chunk::Chunk::from_yolo_frames(
|
||||
file_id,
|
||||
uuid.to_string(),
|
||||
chunk_index,
|
||||
format!("vis_{}", chunk_index),
|
||||
current_start_frame,
|
||||
end_frame,
|
||||
fps,
|
||||
|
||||
@@ -13,11 +13,17 @@ struct TmdbIdentity {
|
||||
}
|
||||
|
||||
fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
|
||||
if a.len() != b.len() || a.is_empty() { return 0.0; }
|
||||
if a.len() != b.len() || a.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
|
||||
let na: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
let nb: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
if na == 0.0 || nb == 0.0 { 0.0 } else { dot / (na * nb) }
|
||||
if na == 0.0 || nb == 0.0 {
|
||||
0.0
|
||||
} else {
|
||||
dot / (na * nb)
|
||||
}
|
||||
}
|
||||
|
||||
/// Match face detections against TMDb identities using iterative multi-angle propagation.
|
||||
@@ -42,10 +48,11 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
|
||||
let fd_rows = sqlx::query_as::<_, (i32, Vec<f32>)>(
|
||||
"SELECT trace_id, embedding FROM dev.face_detections \
|
||||
WHERE file_uuid=$1 AND trace_id IS NOT NULL AND embedding IS NOT NULL \
|
||||
ORDER BY trace_id"
|
||||
ORDER BY trace_id",
|
||||
)
|
||||
.bind(file_uuid)
|
||||
.fetch_all(pool).await?;
|
||||
.fetch_all(pool)
|
||||
.await?;
|
||||
|
||||
if fd_rows.is_empty() {
|
||||
info!("[TKG-MATCH] No face detections for {}", file_uuid);
|
||||
@@ -77,14 +84,23 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
|
||||
for (id, name, tmdb_emb) in &tmdb_rows {
|
||||
for face in faces {
|
||||
let s = cosine_similarity(face, tmdb_emb);
|
||||
if s > best_sim { best_sim = s; best_id = *id; best_name = name.clone(); }
|
||||
if s > best_sim {
|
||||
best_sim = s;
|
||||
best_id = *id;
|
||||
best_name = name.clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
if best_sim >= TH {
|
||||
matched.insert(tid, (best_id, best_name));
|
||||
}
|
||||
}
|
||||
info!("[TKG-MATCH] Round 1: {} ({}/{})", matched.len(), matched.len() * 100 / total, total);
|
||||
info!(
|
||||
"[TKG-MATCH] Round 1: {} ({}/{})",
|
||||
matched.len(),
|
||||
matched.len() * 100 / total,
|
||||
total
|
||||
);
|
||||
|
||||
// Round 2+: propagate
|
||||
for round_n in 2..=10 {
|
||||
@@ -98,7 +114,9 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
|
||||
|
||||
let mut new_matches: Vec<(i32, i32, String)> = Vec::new();
|
||||
for (&tid, faces) in &trace_faces {
|
||||
if matched.contains_key(&tid) || faces.is_empty() { continue; }
|
||||
if matched.contains_key(&tid) || faces.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let ref_face = &faces[0];
|
||||
let mut best_id = 0i32;
|
||||
let mut best_name = String::new();
|
||||
@@ -106,13 +124,19 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
|
||||
for (&id, seed_faces) in &seed_pool {
|
||||
for seed in seed_faces {
|
||||
let s = cosine_similarity(ref_face, seed);
|
||||
if s > best_sim { best_sim = s; best_id = id; }
|
||||
if s > best_sim {
|
||||
best_sim = s;
|
||||
best_id = id;
|
||||
}
|
||||
}
|
||||
}
|
||||
if best_sim >= TH {
|
||||
// Look up name for this id
|
||||
for (id, name, _) in &tmdb_rows {
|
||||
if *id == best_id { best_name = name.clone(); break; }
|
||||
if *id == best_id {
|
||||
best_name = name.clone();
|
||||
break;
|
||||
}
|
||||
}
|
||||
new_matches.push((tid, best_id, best_name));
|
||||
}
|
||||
@@ -121,7 +145,9 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
|
||||
matched.insert(tid, (id, name));
|
||||
}
|
||||
let new = matched.len() - prev;
|
||||
if new < 5 { break; }
|
||||
if new < 5 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 4: Quality control
|
||||
@@ -129,41 +155,62 @@ pub async fn match_faces_against_tmdb(db: &PostgresDb, file_uuid: &str) -> Resul
|
||||
let mut after_qc = HashMap::new();
|
||||
for (&tid, &(id, ref name)) in &matched {
|
||||
let cnt: i64 = sqlx::query_scalar(
|
||||
"SELECT COUNT(*) FROM dev.face_detections WHERE file_uuid=$1 AND trace_id=$2"
|
||||
"SELECT COUNT(*) FROM dev.face_detections WHERE file_uuid=$1 AND trace_id=$2",
|
||||
)
|
||||
.bind(file_uuid).bind(tid)
|
||||
.fetch_one(pool).await.unwrap_or(0);
|
||||
.bind(file_uuid)
|
||||
.bind(tid)
|
||||
.fetch_one(pool)
|
||||
.await
|
||||
.unwrap_or(0);
|
||||
if cnt >= 4 {
|
||||
after_qc.insert(tid, (id, name.clone()));
|
||||
} else {
|
||||
info!("[TKG-QC] trace {} removed: only {} face(s), need >= 4", tid, cnt);
|
||||
info!(
|
||||
"[TKG-QC] trace {} removed: only {} face(s), need >= 4",
|
||||
tid, cnt
|
||||
);
|
||||
}
|
||||
}
|
||||
let matched = after_qc;
|
||||
let removed_low = total - matched.len();
|
||||
if removed_low > 0 {
|
||||
info!("[TKG-QC] Removed {} low-confidence traces (< 4 faces)", removed_low);
|
||||
info!(
|
||||
"[TKG-QC] Removed {} low-confidence traces (< 4 faces)",
|
||||
removed_low
|
||||
);
|
||||
}
|
||||
|
||||
// 4b: Temporal collision check
|
||||
let removed_collisions = quality_check_temporal_collisions(pool, file_uuid).await?;
|
||||
if removed_collisions > 0 {
|
||||
info!("[TKG-QC] Resolved {} temporal collisions", removed_collisions);
|
||||
info!(
|
||||
"[TKG-QC] Resolved {} temporal collisions",
|
||||
removed_collisions
|
||||
);
|
||||
}
|
||||
|
||||
// Step 5: Update DB
|
||||
let mut updated = 0usize;
|
||||
for (&tid, &(id, _)) in &matched {
|
||||
let r = sqlx::query(
|
||||
"UPDATE dev.face_detections SET identity_id=$1 WHERE file_uuid=$2 AND trace_id=$3"
|
||||
"UPDATE dev.face_detections SET identity_id=$1 WHERE file_uuid=$2 AND trace_id=$3",
|
||||
)
|
||||
.bind(id).bind(file_uuid).bind(tid)
|
||||
.execute(pool).await?;
|
||||
if r.rows_affected() > 0 { updated += 1; }
|
||||
.bind(id)
|
||||
.bind(file_uuid)
|
||||
.bind(tid)
|
||||
.execute(pool)
|
||||
.await?;
|
||||
if r.rows_affected() > 0 {
|
||||
updated += 1;
|
||||
}
|
||||
}
|
||||
|
||||
info!("[TKG-MATCH] Done: {}/{} traces matched ({}%)",
|
||||
matched.len(), total, matched.len() * 100 / total);
|
||||
info!(
|
||||
"[TKG-MATCH] Done: {}/{} traces matched ({}%)",
|
||||
matched.len(),
|
||||
total,
|
||||
matched.len() * 100 / total
|
||||
);
|
||||
Ok(updated)
|
||||
}
|
||||
|
||||
@@ -185,10 +232,11 @@ async fn quality_check_temporal_collisions(pool: &sqlx::PgPool, file_uuid: &str)
|
||||
AND a.identity_id IS NOT NULL
|
||||
AND a.identity_id = b.identity_id
|
||||
ORDER BY a.identity_id, a.frame_number
|
||||
"#
|
||||
"#,
|
||||
)
|
||||
.bind(file_uuid)
|
||||
.fetch_all(pool).await?;
|
||||
.fetch_all(pool)
|
||||
.await?;
|
||||
|
||||
if collisions.is_empty() {
|
||||
return Ok(0);
|
||||
@@ -221,10 +269,12 @@ async fn quality_check_temporal_collisions(pool: &sqlx::PgPool, file_uuid: &str)
|
||||
let victim_cnt = if cnt_a <= cnt_b { cnt_a } else { cnt_b };
|
||||
|
||||
sqlx::query(
|
||||
"UPDATE dev.face_detections SET identity_id=NULL WHERE file_uuid=$1 AND trace_id=$2"
|
||||
"UPDATE dev.face_detections SET identity_id=NULL WHERE file_uuid=$1 AND trace_id=$2",
|
||||
)
|
||||
.bind(file_uuid).bind(victim)
|
||||
.execute(pool).await?;
|
||||
.bind(file_uuid)
|
||||
.bind(victim)
|
||||
.execute(pool)
|
||||
.await?;
|
||||
|
||||
unbound += 1;
|
||||
warn!("[TKG-QC] Collision identity={}: trace {} vs trace {} ({} overlap frames). Unbound trace {} ({} detections)",
|
||||
|
||||
Reference in New Issue
Block a user