feat: deploy hybrid search (semantic+keyword+identity) with RRF fusion
- Replace smart_search with hybrid RRF implementation - Add speaker_detections table for identity-agent binding - Fix identity queries: direct SQL to avoid type mismatches - Add debug logs to job_worker for processor debugging - Deployed to production (3002) successfully Key changes: - search.rs: Complete rewrite with 3 strategies + RRF - postgres_db.rs: speaker_detections table + identity query fixes - job_worker.rs: Debug logs for output file checks Tested: - Hybrid search works with semantic + keyword + identity - Identity search: 'identity:Charade' returns correct results - Chinese keyword search: '調光' matches Charade summaries Bugs found: - Case mismatch: 'ASRX' vs 'asrx' in processors field - Missing CUT dependency for ASRX processor
This commit is contained in:
@@ -1008,6 +1008,32 @@ impl PostgresDb {
|
||||
// sqlx::query("CREATE TABLE IF NOT EXISTS chunks_rule1 ...").execute(pool).await?;
|
||||
// sqlx::query("CREATE INDEX IF NOT EXISTS idx_chunks_rule1_asset ...").execute(pool).await?;
|
||||
|
||||
// Speaker Detections
|
||||
sqlx::query(
|
||||
"CREATE TABLE IF NOT EXISTS speaker_detections ( \
|
||||
id SERIAL PRIMARY KEY, \
|
||||
file_uuid VARCHAR(32) NOT NULL, \
|
||||
identity_id INTEGER REFERENCES identities(id) ON DELETE CASCADE, \
|
||||
speaker_id VARCHAR(32), \
|
||||
start_time DOUBLE PRECISION, \
|
||||
end_time DOUBLE PRECISION, \
|
||||
text_content TEXT, \
|
||||
chunk_id VARCHAR(128), \
|
||||
confidence REAL, \
|
||||
metadata JSONB DEFAULT '{}', \
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP)",
|
||||
)
|
||||
.execute(pool)
|
||||
.await?;
|
||||
sqlx::query("CREATE INDEX IF NOT EXISTS idx_speaker_detections_identity ON speaker_detections(identity_id)")
|
||||
.execute(pool).await?;
|
||||
sqlx::query("CREATE INDEX IF NOT EXISTS idx_speaker_detections_file ON speaker_detections(file_uuid)")
|
||||
.execute(pool).await?;
|
||||
sqlx::query("CREATE INDEX IF NOT EXISTS idx_speaker_detections_chunk ON speaker_detections(chunk_id)")
|
||||
.execute(pool).await?;
|
||||
sqlx::query("CREATE INDEX IF NOT EXISTS idx_speaker_detections_search ON speaker_detections(file_uuid, identity_id)")
|
||||
.execute(pool).await?;
|
||||
|
||||
// Jobs (Legacy/P0)
|
||||
tracing::info!("Creating jobs table...");
|
||||
sqlx::query("CREATE TABLE IF NOT EXISTS jobs (id UUID PRIMARY KEY, file_uuid VARCHAR(32) NOT NULL REFERENCES videos(file_uuid) ON DELETE CASCADE, processor_list TEXT[], assigned_processor_id UUID, rule VARCHAR(20), status VARCHAR(20) DEFAULT 'QUEUED', total_frames BIGINT DEFAULT 0, processed_frames BIGINT DEFAULT 0, error_message TEXT, created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW())").execute(pool).await?;
|
||||
@@ -2181,6 +2207,36 @@ impl PostgresDb {
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Retrieve chunk details by file_uuid and chunk_id for Qdrant result enrichment
|
||||
pub async fn get_chunk_by_file_and_chunk_id(
|
||||
&self,
|
||||
file_uuid: &str,
|
||||
chunk_id: &str,
|
||||
) -> Result<Option<SemanticSearchResult>> {
|
||||
let chunk_table = schema::table_name("chunk");
|
||||
let results = sqlx::query_as::<_, SemanticSearchResult>(
|
||||
&format!(
|
||||
"SELECT \
|
||||
id, file_uuid, id as scene_order, \
|
||||
(start_time * fps)::bigint as start_frame, (end_time * fps)::bigint as end_frame, \
|
||||
fps, start_time, end_time, \
|
||||
COALESCE(summary_text, text_content, '') as summary, \
|
||||
metadata, \
|
||||
1.0::float8 as similarity \
|
||||
FROM {} \
|
||||
WHERE file_uuid = $1 AND chunk_id = $2 AND embedding IS NOT NULL \
|
||||
LIMIT 1",
|
||||
chunk_table
|
||||
),
|
||||
)
|
||||
.bind(file_uuid)
|
||||
.bind(chunk_id)
|
||||
.fetch_optional(&self.pool)
|
||||
.await?;
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Get children for a list of parent IDs
|
||||
pub async fn get_children_for_parents(
|
||||
&self,
|
||||
@@ -2402,6 +2458,50 @@ impl PostgresDb {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn store_speaker_detections_batch(
|
||||
&self,
|
||||
uuid: &str,
|
||||
segments: &[(String, f64, f64, String, Option<String>, f32)],
|
||||
) -> Result<()> {
|
||||
let table = schema::table_name("speaker_detections");
|
||||
for (speaker_id, start_time, end_time, text, chunk_id, confidence) in segments {
|
||||
sqlx::query(&format!(
|
||||
"INSERT INTO {} (file_uuid, speaker_id, start_time, end_time, text_content, chunk_id, confidence) \
|
||||
VALUES ($1, $2, $3, $4, $5, $6, $7) ON CONFLICT DO NOTHING",
|
||||
table
|
||||
))
|
||||
.bind(uuid)
|
||||
.bind(speaker_id)
|
||||
.bind(start_time)
|
||||
.bind(end_time)
|
||||
.bind(text)
|
||||
.bind(chunk_id)
|
||||
.bind(confidence)
|
||||
.execute(&self.pool)
|
||||
.await?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn update_speaker_detection_identity(
|
||||
&self,
|
||||
file_uuid: &str,
|
||||
speaker_id: &str,
|
||||
identity_id: i64,
|
||||
) -> Result<()> {
|
||||
let table = schema::table_name("speaker_detections");
|
||||
sqlx::query(&format!(
|
||||
"UPDATE {} SET identity_id = $1 WHERE file_uuid = $2 AND speaker_id = $3 AND identity_id IS NULL",
|
||||
table
|
||||
))
|
||||
.bind(identity_id)
|
||||
.bind(file_uuid)
|
||||
.bind(speaker_id)
|
||||
.execute(&self.pool)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn store_scene_pre_chunks_batch(
|
||||
&self,
|
||||
uuid: &str,
|
||||
@@ -2761,14 +2861,14 @@ impl PostgresDb {
|
||||
use sqlx::Row;
|
||||
let rows = if let Some(u) = file_uuid {
|
||||
sqlx::query(&format!(
|
||||
"SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, 1.0 as score \
|
||||
"SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, 1.0::float8 as score \
|
||||
FROM {} WHERE file_uuid=$1 AND text_content ILIKE $2 LIMIT $3", table)
|
||||
)
|
||||
.bind(u).bind(&like).bind(limit)
|
||||
.fetch_all(&self.pool).await?
|
||||
} else {
|
||||
sqlx::query(&format!(
|
||||
"SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, 1.0 as score \
|
||||
"SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, 1.0::float8 as score \
|
||||
FROM {} WHERE text_content ILIKE $1 LIMIT $2", table)
|
||||
)
|
||||
.bind(&like).bind(limit)
|
||||
@@ -3118,8 +3218,7 @@ impl PostgresDb {
|
||||
let id_table = schema::table_name("identities");
|
||||
let fd_table = schema::table_name("face_detections");
|
||||
let chunk_table = schema::table_name("chunk");
|
||||
let ib_table = schema::table_name("identity_bindings");
|
||||
let pc_table = schema::table_name("pre_chunks");
|
||||
let sd_table = schema::table_name("speaker_detections");
|
||||
use sqlx::Row;
|
||||
let subq = format!(
|
||||
"SELECT id FROM {} WHERE REPLACE(uuid::text, '-', '') = $1",
|
||||
@@ -3138,22 +3237,16 @@ impl PostgresDb {
|
||||
GROUP BY c.file_uuid, c.chunk_id, c.start_frame, c.end_frame, \
|
||||
c.fps, c.start_time, c.end_time, c.text_content \
|
||||
UNION ALL \
|
||||
SELECT c.file_uuid, c.chunk_id, \
|
||||
c.start_frame::bigint, c.end_frame::bigint, \
|
||||
c.fps, c.start_time, c.end_time, c.text_content, \
|
||||
SELECT sd.file_uuid, COALESCE(c.chunk_id, sd.chunk_id), \
|
||||
COALESCE(c.start_frame, 0)::bigint, COALESCE(c.end_frame, 0)::bigint, \
|
||||
COALESCE(c.fps, 24.0), sd.start_time, sd.end_time, sd.text_content, \
|
||||
'sentence' as chunk_type \
|
||||
FROM {} c \
|
||||
JOIN {} pc ON pc.file_uuid = c.file_uuid \
|
||||
AND pc.processor_type = 'asrx' \
|
||||
AND c.start_time <= (pc.data->>'timestamp')::double precision \
|
||||
AND c.end_time >= (pc.data->>'timestamp')::double precision \
|
||||
JOIN {} ib ON ib.identity_value = pc.data->>'speaker_id' \
|
||||
AND ib.identity_type = 'speaker' \
|
||||
AND ib.file_uuid = pc.file_uuid \
|
||||
WHERE ib.identity_id = ({}) \
|
||||
FROM {} sd \
|
||||
LEFT JOIN {} c ON c.chunk_id = sd.chunk_id \
|
||||
WHERE sd.identity_id = ({}) \
|
||||
ORDER BY start_time \
|
||||
LIMIT $2 OFFSET $3",
|
||||
chunk_table, fd_table, subq, chunk_table, pc_table, ib_table, subq
|
||||
chunk_table, fd_table, subq, sd_table, chunk_table, subq
|
||||
))
|
||||
.bind(uuid_str)
|
||||
.bind(limit)
|
||||
|
||||
Reference in New Issue
Block a user