feat: deploy hybrid search (semantic+keyword+identity) with RRF fusion

- Replace smart_search with hybrid RRF implementation
- Add speaker_detections table for identity-agent binding
- Fix identity queries: direct SQL to avoid type mismatches
- Add debug logs to job_worker for processor debugging
- Deployed to production (3002) successfully

Key changes:
- search.rs: Complete rewrite with 3 strategies + RRF
- postgres_db.rs: speaker_detections table + identity query fixes
- job_worker.rs: Debug logs for output file checks

Tested:
- Hybrid search works with semantic + keyword + identity
- Identity search: 'identity:Charade' returns correct results
- Chinese keyword search: '調光' matches Charade summaries

Bugs found:
- Case mismatch: 'ASRX' vs 'asrx' in processors field
- Missing CUT dependency for ASRX processor
This commit is contained in:
Accusys
2026-06-01 15:15:17 +08:00
parent 0d58a738a1
commit 874d688987
4 changed files with 549 additions and 74 deletions

View File

@@ -1008,6 +1008,32 @@ impl PostgresDb {
// sqlx::query("CREATE TABLE IF NOT EXISTS chunks_rule1 ...").execute(pool).await?;
// sqlx::query("CREATE INDEX IF NOT EXISTS idx_chunks_rule1_asset ...").execute(pool).await?;
// Speaker Detections
sqlx::query(
"CREATE TABLE IF NOT EXISTS speaker_detections ( \
id SERIAL PRIMARY KEY, \
file_uuid VARCHAR(32) NOT NULL, \
identity_id INTEGER REFERENCES identities(id) ON DELETE CASCADE, \
speaker_id VARCHAR(32), \
start_time DOUBLE PRECISION, \
end_time DOUBLE PRECISION, \
text_content TEXT, \
chunk_id VARCHAR(128), \
confidence REAL, \
metadata JSONB DEFAULT '{}', \
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP)",
)
.execute(pool)
.await?;
sqlx::query("CREATE INDEX IF NOT EXISTS idx_speaker_detections_identity ON speaker_detections(identity_id)")
.execute(pool).await?;
sqlx::query("CREATE INDEX IF NOT EXISTS idx_speaker_detections_file ON speaker_detections(file_uuid)")
.execute(pool).await?;
sqlx::query("CREATE INDEX IF NOT EXISTS idx_speaker_detections_chunk ON speaker_detections(chunk_id)")
.execute(pool).await?;
sqlx::query("CREATE INDEX IF NOT EXISTS idx_speaker_detections_search ON speaker_detections(file_uuid, identity_id)")
.execute(pool).await?;
// Jobs (Legacy/P0)
tracing::info!("Creating jobs table...");
sqlx::query("CREATE TABLE IF NOT EXISTS jobs (id UUID PRIMARY KEY, file_uuid VARCHAR(32) NOT NULL REFERENCES videos(file_uuid) ON DELETE CASCADE, processor_list TEXT[], assigned_processor_id UUID, rule VARCHAR(20), status VARCHAR(20) DEFAULT 'QUEUED', total_frames BIGINT DEFAULT 0, processed_frames BIGINT DEFAULT 0, error_message TEXT, created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW())").execute(pool).await?;
@@ -2181,6 +2207,36 @@ impl PostgresDb {
Ok(results)
}
/// Retrieve chunk details by file_uuid and chunk_id for Qdrant result enrichment
pub async fn get_chunk_by_file_and_chunk_id(
&self,
file_uuid: &str,
chunk_id: &str,
) -> Result<Option<SemanticSearchResult>> {
let chunk_table = schema::table_name("chunk");
let results = sqlx::query_as::<_, SemanticSearchResult>(
&format!(
"SELECT \
id, file_uuid, id as scene_order, \
(start_time * fps)::bigint as start_frame, (end_time * fps)::bigint as end_frame, \
fps, start_time, end_time, \
COALESCE(summary_text, text_content, '') as summary, \
metadata, \
1.0::float8 as similarity \
FROM {} \
WHERE file_uuid = $1 AND chunk_id = $2 AND embedding IS NOT NULL \
LIMIT 1",
chunk_table
),
)
.bind(file_uuid)
.bind(chunk_id)
.fetch_optional(&self.pool)
.await?;
Ok(results)
}
/// Get children for a list of parent IDs
pub async fn get_children_for_parents(
&self,
@@ -2402,6 +2458,50 @@ impl PostgresDb {
Ok(())
}
pub async fn store_speaker_detections_batch(
&self,
uuid: &str,
segments: &[(String, f64, f64, String, Option<String>, f32)],
) -> Result<()> {
let table = schema::table_name("speaker_detections");
for (speaker_id, start_time, end_time, text, chunk_id, confidence) in segments {
sqlx::query(&format!(
"INSERT INTO {} (file_uuid, speaker_id, start_time, end_time, text_content, chunk_id, confidence) \
VALUES ($1, $2, $3, $4, $5, $6, $7) ON CONFLICT DO NOTHING",
table
))
.bind(uuid)
.bind(speaker_id)
.bind(start_time)
.bind(end_time)
.bind(text)
.bind(chunk_id)
.bind(confidence)
.execute(&self.pool)
.await?;
}
Ok(())
}
pub async fn update_speaker_detection_identity(
&self,
file_uuid: &str,
speaker_id: &str,
identity_id: i64,
) -> Result<()> {
let table = schema::table_name("speaker_detections");
sqlx::query(&format!(
"UPDATE {} SET identity_id = $1 WHERE file_uuid = $2 AND speaker_id = $3 AND identity_id IS NULL",
table
))
.bind(identity_id)
.bind(file_uuid)
.bind(speaker_id)
.execute(&self.pool)
.await?;
Ok(())
}
pub async fn store_scene_pre_chunks_batch(
&self,
uuid: &str,
@@ -2761,14 +2861,14 @@ impl PostgresDb {
use sqlx::Row;
let rows = if let Some(u) = file_uuid {
sqlx::query(&format!(
"SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, 1.0 as score \
"SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, 1.0::float8 as score \
FROM {} WHERE file_uuid=$1 AND text_content ILIKE $2 LIMIT $3", table)
)
.bind(u).bind(&like).bind(limit)
.fetch_all(&self.pool).await?
} else {
sqlx::query(&format!(
"SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, 1.0 as score \
"SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, 1.0::float8 as score \
FROM {} WHERE text_content ILIKE $1 LIMIT $2", table)
)
.bind(&like).bind(limit)
@@ -3118,8 +3218,7 @@ impl PostgresDb {
let id_table = schema::table_name("identities");
let fd_table = schema::table_name("face_detections");
let chunk_table = schema::table_name("chunk");
let ib_table = schema::table_name("identity_bindings");
let pc_table = schema::table_name("pre_chunks");
let sd_table = schema::table_name("speaker_detections");
use sqlx::Row;
let subq = format!(
"SELECT id FROM {} WHERE REPLACE(uuid::text, '-', '') = $1",
@@ -3138,22 +3237,16 @@ impl PostgresDb {
GROUP BY c.file_uuid, c.chunk_id, c.start_frame, c.end_frame, \
c.fps, c.start_time, c.end_time, c.text_content \
UNION ALL \
SELECT c.file_uuid, c.chunk_id, \
c.start_frame::bigint, c.end_frame::bigint, \
c.fps, c.start_time, c.end_time, c.text_content, \
SELECT sd.file_uuid, COALESCE(c.chunk_id, sd.chunk_id), \
COALESCE(c.start_frame, 0)::bigint, COALESCE(c.end_frame, 0)::bigint, \
COALESCE(c.fps, 24.0), sd.start_time, sd.end_time, sd.text_content, \
'sentence' as chunk_type \
FROM {} c \
JOIN {} pc ON pc.file_uuid = c.file_uuid \
AND pc.processor_type = 'asrx' \
AND c.start_time <= (pc.data->>'timestamp')::double precision \
AND c.end_time >= (pc.data->>'timestamp')::double precision \
JOIN {} ib ON ib.identity_value = pc.data->>'speaker_id' \
AND ib.identity_type = 'speaker' \
AND ib.file_uuid = pc.file_uuid \
WHERE ib.identity_id = ({}) \
FROM {} sd \
LEFT JOIN {} c ON c.chunk_id = sd.chunk_id \
WHERE sd.identity_id = ({}) \
ORDER BY start_time \
LIMIT $2 OFFSET $3",
chunk_table, fd_table, subq, chunk_table, pc_table, ib_table, subq
chunk_table, fd_table, subq, sd_table, chunk_table, subq
))
.bind(uuid_str)
.bind(limit)