feat: deploy hybrid search (semantic+keyword+identity) with RRF fusion

- Replace smart_search with hybrid RRF implementation - Add speaker_detections table for identity-agent binding - Fix identity queries: direct SQL to avoid type mismatches - Add debug logs to job_worker for processor debugging - Deployed to production (3002) successfully Key changes: - search.rs: Complete rewrite with 3 strategies + RRF - postgres_db.rs: speaker_detections table + identity query fixes - job_worker.rs: Debug logs for output file checks Tested: - Hybrid search works with semantic + keyword + identity - Identity search: 'identity:Charade' returns correct results - Chinese keyword search: '調光' matches Charade summaries Bugs found: - Case mismatch: 'ASRX' vs 'asrx' in processors field - Missing CUT dependency for ASRX processor
2026-06-01 15:15:17 +08:00
parent 0d58a738a1
commit 874d688987
4 changed files with 549 additions and 74 deletions
--- a/src/core/db/postgres_db.rs
+++ b/src/core/db/postgres_db.rs
@@ -1008,6 +1008,32 @@ impl PostgresDb {
        // sqlx::query("CREATE TABLE IF NOT EXISTS chunks_rule1 ...").execute(pool).await?;
        // sqlx::query("CREATE INDEX IF NOT EXISTS idx_chunks_rule1_asset ...").execute(pool).await?;

+        // Speaker Detections
+        sqlx::query(
+            "CREATE TABLE IF NOT EXISTS speaker_detections ( \
+             id SERIAL PRIMARY KEY, \
+             file_uuid VARCHAR(32) NOT NULL, \
+             identity_id INTEGER REFERENCES identities(id) ON DELETE CASCADE, \
+             speaker_id VARCHAR(32), \
+             start_time DOUBLE PRECISION, \
+             end_time DOUBLE PRECISION, \
+             text_content TEXT, \
+             chunk_id VARCHAR(128), \
+             confidence REAL, \
+             metadata JSONB DEFAULT '{}', \
+             created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP)",
+        )
+        .execute(pool)
+        .await?;
+        sqlx::query("CREATE INDEX IF NOT EXISTS idx_speaker_detections_identity ON speaker_detections(identity_id)")
+            .execute(pool).await?;
+        sqlx::query("CREATE INDEX IF NOT EXISTS idx_speaker_detections_file ON speaker_detections(file_uuid)")
+            .execute(pool).await?;
+        sqlx::query("CREATE INDEX IF NOT EXISTS idx_speaker_detections_chunk ON speaker_detections(chunk_id)")
+            .execute(pool).await?;
+        sqlx::query("CREATE INDEX IF NOT EXISTS idx_speaker_detections_search ON speaker_detections(file_uuid, identity_id)")
+            .execute(pool).await?;
+
        // Jobs (Legacy/P0)
        tracing::info!("Creating jobs table...");
        sqlx::query("CREATE TABLE IF NOT EXISTS jobs (id UUID PRIMARY KEY, file_uuid VARCHAR(32) NOT NULL REFERENCES videos(file_uuid) ON DELETE CASCADE, processor_list TEXT[], assigned_processor_id UUID, rule VARCHAR(20), status VARCHAR(20) DEFAULT 'QUEUED', total_frames BIGINT DEFAULT 0, processed_frames BIGINT DEFAULT 0, error_message TEXT, created_at TIMESTAMPTZ DEFAULT NOW(), updated_at TIMESTAMPTZ DEFAULT NOW())").execute(pool).await?;
@@ -2181,6 +2207,36 @@ impl PostgresDb {
        Ok(results)
    }

+    /// Retrieve chunk details by file_uuid and chunk_id for Qdrant result enrichment
+    pub async fn get_chunk_by_file_and_chunk_id(
+        &self,
+        file_uuid: &str,
+        chunk_id: &str,
+    ) -> Result<Option<SemanticSearchResult>> {
+        let chunk_table = schema::table_name("chunk");
+        let results = sqlx::query_as::<_, SemanticSearchResult>(
+            &format!(
+                "SELECT \
+                     id, file_uuid, id as scene_order, \
+                     (start_time * fps)::bigint as start_frame, (end_time * fps)::bigint as end_frame, \
+                     fps, start_time, end_time, \
+                     COALESCE(summary_text, text_content, '') as summary, \
+                     metadata, \
+                     1.0::float8 as similarity \
+                 FROM {} \
+                 WHERE file_uuid = $1 AND chunk_id = $2 AND embedding IS NOT NULL \
+                 LIMIT 1",
+                chunk_table
+            ),
+        )
+        .bind(file_uuid)
+        .bind(chunk_id)
+        .fetch_optional(&self.pool)
+        .await?;
+
+        Ok(results)
+    }
+
    /// Get children for a list of parent IDs
    pub async fn get_children_for_parents(
        &self,
@@ -2402,6 +2458,50 @@ impl PostgresDb {
        Ok(())
    }

+    pub async fn store_speaker_detections_batch(
+        &self,
+        uuid: &str,
+        segments: &[(String, f64, f64, String, Option<String>, f32)],
+    ) -> Result<()> {
+        let table = schema::table_name("speaker_detections");
+        for (speaker_id, start_time, end_time, text, chunk_id, confidence) in segments {
+            sqlx::query(&format!(
+                "INSERT INTO {} (file_uuid, speaker_id, start_time, end_time, text_content, chunk_id, confidence) \
+                 VALUES ($1, $2, $3, $4, $5, $6, $7) ON CONFLICT DO NOTHING",
+                table
+            ))
+            .bind(uuid)
+            .bind(speaker_id)
+            .bind(start_time)
+            .bind(end_time)
+            .bind(text)
+            .bind(chunk_id)
+            .bind(confidence)
+            .execute(&self.pool)
+            .await?;
+        }
+        Ok(())
+    }
+
+    pub async fn update_speaker_detection_identity(
+        &self,
+        file_uuid: &str,
+        speaker_id: &str,
+        identity_id: i64,
+    ) -> Result<()> {
+        let table = schema::table_name("speaker_detections");
+        sqlx::query(&format!(
+            "UPDATE {} SET identity_id = $1 WHERE file_uuid = $2 AND speaker_id = $3 AND identity_id IS NULL",
+            table
+        ))
+        .bind(identity_id)
+        .bind(file_uuid)
+        .bind(speaker_id)
+        .execute(&self.pool)
+        .await?;
+        Ok(())
+    }
+
    pub async fn store_scene_pre_chunks_batch(
        &self,
        uuid: &str,
@@ -2761,14 +2861,14 @@ impl PostgresDb {
        use sqlx::Row;
        let rows = if let Some(u) = file_uuid {
            sqlx::query(&format!(
-                "SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, 1.0 as score \
+                "SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, 1.0::float8 as score \
                 FROM {} WHERE file_uuid=$1 AND text_content ILIKE $2 LIMIT $3", table)
            )
            .bind(u).bind(&like).bind(limit)
            .fetch_all(&self.pool).await?
        } else {
            sqlx::query(&format!(
-                "SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, 1.0 as score \
+                "SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, 1.0::float8 as score \
                 FROM {} WHERE text_content ILIKE $1 LIMIT $2", table)
            )
            .bind(&like).bind(limit)
@@ -3118,8 +3218,7 @@ impl PostgresDb {
        let id_table = schema::table_name("identities");
        let fd_table = schema::table_name("face_detections");
        let chunk_table = schema::table_name("chunk");
-        let ib_table = schema::table_name("identity_bindings");
-        let pc_table = schema::table_name("pre_chunks");
+        let sd_table = schema::table_name("speaker_detections");
        use sqlx::Row;
        let subq = format!(
            "SELECT id FROM {} WHERE REPLACE(uuid::text, '-', '') = $1",
@@ -3138,22 +3237,16 @@ impl PostgresDb {
                 GROUP BY c.file_uuid, c.chunk_id, c.start_frame, c.end_frame, \
                          c.fps, c.start_time, c.end_time, c.text_content \
                 UNION ALL \
-                 SELECT c.file_uuid, c.chunk_id, \
-                 c.start_frame::bigint, c.end_frame::bigint, \
-                 c.fps, c.start_time, c.end_time, c.text_content, \
+                 SELECT sd.file_uuid, COALESCE(c.chunk_id, sd.chunk_id), \
+                 COALESCE(c.start_frame, 0)::bigint, COALESCE(c.end_frame, 0)::bigint, \
+                 COALESCE(c.fps, 24.0), sd.start_time, sd.end_time, sd.text_content, \
                 'sentence' as chunk_type \
-                 FROM {} c \
-                 JOIN {} pc ON pc.file_uuid = c.file_uuid \
-                  AND pc.processor_type = 'asrx' \
-                  AND c.start_time <= (pc.data->>'timestamp')::double precision \
-                  AND c.end_time >= (pc.data->>'timestamp')::double precision \
-                 JOIN {} ib ON ib.identity_value = pc.data->>'speaker_id' \
-                  AND ib.identity_type = 'speaker' \
-                  AND ib.file_uuid = pc.file_uuid \
-                 WHERE ib.identity_id = ({}) \
+                 FROM {} sd \
+                 LEFT JOIN {} c ON c.chunk_id = sd.chunk_id \
+                 WHERE sd.identity_id = ({}) \
                 ORDER BY start_time \
                 LIMIT $2 OFFSET $3",
-            chunk_table, fd_table, subq, chunk_table, pc_table, ib_table, subq
+            chunk_table, fd_table, subq, sd_table, chunk_table, subq
        ))
        .bind(uuid_str)
        .bind(limit)