fix: keyword search - add text_content field and CJK support

- Added text_content field to SearchResult and SemanticSearchResult
- Added get_chunk_by_id_no_embedding for keyword results without embedding requirement
- Fixed search_bm25 to use position-based ranking for CJK/Korean content
- Fixed sqlx column mapping with explicit alias
- Skip text_match filter for keyword-only results
- Use text_content as fallback when summary is empty
This commit is contained in:
Accusys
2026-07-02 21:16:38 +08:00
parent 5a9d4325d8
commit 78364afc51
2 changed files with 119 additions and 47 deletions

View File

@@ -34,6 +34,7 @@ pub struct SearchResult {
pub end_time: f64,
pub raw_text: Option<String>,
pub summary: Option<String>,
pub text_content: Option<String>,
pub metadata: Option<serde_json::Value>,
pub similarity: Option<f64>,
pub file_name: Option<String>,
@@ -82,6 +83,7 @@ async fn enrich_from_pg(
end_time: p.end_time,
raw_text: None,
summary: Some(p.summary),
text_content: p.text_content.clone(),
metadata: p.metadata.clone(),
similarity: Some(qdrant_score as f64),
file_name: None,
@@ -109,6 +111,7 @@ fn pg_result_to_search(p: &SemanticSearchResult) -> SearchResult {
end_time: p.end_time,
raw_text: None,
summary: Some(p.summary.clone()),
text_content: p.text_content.clone(),
metadata: p.metadata.clone(),
similarity: p.similarity,
file_name: None,
@@ -381,43 +384,55 @@ pub async fn smart_search(
let mut final_results = Vec::new();
for mr in ranked.iter().take(limit * 3) {
// 取更多結果以便過濾
if let Some(pg) = db
.get_chunk_by_file_and_chunk_id(&mr.file_uuid, &mr.chunk_id)
.await
.ok()
.flatten()
{
// 關鍵字過濾: CJK 用子字串匹配,英文用單詞邊界匹配
let summary_lower = pg.summary.to_lowercase();
let query_words: Vec<String> = query_lower
.split_whitespace()
.map(|s| s.to_string())
.collect();
// Use no_embedding version for keyword results, regular for semantic
let pg_opt = if mr.keyword_score.is_some() && mr.semantic_score.is_none() {
db.get_chunk_by_id_no_embedding(&mr.file_uuid, &mr.chunk_id).await
} else {
db.get_chunk_by_file_and_chunk_id(&mr.file_uuid, &mr.chunk_id).await
};
if let Some(pg) = pg_opt.ok().flatten() {
// 關鍵字結果跳過 text_match 過濾search_bm25 已經匹配過)
let is_keyword_only = mr.keyword_score.is_some() && mr.semantic_score.is_none();
if !is_keyword_only {
// 關鍵字過濾: CJK 用子字串匹配,英文用單詞邊界匹配
let summary_lower = pg.summary.to_lowercase();
let query_words: Vec<String> = query_lower
.split_whitespace()
.map(|s| s.to_string())
.collect();
let text_match = !pg.summary.is_empty() && {
let has_cjk = |s: &str| -> bool {
s.chars().any(|c| {
('\u{4E00}'..='\u{9FFF}').contains(&c)
|| ('\u{3040}'..='\u{309F}').contains(&c)
|| ('\u{30A0}'..='\u{30FF}').contains(&c)
|| ('\u{AC00}'..='\u{D7AF}').contains(&c)
})
let text_match = !pg.summary.is_empty() && {
let has_cjk = |s: &str| -> bool {
s.chars().any(|c| {
('\u{4E00}'..='\u{9FFF}').contains(&c)
|| ('\u{3040}'..='\u{309F}').contains(&c)
|| ('\u{30A0}'..='\u{30FF}').contains(&c)
|| ('\u{AC00}'..='\u{D7AF}').contains(&c)
})
};
if has_cjk(&query_lower) || has_cjk(&summary_lower) {
query_words.iter().all(|w| summary_lower.contains(w))
} else {
let bordered = format!(" {} ", summary_lower);
query_words
.iter()
.all(|w| bordered.contains(&format!(" {} ", w)))
}
};
if has_cjk(&query_lower) || has_cjk(&summary_lower) {
query_words.iter().all(|w| summary_lower.contains(w))
} else {
let bordered = format!(" {} ", summary_lower);
query_words
.iter()
.all(|w| bordered.contains(&format!(" {} ", w)))
if !text_match {
continue;
}
};
if !text_match && mr.semantic_score.is_none() {
continue;
}
// 使用 text_content 如果 summary 為空
let display_text = if pg.summary.is_empty() {
pg.text_content.clone().unwrap_or_default()
} else {
pg.summary.clone()
};
final_results.push(SearchResult {
id: 0,
file_uuid: pg.file_uuid.clone(),
@@ -430,6 +445,7 @@ pub async fn smart_search(
end_time: pg.end_time,
raw_text: None,
summary: Some(pg.summary),
text_content: pg.text_content.clone(),
metadata: pg.metadata.clone(),
similarity: Some(mr.score),
file_name: None,

View File

@@ -832,6 +832,7 @@ pub struct SemanticSearchResult {
pub start_time: f64,
pub end_time: f64,
pub summary: String,
pub text_content: Option<String>,
pub metadata: Option<serde_json::Value>,
pub similarity: Option<f64>,
}
@@ -2552,6 +2553,7 @@ impl PostgresDb {
(start_time * fps)::bigint as start_frame, (end_time * fps)::bigint as end_frame, \
fps, start_time, end_time, \
COALESCE(summary_text, text_content, '') as summary, \
text_content as text_content, \
metadata, \
1.0::float8 as similarity \
FROM {} \
@@ -2568,6 +2570,37 @@ impl PostgresDb {
Ok(results)
}
/// Get chunk by file_uuid and chunk_id WITHOUT embedding requirement (for keyword search)
pub async fn get_chunk_by_id_no_embedding(
&self,
file_uuid: &str,
chunk_id: &str,
) -> Result<Option<SemanticSearchResult>> {
let chunk_table = schema::table_name("chunk");
let results = sqlx::query_as::<_, SemanticSearchResult>(
&format!(
"SELECT \
id, file_uuid, id as scene_order, \
(start_time * fps)::bigint as start_frame, (end_time * fps)::bigint as end_frame, \
fps, start_time, end_time, \
COALESCE(summary_text, text_content, '') as summary, \
text_content as text_content, \
metadata, \
1.0::float8 as similarity \
FROM {} \
WHERE file_uuid = $1 AND chunk_id = $2 \
LIMIT 1",
chunk_table
),
)
.bind(file_uuid)
.bind(chunk_id)
.fetch_optional(&self.pool)
.await?;
Ok(results)
}
/// Get children for a list of parent IDs
pub async fn get_children_for_parents(
&self,
@@ -3339,22 +3372,45 @@ impl PostgresDb {
let like = format!("%{}%", query.replace('%', "%%"));
use sqlx::Row;
// Use PostgreSQL full-text search with ts_rank for ranking, fallback to ILIKE for recall
let sql = format!(
"SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, \
CASE \
WHEN to_tsvector('english', text_content) @@ plainto_tsquery('english', $1) \
THEN ts_rank(to_tsvector('english', text_content), plainto_tsquery('english', $1))::float8 \
ELSE 0.1::float8 \
END as score \
FROM {} \
WHERE text_content ILIKE $2 AND text_content != '' \
{}\
ORDER BY score DESC \
LIMIT $3",
table,
if file_uuid.is_some() { "AND file_uuid = $4 " } else { "" }
);
// Check if query contains CJK characters
let has_cjk = query.chars().any(|c| {
('\u{4E00}'..='\u{9FFF}').contains(&c)
|| ('\u{3040}'..='\u{309F}').contains(&c)
|| ('\u{30A0}'..='\u{30FF}').contains(&c)
|| ('\u{AC00}'..='\u{D7AF}').contains(&c)
});
let sql = if has_cjk {
// CJK/Korean: use ILIKE position-based ranking
format!(
"SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, \
(1.0 - (POSITION(LOWER($1) IN LOWER(text_content))::float8 / NULLIF(LENGTH(text_content), 0)::float8))::float8 as score \
FROM {} \
WHERE text_content ILIKE $2 AND text_content != '' \
{}\
ORDER BY score DESC \
LIMIT $3",
table,
if file_uuid.is_some() { "AND file_uuid = $4 " } else { "" }
)
} else {
// English: use PostgreSQL full-text search
format!(
"SELECT chunk_id, file_uuid, chunk_type, text_content, start_time, end_time, \
CASE \
WHEN to_tsvector('english', text_content) @@ plainto_tsquery('english', $1) \
THEN ts_rank(to_tsvector('english', text_content), plainto_tsquery('english', $1))::float8 \
ELSE 0.1::float8 \
END as score \
FROM {} \
WHERE text_content ILIKE $2 AND text_content != '' \
{}\
ORDER BY score DESC \
LIMIT $3",
table,
if file_uuid.is_some() { "AND file_uuid = $4 " } else { "" }
)
};
let rows = if let Some(u) = file_uuid {
sqlx::query(&sql)