feat: fix Chinese text search and duplicate chunk_id bug

- Add helper functions to extract text from nested content structure
- Update SearchResult to include uuid field
- Add PostgreSQL function get_chunk_by_chunk_id_and_uuid to handle duplicate chunk_ids
- Update Qdrant search functions to extract uuid from payload
- Change embedding model to nomic-embed-text-v2-moe:latest
- Update Qdrant collection name to momentry_rule1
- Fix MongoDB authentication and disable cache for development
- Improve error handling in processor.rs
- Update documentation with new embedding model
This commit is contained in:
Warren
2026-03-29 04:44:28 +08:00
parent 82955504f3
commit 2393d81a3f
13 changed files with 355 additions and 106 deletions

View File

@@ -237,6 +237,26 @@ struct HybridSearchResponse {
query: String,
}
fn extract_text_from_content(content: &serde_json::Value) -> String {
content
.get("data")
.and_then(|data| data.get("text"))
.and_then(|v| v.as_str())
.or_else(|| content.get("text").and_then(|v| v.as_str()))
.unwrap_or("")
.to_string()
}
fn extract_title_from_content(content: &serde_json::Value) -> String {
content
.get("data")
.and_then(|data| data.get("title"))
.and_then(|v| v.as_str())
.or_else(|| content.get("title").and_then(|v| v.as_str()))
.unwrap_or("")
.to_string()
}
#[derive(Debug, Deserialize)]
struct LookupQuery {
path: Option<String>,
@@ -537,10 +557,22 @@ async fn register(
let mut width = 0u32;
let mut height = 0u32;
let mut fps = 0.0;
for stream in &probe_result.streams {
if stream.codec_type.as_deref() == Some("video") {
width = stream.width.unwrap_or(0);
height = stream.height.unwrap_or(0);
// Parse FPS from r_frame_rate (e.g., "60000/1001")
if let Some(frame_rate_str) = &stream.r_frame_rate {
if let Some((num_str, den_str)) = frame_rate_str.split_once('/') {
if let (Ok(num), Ok(den)) = (num_str.parse::<f64>(), den_str.parse::<f64>()) {
if den != 0.0 {
fps = num / den;
}
}
}
}
}
}
@@ -566,7 +598,7 @@ async fn register(
duration,
width,
height,
fps: 0.0,
fps,
probe_json: Some(json_str),
storage: Default::default(),
status: VideoStatus::Pending,
@@ -599,6 +631,17 @@ async fn register(
StatusCode::INTERNAL_SERVER_ERROR
})?;
db.update_monitor_job_video_id(job.id, video_id)
.await
.map_err(|e| {
tracing::error!(
"Failed to update monitor job video_id for job {}: {}",
job.id,
e
);
StatusCode::INTERNAL_SERVER_ERROR
})?;
let _ = state.mongo_cache.invalidate_videos_list().await;
Ok(Json(RegisterResponse {
@@ -771,21 +814,20 @@ async fn search(
.map_err(|e| anyhow::anyhow!("PG init failed: {}", e))?;
let search_results = if let Some(ref uuid) = req.uuid {
let query_f64: Vec<f64> = query_vector.iter().map(|&x| x as f64).collect();
qdrant.search_in_uuid(&query_f64, uuid, limit).await?
qdrant.search_in_uuid(&query_vector, uuid, limit).await?
} else {
qdrant.search(&query_vector, limit).await?
};
let mut results = Vec::new();
for r in search_results {
if let Some(chunk) = pg.get_chunk_by_chunk_id(&r.chunk_id).await.ok().flatten() {
let text = chunk
.content
.get("text")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
if let Some(chunk) = pg
.get_chunk_by_chunk_id_and_uuid(&r.chunk_id, &r.uuid)
.await
.ok()
.flatten()
{
let text = extract_text_from_content(&chunk.content);
results.push(SearchResult {
uuid: chunk.uuid.clone(),
@@ -834,8 +876,7 @@ async fn n8n_search(
.map_err(|e| anyhow::anyhow!("PG init failed: {}", e))?;
let search_results = if let Some(ref uuid) = req.uuid {
let query_f64: Vec<f64> = query_vector.iter().map(|&x| x as f64).collect();
qdrant.search_in_uuid(&query_f64, uuid, limit).await?
qdrant.search_in_uuid(&query_vector, uuid, limit).await?
} else {
qdrant.search(&query_vector, limit).await?
};
@@ -843,20 +884,15 @@ async fn n8n_search(
let mut hits = Vec::new();
for r in search_results {
if let Some(chunk) = pg.get_chunk_by_chunk_id(&r.chunk_id).await.ok().flatten() {
let text = chunk
.content
.get("text")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
if let Some(chunk) = pg
.get_chunk_by_chunk_id_and_uuid(&r.chunk_id, &r.uuid)
.await
.ok()
.flatten()
{
let text = extract_text_from_content(&chunk.content);
let title = chunk
.content
.get("title")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let title = extract_title_from_content(&chunk.content);
let file_path = if chunk.uuid.is_empty() {
None
@@ -1376,7 +1412,7 @@ async fn unregister(
pub async fn start_server(host: &str, port: u16) -> anyhow::Result<()> {
let _ = SERVER_START.set(Instant::now());
let embedder = std::sync::Arc::new(Embedder::new("nomic-embed-text:v1.5".to_string()));
let embedder = std::sync::Arc::new(Embedder::new("nomic-embed-text-v2-moe:latest".to_string()));
let mongo_cache = MongoCache::init().await?;
let redis_cache = RedisCache::new()?;
let db = PostgresDb::init().await?;
@@ -1384,7 +1420,7 @@ pub async fn start_server(host: &str, port: u16) -> anyhow::Result<()> {
let state = AppState {
embedder,
embedder_model: "nomic-embed-text:v1.5".to_string(),
embedder_model: "nomic-embed-text-v2-moe:latest".to_string(),
mongo_cache,
redis_cache,
api_state,