feat: fix Chinese text search and duplicate chunk_id bug
- Add helper functions to extract text from nested content structure - Update SearchResult to include uuid field - Add PostgreSQL function get_chunk_by_chunk_id_and_uuid to handle duplicate chunk_ids - Update Qdrant search functions to extract uuid from payload - Change embedding model to nomic-embed-text-v2-moe:latest - Update Qdrant collection name to momentry_rule1 - Fix MongoDB authentication and disable cache for development - Improve error handling in processor.rs - Update documentation with new embedding model
This commit is contained in:
@@ -237,6 +237,26 @@ struct HybridSearchResponse {
|
||||
query: String,
|
||||
}
|
||||
|
||||
fn extract_text_from_content(content: &serde_json::Value) -> String {
|
||||
content
|
||||
.get("data")
|
||||
.and_then(|data| data.get("text"))
|
||||
.and_then(|v| v.as_str())
|
||||
.or_else(|| content.get("text").and_then(|v| v.as_str()))
|
||||
.unwrap_or("")
|
||||
.to_string()
|
||||
}
|
||||
|
||||
fn extract_title_from_content(content: &serde_json::Value) -> String {
|
||||
content
|
||||
.get("data")
|
||||
.and_then(|data| data.get("title"))
|
||||
.and_then(|v| v.as_str())
|
||||
.or_else(|| content.get("title").and_then(|v| v.as_str()))
|
||||
.unwrap_or("")
|
||||
.to_string()
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct LookupQuery {
|
||||
path: Option<String>,
|
||||
@@ -537,10 +557,22 @@ async fn register(
|
||||
let mut width = 0u32;
|
||||
let mut height = 0u32;
|
||||
|
||||
let mut fps = 0.0;
|
||||
for stream in &probe_result.streams {
|
||||
if stream.codec_type.as_deref() == Some("video") {
|
||||
width = stream.width.unwrap_or(0);
|
||||
height = stream.height.unwrap_or(0);
|
||||
|
||||
// Parse FPS from r_frame_rate (e.g., "60000/1001")
|
||||
if let Some(frame_rate_str) = &stream.r_frame_rate {
|
||||
if let Some((num_str, den_str)) = frame_rate_str.split_once('/') {
|
||||
if let (Ok(num), Ok(den)) = (num_str.parse::<f64>(), den_str.parse::<f64>()) {
|
||||
if den != 0.0 {
|
||||
fps = num / den;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -566,7 +598,7 @@ async fn register(
|
||||
duration,
|
||||
width,
|
||||
height,
|
||||
fps: 0.0,
|
||||
fps,
|
||||
probe_json: Some(json_str),
|
||||
storage: Default::default(),
|
||||
status: VideoStatus::Pending,
|
||||
@@ -599,6 +631,17 @@ async fn register(
|
||||
StatusCode::INTERNAL_SERVER_ERROR
|
||||
})?;
|
||||
|
||||
db.update_monitor_job_video_id(job.id, video_id)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
tracing::error!(
|
||||
"Failed to update monitor job video_id for job {}: {}",
|
||||
job.id,
|
||||
e
|
||||
);
|
||||
StatusCode::INTERNAL_SERVER_ERROR
|
||||
})?;
|
||||
|
||||
let _ = state.mongo_cache.invalidate_videos_list().await;
|
||||
|
||||
Ok(Json(RegisterResponse {
|
||||
@@ -771,21 +814,20 @@ async fn search(
|
||||
.map_err(|e| anyhow::anyhow!("PG init failed: {}", e))?;
|
||||
|
||||
let search_results = if let Some(ref uuid) = req.uuid {
|
||||
let query_f64: Vec<f64> = query_vector.iter().map(|&x| x as f64).collect();
|
||||
qdrant.search_in_uuid(&query_f64, uuid, limit).await?
|
||||
qdrant.search_in_uuid(&query_vector, uuid, limit).await?
|
||||
} else {
|
||||
qdrant.search(&query_vector, limit).await?
|
||||
};
|
||||
|
||||
let mut results = Vec::new();
|
||||
for r in search_results {
|
||||
if let Some(chunk) = pg.get_chunk_by_chunk_id(&r.chunk_id).await.ok().flatten() {
|
||||
let text = chunk
|
||||
.content
|
||||
.get("text")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
if let Some(chunk) = pg
|
||||
.get_chunk_by_chunk_id_and_uuid(&r.chunk_id, &r.uuid)
|
||||
.await
|
||||
.ok()
|
||||
.flatten()
|
||||
{
|
||||
let text = extract_text_from_content(&chunk.content);
|
||||
|
||||
results.push(SearchResult {
|
||||
uuid: chunk.uuid.clone(),
|
||||
@@ -834,8 +876,7 @@ async fn n8n_search(
|
||||
.map_err(|e| anyhow::anyhow!("PG init failed: {}", e))?;
|
||||
|
||||
let search_results = if let Some(ref uuid) = req.uuid {
|
||||
let query_f64: Vec<f64> = query_vector.iter().map(|&x| x as f64).collect();
|
||||
qdrant.search_in_uuid(&query_f64, uuid, limit).await?
|
||||
qdrant.search_in_uuid(&query_vector, uuid, limit).await?
|
||||
} else {
|
||||
qdrant.search(&query_vector, limit).await?
|
||||
};
|
||||
@@ -843,20 +884,15 @@ async fn n8n_search(
|
||||
let mut hits = Vec::new();
|
||||
|
||||
for r in search_results {
|
||||
if let Some(chunk) = pg.get_chunk_by_chunk_id(&r.chunk_id).await.ok().flatten() {
|
||||
let text = chunk
|
||||
.content
|
||||
.get("text")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
if let Some(chunk) = pg
|
||||
.get_chunk_by_chunk_id_and_uuid(&r.chunk_id, &r.uuid)
|
||||
.await
|
||||
.ok()
|
||||
.flatten()
|
||||
{
|
||||
let text = extract_text_from_content(&chunk.content);
|
||||
|
||||
let title = chunk
|
||||
.content
|
||||
.get("title")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
let title = extract_title_from_content(&chunk.content);
|
||||
|
||||
let file_path = if chunk.uuid.is_empty() {
|
||||
None
|
||||
@@ -1376,7 +1412,7 @@ async fn unregister(
|
||||
pub async fn start_server(host: &str, port: u16) -> anyhow::Result<()> {
|
||||
let _ = SERVER_START.set(Instant::now());
|
||||
|
||||
let embedder = std::sync::Arc::new(Embedder::new("nomic-embed-text:v1.5".to_string()));
|
||||
let embedder = std::sync::Arc::new(Embedder::new("nomic-embed-text-v2-moe:latest".to_string()));
|
||||
let mongo_cache = MongoCache::init().await?;
|
||||
let redis_cache = RedisCache::new()?;
|
||||
let db = PostgresDb::init().await?;
|
||||
@@ -1384,7 +1420,7 @@ pub async fn start_server(host: &str, port: u16) -> anyhow::Result<()> {
|
||||
|
||||
let state = AppState {
|
||||
embedder,
|
||||
embedder_model: "nomic-embed-text:v1.5".to_string(),
|
||||
embedder_model: "nomic-embed-text-v2-moe:latest".to_string(),
|
||||
mongo_cache,
|
||||
redis_cache,
|
||||
api_state,
|
||||
|
||||
Reference in New Issue
Block a user