feat: fix Chinese text search and duplicate chunk_id bug

- Add helper functions to extract text from nested content structure
- Update SearchResult to include uuid field
- Add PostgreSQL function get_chunk_by_chunk_id_and_uuid to handle duplicate chunk_ids
- Update Qdrant search functions to extract uuid from payload
- Change embedding model to nomic-embed-text-v2-moe:latest
- Update Qdrant collection name to momentry_rule1
- Fix MongoDB authentication and disable cache for development
- Improve error handling in processor.rs
- Update documentation with new embedding model
This commit is contained in:
Warren
2026-03-29 04:44:28 +08:00
parent 82955504f3
commit 2393d81a3f
13 changed files with 355 additions and 106 deletions

View File

@@ -1,6 +1,7 @@
use anyhow::{Context, Result};
use clap::{Parser, Subcommand};
use futures_util::StreamExt;
use std::io::Write;
use std::path::Path;
use std::str;
use std::sync::{Arc, Mutex};
@@ -2226,18 +2227,24 @@ async fn main() -> Result<()> {
.await
.context("Failed to init PostgreSQL")?;
let qdrant = QdrantDb::init().await.context("Failed to init Qdrant")?;
let embedder = Embedder::new("nomic-embed-text:v1.5".to_string());
let target_uuid = if uuid == "all" {
None
} else {
Some(uuid.as_str())
};
let embedder = Embedder::new("nomic-embed-text-v2-moe:latest".to_string());
let mut stored_count = 0usize;
if let Some(target) = target_uuid {
let chunks = pg.get_chunks_by_uuid(target).await?;
// Get list of videos to process
let videos_to_process = if uuid == "all" {
// Get all videos
let videos = pg.list_videos().await?;
videos.into_iter().map(|v| v.uuid).collect::<Vec<_>>()
} else {
// Process single video
vec![uuid.clone()]
};
for target in &videos_to_process {
println!("\n=== Processing video: {} ===", target);
let chunks = pg.get_chunks_by_uuid(target.as_str()).await?;
let sentence_chunks: Vec<_> = chunks
.into_iter()
.filter(|c| c.chunk_type == ChunkType::Sentence)
@@ -2249,21 +2256,32 @@ async fn main() -> Result<()> {
target
);
let mut video_stored_count = 0usize;
for chunk in sentence_chunks {
// Try to extract text from different possible locations
let text = chunk
.content
.get("text")
.get("data") // Try data->text structure first
.and_then(|data| data.get("text"))
.and_then(|v| v.as_str())
.or_else(|| chunk.content.get("text").and_then(|v| v.as_str())) // Try root text structure
.unwrap_or("");
if text.is_empty() {
eprintln!(
"Empty text for chunk {}, content: {:?}",
chunk.chunk_id, chunk.content
);
continue;
}
print!("Embedding chunk {}... ", chunk.chunk_id);
std::io::stdout().flush().unwrap();
match embedder.embed_document(text).await {
Ok(vector) => {
println!("embedding success ({} dims)", vector.len());
let vector_id = format!("{}_{}", chunk.uuid, chunk.chunk_id);
if let Err(e) =
@@ -2295,32 +2313,40 @@ async fn main() -> Result<()> {
}
stored_count += 1;
println!("done ({} dims)", vector.len());
video_stored_count += 1;
println!(
"stored (video: {}, total: {})",
video_stored_count, stored_count
);
}
Err(e) => {
println!("failed: {}", e);
println!("embedding failed: {}", e);
}
}
}
// Only update storage status if vectors were actually stored
if stored_count > 0 {
pg.update_storage_status(target, "pvector_chunk", true)
// Only update storage status if vectors were actually stored for this video
if video_stored_count > 0 {
pg.update_storage_status(target.as_str(), "pvector_chunk", true)
.await?;
pg.update_storage_status(target, "qvector_chunk", true)
pg.update_storage_status(target.as_str(), "qvector_chunk", true)
.await?;
println!(
"\n✓ Vectorize stage completed for {}! ({} vectors stored)",
target, stored_count
"✓ Vectorize stage completed for {}! ({} vectors stored)",
target, video_stored_count
);
} else {
println!(
"\n✗ Vectorize stage failed for {}! (0 vectors stored)",
"✗ Vectorize stage failed for {}! (0 vectors stored)",
target
);
}
} else {
println!("\n✓ Vectorize stage completed for all videos!");
}
println!("\n=== Vectorization Summary ===");
println!("Total vectors stored: {}", stored_count);
if uuid == "all" {
println!("✓ Vectorize stage completed for all videos!");
}
Ok(())
}