feat: fix Chinese text search and duplicate chunk_id bug
- Add helper functions to extract text from nested content structure - Update SearchResult to include uuid field - Add PostgreSQL function get_chunk_by_chunk_id_and_uuid to handle duplicate chunk_ids - Update Qdrant search functions to extract uuid from payload - Change embedding model to nomic-embed-text-v2-moe:latest - Update Qdrant collection name to momentry_rule1 - Fix MongoDB authentication and disable cache for development - Improve error handling in processor.rs - Update documentation with new embedding model
This commit is contained in:
68
src/main.rs
68
src/main.rs
@@ -1,6 +1,7 @@
|
||||
use anyhow::{Context, Result};
|
||||
use clap::{Parser, Subcommand};
|
||||
use futures_util::StreamExt;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::str;
|
||||
use std::sync::{Arc, Mutex};
|
||||
@@ -2226,18 +2227,24 @@ async fn main() -> Result<()> {
|
||||
.await
|
||||
.context("Failed to init PostgreSQL")?;
|
||||
let qdrant = QdrantDb::init().await.context("Failed to init Qdrant")?;
|
||||
let embedder = Embedder::new("nomic-embed-text:v1.5".to_string());
|
||||
|
||||
let target_uuid = if uuid == "all" {
|
||||
None
|
||||
} else {
|
||||
Some(uuid.as_str())
|
||||
};
|
||||
let embedder = Embedder::new("nomic-embed-text-v2-moe:latest".to_string());
|
||||
|
||||
let mut stored_count = 0usize;
|
||||
|
||||
if let Some(target) = target_uuid {
|
||||
let chunks = pg.get_chunks_by_uuid(target).await?;
|
||||
// Get list of videos to process
|
||||
let videos_to_process = if uuid == "all" {
|
||||
// Get all videos
|
||||
let videos = pg.list_videos().await?;
|
||||
videos.into_iter().map(|v| v.uuid).collect::<Vec<_>>()
|
||||
} else {
|
||||
// Process single video
|
||||
vec![uuid.clone()]
|
||||
};
|
||||
|
||||
for target in &videos_to_process {
|
||||
println!("\n=== Processing video: {} ===", target);
|
||||
|
||||
let chunks = pg.get_chunks_by_uuid(target.as_str()).await?;
|
||||
let sentence_chunks: Vec<_> = chunks
|
||||
.into_iter()
|
||||
.filter(|c| c.chunk_type == ChunkType::Sentence)
|
||||
@@ -2249,21 +2256,32 @@ async fn main() -> Result<()> {
|
||||
target
|
||||
);
|
||||
|
||||
let mut video_stored_count = 0usize;
|
||||
|
||||
for chunk in sentence_chunks {
|
||||
// Try to extract text from different possible locations
|
||||
let text = chunk
|
||||
.content
|
||||
.get("text")
|
||||
.get("data") // Try data->text structure first
|
||||
.and_then(|data| data.get("text"))
|
||||
.and_then(|v| v.as_str())
|
||||
.or_else(|| chunk.content.get("text").and_then(|v| v.as_str())) // Try root text structure
|
||||
.unwrap_or("");
|
||||
|
||||
if text.is_empty() {
|
||||
eprintln!(
|
||||
"Empty text for chunk {}, content: {:?}",
|
||||
chunk.chunk_id, chunk.content
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
print!("Embedding chunk {}... ", chunk.chunk_id);
|
||||
std::io::stdout().flush().unwrap();
|
||||
|
||||
match embedder.embed_document(text).await {
|
||||
Ok(vector) => {
|
||||
println!("embedding success ({} dims)", vector.len());
|
||||
let vector_id = format!("{}_{}", chunk.uuid, chunk.chunk_id);
|
||||
|
||||
if let Err(e) =
|
||||
@@ -2295,32 +2313,40 @@ async fn main() -> Result<()> {
|
||||
}
|
||||
|
||||
stored_count += 1;
|
||||
println!("done ({} dims)", vector.len());
|
||||
video_stored_count += 1;
|
||||
println!(
|
||||
"stored (video: {}, total: {})",
|
||||
video_stored_count, stored_count
|
||||
);
|
||||
}
|
||||
Err(e) => {
|
||||
println!("failed: {}", e);
|
||||
println!("embedding failed: {}", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Only update storage status if vectors were actually stored
|
||||
if stored_count > 0 {
|
||||
pg.update_storage_status(target, "pvector_chunk", true)
|
||||
// Only update storage status if vectors were actually stored for this video
|
||||
if video_stored_count > 0 {
|
||||
pg.update_storage_status(target.as_str(), "pvector_chunk", true)
|
||||
.await?;
|
||||
pg.update_storage_status(target, "qvector_chunk", true)
|
||||
pg.update_storage_status(target.as_str(), "qvector_chunk", true)
|
||||
.await?;
|
||||
println!(
|
||||
"\n✓ Vectorize stage completed for {}! ({} vectors stored)",
|
||||
target, stored_count
|
||||
"✓ Vectorize stage completed for {}! ({} vectors stored)",
|
||||
target, video_stored_count
|
||||
);
|
||||
} else {
|
||||
println!(
|
||||
"\n✗ Vectorize stage failed for {}! (0 vectors stored)",
|
||||
"✗ Vectorize stage failed for {}! (0 vectors stored)",
|
||||
target
|
||||
);
|
||||
}
|
||||
} else {
|
||||
println!("\n✓ Vectorize stage completed for all videos!");
|
||||
}
|
||||
|
||||
println!("\n=== Vectorization Summary ===");
|
||||
println!("Total vectors stored: {}", stored_count);
|
||||
if uuid == "all" {
|
||||
println!("✓ Vectorize stage completed for all videos!");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user