feat: fix Chinese text search and duplicate chunk_id bug

- Add helper functions to extract text from nested content structure
- Update SearchResult to include uuid field
- Add PostgreSQL function get_chunk_by_chunk_id_and_uuid to handle duplicate chunk_ids
- Update Qdrant search functions to extract uuid from payload
- Change embedding model to nomic-embed-text-v2-moe:latest
- Update Qdrant collection name to momentry_rule1
- Fix MongoDB authentication and disable cache for development
- Improve error handling in processor.rs
- Update documentation with new embedding model
This commit is contained in:
Warren
2026-03-29 04:44:28 +08:00
parent 82955504f3
commit 2393d81a3f
13 changed files with 355 additions and 106 deletions

View File

@@ -17,7 +17,7 @@ const QDRANT_API_KEY: &str = "Test3200Test3200Test3200";
#[allow(dead_code)]
const OLLAMA_URL: &str = "http://localhost:11434";
#[allow(dead_code)]
const MODEL: &str = "nomic-embed-text-v2-moe";
const MODEL: &str = "nomic-embed-text-v2-moe:latest";
#[derive(Debug, Clone)]
#[allow(dead_code)]
@@ -112,8 +112,8 @@ impl ChunkSelector {
return Ok(Vec::new());
}
// Search Qdrant - try both collections (chunks_v3 for multilingual, AccusysDB for others)
let collections = ["chunks_v3", "AccusysDB"];
// Search Qdrant - use momentry_rule1 collection (Rule1 specification)
let collections = ["momentry_rule1"];
for collection in collections {
let vector_str = serde_json::to_string(&embedding)