feat: backup architecture docs, source code, and scripts
This commit is contained in:
27
src/bin/test_tokenizer_debug.rs
Normal file
27
src/bin/test_tokenizer_debug.rs
Normal file
@@ -0,0 +1,27 @@
|
||||
use momentry_core::core::text::tokenizer::{contains_chinese, tokenize_chinese_text};
|
||||
|
||||
fn main() {
|
||||
let texts = ["電腦", "工作", "視頻", "分析", "檔案", "這是一個測試"];
|
||||
for text in texts {
|
||||
let tokens = tokenize_chinese_text(text);
|
||||
println!("Text: '{}' -> Tokens: '{}'", text, tokens);
|
||||
let split: Vec<&str> = tokens.split_whitespace().collect();
|
||||
println!(" Split: {:?}", split);
|
||||
}
|
||||
|
||||
println!("\n=== Testing complex queries ===");
|
||||
let complex = [
|
||||
"(電腦 | 計算機 | 微机)",
|
||||
"(工作 | 任務 | 作業)",
|
||||
"電腦 & 工作",
|
||||
"(電腦:* | 計算機:* | 微机:*)",
|
||||
];
|
||||
|
||||
for query in complex {
|
||||
let tokens = tokenize_chinese_text(query);
|
||||
println!("Query: '{}' -> Tokens: '{}'", query, tokens);
|
||||
let split: Vec<&str> = tokens.split_whitespace().collect();
|
||||
println!(" Split: {:?}", split);
|
||||
println!("---");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user