Files
momentry_core/src/bin/test_tokenizer_debug.rs

28 lines
958 B
Rust

use momentry_core::core::text::tokenizer::{contains_chinese, tokenize_chinese_text};
fn main() {
let texts = ["電腦", "工作", "視頻", "分析", "檔案", "這是一個測試"];
for text in texts {
let tokens = tokenize_chinese_text(text);
println!("Text: '{}' -> Tokens: '{}'", text, tokens);
let split: Vec<&str> = tokens.split_whitespace().collect();
println!(" Split: {:?}", split);
}
println!("\n=== Testing complex queries ===");
let complex = [
"(電腦 | 計算機 | 微机)",
"(工作 | 任務 | 作業)",
"電腦 & 工作",
"(電腦:* | 計算機:* | 微机:*)",
];
for query in complex {
let tokens = tokenize_chinese_text(query);
println!("Query: '{}' -> Tokens: '{}'", query, tokens);
let split: Vec<&str> = tokens.split_whitespace().collect();
println!(" Split: {:?}", split);
println!("---");
}
}