feat: backup architecture docs, source code, and scripts
This commit is contained in:
9
src/core/text/mod.rs
Normal file
9
src/core/text/mod.rs
Normal file
@@ -0,0 +1,9 @@
|
||||
pub mod online_synonym_expander;
|
||||
pub mod synonym;
|
||||
pub mod synonym_expander;
|
||||
pub mod tokenizer;
|
||||
|
||||
pub use online_synonym_expander::{global_online_expander, OnlineSynonymExpander};
|
||||
pub use synonym::{normalize_chinese_query, simplified_to_traditional, traditional_to_simplified};
|
||||
pub use synonym_expander::{global_synonym_expander, SynonymExpander};
|
||||
pub use tokenizer::{contains_chinese, extract_and_tokenize_text, tokenize_chinese_text};
|
||||
242
src/core/text/online_synonym_expander.rs
Normal file
242
src/core/text/online_synonym_expander.rs
Normal file
@@ -0,0 +1,242 @@
|
||||
use anyhow::{Context, Result};
|
||||
use once_cell::sync::Lazy;
|
||||
use serde::Deserialize;
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::Mutex;
|
||||
|
||||
/// Online Synonym Expander
|
||||
/// Fetches synonyms from LLM (llama.cpp server) on-demand and caches them.
|
||||
///
|
||||
/// Environment variables:
|
||||
/// - `MOMENTRY_ONLINE_SYNONYM` - Enable online synonym expansion (default: false)
|
||||
/// - `MOMENTRY_LLM_SYNONYM_URL` - LLM server URL (default: http://127.0.0.1:8081)
|
||||
/// - `MOMENTRY_LLM_SYNONYM_MODEL` - Model name (default: gemma4)
|
||||
/// - `MOMENTRY_LLM_SYNONYM_TIMEOUT` - Request timeout in seconds (default: 60)
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct LlmResponse {
|
||||
choices: Vec<LlmChoice>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct LlmChoice {
|
||||
message: LlmMessage,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct LlmMessage {
|
||||
content: String,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct OnlineSynonymExpander {
|
||||
/// Local synonym cache (loaded from file)
|
||||
local_map: HashMap<String, Vec<String>>,
|
||||
/// Runtime cache for LLM-fetched synonyms
|
||||
runtime_cache: Arc<Mutex<HashMap<String, Vec<String>>>>,
|
||||
/// LLM server URL
|
||||
api_url: String,
|
||||
/// Model name
|
||||
model: String,
|
||||
/// Request timeout
|
||||
timeout_secs: u64,
|
||||
}
|
||||
|
||||
static SYSTEM_PROMPT: &str = r#"You are a synonym generation assistant. For each given word, provide 8-12 synonyms in the same language.
|
||||
Rules:
|
||||
1. Return ONLY a JSON array of strings, nothing else
|
||||
2. Synonyms should be contextually relevant for video content search
|
||||
3. Include common words, informal terms, and related concepts
|
||||
4. Do NOT include the input word in the output
|
||||
5. All synonyms must be in the SAME language as the input word
|
||||
6. No explanations, no markdown, just the JSON array
|
||||
|
||||
Example input: "money"
|
||||
Example output: ["cash", "dollar", "currency", "funds", "bucks", "greenbacks", "coins", "wealth", "payment"]"#;
|
||||
|
||||
impl OnlineSynonymExpander {
|
||||
pub fn new(local_file_path: Option<&str>) -> Self {
|
||||
let local_map = if let Some(path) = local_file_path {
|
||||
match Self::load_local_file(path) {
|
||||
Ok(map) => map,
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to load local synonym file {}: {}", path, e);
|
||||
HashMap::new()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
HashMap::new()
|
||||
};
|
||||
|
||||
let api_url = env::var("MOMENTRY_LLM_SYNONYM_URL")
|
||||
.unwrap_or_else(|_| "http://127.0.0.1:8081".to_string());
|
||||
let model = env::var("MOMENTRY_LLM_SYNONYM_MODEL").unwrap_or_else(|_| "gemma4".to_string());
|
||||
let timeout_secs = env::var("MOMENTRY_LLM_SYNONYM_TIMEOUT")
|
||||
.ok()
|
||||
.and_then(|v| v.parse().ok())
|
||||
.unwrap_or(60);
|
||||
|
||||
Self {
|
||||
local_map,
|
||||
runtime_cache: Arc::new(Mutex::new(HashMap::new())),
|
||||
api_url,
|
||||
model,
|
||||
timeout_secs,
|
||||
}
|
||||
}
|
||||
|
||||
fn load_local_file(path: &str) -> Result<HashMap<String, Vec<String>>> {
|
||||
let content = std::fs::read_to_string(path).context("Failed to read local synonym file")?;
|
||||
let map: HashMap<String, Vec<String>> =
|
||||
serde_json::from_str(&content).context("Failed to parse local synonym JSON")?;
|
||||
Ok(map)
|
||||
}
|
||||
|
||||
/// Get synonyms for a word. Checks local map first, then runtime cache, then fetches from LLM.
|
||||
pub async fn expand_word(&self, word: &str) -> String {
|
||||
// 1. Check local map
|
||||
if let Some(syns) = self.local_map.get(word) {
|
||||
if !syns.is_empty() {
|
||||
let mut parts = vec![word.to_string()];
|
||||
parts.extend_from_slice(syns);
|
||||
return format!("({})", parts.join(" | "));
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Check runtime cache
|
||||
let mut cache = self.runtime_cache.lock().await;
|
||||
if let Some(syns) = cache.get(word) {
|
||||
if !syns.is_empty() {
|
||||
let mut parts = vec![word.to_string()];
|
||||
parts.extend_from_slice(syns);
|
||||
return format!("({})", parts.join(" | "));
|
||||
}
|
||||
}
|
||||
drop(cache);
|
||||
|
||||
// 3. Fetch from LLM
|
||||
if let Ok(synonyms) = self.fetch_from_llm(word).await {
|
||||
if !synonyms.is_empty() {
|
||||
// Add to runtime cache
|
||||
let mut cache = self.runtime_cache.lock().await;
|
||||
cache.insert(word.to_string(), synonyms.clone());
|
||||
drop(cache);
|
||||
|
||||
let mut parts = vec![word.to_string()];
|
||||
parts.extend_from_slice(&synonyms);
|
||||
return format!("({})", parts.join(" | "));
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Fallback: return original word
|
||||
word.to_string()
|
||||
}
|
||||
|
||||
async fn fetch_from_llm(&self, word: &str) -> Result<Vec<String>> {
|
||||
let client = reqwest::Client::new();
|
||||
|
||||
let prompt = format!(
|
||||
r#"Give synonyms for: "{}"
|
||||
Return ONLY a JSON array of strings, nothing else. Do NOT include the input word."#,
|
||||
word
|
||||
);
|
||||
|
||||
let payload = serde_json::json!({
|
||||
"model": self.model,
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": SYSTEM_PROMPT
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt
|
||||
}
|
||||
],
|
||||
"temperature": 0.3,
|
||||
"stream": false,
|
||||
"max_tokens": 256,
|
||||
});
|
||||
|
||||
let response = client
|
||||
.post(format!("{}/v1/chat/completions", self.api_url))
|
||||
.json(&payload)
|
||||
.timeout(std::time::Duration::from_secs(self.timeout_secs))
|
||||
.send()
|
||||
.await
|
||||
.context("LLM request failed")?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
anyhow::bail!("LLM request failed with status: {}", response.status());
|
||||
}
|
||||
|
||||
let llm_resp: LlmResponse = response
|
||||
.json()
|
||||
.await
|
||||
.context("Failed to parse LLM response")?;
|
||||
|
||||
let content = &llm_resp
|
||||
.choices
|
||||
.get(0)
|
||||
.context("No choices in LLM response")?
|
||||
.message
|
||||
.content;
|
||||
|
||||
// Extract JSON from response (handle markdown code blocks)
|
||||
let json_str = if let Some(start) = content.find('[') {
|
||||
if let Some(end) = content.rfind(']') {
|
||||
&content[start..=end]
|
||||
} else {
|
||||
anyhow::bail!("No JSON array found in LLM response");
|
||||
}
|
||||
} else {
|
||||
anyhow::bail!("No JSON array found in LLM response");
|
||||
};
|
||||
|
||||
let synonyms: Vec<String> =
|
||||
serde_json::from_str(json_str).context("Failed to parse LLM synonyms JSON")?;
|
||||
|
||||
// Filter and normalize
|
||||
let cleaned: Vec<String> = synonyms
|
||||
.into_iter()
|
||||
.map(|s| s.trim().to_lowercase())
|
||||
.filter(|s| !s.is_empty() && !s.contains(' ')) // Filter out multi-word synonyms for to_tsquery compatibility
|
||||
.collect();
|
||||
|
||||
if cleaned.is_empty() {
|
||||
anyhow::bail!("No valid synonyms returned");
|
||||
}
|
||||
|
||||
tracing::info!(
|
||||
"LLM fetched {} synonyms for '{}': {:?}",
|
||||
cleaned.len(),
|
||||
word,
|
||||
cleaned.iter().take(5).collect::<Vec<_>>()
|
||||
);
|
||||
|
||||
Ok(cleaned)
|
||||
}
|
||||
|
||||
/// Get the number of cached synonyms
|
||||
pub async fn cache_size(&self) -> usize {
|
||||
self.runtime_cache.lock().await.len()
|
||||
}
|
||||
}
|
||||
|
||||
/// Global online synonym expander (lazy-loaded)
|
||||
static ONLINE_EXPANDER: Lazy<Option<OnlineSynonymExpander>> = Lazy::new(|| {
|
||||
if env::var("MOMENTRY_ONLINE_SYNONYM").is_ok() {
|
||||
let local_file = env::var("MOMENTRY_SYNONYM_FILE").ok();
|
||||
tracing::info!("Initializing online synonym expander");
|
||||
Some(OnlineSynonymExpander::new(local_file.as_deref()))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
});
|
||||
|
||||
/// Get the global online synonym expander (if enabled)
|
||||
pub fn global_online_expander() -> Option<&'static OnlineSynonymExpander> {
|
||||
ONLINE_EXPANDER.as_ref()
|
||||
}
|
||||
71
src/core/text/synonym.rs
Normal file
71
src/core/text/synonym.rs
Normal file
@@ -0,0 +1,71 @@
|
||||
use ferrous_opencc::{config::BuiltinConfig, OpenCC};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
static OPENCC_S2T: Lazy<OpenCC> = Lazy::new(|| {
|
||||
OpenCC::from_config(BuiltinConfig::S2t)
|
||||
.expect("Failed to initialize OpenCC Simplified to Traditional converter")
|
||||
});
|
||||
|
||||
static OPENCC_T2S: Lazy<OpenCC> = Lazy::new(|| {
|
||||
OpenCC::from_config(BuiltinConfig::T2s)
|
||||
.expect("Failed to initialize OpenCC Traditional to Simplified converter")
|
||||
});
|
||||
|
||||
/// Convert Simplified Chinese text to Traditional Chinese
|
||||
pub fn simplified_to_traditional(text: &str) -> String {
|
||||
OPENCC_S2T.convert(text)
|
||||
}
|
||||
|
||||
/// Convert Traditional Chinese text to Simplified Chinese
|
||||
pub fn traditional_to_simplified(text: &str) -> String {
|
||||
OPENCC_T2S.convert(text)
|
||||
}
|
||||
|
||||
/// Normalize Chinese query for search:
|
||||
/// 1. Convert Simplified Chinese to Traditional Chinese (assuming database stores Traditional)
|
||||
/// 2. Return converted text
|
||||
pub fn normalize_chinese_query(text: &str) -> String {
|
||||
simplified_to_traditional(text)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_simplified_to_traditional() {
|
||||
// Example: Simplified "计算机" -> Traditional "計算機"
|
||||
let simplified = "计算机";
|
||||
let traditional = simplified_to_traditional(simplified);
|
||||
// The conversion might produce "計算機" (depending on dictionary)
|
||||
// We'll just verify it's not empty and different from input
|
||||
assert!(!traditional.is_empty());
|
||||
assert_ne!(traditional, simplified);
|
||||
|
||||
// Traditional input should remain unchanged (or nearly unchanged)
|
||||
let traditional_input = "計算機";
|
||||
let converted = simplified_to_traditional(traditional_input);
|
||||
assert_eq!(converted, traditional_input);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_traditional_to_simplified() {
|
||||
let traditional = "計算機";
|
||||
let simplified = traditional_to_simplified(traditional);
|
||||
assert!(!simplified.is_empty());
|
||||
assert_ne!(simplified, traditional);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalize_chinese_query() {
|
||||
let simplified = "计算机";
|
||||
let normalized = normalize_chinese_query(simplified);
|
||||
// Should be Traditional
|
||||
assert_ne!(normalized, simplified);
|
||||
|
||||
let traditional = "計算機";
|
||||
let normalized2 = normalize_chinese_query(traditional);
|
||||
// Should remain Traditional
|
||||
assert_eq!(normalized2, traditional);
|
||||
}
|
||||
}
|
||||
247
src/core/text/synonym_expander.rs
Normal file
247
src/core/text/synonym_expander.rs
Normal file
@@ -0,0 +1,247 @@
|
||||
use anyhow::{Context, Result};
|
||||
use once_cell::sync::Lazy;
|
||||
use std::collections::HashMap;
|
||||
use std::env;
|
||||
use std::fs;
|
||||
use std::path::Path;
|
||||
|
||||
/// 同義詞擴展器
|
||||
/// 從 JSON 檔案加載自定義同義詞映射
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct SynonymExpander {
|
||||
/// 詞語 -> 同義詞列表的映射
|
||||
map: HashMap<String, Vec<String>>,
|
||||
}
|
||||
|
||||
impl SynonymExpander {
|
||||
/// 從 JSON 檔案創建同義詞擴展器
|
||||
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
|
||||
let content = fs::read_to_string(path).context("Failed to read synonym file")?;
|
||||
let map: HashMap<String, Vec<String>> =
|
||||
serde_json::from_str(&content).context("Failed to parse synonym JSON")?;
|
||||
Ok(Self { map })
|
||||
}
|
||||
|
||||
/// 從多個 JSON 檔案創建同義詞擴展器(後面的檔案會覆蓋前面的)
|
||||
pub fn from_files<P: AsRef<Path>>(paths: &[P]) -> Result<Self> {
|
||||
let mut combined_map = HashMap::new();
|
||||
|
||||
for path in paths {
|
||||
let content = fs::read_to_string(path)
|
||||
.with_context(|| format!("Failed to read synonym file: {:?}", path.as_ref()))?;
|
||||
let map: HashMap<String, Vec<String>> =
|
||||
serde_json::from_str(&content).with_context(|| {
|
||||
format!("Failed to parse synonym JSON from {:?}", path.as_ref())
|
||||
})?;
|
||||
|
||||
// 合併映射,後面的檔案覆蓋前面的
|
||||
for (key, synonyms) in map {
|
||||
combined_map.insert(key, synonyms);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self { map: combined_map })
|
||||
}
|
||||
|
||||
/// 從內建預設資料創建(返回空映射,用戶可通過配置文件添加自定義同義詞)
|
||||
pub fn from_default() -> Self {
|
||||
Self::empty()
|
||||
}
|
||||
|
||||
/// 獲取詞語的同義詞列表(如果存在)
|
||||
pub fn get_synonyms(&self, word: &str) -> Option<&[String]> {
|
||||
self.map.get(word).map(|v| v.as_slice())
|
||||
}
|
||||
|
||||
/// 擴展查詢詞語:將詞語替換為 (詞語 OR 同義詞1 OR 同義詞2 ...)
|
||||
/// 如果沒有同義詞,返回原詞語
|
||||
pub fn expand_word(&self, word: &str) -> String {
|
||||
match self.get_synonyms(word) {
|
||||
Some(syns) if !syns.is_empty() => {
|
||||
let mut parts = vec![word.to_string()];
|
||||
parts.extend_from_slice(syns);
|
||||
format!("({})", parts.join(" | "))
|
||||
}
|
||||
_ => word.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// 擴展整個查詢字符串(空格分隔的詞語)
|
||||
pub fn expand_query(&self, query: &str) -> String {
|
||||
query
|
||||
.split_whitespace()
|
||||
.map(|word| self.expand_word(word))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" & ")
|
||||
}
|
||||
|
||||
/// 對中文查詢進行智能擴展:先匹配已知同義詞,再對剩餘部分進行分詞
|
||||
pub fn expand_chinese_query(&self, query: &str) -> String {
|
||||
// 如果查詢很短,直接嘗試匹配整個查詢
|
||||
if query.chars().count() <= 4 {
|
||||
if let Some(syns) = self.get_synonyms(query) {
|
||||
let mut parts = vec![query.to_string()];
|
||||
parts.extend_from_slice(syns);
|
||||
return format!("({})", parts.join(" | "));
|
||||
}
|
||||
}
|
||||
|
||||
// 嘗試在查詢中尋找已知的同義詞
|
||||
let mut expanded_parts = Vec::new();
|
||||
let mut remaining_query = query;
|
||||
let mut found_synonym = false;
|
||||
|
||||
// 對同義詞鍵按長度降序排序(最長匹配優先)
|
||||
let mut keys: Vec<&String> = self.map.keys().collect();
|
||||
keys.sort_by_key(|b| std::cmp::Reverse(b.chars().count()));
|
||||
|
||||
// 貪婪匹配:尋找最長的同義詞匹配
|
||||
while !remaining_query.is_empty() {
|
||||
let mut matched = false;
|
||||
|
||||
for key in &keys {
|
||||
if remaining_query.starts_with(*key) {
|
||||
// 找到匹配的同義詞
|
||||
expanded_parts.push(self.expand_word(key));
|
||||
remaining_query = &remaining_query[key.len()..];
|
||||
found_synonym = true;
|
||||
matched = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !matched {
|
||||
// 沒有找到同義詞,跳過第一個字符,繼續嘗試
|
||||
let first_char_len = remaining_query.chars().next().map_or(0, |c| c.len_utf8());
|
||||
if first_char_len > 0 {
|
||||
let next_part = &remaining_query[..first_char_len];
|
||||
expanded_parts.push(next_part.to_string());
|
||||
remaining_query = &remaining_query[first_char_len..];
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if found_synonym {
|
||||
// 如果有找到同義詞,使用擴展後的查詢
|
||||
expanded_parts.join(" & ")
|
||||
} else {
|
||||
// 沒有找到同義詞,返回原查詢(稍後會進行分詞)
|
||||
query.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// 創建空的同義詞擴展器(無同義詞映射)
|
||||
pub fn empty() -> Self {
|
||||
Self {
|
||||
map: HashMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// 全局同義詞擴展器(懶加載)
|
||||
static SYNONYM_EXPANDER: Lazy<SynonymExpander> = Lazy::new(|| {
|
||||
// 優先嘗試 MOMENTRY_SYNONYM_FILES(逗號分隔的多個檔案)
|
||||
if let Ok(files_var) = env::var("MOMENTRY_SYNONYM_FILES") {
|
||||
let file_paths: Vec<&str> = files_var
|
||||
.split(',')
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect();
|
||||
|
||||
if !file_paths.is_empty() {
|
||||
match SynonymExpander::from_files(&file_paths) {
|
||||
Ok(expander) => {
|
||||
tracing::info!(
|
||||
"Loaded synonym expander from {} files: {:?}",
|
||||
file_paths.len(),
|
||||
file_paths
|
||||
);
|
||||
return expander;
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
"Failed to load synonym expander from files {:?}: {}",
|
||||
file_paths,
|
||||
e
|
||||
);
|
||||
// 繼續嘗試單一檔案或使用預設
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 回退到單一檔案 MOMENTRY_SYNONYM_FILE(向下兼容)
|
||||
if let Ok(file_path) = env::var("MOMENTRY_SYNONYM_FILE") {
|
||||
match SynonymExpander::from_file(&file_path) {
|
||||
Ok(expander) => {
|
||||
tracing::info!("Loaded synonym expander from {}", file_path);
|
||||
expander
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Failed to load synonym expander from {}: {}", file_path, e);
|
||||
SynonymExpander::empty()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// 使用預設同義詞(示例)
|
||||
SynonymExpander::from_default()
|
||||
}
|
||||
});
|
||||
|
||||
/// 獲取全局同義詞擴展器實例
|
||||
pub fn global_synonym_expander() -> &'static SynonymExpander {
|
||||
&SYNONYM_EXPANDER
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_expand_word() {
|
||||
let mut map = HashMap::new();
|
||||
map.insert(
|
||||
"電腦".to_string(),
|
||||
vec!["計算機".to_string(), "微机".to_string()],
|
||||
);
|
||||
map.insert(
|
||||
"工作".to_string(),
|
||||
vec!["任務".to_string(), "作業".to_string()],
|
||||
);
|
||||
let expander = SynonymExpander { map };
|
||||
|
||||
assert_eq!(expander.expand_word("電腦"), "(電腦 | 計算機 | 微机)");
|
||||
assert_eq!(expander.expand_word("工作"), "(工作 | 任務 | 作業)");
|
||||
assert_eq!(expander.expand_word("未知"), "未知");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_expand_query() {
|
||||
let mut map = HashMap::new();
|
||||
map.insert(
|
||||
"電腦".to_string(),
|
||||
vec!["計算機".to_string(), "微机".to_string()],
|
||||
);
|
||||
map.insert(
|
||||
"工作".to_string(),
|
||||
vec!["任務".to_string(), "作業".to_string()],
|
||||
);
|
||||
let expander = SynonymExpander { map };
|
||||
|
||||
assert_eq!(
|
||||
expander.expand_query("電腦 工作"),
|
||||
"(電腦 | 計算機 | 微机) & (工作 | 任務 | 作業)"
|
||||
);
|
||||
assert_eq!(expander.expand_query("單個詞"), "單個詞");
|
||||
assert_eq!(expander.expand_query(""), "");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_files_empty() {
|
||||
let paths: Vec<&str> = vec![];
|
||||
let expander = SynonymExpander::from_files(&paths).unwrap();
|
||||
assert!(expander.map.is_empty());
|
||||
}
|
||||
}
|
||||
121
src/core/text/tokenizer.rs
Normal file
121
src/core/text/tokenizer.rs
Normal file
@@ -0,0 +1,121 @@
|
||||
use jieba_rs::Jieba;
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
static JIEBA: Lazy<Jieba> = Lazy::new(Jieba::new);
|
||||
|
||||
/// 檢查文本是否包含中文字符
|
||||
/// 包括 CJK Unified Ideographs (U+4E00-U+9FFF) 和 Extension A (U+3400-U+4DBF)
|
||||
pub fn contains_chinese(text: &str) -> bool {
|
||||
text.chars()
|
||||
.any(|c| ('\u{4e00}'..='\u{9fff}').contains(&c) || ('\u{3400}'..='\u{4dbf}').contains(&c))
|
||||
}
|
||||
|
||||
/// 對中文文本進行分詞,並用空格連接分詞結果
|
||||
/// 非中文文本保持不變
|
||||
///
|
||||
/// # 示例
|
||||
/// ```
|
||||
/// use momentry_core::core::text::tokenizer::tokenize_chinese_text;
|
||||
///
|
||||
/// assert_eq!(tokenize_chinese_text("這是一個測試"), "這 是 一 個 測 試");
|
||||
/// assert_eq!(tokenize_chinese_text("Hello world"), "Hello world");
|
||||
/// assert_eq!(tokenize_chinese_text("中文English混合"), "中文 English 混合");
|
||||
/// ```
|
||||
pub fn tokenize_chinese_text(text: &str) -> String {
|
||||
if contains_chinese(text) {
|
||||
// 使用精確模式分詞(cut=false)
|
||||
let tokens = JIEBA.cut(text, false);
|
||||
tokens.join(" ")
|
||||
} else {
|
||||
text.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
/// 從 JSON 內容中提取文本並進行分詞
|
||||
/// 支持兩種格式:
|
||||
/// 1. content->'data'->>'text' (中文視頻格式)
|
||||
/// 2. content->'text' (英文視頻格式)
|
||||
pub fn extract_and_tokenize_text(content: &serde_json::Value) -> String {
|
||||
let raw_text = content
|
||||
.get("data")
|
||||
.and_then(|data| data.get("text"))
|
||||
.and_then(|v| v.as_str())
|
||||
.or_else(|| content.get("text").and_then(|v| v.as_str()))
|
||||
.unwrap_or("");
|
||||
|
||||
tokenize_chinese_text(raw_text)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_contains_chinese() {
|
||||
assert!(contains_chinese("中文"));
|
||||
assert!(contains_chinese("這是一個測試"));
|
||||
assert!(contains_chinese("混合文本 English 中文"));
|
||||
assert!(!contains_chinese("English only"));
|
||||
assert!(!contains_chinese("123"));
|
||||
assert!(!contains_chinese(""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_chinese_text() {
|
||||
// 純中文
|
||||
assert_eq!(tokenize_chinese_text("這是一個測試"), "這 是 一 個 測 試");
|
||||
|
||||
// 純英文
|
||||
assert_eq!(tokenize_chinese_text("Hello world"), "Hello world");
|
||||
|
||||
// 中英混合
|
||||
assert_eq!(
|
||||
tokenize_chinese_text("中文English混合"),
|
||||
"中文 English 混合"
|
||||
);
|
||||
|
||||
// 空字符串
|
||||
assert_eq!(tokenize_chinese_text(""), "");
|
||||
|
||||
// 數字和標點
|
||||
assert_eq!(tokenize_chinese_text("測試123。"), "測 試 123 。");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_and_tokenize_text() {
|
||||
// 中文格式:content->'data'->>'text'
|
||||
let content1 = serde_json::json!({
|
||||
"data": {
|
||||
"text": "這是一個測試"
|
||||
}
|
||||
});
|
||||
assert_eq!(extract_and_tokenize_text(&content1), "這 是 一 個 測 試");
|
||||
|
||||
// 英文格式:content->'text'
|
||||
let content2 = serde_json::json!({
|
||||
"text": "Hello world"
|
||||
});
|
||||
assert_eq!(extract_and_tokenize_text(&content2), "Hello world");
|
||||
|
||||
// 混合格式:優先使用 data->text
|
||||
let content3 = serde_json::json!({
|
||||
"data": {
|
||||
"text": "中文測試"
|
||||
},
|
||||
"text": "English text"
|
||||
});
|
||||
assert_eq!(extract_and_tokenize_text(&content3), "中文 測 試");
|
||||
|
||||
// 無文本
|
||||
let content4 = serde_json::json!({});
|
||||
assert_eq!(extract_and_tokenize_text(&content4), "");
|
||||
|
||||
// 非字符串文本
|
||||
let content5 = serde_json::json!({
|
||||
"data": {
|
||||
"text": 123
|
||||
}
|
||||
});
|
||||
assert_eq!(extract_and_tokenize_text(&content5), "");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user