feat: backup architecture docs, source code, and scripts

This commit is contained in:
Warren
2026-04-25 17:15:45 +08:00
parent 59809dae1f
commit 1f84e5469f
368 changed files with 146329 additions and 261 deletions

9
src/core/text/mod.rs Normal file
View File

@@ -0,0 +1,9 @@
pub mod online_synonym_expander;
pub mod synonym;
pub mod synonym_expander;
pub mod tokenizer;
pub use online_synonym_expander::{global_online_expander, OnlineSynonymExpander};
pub use synonym::{normalize_chinese_query, simplified_to_traditional, traditional_to_simplified};
pub use synonym_expander::{global_synonym_expander, SynonymExpander};
pub use tokenizer::{contains_chinese, extract_and_tokenize_text, tokenize_chinese_text};

View File

@@ -0,0 +1,242 @@
use anyhow::{Context, Result};
use once_cell::sync::Lazy;
use serde::Deserialize;
use std::collections::HashMap;
use std::env;
use std::sync::Arc;
use tokio::sync::Mutex;
/// Online Synonym Expander
/// Fetches synonyms from LLM (llama.cpp server) on-demand and caches them.
///
/// Environment variables:
/// - `MOMENTRY_ONLINE_SYNONYM` - Enable online synonym expansion (default: false)
/// - `MOMENTRY_LLM_SYNONYM_URL` - LLM server URL (default: http://127.0.0.1:8081)
/// - `MOMENTRY_LLM_SYNONYM_MODEL` - Model name (default: gemma4)
/// - `MOMENTRY_LLM_SYNONYM_TIMEOUT` - Request timeout in seconds (default: 60)
#[derive(Debug, Deserialize)]
struct LlmResponse {
choices: Vec<LlmChoice>,
}
#[derive(Debug, Deserialize)]
struct LlmChoice {
message: LlmMessage,
}
#[derive(Debug, Deserialize)]
struct LlmMessage {
content: String,
}
#[derive(Debug)]
pub struct OnlineSynonymExpander {
/// Local synonym cache (loaded from file)
local_map: HashMap<String, Vec<String>>,
/// Runtime cache for LLM-fetched synonyms
runtime_cache: Arc<Mutex<HashMap<String, Vec<String>>>>,
/// LLM server URL
api_url: String,
/// Model name
model: String,
/// Request timeout
timeout_secs: u64,
}
static SYSTEM_PROMPT: &str = r#"You are a synonym generation assistant. For each given word, provide 8-12 synonyms in the same language.
Rules:
1. Return ONLY a JSON array of strings, nothing else
2. Synonyms should be contextually relevant for video content search
3. Include common words, informal terms, and related concepts
4. Do NOT include the input word in the output
5. All synonyms must be in the SAME language as the input word
6. No explanations, no markdown, just the JSON array
Example input: "money"
Example output: ["cash", "dollar", "currency", "funds", "bucks", "greenbacks", "coins", "wealth", "payment"]"#;
impl OnlineSynonymExpander {
pub fn new(local_file_path: Option<&str>) -> Self {
let local_map = if let Some(path) = local_file_path {
match Self::load_local_file(path) {
Ok(map) => map,
Err(e) => {
tracing::warn!("Failed to load local synonym file {}: {}", path, e);
HashMap::new()
}
}
} else {
HashMap::new()
};
let api_url = env::var("MOMENTRY_LLM_SYNONYM_URL")
.unwrap_or_else(|_| "http://127.0.0.1:8081".to_string());
let model = env::var("MOMENTRY_LLM_SYNONYM_MODEL").unwrap_or_else(|_| "gemma4".to_string());
let timeout_secs = env::var("MOMENTRY_LLM_SYNONYM_TIMEOUT")
.ok()
.and_then(|v| v.parse().ok())
.unwrap_or(60);
Self {
local_map,
runtime_cache: Arc::new(Mutex::new(HashMap::new())),
api_url,
model,
timeout_secs,
}
}
fn load_local_file(path: &str) -> Result<HashMap<String, Vec<String>>> {
let content = std::fs::read_to_string(path).context("Failed to read local synonym file")?;
let map: HashMap<String, Vec<String>> =
serde_json::from_str(&content).context("Failed to parse local synonym JSON")?;
Ok(map)
}
/// Get synonyms for a word. Checks local map first, then runtime cache, then fetches from LLM.
pub async fn expand_word(&self, word: &str) -> String {
// 1. Check local map
if let Some(syns) = self.local_map.get(word) {
if !syns.is_empty() {
let mut parts = vec![word.to_string()];
parts.extend_from_slice(syns);
return format!("({})", parts.join(" | "));
}
}
// 2. Check runtime cache
let mut cache = self.runtime_cache.lock().await;
if let Some(syns) = cache.get(word) {
if !syns.is_empty() {
let mut parts = vec![word.to_string()];
parts.extend_from_slice(syns);
return format!("({})", parts.join(" | "));
}
}
drop(cache);
// 3. Fetch from LLM
if let Ok(synonyms) = self.fetch_from_llm(word).await {
if !synonyms.is_empty() {
// Add to runtime cache
let mut cache = self.runtime_cache.lock().await;
cache.insert(word.to_string(), synonyms.clone());
drop(cache);
let mut parts = vec![word.to_string()];
parts.extend_from_slice(&synonyms);
return format!("({})", parts.join(" | "));
}
}
// 4. Fallback: return original word
word.to_string()
}
async fn fetch_from_llm(&self, word: &str) -> Result<Vec<String>> {
let client = reqwest::Client::new();
let prompt = format!(
r#"Give synonyms for: "{}"
Return ONLY a JSON array of strings, nothing else. Do NOT include the input word."#,
word
);
let payload = serde_json::json!({
"model": self.model,
"messages": [
{
"role": "system",
"content": SYSTEM_PROMPT
},
{
"role": "user",
"content": prompt
}
],
"temperature": 0.3,
"stream": false,
"max_tokens": 256,
});
let response = client
.post(format!("{}/v1/chat/completions", self.api_url))
.json(&payload)
.timeout(std::time::Duration::from_secs(self.timeout_secs))
.send()
.await
.context("LLM request failed")?;
if !response.status().is_success() {
anyhow::bail!("LLM request failed with status: {}", response.status());
}
let llm_resp: LlmResponse = response
.json()
.await
.context("Failed to parse LLM response")?;
let content = &llm_resp
.choices
.get(0)
.context("No choices in LLM response")?
.message
.content;
// Extract JSON from response (handle markdown code blocks)
let json_str = if let Some(start) = content.find('[') {
if let Some(end) = content.rfind(']') {
&content[start..=end]
} else {
anyhow::bail!("No JSON array found in LLM response");
}
} else {
anyhow::bail!("No JSON array found in LLM response");
};
let synonyms: Vec<String> =
serde_json::from_str(json_str).context("Failed to parse LLM synonyms JSON")?;
// Filter and normalize
let cleaned: Vec<String> = synonyms
.into_iter()
.map(|s| s.trim().to_lowercase())
.filter(|s| !s.is_empty() && !s.contains(' ')) // Filter out multi-word synonyms for to_tsquery compatibility
.collect();
if cleaned.is_empty() {
anyhow::bail!("No valid synonyms returned");
}
tracing::info!(
"LLM fetched {} synonyms for '{}': {:?}",
cleaned.len(),
word,
cleaned.iter().take(5).collect::<Vec<_>>()
);
Ok(cleaned)
}
/// Get the number of cached synonyms
pub async fn cache_size(&self) -> usize {
self.runtime_cache.lock().await.len()
}
}
/// Global online synonym expander (lazy-loaded)
static ONLINE_EXPANDER: Lazy<Option<OnlineSynonymExpander>> = Lazy::new(|| {
if env::var("MOMENTRY_ONLINE_SYNONYM").is_ok() {
let local_file = env::var("MOMENTRY_SYNONYM_FILE").ok();
tracing::info!("Initializing online synonym expander");
Some(OnlineSynonymExpander::new(local_file.as_deref()))
} else {
None
}
});
/// Get the global online synonym expander (if enabled)
pub fn global_online_expander() -> Option<&'static OnlineSynonymExpander> {
ONLINE_EXPANDER.as_ref()
}

71
src/core/text/synonym.rs Normal file
View File

@@ -0,0 +1,71 @@
use ferrous_opencc::{config::BuiltinConfig, OpenCC};
use once_cell::sync::Lazy;
static OPENCC_S2T: Lazy<OpenCC> = Lazy::new(|| {
OpenCC::from_config(BuiltinConfig::S2t)
.expect("Failed to initialize OpenCC Simplified to Traditional converter")
});
static OPENCC_T2S: Lazy<OpenCC> = Lazy::new(|| {
OpenCC::from_config(BuiltinConfig::T2s)
.expect("Failed to initialize OpenCC Traditional to Simplified converter")
});
/// Convert Simplified Chinese text to Traditional Chinese
pub fn simplified_to_traditional(text: &str) -> String {
OPENCC_S2T.convert(text)
}
/// Convert Traditional Chinese text to Simplified Chinese
pub fn traditional_to_simplified(text: &str) -> String {
OPENCC_T2S.convert(text)
}
/// Normalize Chinese query for search:
/// 1. Convert Simplified Chinese to Traditional Chinese (assuming database stores Traditional)
/// 2. Return converted text
pub fn normalize_chinese_query(text: &str) -> String {
simplified_to_traditional(text)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simplified_to_traditional() {
// Example: Simplified "计算机" -> Traditional "計算機"
let simplified = "计算机";
let traditional = simplified_to_traditional(simplified);
// The conversion might produce "計算機" (depending on dictionary)
// We'll just verify it's not empty and different from input
assert!(!traditional.is_empty());
assert_ne!(traditional, simplified);
// Traditional input should remain unchanged (or nearly unchanged)
let traditional_input = "計算機";
let converted = simplified_to_traditional(traditional_input);
assert_eq!(converted, traditional_input);
}
#[test]
fn test_traditional_to_simplified() {
let traditional = "計算機";
let simplified = traditional_to_simplified(traditional);
assert!(!simplified.is_empty());
assert_ne!(simplified, traditional);
}
#[test]
fn test_normalize_chinese_query() {
let simplified = "计算机";
let normalized = normalize_chinese_query(simplified);
// Should be Traditional
assert_ne!(normalized, simplified);
let traditional = "計算機";
let normalized2 = normalize_chinese_query(traditional);
// Should remain Traditional
assert_eq!(normalized2, traditional);
}
}

View File

@@ -0,0 +1,247 @@
use anyhow::{Context, Result};
use once_cell::sync::Lazy;
use std::collections::HashMap;
use std::env;
use std::fs;
use std::path::Path;
/// 同義詞擴展器
/// 從 JSON 檔案加載自定義同義詞映射
#[derive(Debug, Clone, Default)]
pub struct SynonymExpander {
/// 詞語 -> 同義詞列表的映射
map: HashMap<String, Vec<String>>,
}
impl SynonymExpander {
/// 從 JSON 檔案創建同義詞擴展器
pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let content = fs::read_to_string(path).context("Failed to read synonym file")?;
let map: HashMap<String, Vec<String>> =
serde_json::from_str(&content).context("Failed to parse synonym JSON")?;
Ok(Self { map })
}
/// 從多個 JSON 檔案創建同義詞擴展器(後面的檔案會覆蓋前面的)
pub fn from_files<P: AsRef<Path>>(paths: &[P]) -> Result<Self> {
let mut combined_map = HashMap::new();
for path in paths {
let content = fs::read_to_string(path)
.with_context(|| format!("Failed to read synonym file: {:?}", path.as_ref()))?;
let map: HashMap<String, Vec<String>> =
serde_json::from_str(&content).with_context(|| {
format!("Failed to parse synonym JSON from {:?}", path.as_ref())
})?;
// 合併映射,後面的檔案覆蓋前面的
for (key, synonyms) in map {
combined_map.insert(key, synonyms);
}
}
Ok(Self { map: combined_map })
}
/// 從內建預設資料創建(返回空映射,用戶可通過配置文件添加自定義同義詞)
pub fn from_default() -> Self {
Self::empty()
}
/// 獲取詞語的同義詞列表(如果存在)
pub fn get_synonyms(&self, word: &str) -> Option<&[String]> {
self.map.get(word).map(|v| v.as_slice())
}
/// 擴展查詢詞語:將詞語替換為 (詞語 OR 同義詞1 OR 同義詞2 ...)
/// 如果沒有同義詞,返回原詞語
pub fn expand_word(&self, word: &str) -> String {
match self.get_synonyms(word) {
Some(syns) if !syns.is_empty() => {
let mut parts = vec![word.to_string()];
parts.extend_from_slice(syns);
format!("({})", parts.join(" | "))
}
_ => word.to_string(),
}
}
/// 擴展整個查詢字符串(空格分隔的詞語)
pub fn expand_query(&self, query: &str) -> String {
query
.split_whitespace()
.map(|word| self.expand_word(word))
.collect::<Vec<_>>()
.join(" & ")
}
/// 對中文查詢進行智能擴展:先匹配已知同義詞,再對剩餘部分進行分詞
pub fn expand_chinese_query(&self, query: &str) -> String {
// 如果查詢很短,直接嘗試匹配整個查詢
if query.chars().count() <= 4 {
if let Some(syns) = self.get_synonyms(query) {
let mut parts = vec![query.to_string()];
parts.extend_from_slice(syns);
return format!("({})", parts.join(" | "));
}
}
// 嘗試在查詢中尋找已知的同義詞
let mut expanded_parts = Vec::new();
let mut remaining_query = query;
let mut found_synonym = false;
// 對同義詞鍵按長度降序排序(最長匹配優先)
let mut keys: Vec<&String> = self.map.keys().collect();
keys.sort_by_key(|b| std::cmp::Reverse(b.chars().count()));
// 貪婪匹配:尋找最長的同義詞匹配
while !remaining_query.is_empty() {
let mut matched = false;
for key in &keys {
if remaining_query.starts_with(*key) {
// 找到匹配的同義詞
expanded_parts.push(self.expand_word(key));
remaining_query = &remaining_query[key.len()..];
found_synonym = true;
matched = true;
break;
}
}
if !matched {
// 沒有找到同義詞,跳過第一個字符,繼續嘗試
let first_char_len = remaining_query.chars().next().map_or(0, |c| c.len_utf8());
if first_char_len > 0 {
let next_part = &remaining_query[..first_char_len];
expanded_parts.push(next_part.to_string());
remaining_query = &remaining_query[first_char_len..];
} else {
break;
}
}
}
if found_synonym {
// 如果有找到同義詞,使用擴展後的查詢
expanded_parts.join(" & ")
} else {
// 沒有找到同義詞,返回原查詢(稍後會進行分詞)
query.to_string()
}
}
/// 創建空的同義詞擴展器(無同義詞映射)
pub fn empty() -> Self {
Self {
map: HashMap::new(),
}
}
}
/// 全局同義詞擴展器(懶加載)
static SYNONYM_EXPANDER: Lazy<SynonymExpander> = Lazy::new(|| {
// 優先嘗試 MOMENTRY_SYNONYM_FILES逗號分隔的多個檔案
if let Ok(files_var) = env::var("MOMENTRY_SYNONYM_FILES") {
let file_paths: Vec<&str> = files_var
.split(',')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.collect();
if !file_paths.is_empty() {
match SynonymExpander::from_files(&file_paths) {
Ok(expander) => {
tracing::info!(
"Loaded synonym expander from {} files: {:?}",
file_paths.len(),
file_paths
);
return expander;
}
Err(e) => {
tracing::warn!(
"Failed to load synonym expander from files {:?}: {}",
file_paths,
e
);
// 繼續嘗試單一檔案或使用預設
}
}
}
}
// 回退到單一檔案 MOMENTRY_SYNONYM_FILE向下兼容
if let Ok(file_path) = env::var("MOMENTRY_SYNONYM_FILE") {
match SynonymExpander::from_file(&file_path) {
Ok(expander) => {
tracing::info!("Loaded synonym expander from {}", file_path);
expander
}
Err(e) => {
tracing::warn!("Failed to load synonym expander from {}: {}", file_path, e);
SynonymExpander::empty()
}
}
} else {
// 使用預設同義詞(示例)
SynonymExpander::from_default()
}
});
/// 獲取全局同義詞擴展器實例
pub fn global_synonym_expander() -> &'static SynonymExpander {
&SYNONYM_EXPANDER
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_expand_word() {
let mut map = HashMap::new();
map.insert(
"電腦".to_string(),
vec!["計算機".to_string(), "微机".to_string()],
);
map.insert(
"工作".to_string(),
vec!["任務".to_string(), "作業".to_string()],
);
let expander = SynonymExpander { map };
assert_eq!(expander.expand_word("電腦"), "(電腦 | 計算機 | 微机)");
assert_eq!(expander.expand_word("工作"), "(工作 | 任務 | 作業)");
assert_eq!(expander.expand_word("未知"), "未知");
}
#[test]
fn test_expand_query() {
let mut map = HashMap::new();
map.insert(
"電腦".to_string(),
vec!["計算機".to_string(), "微机".to_string()],
);
map.insert(
"工作".to_string(),
vec!["任務".to_string(), "作業".to_string()],
);
let expander = SynonymExpander { map };
assert_eq!(
expander.expand_query("電腦 工作"),
"(電腦 | 計算機 | 微机) & (工作 | 任務 | 作業)"
);
assert_eq!(expander.expand_query("單個詞"), "單個詞");
assert_eq!(expander.expand_query(""), "");
}
#[test]
fn test_from_files_empty() {
let paths: Vec<&str> = vec![];
let expander = SynonymExpander::from_files(&paths).unwrap();
assert!(expander.map.is_empty());
}
}

121
src/core/text/tokenizer.rs Normal file
View File

@@ -0,0 +1,121 @@
use jieba_rs::Jieba;
use once_cell::sync::Lazy;
static JIEBA: Lazy<Jieba> = Lazy::new(Jieba::new);
/// 檢查文本是否包含中文字符
/// 包括 CJK Unified Ideographs (U+4E00-U+9FFF) 和 Extension A (U+3400-U+4DBF)
pub fn contains_chinese(text: &str) -> bool {
text.chars()
.any(|c| ('\u{4e00}'..='\u{9fff}').contains(&c) || ('\u{3400}'..='\u{4dbf}').contains(&c))
}
/// 對中文文本進行分詞,並用空格連接分詞結果
/// 非中文文本保持不變
///
/// # 示例
/// ```
/// use momentry_core::core::text::tokenizer::tokenize_chinese_text;
///
/// assert_eq!(tokenize_chinese_text("這是一個測試"), "這 是 一 個 測 試");
/// assert_eq!(tokenize_chinese_text("Hello world"), "Hello world");
/// assert_eq!(tokenize_chinese_text("中文English混合"), "中文 English 混合");
/// ```
pub fn tokenize_chinese_text(text: &str) -> String {
if contains_chinese(text) {
// 使用精確模式分詞cut=false
let tokens = JIEBA.cut(text, false);
tokens.join(" ")
} else {
text.to_string()
}
}
/// 從 JSON 內容中提取文本並進行分詞
/// 支持兩種格式:
/// 1. content->'data'->>'text' (中文視頻格式)
/// 2. content->'text' (英文視頻格式)
pub fn extract_and_tokenize_text(content: &serde_json::Value) -> String {
let raw_text = content
.get("data")
.and_then(|data| data.get("text"))
.and_then(|v| v.as_str())
.or_else(|| content.get("text").and_then(|v| v.as_str()))
.unwrap_or("");
tokenize_chinese_text(raw_text)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_contains_chinese() {
assert!(contains_chinese("中文"));
assert!(contains_chinese("這是一個測試"));
assert!(contains_chinese("混合文本 English 中文"));
assert!(!contains_chinese("English only"));
assert!(!contains_chinese("123"));
assert!(!contains_chinese(""));
}
#[test]
fn test_tokenize_chinese_text() {
// 純中文
assert_eq!(tokenize_chinese_text("這是一個測試"), "這 是 一 個 測 試");
// 純英文
assert_eq!(tokenize_chinese_text("Hello world"), "Hello world");
// 中英混合
assert_eq!(
tokenize_chinese_text("中文English混合"),
"中文 English 混合"
);
// 空字符串
assert_eq!(tokenize_chinese_text(""), "");
// 數字和標點
assert_eq!(tokenize_chinese_text("測試123。"), "測 試 123 。");
}
#[test]
fn test_extract_and_tokenize_text() {
// 中文格式content->'data'->>'text'
let content1 = serde_json::json!({
"data": {
"text": "這是一個測試"
}
});
assert_eq!(extract_and_tokenize_text(&content1), "這 是 一 個 測 試");
// 英文格式content->'text'
let content2 = serde_json::json!({
"text": "Hello world"
});
assert_eq!(extract_and_tokenize_text(&content2), "Hello world");
// 混合格式:優先使用 data->text
let content3 = serde_json::json!({
"data": {
"text": "中文測試"
},
"text": "English text"
});
assert_eq!(extract_and_tokenize_text(&content3), "中文 測 試");
// 無文本
let content4 = serde_json::json!({});
assert_eq!(extract_and_tokenize_text(&content4), "");
// 非字符串文本
let content5 = serde_json::json!({
"data": {
"text": 123
}
});
assert_eq!(extract_and_tokenize_text(&content5), "");
}
}