feat: backup architecture docs, source code, and scripts

2026-04-25 17:15:45 +08:00
parent 59809dae1f
commit 1f84e5469f
368 changed files with 146329 additions and 261 deletions
--- a/src/core/text/mod.rs
+++ b/src/core/text/mod.rs
@@ -0,0 +1,9 @@
+pub mod online_synonym_expander;
+pub mod synonym;
+pub mod synonym_expander;
+pub mod tokenizer;
+
+pub use online_synonym_expander::{global_online_expander, OnlineSynonymExpander};
+pub use synonym::{normalize_chinese_query, simplified_to_traditional, traditional_to_simplified};
+pub use synonym_expander::{global_synonym_expander, SynonymExpander};
+pub use tokenizer::{contains_chinese, extract_and_tokenize_text, tokenize_chinese_text};
--- a/src/core/text/online_synonym_expander.rs
+++ b/src/core/text/online_synonym_expander.rs
@@ -0,0 +1,242 @@
+use anyhow::{Context, Result};
+use once_cell::sync::Lazy;
+use serde::Deserialize;
+use std::collections::HashMap;
+use std::env;
+use std::sync::Arc;
+use tokio::sync::Mutex;
+
+/// Online Synonym Expander
+/// Fetches synonyms from LLM (llama.cpp server) on-demand and caches them.
+///
+/// Environment variables:
+/// - `MOMENTRY_ONLINE_SYNONYM` - Enable online synonym expansion (default: false)
+/// - `MOMENTRY_LLM_SYNONYM_URL` - LLM server URL (default: http://127.0.0.1:8081)
+/// - `MOMENTRY_LLM_SYNONYM_MODEL` - Model name (default: gemma4)
+/// - `MOMENTRY_LLM_SYNONYM_TIMEOUT` - Request timeout in seconds (default: 60)
+
+#[derive(Debug, Deserialize)]
+struct LlmResponse {
+    choices: Vec<LlmChoice>,
+}
+
+#[derive(Debug, Deserialize)]
+struct LlmChoice {
+    message: LlmMessage,
+}
+
+#[derive(Debug, Deserialize)]
+struct LlmMessage {
+    content: String,
+}
+
+#[derive(Debug)]
+pub struct OnlineSynonymExpander {
+    /// Local synonym cache (loaded from file)
+    local_map: HashMap<String, Vec<String>>,
+    /// Runtime cache for LLM-fetched synonyms
+    runtime_cache: Arc<Mutex<HashMap<String, Vec<String>>>>,
+    /// LLM server URL
+    api_url: String,
+    /// Model name
+    model: String,
+    /// Request timeout
+    timeout_secs: u64,
+}
+
+static SYSTEM_PROMPT: &str = r#"You are a synonym generation assistant. For each given word, provide 8-12 synonyms in the same language.
+Rules:
+1. Return ONLY a JSON array of strings, nothing else
+2. Synonyms should be contextually relevant for video content search
+3. Include common words, informal terms, and related concepts
+4. Do NOT include the input word in the output
+5. All synonyms must be in the SAME language as the input word
+6. No explanations, no markdown, just the JSON array
+
+Example input: "money"
+Example output: ["cash", "dollar", "currency", "funds", "bucks", "greenbacks", "coins", "wealth", "payment"]"#;
+
+impl OnlineSynonymExpander {
+    pub fn new(local_file_path: Option<&str>) -> Self {
+        let local_map = if let Some(path) = local_file_path {
+            match Self::load_local_file(path) {
+                Ok(map) => map,
+                Err(e) => {
+                    tracing::warn!("Failed to load local synonym file {}: {}", path, e);
+                    HashMap::new()
+                }
+            }
+        } else {
+            HashMap::new()
+        };
+
+        let api_url = env::var("MOMENTRY_LLM_SYNONYM_URL")
+            .unwrap_or_else(|_| "http://127.0.0.1:8081".to_string());
+        let model = env::var("MOMENTRY_LLM_SYNONYM_MODEL").unwrap_or_else(|_| "gemma4".to_string());
+        let timeout_secs = env::var("MOMENTRY_LLM_SYNONYM_TIMEOUT")
+            .ok()
+            .and_then(|v| v.parse().ok())
+            .unwrap_or(60);
+
+        Self {
+            local_map,
+            runtime_cache: Arc::new(Mutex::new(HashMap::new())),
+            api_url,
+            model,
+            timeout_secs,
+        }
+    }
+
+    fn load_local_file(path: &str) -> Result<HashMap<String, Vec<String>>> {
+        let content = std::fs::read_to_string(path).context("Failed to read local synonym file")?;
+        let map: HashMap<String, Vec<String>> =
+            serde_json::from_str(&content).context("Failed to parse local synonym JSON")?;
+        Ok(map)
+    }
+
+    /// Get synonyms for a word. Checks local map first, then runtime cache, then fetches from LLM.
+    pub async fn expand_word(&self, word: &str) -> String {
+        // 1. Check local map
+        if let Some(syns) = self.local_map.get(word) {
+            if !syns.is_empty() {
+                let mut parts = vec![word.to_string()];
+                parts.extend_from_slice(syns);
+                return format!("({})", parts.join(" | "));
+            }
+        }
+
+        // 2. Check runtime cache
+        let mut cache = self.runtime_cache.lock().await;
+        if let Some(syns) = cache.get(word) {
+            if !syns.is_empty() {
+                let mut parts = vec![word.to_string()];
+                parts.extend_from_slice(syns);
+                return format!("({})", parts.join(" | "));
+            }
+        }
+        drop(cache);
+
+        // 3. Fetch from LLM
+        if let Ok(synonyms) = self.fetch_from_llm(word).await {
+            if !synonyms.is_empty() {
+                // Add to runtime cache
+                let mut cache = self.runtime_cache.lock().await;
+                cache.insert(word.to_string(), synonyms.clone());
+                drop(cache);
+
+                let mut parts = vec![word.to_string()];
+                parts.extend_from_slice(&synonyms);
+                return format!("({})", parts.join(" | "));
+            }
+        }
+
+        // 4. Fallback: return original word
+        word.to_string()
+    }
+
+    async fn fetch_from_llm(&self, word: &str) -> Result<Vec<String>> {
+        let client = reqwest::Client::new();
+
+        let prompt = format!(
+            r#"Give synonyms for: "{}"
+Return ONLY a JSON array of strings, nothing else. Do NOT include the input word."#,
+            word
+        );
+
+        let payload = serde_json::json!({
+            "model": self.model,
+            "messages": [
+                {
+                    "role": "system",
+                    "content": SYSTEM_PROMPT
+                },
+                {
+                    "role": "user",
+                    "content": prompt
+                }
+            ],
+            "temperature": 0.3,
+            "stream": false,
+            "max_tokens": 256,
+        });
+
+        let response = client
+            .post(format!("{}/v1/chat/completions", self.api_url))
+            .json(&payload)
+            .timeout(std::time::Duration::from_secs(self.timeout_secs))
+            .send()
+            .await
+            .context("LLM request failed")?;
+
+        if !response.status().is_success() {
+            anyhow::bail!("LLM request failed with status: {}", response.status());
+        }
+
+        let llm_resp: LlmResponse = response
+            .json()
+            .await
+            .context("Failed to parse LLM response")?;
+
+        let content = &llm_resp
+            .choices
+            .get(0)
+            .context("No choices in LLM response")?
+            .message
+            .content;
+
+        // Extract JSON from response (handle markdown code blocks)
+        let json_str = if let Some(start) = content.find('[') {
+            if let Some(end) = content.rfind(']') {
+                &content[start..=end]
+            } else {
+                anyhow::bail!("No JSON array found in LLM response");
+            }
+        } else {
+            anyhow::bail!("No JSON array found in LLM response");
+        };
+
+        let synonyms: Vec<String> =
+            serde_json::from_str(json_str).context("Failed to parse LLM synonyms JSON")?;
+
+        // Filter and normalize
+        let cleaned: Vec<String> = synonyms
+            .into_iter()
+            .map(|s| s.trim().to_lowercase())
+            .filter(|s| !s.is_empty() && !s.contains(' ')) // Filter out multi-word synonyms for to_tsquery compatibility
+            .collect();
+
+        if cleaned.is_empty() {
+            anyhow::bail!("No valid synonyms returned");
+        }
+
+        tracing::info!(
+            "LLM fetched {} synonyms for '{}': {:?}",
+            cleaned.len(),
+            word,
+            cleaned.iter().take(5).collect::<Vec<_>>()
+        );
+
+        Ok(cleaned)
+    }
+
+    /// Get the number of cached synonyms
+    pub async fn cache_size(&self) -> usize {
+        self.runtime_cache.lock().await.len()
+    }
+}
+
+/// Global online synonym expander (lazy-loaded)
+static ONLINE_EXPANDER: Lazy<Option<OnlineSynonymExpander>> = Lazy::new(|| {
+    if env::var("MOMENTRY_ONLINE_SYNONYM").is_ok() {
+        let local_file = env::var("MOMENTRY_SYNONYM_FILE").ok();
+        tracing::info!("Initializing online synonym expander");
+        Some(OnlineSynonymExpander::new(local_file.as_deref()))
+    } else {
+        None
+    }
+});
+
+/// Get the global online synonym expander (if enabled)
+pub fn global_online_expander() -> Option<&'static OnlineSynonymExpander> {
+    ONLINE_EXPANDER.as_ref()
+}
--- a/src/core/text/synonym.rs
+++ b/src/core/text/synonym.rs
@@ -0,0 +1,71 @@
+use ferrous_opencc::{config::BuiltinConfig, OpenCC};
+use once_cell::sync::Lazy;
+
+static OPENCC_S2T: Lazy<OpenCC> = Lazy::new(|| {
+    OpenCC::from_config(BuiltinConfig::S2t)
+        .expect("Failed to initialize OpenCC Simplified to Traditional converter")
+});
+
+static OPENCC_T2S: Lazy<OpenCC> = Lazy::new(|| {
+    OpenCC::from_config(BuiltinConfig::T2s)
+        .expect("Failed to initialize OpenCC Traditional to Simplified converter")
+});
+
+/// Convert Simplified Chinese text to Traditional Chinese
+pub fn simplified_to_traditional(text: &str) -> String {
+    OPENCC_S2T.convert(text)
+}
+
+/// Convert Traditional Chinese text to Simplified Chinese
+pub fn traditional_to_simplified(text: &str) -> String {
+    OPENCC_T2S.convert(text)
+}
+
+/// Normalize Chinese query for search:
+/// 1. Convert Simplified Chinese to Traditional Chinese (assuming database stores Traditional)
+/// 2. Return converted text
+pub fn normalize_chinese_query(text: &str) -> String {
+    simplified_to_traditional(text)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_simplified_to_traditional() {
+        // Example: Simplified "计算机" -> Traditional "計算機"
+        let simplified = "计算机";
+        let traditional = simplified_to_traditional(simplified);
+        // The conversion might produce "計算機" (depending on dictionary)
+        // We'll just verify it's not empty and different from input
+        assert!(!traditional.is_empty());
+        assert_ne!(traditional, simplified);
+
+        // Traditional input should remain unchanged (or nearly unchanged)
+        let traditional_input = "計算機";
+        let converted = simplified_to_traditional(traditional_input);
+        assert_eq!(converted, traditional_input);
+    }
+
+    #[test]
+    fn test_traditional_to_simplified() {
+        let traditional = "計算機";
+        let simplified = traditional_to_simplified(traditional);
+        assert!(!simplified.is_empty());
+        assert_ne!(simplified, traditional);
+    }
+
+    #[test]
+    fn test_normalize_chinese_query() {
+        let simplified = "计算机";
+        let normalized = normalize_chinese_query(simplified);
+        // Should be Traditional
+        assert_ne!(normalized, simplified);
+
+        let traditional = "計算機";
+        let normalized2 = normalize_chinese_query(traditional);
+        // Should remain Traditional
+        assert_eq!(normalized2, traditional);
+    }
+}
--- a/src/core/text/synonym_expander.rs
+++ b/src/core/text/synonym_expander.rs
@@ -0,0 +1,247 @@
+use anyhow::{Context, Result};
+use once_cell::sync::Lazy;
+use std::collections::HashMap;
+use std::env;
+use std::fs;
+use std::path::Path;
+
+/// 同義詞擴展器
+/// 從 JSON 檔案加載自定義同義詞映射
+#[derive(Debug, Clone, Default)]
+pub struct SynonymExpander {
+    /// 詞語 -> 同義詞列表的映射
+    map: HashMap<String, Vec<String>>,
+}
+
+impl SynonymExpander {
+    /// 從 JSON 檔案創建同義詞擴展器
+    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
+        let content = fs::read_to_string(path).context("Failed to read synonym file")?;
+        let map: HashMap<String, Vec<String>> =
+            serde_json::from_str(&content).context("Failed to parse synonym JSON")?;
+        Ok(Self { map })
+    }
+
+    /// 從多個 JSON 檔案創建同義詞擴展器（後面的檔案會覆蓋前面的）
+    pub fn from_files<P: AsRef<Path>>(paths: &[P]) -> Result<Self> {
+        let mut combined_map = HashMap::new();
+
+        for path in paths {
+            let content = fs::read_to_string(path)
+                .with_context(|| format!("Failed to read synonym file: {:?}", path.as_ref()))?;
+            let map: HashMap<String, Vec<String>> =
+                serde_json::from_str(&content).with_context(|| {
+                    format!("Failed to parse synonym JSON from {:?}", path.as_ref())
+                })?;
+
+            // 合併映射，後面的檔案覆蓋前面的
+            for (key, synonyms) in map {
+                combined_map.insert(key, synonyms);
+            }
+        }
+
+        Ok(Self { map: combined_map })
+    }
+
+    /// 從內建預設資料創建（返回空映射，用戶可通過配置文件添加自定義同義詞）
+    pub fn from_default() -> Self {
+        Self::empty()
+    }
+
+    /// 獲取詞語的同義詞列表（如果存在）
+    pub fn get_synonyms(&self, word: &str) -> Option<&[String]> {
+        self.map.get(word).map(|v| v.as_slice())
+    }
+
+    /// 擴展查詢詞語：將詞語替換為 (詞語 OR 同義詞1 OR 同義詞2 ...)
+    /// 如果沒有同義詞，返回原詞語
+    pub fn expand_word(&self, word: &str) -> String {
+        match self.get_synonyms(word) {
+            Some(syns) if !syns.is_empty() => {
+                let mut parts = vec![word.to_string()];
+                parts.extend_from_slice(syns);
+                format!("({})", parts.join(" | "))
+            }
+            _ => word.to_string(),
+        }
+    }
+
+    /// 擴展整個查詢字符串（空格分隔的詞語）
+    pub fn expand_query(&self, query: &str) -> String {
+        query
+            .split_whitespace()
+            .map(|word| self.expand_word(word))
+            .collect::<Vec<_>>()
+            .join(" & ")
+    }
+
+    /// 對中文查詢進行智能擴展：先匹配已知同義詞，再對剩餘部分進行分詞
+    pub fn expand_chinese_query(&self, query: &str) -> String {
+        // 如果查詢很短，直接嘗試匹配整個查詢
+        if query.chars().count() <= 4 {
+            if let Some(syns) = self.get_synonyms(query) {
+                let mut parts = vec![query.to_string()];
+                parts.extend_from_slice(syns);
+                return format!("({})", parts.join(" | "));
+            }
+        }
+
+        // 嘗試在查詢中尋找已知的同義詞
+        let mut expanded_parts = Vec::new();
+        let mut remaining_query = query;
+        let mut found_synonym = false;
+
+        // 對同義詞鍵按長度降序排序（最長匹配優先）
+        let mut keys: Vec<&String> = self.map.keys().collect();
+        keys.sort_by_key(|b| std::cmp::Reverse(b.chars().count()));
+
+        // 貪婪匹配：尋找最長的同義詞匹配
+        while !remaining_query.is_empty() {
+            let mut matched = false;
+
+            for key in &keys {
+                if remaining_query.starts_with(*key) {
+                    // 找到匹配的同義詞
+                    expanded_parts.push(self.expand_word(key));
+                    remaining_query = &remaining_query[key.len()..];
+                    found_synonym = true;
+                    matched = true;
+                    break;
+                }
+            }
+
+            if !matched {
+                // 沒有找到同義詞，跳過第一個字符，繼續嘗試
+                let first_char_len = remaining_query.chars().next().map_or(0, |c| c.len_utf8());
+                if first_char_len > 0 {
+                    let next_part = &remaining_query[..first_char_len];
+                    expanded_parts.push(next_part.to_string());
+                    remaining_query = &remaining_query[first_char_len..];
+                } else {
+                    break;
+                }
+            }
+        }
+
+        if found_synonym {
+            // 如果有找到同義詞，使用擴展後的查詢
+            expanded_parts.join(" & ")
+        } else {
+            // 沒有找到同義詞，返回原查詢（稍後會進行分詞）
+            query.to_string()
+        }
+    }
+
+    /// 創建空的同義詞擴展器（無同義詞映射）
+    pub fn empty() -> Self {
+        Self {
+            map: HashMap::new(),
+        }
+    }
+}
+
+/// 全局同義詞擴展器（懶加載）
+static SYNONYM_EXPANDER: Lazy<SynonymExpander> = Lazy::new(|| {
+    // 優先嘗試 MOMENTRY_SYNONYM_FILES（逗號分隔的多個檔案）
+    if let Ok(files_var) = env::var("MOMENTRY_SYNONYM_FILES") {
+        let file_paths: Vec<&str> = files_var
+            .split(',')
+            .map(|s| s.trim())
+            .filter(|s| !s.is_empty())
+            .collect();
+
+        if !file_paths.is_empty() {
+            match SynonymExpander::from_files(&file_paths) {
+                Ok(expander) => {
+                    tracing::info!(
+                        "Loaded synonym expander from {} files: {:?}",
+                        file_paths.len(),
+                        file_paths
+                    );
+                    return expander;
+                }
+                Err(e) => {
+                    tracing::warn!(
+                        "Failed to load synonym expander from files {:?}: {}",
+                        file_paths,
+                        e
+                    );
+                    // 繼續嘗試單一檔案或使用預設
+                }
+            }
+        }
+    }
+
+    // 回退到單一檔案 MOMENTRY_SYNONYM_FILE（向下兼容）
+    if let Ok(file_path) = env::var("MOMENTRY_SYNONYM_FILE") {
+        match SynonymExpander::from_file(&file_path) {
+            Ok(expander) => {
+                tracing::info!("Loaded synonym expander from {}", file_path);
+                expander
+            }
+            Err(e) => {
+                tracing::warn!("Failed to load synonym expander from {}: {}", file_path, e);
+                SynonymExpander::empty()
+            }
+        }
+    } else {
+        // 使用預設同義詞（示例）
+        SynonymExpander::from_default()
+    }
+});
+
+/// 獲取全局同義詞擴展器實例
+pub fn global_synonym_expander() -> &'static SynonymExpander {
+    &SYNONYM_EXPANDER
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_expand_word() {
+        let mut map = HashMap::new();
+        map.insert(
+            "電腦".to_string(),
+            vec!["計算機".to_string(), "微机".to_string()],
+        );
+        map.insert(
+            "工作".to_string(),
+            vec!["任務".to_string(), "作業".to_string()],
+        );
+        let expander = SynonymExpander { map };
+
+        assert_eq!(expander.expand_word("電腦"), "(電腦 | 計算機 | 微机)");
+        assert_eq!(expander.expand_word("工作"), "(工作 | 任務 | 作業)");
+        assert_eq!(expander.expand_word("未知"), "未知");
+    }
+
+    #[test]
+    fn test_expand_query() {
+        let mut map = HashMap::new();
+        map.insert(
+            "電腦".to_string(),
+            vec!["計算機".to_string(), "微机".to_string()],
+        );
+        map.insert(
+            "工作".to_string(),
+            vec!["任務".to_string(), "作業".to_string()],
+        );
+        let expander = SynonymExpander { map };
+
+        assert_eq!(
+            expander.expand_query("電腦 工作"),
+            "(電腦 | 計算機 | 微机) & (工作 | 任務 | 作業)"
+        );
+        assert_eq!(expander.expand_query("單個詞"), "單個詞");
+        assert_eq!(expander.expand_query(""), "");
+    }
+
+    #[test]
+    fn test_from_files_empty() {
+        let paths: Vec<&str> = vec![];
+        let expander = SynonymExpander::from_files(&paths).unwrap();
+        assert!(expander.map.is_empty());
+    }
+}
--- a/src/core/text/tokenizer.rs
+++ b/src/core/text/tokenizer.rs
@@ -0,0 +1,121 @@
+use jieba_rs::Jieba;
+use once_cell::sync::Lazy;
+
+static JIEBA: Lazy<Jieba> = Lazy::new(Jieba::new);
+
+/// 檢查文本是否包含中文字符
+/// 包括 CJK Unified Ideographs (U+4E00-U+9FFF) 和 Extension A (U+3400-U+4DBF)
+pub fn contains_chinese(text: &str) -> bool {
+    text.chars()
+        .any(|c| ('\u{4e00}'..='\u{9fff}').contains(&c) || ('\u{3400}'..='\u{4dbf}').contains(&c))
+}
+
+/// 對中文文本進行分詞，並用空格連接分詞結果
+/// 非中文文本保持不變
+///
+/// # 示例
+/// ```
+/// use momentry_core::core::text::tokenizer::tokenize_chinese_text;
+///
+/// assert_eq!(tokenize_chinese_text("這是一個測試"), "這 是 一 個 測 試");
+/// assert_eq!(tokenize_chinese_text("Hello world"), "Hello world");
+/// assert_eq!(tokenize_chinese_text("中文English混合"), "中文 English 混合");
+/// ```
+pub fn tokenize_chinese_text(text: &str) -> String {
+    if contains_chinese(text) {
+        // 使用精確模式分詞（cut=false）
+        let tokens = JIEBA.cut(text, false);
+        tokens.join(" ")
+    } else {
+        text.to_string()
+    }
+}
+
+/// 從 JSON 內容中提取文本並進行分詞
+/// 支持兩種格式：
+/// 1. content->'data'->>'text' (中文視頻格式)
+/// 2. content->'text' (英文視頻格式)
+pub fn extract_and_tokenize_text(content: &serde_json::Value) -> String {
+    let raw_text = content
+        .get("data")
+        .and_then(|data| data.get("text"))
+        .and_then(|v| v.as_str())
+        .or_else(|| content.get("text").and_then(|v| v.as_str()))
+        .unwrap_or("");
+
+    tokenize_chinese_text(raw_text)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_contains_chinese() {
+        assert!(contains_chinese("中文"));
+        assert!(contains_chinese("這是一個測試"));
+        assert!(contains_chinese("混合文本 English 中文"));
+        assert!(!contains_chinese("English only"));
+        assert!(!contains_chinese("123"));
+        assert!(!contains_chinese(""));
+    }
+
+    #[test]
+    fn test_tokenize_chinese_text() {
+        // 純中文
+        assert_eq!(tokenize_chinese_text("這是一個測試"), "這 是 一 個 測 試");
+
+        // 純英文
+        assert_eq!(tokenize_chinese_text("Hello world"), "Hello world");
+
+        // 中英混合
+        assert_eq!(
+            tokenize_chinese_text("中文English混合"),
+            "中文 English 混合"
+        );
+
+        // 空字符串
+        assert_eq!(tokenize_chinese_text(""), "");
+
+        // 數字和標點
+        assert_eq!(tokenize_chinese_text("測試123。"), "測 試 123 。");
+    }
+
+    #[test]
+    fn test_extract_and_tokenize_text() {
+        // 中文格式：content->'data'->>'text'
+        let content1 = serde_json::json!({
+            "data": {
+                "text": "這是一個測試"
+            }
+        });
+        assert_eq!(extract_and_tokenize_text(&content1), "這 是 一 個 測 試");
+
+        // 英文格式：content->'text'
+        let content2 = serde_json::json!({
+            "text": "Hello world"
+        });
+        assert_eq!(extract_and_tokenize_text(&content2), "Hello world");
+
+        // 混合格式：優先使用 data->text
+        let content3 = serde_json::json!({
+            "data": {
+                "text": "中文測試"
+            },
+            "text": "English text"
+        });
+        assert_eq!(extract_and_tokenize_text(&content3), "中文 測 試");
+
+        // 無文本
+        let content4 = serde_json::json!({});
+        assert_eq!(extract_and_tokenize_text(&content4), "");
+
+        // 非字符串文本
+        let content5 = serde_json::json!({
+            "data": {
+                "text": 123
+            }
+        });
+        assert_eq!(extract_and_tokenize_text(&content5), "");
+    }
+}