#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 語言路由工具 根據語言檢測結果路由到相應的同義詞庫 """ import sys import json import argparse from typing import Dict, List, Optional, Any from pathlib import Path class LanguageRouter: def __init__(self, config_file: Optional[str] = None): """ 初始化語言路由器 Args: config_file: 配置文件路徑 """ self.config = self.load_config(config_file) self.language_mappings = self.config.get("language_mappings", {}) self.default_language = self.config.get("default_language", "zh-CN") self.fallback_language = self.config.get("fallback_language", "en-US") def load_config(self, config_file: Optional[str]) -> Dict[str, Any]: """ 加載配置文件 Args: config_file: 配置文件路徑 Returns: 配置字典 """ default_config = { "default_language": "zh-CN", "fallback_language": "en-US", "language_mappings": { "zh-CN": { "synonym_file": "synonyms_zh_CN.json", "description": "簡體中文同義詞庫", }, "zh-TW": { "synonym_file": "synonyms_zh_TW.json", "description": "繁體中文同義詞庫", }, "en-US": { "synonym_file": "synonyms_en_US.json", "description": "美式英文同義詞庫", }, "ja-JP": { "synonym_file": "synonyms_ja_JP.json", "description": "日文同義詞庫", }, "ko-KR": { "synonym_file": "synonyms_ko_KR.json", "description": "韓文同義詞庫", }, }, "cross_language_fallback": { "enabled": True, "fallback_order": ["zh-CN", "zh-TW", "en-US", "ja-JP", "ko-KR"], }, } if config_file: try: with open(config_file, "r", encoding="utf-8") as f: user_config = json.load(f) # 合併配置 if "language_routing" in user_config: user_config = user_config["language_routing"] # 深度合併 merged_config = self.deep_merge(default_config, user_config) return merged_config except Exception as e: print(f"警告: 無法加載配置文件 {config_file}: {e}", file=sys.stderr) print("使用默認配置", file=sys.stderr) return default_config else: return default_config def deep_merge(self, base: Dict, update: Dict) -> Dict: """ 深度合併兩個字典 Args: base: 基礎字典 update: 更新字典 Returns: 合併後的字典 """ result = base.copy() for key, value in update.items(): if ( key in result and isinstance(result[key], dict) and isinstance(value, dict) ): result[key] = self.deep_merge(result[key], value) else: result[key] = value return result def route_language( self, detected_lang: str, confidence: float = 0.0 ) -> Dict[str, Any]: """ 根據檢測到的語言進行路由 Args: detected_lang: 檢測到的語言代碼 confidence: 檢測置信度 Returns: 路由結果字典 """ result = { "detected_language": detected_lang, "confidence": confidence, "routed_language": None, "synonym_file": None, "fallback_used": False, "available_languages": list(self.language_mappings.keys()), } # 檢查檢測到的語言是否在映射中 if detected_lang in self.language_mappings: result["routed_language"] = detected_lang result["synonym_file"] = self.language_mappings[detected_lang][ "synonym_file" ] return result # 如果檢測到的語言不在映射中,嘗試語言變體 lang_variants = self.get_language_variants(detected_lang) for variant in lang_variants: if variant in self.language_mappings: result["routed_language"] = variant result["synonym_file"] = self.language_mappings[variant]["synonym_file"] result["fallback_used"] = True result["fallback_reason"] = f"使用變體 {variant} 替代 {detected_lang}" return result # 使用跨語言回退 if self.config.get("cross_language_fallback", {}).get("enabled", True): fallback_order = self.config["cross_language_fallback"].get( "fallback_order", [] ) for fallback_lang in fallback_order: if fallback_lang in self.language_mappings: result["routed_language"] = fallback_lang result["synonym_file"] = self.language_mappings[fallback_lang][ "synonym_file" ] result["fallback_used"] = True result["fallback_reason"] = f"使用跨語言回退到 {fallback_lang}" return result # 使用默認語言 if self.default_language in self.language_mappings: result["routed_language"] = self.default_language result["synonym_file"] = self.language_mappings[self.default_language][ "synonym_file" ] result["fallback_used"] = True result["fallback_reason"] = f"使用默認語言 {self.default_language}" return result # 使用回退語言 if self.fallback_language in self.language_mappings: result["routed_language"] = self.fallback_language result["synonym_file"] = self.language_mappings[self.fallback_language][ "synonym_file" ] result["fallback_used"] = True result["fallback_reason"] = f"使用回退語言 {self.fallback_language}" return result # 沒有可用的語言 result["error"] = "沒有可用的語言映射" return result def get_language_variants(self, lang_code: str) -> List[str]: """ 獲取語言變體 Args: lang_code: 語言代碼 Returns: 語言變體列表 """ variants = [] # 常見的語言變體映射 variant_mapping = { "zh": ["zh-CN", "zh-TW", "zh-HK", "zh-SG", "zh-MO"], "en": ["en-US", "en-GB", "en-CA", "en-AU", "en-NZ"], "ja": ["ja-JP"], "ko": ["ko-KR"], "fr": ["fr-FR", "fr-CA", "fr-BE", "fr-CH"], "de": ["de-DE", "de-AT", "de-CH"], "es": ["es-ES", "es-MX", "es-AR", "es-CO"], "pt": ["pt-BR", "pt-PT"], "ru": ["ru-RU"], "ar": ["ar-SA", "ar-EG", "ar-AE"], } # 提取語言部分(去掉地區代碼) lang_part = lang_code.split("-")[0] if "-" in lang_code else lang_code if lang_part in variant_mapping: variants = variant_mapping[lang_part] return variants def get_synonym_file_path( self, routed_result: Dict[str, Any], base_dir: str = "." ) -> Optional[Path]: """ 獲取同義詞檔案路徑 Args: routed_result: 路由結果 base_dir: 基礎目錄 Returns: 檔案路徑或 None """ if not routed_result.get("synonym_file"): return None file_path = Path(base_dir) / routed_result["synonym_file"] # 檢查檔案是否存在 if file_path.exists(): return file_path # 嘗試在常見位置尋找 common_paths = [ Path(base_dir) / "synonyms" / routed_result["synonym_file"], Path(base_dir) / "data" / "synonyms" / routed_result["synonym_file"], Path(base_dir) / "config" / "synonyms" / routed_result["synonym_file"], Path(base_dir) / ".." / "synonyms" / routed_result["synonym_file"], ] for path in common_paths: if path.exists(): return path return None def main(): parser = argparse.ArgumentParser(description="語言路由工具") parser.add_argument("language", help="檢測到的語言代碼") parser.add_argument( "-c", "--confidence", type=float, default=0.0, help="檢測置信度" ) parser.add_argument("-j", "--json", action="store_true", help="輸出 JSON 格式") parser.add_argument("-v", "--verbose", action="store_true", help="詳細輸出") parser.add_argument("--config", help="配置文件路徑") parser.add_argument("--base-dir", default=".", help="基礎目錄路徑") args = parser.parse_args() # 初始化路由器 router = LanguageRouter(args.config) # 進行路由 result = router.route_language(args.language, args.confidence) # 獲取檔案路徑 file_path = router.get_synonym_file_path(result, args.base_dir) result["file_path"] = str(file_path) if file_path else None result["file_exists"] = file_path is not None and file_path.exists() # 輸出結果 if args.json: print(json.dumps(result, ensure_ascii=False, indent=2)) else: if args.verbose: print("語言路由結果:") print(f" 檢測到的語言: {result['detected_language']}") print(f" 置信度: {result['confidence']:.2%}") print(f" 路由到的語言: {result['routed_language']}") print(f" 同義詞檔案: {result['synonym_file']}") print(f" 檔案路徑: {result['file_path']}") print(f" 檔案存在: {result['file_exists']}") if result.get("fallback_used"): print(" 使用了回退: 是") print(f" 回退原因: {result.get('fallback_reason', '未知')}") else: print(" 使用了回退: 否") print(f" 可用語言: {', '.join(result['available_languages'])}") else: if result["file_exists"]: print(f"{result['routed_language']}:{result['synonym_file']}") else: print( f"{result['routed_language']}:{result['synonym_file']} (檔案不存在)" ) if __name__ == "__main__": main()