- Remove session-ses_2f27.md (161KB raw session log) - Remove 49 ROOT_* duplicate files across REFERENCE/ - Remove 14 duplicate files between REFERENCE/ root and history/ - Remove asr_legacy.rs (dead code, replaced by asr.rs) - Remove src/core/worker/ (duplicate JobWorker) - Remove src/core/layers/ (empty directory) - Remove 4 .bak files in src/ - Remove 7 dead private methods in worker/processor.rs - Remove backup directory from git tracking
316 lines
11 KiB
Python
316 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
語言路由工具
|
|
根據語言檢測結果路由到相應的同義詞庫
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import argparse
|
|
from typing import Dict, List, Optional, Any
|
|
from pathlib import Path
|
|
|
|
|
|
class LanguageRouter:
|
|
def __init__(self, config_file: Optional[str] = None):
|
|
"""
|
|
初始化語言路由器
|
|
|
|
Args:
|
|
config_file: 配置文件路徑
|
|
"""
|
|
self.config = self.load_config(config_file)
|
|
self.language_mappings = self.config.get("language_mappings", {})
|
|
self.default_language = self.config.get("default_language", "zh-CN")
|
|
self.fallback_language = self.config.get("fallback_language", "en-US")
|
|
|
|
def load_config(self, config_file: Optional[str]) -> Dict[str, Any]:
|
|
"""
|
|
加載配置文件
|
|
|
|
Args:
|
|
config_file: 配置文件路徑
|
|
|
|
Returns:
|
|
配置字典
|
|
"""
|
|
default_config = {
|
|
"default_language": "zh-CN",
|
|
"fallback_language": "en-US",
|
|
"language_mappings": {
|
|
"zh-CN": {
|
|
"synonym_file": "synonyms_zh_CN.json",
|
|
"description": "簡體中文同義詞庫",
|
|
},
|
|
"zh-TW": {
|
|
"synonym_file": "synonyms_zh_TW.json",
|
|
"description": "繁體中文同義詞庫",
|
|
},
|
|
"en-US": {
|
|
"synonym_file": "synonyms_en_US.json",
|
|
"description": "美式英文同義詞庫",
|
|
},
|
|
"ja-JP": {
|
|
"synonym_file": "synonyms_ja_JP.json",
|
|
"description": "日文同義詞庫",
|
|
},
|
|
"ko-KR": {
|
|
"synonym_file": "synonyms_ko_KR.json",
|
|
"description": "韓文同義詞庫",
|
|
},
|
|
},
|
|
"cross_language_fallback": {
|
|
"enabled": True,
|
|
"fallback_order": ["zh-CN", "zh-TW", "en-US", "ja-JP", "ko-KR"],
|
|
},
|
|
}
|
|
|
|
if config_file:
|
|
try:
|
|
with open(config_file, "r", encoding="utf-8") as f:
|
|
user_config = json.load(f)
|
|
# 合併配置
|
|
if "language_routing" in user_config:
|
|
user_config = user_config["language_routing"]
|
|
|
|
# 深度合併
|
|
merged_config = self.deep_merge(default_config, user_config)
|
|
return merged_config
|
|
except Exception as e:
|
|
print(f"警告: 無法加載配置文件 {config_file}: {e}", file=sys.stderr)
|
|
print("使用默認配置", file=sys.stderr)
|
|
return default_config
|
|
else:
|
|
return default_config
|
|
|
|
def deep_merge(self, base: Dict, update: Dict) -> Dict:
|
|
"""
|
|
深度合併兩個字典
|
|
|
|
Args:
|
|
base: 基礎字典
|
|
update: 更新字典
|
|
|
|
Returns:
|
|
合併後的字典
|
|
"""
|
|
result = base.copy()
|
|
|
|
for key, value in update.items():
|
|
if (
|
|
key in result
|
|
and isinstance(result[key], dict)
|
|
and isinstance(value, dict)
|
|
):
|
|
result[key] = self.deep_merge(result[key], value)
|
|
else:
|
|
result[key] = value
|
|
|
|
return result
|
|
|
|
def route_language(
|
|
self, detected_lang: str, confidence: float = 0.0
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
根據檢測到的語言進行路由
|
|
|
|
Args:
|
|
detected_lang: 檢測到的語言代碼
|
|
confidence: 檢測置信度
|
|
|
|
Returns:
|
|
路由結果字典
|
|
"""
|
|
result = {
|
|
"detected_language": detected_lang,
|
|
"confidence": confidence,
|
|
"routed_language": None,
|
|
"synonym_file": None,
|
|
"fallback_used": False,
|
|
"available_languages": list(self.language_mappings.keys()),
|
|
}
|
|
|
|
# 檢查檢測到的語言是否在映射中
|
|
if detected_lang in self.language_mappings:
|
|
result["routed_language"] = detected_lang
|
|
result["synonym_file"] = self.language_mappings[detected_lang][
|
|
"synonym_file"
|
|
]
|
|
return result
|
|
|
|
# 如果檢測到的語言不在映射中,嘗試語言變體
|
|
lang_variants = self.get_language_variants(detected_lang)
|
|
for variant in lang_variants:
|
|
if variant in self.language_mappings:
|
|
result["routed_language"] = variant
|
|
result["synonym_file"] = self.language_mappings[variant]["synonym_file"]
|
|
result["fallback_used"] = True
|
|
result["fallback_reason"] = f"使用變體 {variant} 替代 {detected_lang}"
|
|
return result
|
|
|
|
# 使用跨語言回退
|
|
if self.config.get("cross_language_fallback", {}).get("enabled", True):
|
|
fallback_order = self.config["cross_language_fallback"].get(
|
|
"fallback_order", []
|
|
)
|
|
|
|
for fallback_lang in fallback_order:
|
|
if fallback_lang in self.language_mappings:
|
|
result["routed_language"] = fallback_lang
|
|
result["synonym_file"] = self.language_mappings[fallback_lang][
|
|
"synonym_file"
|
|
]
|
|
result["fallback_used"] = True
|
|
result["fallback_reason"] = f"使用跨語言回退到 {fallback_lang}"
|
|
return result
|
|
|
|
# 使用默認語言
|
|
if self.default_language in self.language_mappings:
|
|
result["routed_language"] = self.default_language
|
|
result["synonym_file"] = self.language_mappings[self.default_language][
|
|
"synonym_file"
|
|
]
|
|
result["fallback_used"] = True
|
|
result["fallback_reason"] = f"使用默認語言 {self.default_language}"
|
|
return result
|
|
|
|
# 使用回退語言
|
|
if self.fallback_language in self.language_mappings:
|
|
result["routed_language"] = self.fallback_language
|
|
result["synonym_file"] = self.language_mappings[self.fallback_language][
|
|
"synonym_file"
|
|
]
|
|
result["fallback_used"] = True
|
|
result["fallback_reason"] = f"使用回退語言 {self.fallback_language}"
|
|
return result
|
|
|
|
# 沒有可用的語言
|
|
result["error"] = "沒有可用的語言映射"
|
|
return result
|
|
|
|
def get_language_variants(self, lang_code: str) -> List[str]:
|
|
"""
|
|
獲取語言變體
|
|
|
|
Args:
|
|
lang_code: 語言代碼
|
|
|
|
Returns:
|
|
語言變體列表
|
|
"""
|
|
variants = []
|
|
|
|
# 常見的語言變體映射
|
|
variant_mapping = {
|
|
"zh": ["zh-CN", "zh-TW", "zh-HK", "zh-SG", "zh-MO"],
|
|
"en": ["en-US", "en-GB", "en-CA", "en-AU", "en-NZ"],
|
|
"ja": ["ja-JP"],
|
|
"ko": ["ko-KR"],
|
|
"fr": ["fr-FR", "fr-CA", "fr-BE", "fr-CH"],
|
|
"de": ["de-DE", "de-AT", "de-CH"],
|
|
"es": ["es-ES", "es-MX", "es-AR", "es-CO"],
|
|
"pt": ["pt-BR", "pt-PT"],
|
|
"ru": ["ru-RU"],
|
|
"ar": ["ar-SA", "ar-EG", "ar-AE"],
|
|
}
|
|
|
|
# 提取語言部分(去掉地區代碼)
|
|
lang_part = lang_code.split("-")[0] if "-" in lang_code else lang_code
|
|
|
|
if lang_part in variant_mapping:
|
|
variants = variant_mapping[lang_part]
|
|
|
|
return variants
|
|
|
|
def get_synonym_file_path(
|
|
self, routed_result: Dict[str, Any], base_dir: str = "."
|
|
) -> Optional[Path]:
|
|
"""
|
|
獲取同義詞檔案路徑
|
|
|
|
Args:
|
|
routed_result: 路由結果
|
|
base_dir: 基礎目錄
|
|
|
|
Returns:
|
|
檔案路徑或 None
|
|
"""
|
|
if not routed_result.get("synonym_file"):
|
|
return None
|
|
|
|
file_path = Path(base_dir) / routed_result["synonym_file"]
|
|
|
|
# 檢查檔案是否存在
|
|
if file_path.exists():
|
|
return file_path
|
|
|
|
# 嘗試在常見位置尋找
|
|
common_paths = [
|
|
Path(base_dir) / "synonyms" / routed_result["synonym_file"],
|
|
Path(base_dir) / "data" / "synonyms" / routed_result["synonym_file"],
|
|
Path(base_dir) / "config" / "synonyms" / routed_result["synonym_file"],
|
|
Path(base_dir) / ".." / "synonyms" / routed_result["synonym_file"],
|
|
]
|
|
|
|
for path in common_paths:
|
|
if path.exists():
|
|
return path
|
|
|
|
return None
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="語言路由工具")
|
|
parser.add_argument("language", help="檢測到的語言代碼")
|
|
parser.add_argument(
|
|
"-c", "--confidence", type=float, default=0.0, help="檢測置信度"
|
|
)
|
|
parser.add_argument("-j", "--json", action="store_true", help="輸出 JSON 格式")
|
|
parser.add_argument("-v", "--verbose", action="store_true", help="詳細輸出")
|
|
parser.add_argument("--config", help="配置文件路徑")
|
|
parser.add_argument("--base-dir", default=".", help="基礎目錄路徑")
|
|
|
|
args = parser.parse_args()
|
|
|
|
# 初始化路由器
|
|
router = LanguageRouter(args.config)
|
|
|
|
# 進行路由
|
|
result = router.route_language(args.language, args.confidence)
|
|
|
|
# 獲取檔案路徑
|
|
file_path = router.get_synonym_file_path(result, args.base_dir)
|
|
result["file_path"] = str(file_path) if file_path else None
|
|
result["file_exists"] = file_path is not None and file_path.exists()
|
|
|
|
# 輸出結果
|
|
if args.json:
|
|
print(json.dumps(result, ensure_ascii=False, indent=2))
|
|
else:
|
|
if args.verbose:
|
|
print("語言路由結果:")
|
|
print(f" 檢測到的語言: {result['detected_language']}")
|
|
print(f" 置信度: {result['confidence']:.2%}")
|
|
print(f" 路由到的語言: {result['routed_language']}")
|
|
print(f" 同義詞檔案: {result['synonym_file']}")
|
|
print(f" 檔案路徑: {result['file_path']}")
|
|
print(f" 檔案存在: {result['file_exists']}")
|
|
if result.get("fallback_used"):
|
|
print(" 使用了回退: 是")
|
|
print(f" 回退原因: {result.get('fallback_reason', '未知')}")
|
|
else:
|
|
print(" 使用了回退: 否")
|
|
print(f" 可用語言: {', '.join(result['available_languages'])}")
|
|
else:
|
|
if result["file_exists"]:
|
|
print(f"{result['routed_language']}:{result['synonym_file']}")
|
|
else:
|
|
print(
|
|
f"{result['routed_language']}:{result['synonym_file']} (檔案不存在)"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|