feat: update Python processors and add utility scripts
- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
This commit is contained in:
315
scripts/language_router.py
Normal file
315
scripts/language_router.py
Normal file
@@ -0,0 +1,315 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
語言路由工具
|
||||
根據語言檢測結果路由到相應的同義詞庫
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
from typing import Dict, List, Optional, Any
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class LanguageRouter:
|
||||
def __init__(self, config_file: Optional[str] = None):
|
||||
"""
|
||||
初始化語言路由器
|
||||
|
||||
Args:
|
||||
config_file: 配置文件路徑
|
||||
"""
|
||||
self.config = self.load_config(config_file)
|
||||
self.language_mappings = self.config.get("language_mappings", {})
|
||||
self.default_language = self.config.get("default_language", "zh-CN")
|
||||
self.fallback_language = self.config.get("fallback_language", "en-US")
|
||||
|
||||
def load_config(self, config_file: Optional[str]) -> Dict[str, Any]:
|
||||
"""
|
||||
加載配置文件
|
||||
|
||||
Args:
|
||||
config_file: 配置文件路徑
|
||||
|
||||
Returns:
|
||||
配置字典
|
||||
"""
|
||||
default_config = {
|
||||
"default_language": "zh-CN",
|
||||
"fallback_language": "en-US",
|
||||
"language_mappings": {
|
||||
"zh-CN": {
|
||||
"synonym_file": "synonyms_zh_CN.json",
|
||||
"description": "簡體中文同義詞庫",
|
||||
},
|
||||
"zh-TW": {
|
||||
"synonym_file": "synonyms_zh_TW.json",
|
||||
"description": "繁體中文同義詞庫",
|
||||
},
|
||||
"en-US": {
|
||||
"synonym_file": "synonyms_en_US.json",
|
||||
"description": "美式英文同義詞庫",
|
||||
},
|
||||
"ja-JP": {
|
||||
"synonym_file": "synonyms_ja_JP.json",
|
||||
"description": "日文同義詞庫",
|
||||
},
|
||||
"ko-KR": {
|
||||
"synonym_file": "synonyms_ko_KR.json",
|
||||
"description": "韓文同義詞庫",
|
||||
},
|
||||
},
|
||||
"cross_language_fallback": {
|
||||
"enabled": True,
|
||||
"fallback_order": ["zh-CN", "zh-TW", "en-US", "ja-JP", "ko-KR"],
|
||||
},
|
||||
}
|
||||
|
||||
if config_file:
|
||||
try:
|
||||
with open(config_file, "r", encoding="utf-8") as f:
|
||||
user_config = json.load(f)
|
||||
# 合併配置
|
||||
if "language_routing" in user_config:
|
||||
user_config = user_config["language_routing"]
|
||||
|
||||
# 深度合併
|
||||
merged_config = self.deep_merge(default_config, user_config)
|
||||
return merged_config
|
||||
except Exception as e:
|
||||
print(f"警告: 無法加載配置文件 {config_file}: {e}", file=sys.stderr)
|
||||
print("使用默認配置", file=sys.stderr)
|
||||
return default_config
|
||||
else:
|
||||
return default_config
|
||||
|
||||
def deep_merge(self, base: Dict, update: Dict) -> Dict:
|
||||
"""
|
||||
深度合併兩個字典
|
||||
|
||||
Args:
|
||||
base: 基礎字典
|
||||
update: 更新字典
|
||||
|
||||
Returns:
|
||||
合併後的字典
|
||||
"""
|
||||
result = base.copy()
|
||||
|
||||
for key, value in update.items():
|
||||
if (
|
||||
key in result
|
||||
and isinstance(result[key], dict)
|
||||
and isinstance(value, dict)
|
||||
):
|
||||
result[key] = self.deep_merge(result[key], value)
|
||||
else:
|
||||
result[key] = value
|
||||
|
||||
return result
|
||||
|
||||
def route_language(
|
||||
self, detected_lang: str, confidence: float = 0.0
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
根據檢測到的語言進行路由
|
||||
|
||||
Args:
|
||||
detected_lang: 檢測到的語言代碼
|
||||
confidence: 檢測置信度
|
||||
|
||||
Returns:
|
||||
路由結果字典
|
||||
"""
|
||||
result = {
|
||||
"detected_language": detected_lang,
|
||||
"confidence": confidence,
|
||||
"routed_language": None,
|
||||
"synonym_file": None,
|
||||
"fallback_used": False,
|
||||
"available_languages": list(self.language_mappings.keys()),
|
||||
}
|
||||
|
||||
# 檢查檢測到的語言是否在映射中
|
||||
if detected_lang in self.language_mappings:
|
||||
result["routed_language"] = detected_lang
|
||||
result["synonym_file"] = self.language_mappings[detected_lang][
|
||||
"synonym_file"
|
||||
]
|
||||
return result
|
||||
|
||||
# 如果檢測到的語言不在映射中,嘗試語言變體
|
||||
lang_variants = self.get_language_variants(detected_lang)
|
||||
for variant in lang_variants:
|
||||
if variant in self.language_mappings:
|
||||
result["routed_language"] = variant
|
||||
result["synonym_file"] = self.language_mappings[variant]["synonym_file"]
|
||||
result["fallback_used"] = True
|
||||
result["fallback_reason"] = f"使用變體 {variant} 替代 {detected_lang}"
|
||||
return result
|
||||
|
||||
# 使用跨語言回退
|
||||
if self.config.get("cross_language_fallback", {}).get("enabled", True):
|
||||
fallback_order = self.config["cross_language_fallback"].get(
|
||||
"fallback_order", []
|
||||
)
|
||||
|
||||
for fallback_lang in fallback_order:
|
||||
if fallback_lang in self.language_mappings:
|
||||
result["routed_language"] = fallback_lang
|
||||
result["synonym_file"] = self.language_mappings[fallback_lang][
|
||||
"synonym_file"
|
||||
]
|
||||
result["fallback_used"] = True
|
||||
result["fallback_reason"] = f"使用跨語言回退到 {fallback_lang}"
|
||||
return result
|
||||
|
||||
# 使用默認語言
|
||||
if self.default_language in self.language_mappings:
|
||||
result["routed_language"] = self.default_language
|
||||
result["synonym_file"] = self.language_mappings[self.default_language][
|
||||
"synonym_file"
|
||||
]
|
||||
result["fallback_used"] = True
|
||||
result["fallback_reason"] = f"使用默認語言 {self.default_language}"
|
||||
return result
|
||||
|
||||
# 使用回退語言
|
||||
if self.fallback_language in self.language_mappings:
|
||||
result["routed_language"] = self.fallback_language
|
||||
result["synonym_file"] = self.language_mappings[self.fallback_language][
|
||||
"synonym_file"
|
||||
]
|
||||
result["fallback_used"] = True
|
||||
result["fallback_reason"] = f"使用回退語言 {self.fallback_language}"
|
||||
return result
|
||||
|
||||
# 沒有可用的語言
|
||||
result["error"] = "沒有可用的語言映射"
|
||||
return result
|
||||
|
||||
def get_language_variants(self, lang_code: str) -> List[str]:
|
||||
"""
|
||||
獲取語言變體
|
||||
|
||||
Args:
|
||||
lang_code: 語言代碼
|
||||
|
||||
Returns:
|
||||
語言變體列表
|
||||
"""
|
||||
variants = []
|
||||
|
||||
# 常見的語言變體映射
|
||||
variant_mapping = {
|
||||
"zh": ["zh-CN", "zh-TW", "zh-HK", "zh-SG", "zh-MO"],
|
||||
"en": ["en-US", "en-GB", "en-CA", "en-AU", "en-NZ"],
|
||||
"ja": ["ja-JP"],
|
||||
"ko": ["ko-KR"],
|
||||
"fr": ["fr-FR", "fr-CA", "fr-BE", "fr-CH"],
|
||||
"de": ["de-DE", "de-AT", "de-CH"],
|
||||
"es": ["es-ES", "es-MX", "es-AR", "es-CO"],
|
||||
"pt": ["pt-BR", "pt-PT"],
|
||||
"ru": ["ru-RU"],
|
||||
"ar": ["ar-SA", "ar-EG", "ar-AE"],
|
||||
}
|
||||
|
||||
# 提取語言部分(去掉地區代碼)
|
||||
lang_part = lang_code.split("-")[0] if "-" in lang_code else lang_code
|
||||
|
||||
if lang_part in variant_mapping:
|
||||
variants = variant_mapping[lang_part]
|
||||
|
||||
return variants
|
||||
|
||||
def get_synonym_file_path(
|
||||
self, routed_result: Dict[str, Any], base_dir: str = "."
|
||||
) -> Optional[Path]:
|
||||
"""
|
||||
獲取同義詞檔案路徑
|
||||
|
||||
Args:
|
||||
routed_result: 路由結果
|
||||
base_dir: 基礎目錄
|
||||
|
||||
Returns:
|
||||
檔案路徑或 None
|
||||
"""
|
||||
if not routed_result.get("synonym_file"):
|
||||
return None
|
||||
|
||||
file_path = Path(base_dir) / routed_result["synonym_file"]
|
||||
|
||||
# 檢查檔案是否存在
|
||||
if file_path.exists():
|
||||
return file_path
|
||||
|
||||
# 嘗試在常見位置尋找
|
||||
common_paths = [
|
||||
Path(base_dir) / "synonyms" / routed_result["synonym_file"],
|
||||
Path(base_dir) / "data" / "synonyms" / routed_result["synonym_file"],
|
||||
Path(base_dir) / "config" / "synonyms" / routed_result["synonym_file"],
|
||||
Path(base_dir) / ".." / "synonyms" / routed_result["synonym_file"],
|
||||
]
|
||||
|
||||
for path in common_paths:
|
||||
if path.exists():
|
||||
return path
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="語言路由工具")
|
||||
parser.add_argument("language", help="檢測到的語言代碼")
|
||||
parser.add_argument(
|
||||
"-c", "--confidence", type=float, default=0.0, help="檢測置信度"
|
||||
)
|
||||
parser.add_argument("-j", "--json", action="store_true", help="輸出 JSON 格式")
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="詳細輸出")
|
||||
parser.add_argument("--config", help="配置文件路徑")
|
||||
parser.add_argument("--base-dir", default=".", help="基礎目錄路徑")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 初始化路由器
|
||||
router = LanguageRouter(args.config)
|
||||
|
||||
# 進行路由
|
||||
result = router.route_language(args.language, args.confidence)
|
||||
|
||||
# 獲取檔案路徑
|
||||
file_path = router.get_synonym_file_path(result, args.base_dir)
|
||||
result["file_path"] = str(file_path) if file_path else None
|
||||
result["file_exists"] = file_path is not None and file_path.exists()
|
||||
|
||||
# 輸出結果
|
||||
if args.json:
|
||||
print(json.dumps(result, ensure_ascii=False, indent=2))
|
||||
else:
|
||||
if args.verbose:
|
||||
print("語言路由結果:")
|
||||
print(f" 檢測到的語言: {result['detected_language']}")
|
||||
print(f" 置信度: {result['confidence']:.2%}")
|
||||
print(f" 路由到的語言: {result['routed_language']}")
|
||||
print(f" 同義詞檔案: {result['synonym_file']}")
|
||||
print(f" 檔案路徑: {result['file_path']}")
|
||||
print(f" 檔案存在: {result['file_exists']}")
|
||||
if result.get("fallback_used"):
|
||||
print(f" 使用了回退: 是")
|
||||
print(f" 回退原因: {result.get('fallback_reason', '未知')}")
|
||||
else:
|
||||
print(f" 使用了回退: 否")
|
||||
print(f" 可用語言: {', '.join(result['available_languages'])}")
|
||||
else:
|
||||
if result["file_exists"]:
|
||||
print(f"{result['routed_language']}:{result['synonym_file']}")
|
||||
else:
|
||||
print(
|
||||
f"{result['routed_language']}:{result['synonym_file']} (檔案不存在)"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user