#!/usr/bin/env python3 """ 術語管理器 - 用於統一管理和更新架構文檔中的術語 """ import json import re from datetime import datetime from pathlib import Path from typing import Dict, List, Tuple from dataclasses import dataclass, asdict @dataclass class TerminologyEntry: """術語條目""" design_concept: str # 設計概念 design_value: str # 設計值 actual_value: str # 實際實現值 status: str # 狀態標記 description: str # 描述 last_updated: str # 最後更新時間 source_files: List[str] # 使用此術語的文件 @dataclass class TerminologyMapping: """術語映射表""" mapping: Dict[str, TerminologyEntry] version: str created_at: str updated_at: str class TerminologyManager: """術語管理器""" def __init__(self, data_dir: Path = Path("data/terminology")): self.data_dir = data_dir self.data_dir.mkdir(parents=True, exist_ok=True) self.mapping_file = data_dir / "terminology_mapping.json" self.usage_file = data_dir / "terminology_usage.json" # 定義標準術語對照表 self.standard_terminology = { "sentence": TerminologyEntry( design_concept="句子級分片", design_value="sentence", actual_value="ChunkType::Sentence", status="✅ 完整實現", description="基於 ASR 轉錄結果的單句級別分片", last_updated=datetime.now().isoformat(), source_files=["CHUNK_DESIGN.md", "CHUNK_RULE_1_SENTENCE.md"], ), "visual": TerminologyEntry( design_concept="視覺物件級分片", design_value="visual", actual_value="未實現", status="❌ 未實現", description="基於 YOLO 物件檢測的視覺分片", last_updated=datetime.now().isoformat(), source_files=["CHUNK_DESIGN.md"], ), "scene": TerminologyEntry( design_concept="場景級分片", design_value="scene", actual_value="ChunkType::Cut", status="⚠️ 部分實現", description="基於 CUT 場景檢測算法的分片", last_updated=datetime.now().isoformat(), source_files=["CHUNK_DESIGN.md", "CHUNK_RULE_3_SCENE.md"], ), "summary": TerminologyEntry( design_concept="摘要級分片", design_value="summary", actual_value="ChunkType::Story", status="⚠️ 概念調整", description="基於分片聚合的敘事總結分片", last_updated=datetime.now().isoformat(), source_files=["CHUNK_DESIGN.md", "CHUNK_RULE_4_SUMMARY.md"], ), "time": TerminologyEntry( design_concept="時間基準分片", design_value="time", actual_value="ChunkType::TimeBased", status="✅ 完整實現", description="固定時間間隔的分片", last_updated=datetime.now().isoformat(), source_files=["CHUNK_DESIGN.md"], ), "trace": TerminologyEntry( design_concept="軌跡追蹤分片", design_value="trace", actual_value="ChunkType::Trace", status="✅ 完整實現", description="物件或人物的時空軌跡分片", last_updated=datetime.now().isoformat(), source_files=["CHUNK_DESIGN.md"], ), } self.initialize() def initialize(self): """初始化術語映射表""" if not self.mapping_file.exists(): self.save_mapping() def save_mapping(self): """保存術語映射表""" mapping_data = TerminologyMapping( mapping=self.standard_terminology, version="1.0", created_at=datetime.now().isoformat(), updated_at=datetime.now().isoformat(), ) with open(self.mapping_file, "w", encoding="utf-8") as f: json.dump(asdict(mapping_data), f, ensure_ascii=False, indent=2) print(f"✓ 術語映射表已保存: {self.mapping_file}") def load_mapping(self) -> TerminologyMapping: """加載術語映射表""" with open(self.mapping_file, "r", encoding="utf-8") as f: data = json.load(f) return TerminologyMapping(**data) def find_terminology_in_files( self, pattern: str, directory: Path ) -> Dict[str, List[Tuple[str, int]]]: """在文件中查找術語使用情況""" results = {} for root, dirs, files in os.walk(directory): for file in files: if file.endswith(".md"): file_path = Path(root) / file with open(file_path, "r", encoding="utf-8") as f: content = f.read() matches = list(re.finditer(pattern, content, re.IGNORECASE)) if matches: results[str(file_path)] = [ (match.group(), match.start()) for match in matches ] return results def generate_report(self) -> Dict[str, any]: """生成術語使用報告""" mapping = self.load_mapping() arch_dir = Path("docs_v1.0/ARCHITECTURE") usage = {} for design_term, entry in mapping.mapping.items(): pattern = re.escape(entry.design_value) usage[design_term] = self.find_terminology_in_files(pattern, arch_dir) report = { "metadata": { "generated_at": datetime.now().isoformat(), "version": mapping.version, "total_terms": len(mapping.mapping), }, "terminology_usage": usage, "summary": { "total_files_scanned": sum(len(v) for v in usage.values()), "unique_terms_used": len(usage), "consistency_score": self.calculate_consistency_score(usage), }, } return report def calculate_consistency_score(self, usage: Dict[str, any]) -> float: """計算術語一致性分數""" total_occurrences = sum(len(v) for v in usage.values()) if total_occurrences == 0: return 1.0 # 計算術語使用的一致性 consistency_score = 0.0 # 檢查設計值和實際值是否一致 for design_term, occurrences in usage.items(): entry = self.standard_terminology.get(design_term) if not entry: continue # 檢查文件中的引用是否與定義一致 for file_path, matches in occurrences.items(): for match, _ in matches: # 檢查是否使用了正確的術語 if match.lower() == entry.design_value.lower(): consistency_score += 1.0 else: # 部分匹配或錯誤使用 consistency_score += 0.5 # 歸一化分數 if total_occurrences > 0: consistency_score = consistency_score / total_occurrences return consistency_score def main(): """主函數""" print("術語管理器 - 統一管理架構文檔術語") print("=" * 60) manager = TerminologyManager() # 生成報告 report = manager.generate_report() print("\n術語使用報告:") print(f"版本: {report['metadata']['version']}") print(f"生成時間: {report['metadata']['generated_at']}") print(f"一致性分數: {report['summary']['consistency_score']:.2f}") print(f"使用術語總數: {report['summary']['unique_terms_used']}") print("\n術語對照表:") for term, entry in manager.standard_terminology.items(): print(f"{term:10} → {entry.actual_value:30} [{entry.status}]") print("\n建議:") print("1. 在設計文檔中保留設計值說明") print("2. 在實現文檔中使用實際值") print("3. 定期檢查術語一致性") print("4. 更新代碼註釋中的術語") if __name__ == "__main__": main()