feat: Phase 2.6 edges migration to Qdrant (TKG-only architecture)
Phase 2.6.1: co_occurrence_edges migration - build_co_occurrence_edges_from_qdrant() - Qdrant embeddings → frame grouping → YOLO objects - Result: 6679 edges (vs 6701 PostgreSQL) Phase 2.6.2: face_face_edges migration - build_face_face_edges_from_qdrant() - Qdrant embeddings → frame grouping → face pairs - mutual_gaze detection preserved - Result: 6 edges (exact match) Phase 2.6.3: speaker_face_edges migration - build_speaker_face_edges_from_qdrant() - Qdrant embeddings → trace_id frame ranges - SPEAKS_AS edge creation Architecture: - All edges use Qdrant payload (no face_detections queries) - PostgreSQL fallback for empty Qdrant - Estimated 3.6x performance improvement Testing: - Playground (3003): ✓ All Phase 2.6 logs verified - Edge counts: ✓ Close match with PostgreSQL - Fallback: ✓ Working Docs: - docs_v1.0/DESIGN/TKG_PHASE2_6_EDGES_MIGRATION.md - docs_v1.0/M4_workspace/2026-06-21_phase2_6_test.md
This commit is contained in:
239
v1.1/scripts/terminology_manager_v1.11.py
Normal file
239
v1.1/scripts/terminology_manager_v1.11.py
Normal file
@@ -0,0 +1,239 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
術語管理器 - 用於統一管理和更新架構文檔中的術語
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple
|
||||
from dataclasses import dataclass, asdict
|
||||
|
||||
|
||||
@dataclass
|
||||
class TerminologyEntry:
|
||||
"""術語條目"""
|
||||
|
||||
design_concept: str # 設計概念
|
||||
design_value: str # 設計值
|
||||
actual_value: str # 實際實現值
|
||||
status: str # 狀態標記
|
||||
description: str # 描述
|
||||
last_updated: str # 最後更新時間
|
||||
source_files: List[str] # 使用此術語的文件
|
||||
|
||||
|
||||
@dataclass
|
||||
class TerminologyMapping:
|
||||
"""術語映射表"""
|
||||
|
||||
mapping: Dict[str, TerminologyEntry]
|
||||
version: str
|
||||
created_at: str
|
||||
updated_at: str
|
||||
|
||||
|
||||
class TerminologyManager:
|
||||
"""術語管理器"""
|
||||
|
||||
def __init__(self, data_dir: Path = Path("data/terminology")):
|
||||
self.data_dir = data_dir
|
||||
self.data_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.mapping_file = data_dir / "terminology_mapping.json"
|
||||
self.usage_file = data_dir / "terminology_usage.json"
|
||||
|
||||
# 定義標準術語對照表
|
||||
self.standard_terminology = {
|
||||
"sentence": TerminologyEntry(
|
||||
design_concept="句子級分片",
|
||||
design_value="sentence",
|
||||
actual_value="ChunkType::Sentence",
|
||||
status="✅ 完整實現",
|
||||
description="基於 ASR 轉錄結果的單句級別分片",
|
||||
last_updated=datetime.now().isoformat(),
|
||||
source_files=["CHUNK_DESIGN.md", "CHUNK_RULE_1_SENTENCE.md"],
|
||||
),
|
||||
"visual": TerminologyEntry(
|
||||
design_concept="視覺物件級分片",
|
||||
design_value="visual",
|
||||
actual_value="未實現",
|
||||
status="❌ 未實現",
|
||||
description="基於 YOLO 物件檢測的視覺分片",
|
||||
last_updated=datetime.now().isoformat(),
|
||||
source_files=["CHUNK_DESIGN.md"],
|
||||
),
|
||||
"scene": TerminologyEntry(
|
||||
design_concept="場景級分片",
|
||||
design_value="scene",
|
||||
actual_value="ChunkType::Cut",
|
||||
status="⚠️ 部分實現",
|
||||
description="基於 CUT 場景檢測算法的分片",
|
||||
last_updated=datetime.now().isoformat(),
|
||||
source_files=["CHUNK_DESIGN.md", "CHUNK_RULE_3_SCENE.md"],
|
||||
),
|
||||
"summary": TerminologyEntry(
|
||||
design_concept="摘要級分片",
|
||||
design_value="summary",
|
||||
actual_value="ChunkType::Story",
|
||||
status="⚠️ 概念調整",
|
||||
description="基於分片聚合的敘事總結分片",
|
||||
last_updated=datetime.now().isoformat(),
|
||||
source_files=["CHUNK_DESIGN.md", "CHUNK_RULE_4_SUMMARY.md"],
|
||||
),
|
||||
"time": TerminologyEntry(
|
||||
design_concept="時間基準分片",
|
||||
design_value="time",
|
||||
actual_value="ChunkType::TimeBased",
|
||||
status="✅ 完整實現",
|
||||
description="固定時間間隔的分片",
|
||||
last_updated=datetime.now().isoformat(),
|
||||
source_files=["CHUNK_DESIGN.md"],
|
||||
),
|
||||
"trace": TerminologyEntry(
|
||||
design_concept="軌跡追蹤分片",
|
||||
design_value="trace",
|
||||
actual_value="ChunkType::Trace",
|
||||
status="✅ 完整實現",
|
||||
description="物件或人物的時空軌跡分片",
|
||||
last_updated=datetime.now().isoformat(),
|
||||
source_files=["CHUNK_DESIGN.md"],
|
||||
),
|
||||
}
|
||||
|
||||
self.initialize()
|
||||
|
||||
def initialize(self):
|
||||
"""初始化術語映射表"""
|
||||
if not self.mapping_file.exists():
|
||||
self.save_mapping()
|
||||
|
||||
def save_mapping(self):
|
||||
"""保存術語映射表"""
|
||||
mapping_data = TerminologyMapping(
|
||||
mapping=self.standard_terminology,
|
||||
version="1.0",
|
||||
created_at=datetime.now().isoformat(),
|
||||
updated_at=datetime.now().isoformat(),
|
||||
)
|
||||
|
||||
with open(self.mapping_file, "w", encoding="utf-8") as f:
|
||||
json.dump(asdict(mapping_data), f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"✓ 術語映射表已保存: {self.mapping_file}")
|
||||
|
||||
def load_mapping(self) -> TerminologyMapping:
|
||||
"""加載術語映射表"""
|
||||
with open(self.mapping_file, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
return TerminologyMapping(**data)
|
||||
|
||||
def find_terminology_in_files(
|
||||
self, pattern: str, directory: Path
|
||||
) -> Dict[str, List[Tuple[str, int]]]:
|
||||
"""在文件中查找術語使用情況"""
|
||||
results = {}
|
||||
|
||||
for root, dirs, files in os.walk(directory):
|
||||
for file in files:
|
||||
if file.endswith(".md"):
|
||||
file_path = Path(root) / file
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
|
||||
matches = list(re.finditer(pattern, content, re.IGNORECASE))
|
||||
if matches:
|
||||
results[str(file_path)] = [
|
||||
(match.group(), match.start()) for match in matches
|
||||
]
|
||||
|
||||
return results
|
||||
|
||||
def generate_report(self) -> Dict[str, any]:
|
||||
"""生成術語使用報告"""
|
||||
mapping = self.load_mapping()
|
||||
arch_dir = Path("docs_v1.0/ARCHITECTURE")
|
||||
|
||||
usage = {}
|
||||
for design_term, entry in mapping.mapping.items():
|
||||
pattern = re.escape(entry.design_value)
|
||||
usage[design_term] = self.find_terminology_in_files(pattern, arch_dir)
|
||||
|
||||
report = {
|
||||
"metadata": {
|
||||
"generated_at": datetime.now().isoformat(),
|
||||
"version": mapping.version,
|
||||
"total_terms": len(mapping.mapping),
|
||||
},
|
||||
"terminology_usage": usage,
|
||||
"summary": {
|
||||
"total_files_scanned": sum(len(v) for v in usage.values()),
|
||||
"unique_terms_used": len(usage),
|
||||
"consistency_score": self.calculate_consistency_score(usage),
|
||||
},
|
||||
}
|
||||
|
||||
return report
|
||||
|
||||
def calculate_consistency_score(self, usage: Dict[str, any]) -> float:
|
||||
"""計算術語一致性分數"""
|
||||
total_occurrences = sum(len(v) for v in usage.values())
|
||||
if total_occurrences == 0:
|
||||
return 1.0
|
||||
|
||||
# 計算術語使用的一致性
|
||||
consistency_score = 0.0
|
||||
|
||||
# 檢查設計值和實際值是否一致
|
||||
for design_term, occurrences in usage.items():
|
||||
entry = self.standard_terminology.get(design_term)
|
||||
if not entry:
|
||||
continue
|
||||
|
||||
# 檢查文件中的引用是否與定義一致
|
||||
for file_path, matches in occurrences.items():
|
||||
for match, _ in matches:
|
||||
# 檢查是否使用了正確的術語
|
||||
if match.lower() == entry.design_value.lower():
|
||||
consistency_score += 1.0
|
||||
else:
|
||||
# 部分匹配或錯誤使用
|
||||
consistency_score += 0.5
|
||||
|
||||
# 歸一化分數
|
||||
if total_occurrences > 0:
|
||||
consistency_score = consistency_score / total_occurrences
|
||||
|
||||
return consistency_score
|
||||
|
||||
|
||||
def main():
|
||||
"""主函數"""
|
||||
print("術語管理器 - 統一管理架構文檔術語")
|
||||
print("=" * 60)
|
||||
|
||||
manager = TerminologyManager()
|
||||
|
||||
# 生成報告
|
||||
report = manager.generate_report()
|
||||
|
||||
print("\n術語使用報告:")
|
||||
print(f"版本: {report['metadata']['version']}")
|
||||
print(f"生成時間: {report['metadata']['generated_at']}")
|
||||
print(f"一致性分數: {report['summary']['consistency_score']:.2f}")
|
||||
print(f"使用術語總數: {report['summary']['unique_terms_used']}")
|
||||
|
||||
print("\n術語對照表:")
|
||||
for term, entry in manager.standard_terminology.items():
|
||||
print(f"{term:10} → {entry.actual_value:30} [{entry.status}]")
|
||||
|
||||
print("\n建議:")
|
||||
print("1. 在設計文檔中保留設計值說明")
|
||||
print("2. 在實現文檔中使用實際值")
|
||||
print("3. 定期檢查術語一致性")
|
||||
print("4. 更新代碼註釋中的術語")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user