Files
momentry_core/scripts/compare_asr_content.py
Warren e75c4d6f07 cleanup: remove dead code and duplicate docs
- Remove session-ses_2f27.md (161KB raw session log)
- Remove 49 ROOT_* duplicate files across REFERENCE/
- Remove 14 duplicate files between REFERENCE/ root and history/
- Remove asr_legacy.rs (dead code, replaced by asr.rs)
- Remove src/core/worker/ (duplicate JobWorker)
- Remove src/core/layers/ (empty directory)
- Remove 4 .bak files in src/
- Remove 7 dead private methods in worker/processor.rs
- Remove backup directory from git tracking
2026-05-04 01:31:21 +08:00

180 lines
5.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/opt/homebrew/bin/python3.11
"""
ASR方案内容对比分析
对比三个成功方案的输出差异:
- 方案A: faster-whisper small (77 segments)
- 方案B: whisper small (74 segments)
- 方案D: whisper medium (74 segments)
"""
import json
from pathlib import Path
from difflib import SequenceMatcher
def load_segments(json_path):
"""加载JSON文件中的segments"""
with open(json_path) as f:
data = json.load(f)
return data['asr_output']['segments']
def compare_segments(seg_a, seg_b, name_a, name_b):
"""对比两个方案的segments"""
print(f"\n{'='*60}")
print(f"对比: {name_a} vs {name_b}")
print(f"{'='*60}")
# 统计
print("\n【数量对比】")
print(f" {name_a}: {len(seg_a)} segments")
print(f" {name_b}: {len(seg_b)} segments")
print(f" 差异: {len(seg_a) - len(seg_b)} segments")
# 时间覆盖对比
total_time_a = sum(s['end'] - s['start'] for s in seg_a)
total_time_b = sum(s['end'] - s['start'] for s in seg_b)
print("\n【时间覆盖】")
print(f" {name_a}: {total_time_a:.2f}")
print(f" {name_b}: {total_time_b:.2f}")
print(f" 差异: {total_time_a - total_time_b:.2f}")
# 文本内容对比
texts_a = [s['text'] for s in seg_a]
texts_b = [s['text'] for s in seg_b]
# 计算相似度
text_a_full = ' '.join(texts_a)
text_b_full = ' '.join(texts_b)
similarity = SequenceMatcher(None, text_a_full, text_b_full).ratio()
print("\n【文本相似度】")
print(f" 相似度: {similarity*100:.1f}%")
# 差异分析
print("\n【详细差异】")
# 按时间对齐对比
matched_diffs = []
for i, seg in enumerate(seg_a):
start_a = seg['start']
end_a = seg['end']
text_a = seg['text']
# 找到方案B中时间相近的segment
closest_seg = None
min_time_diff = float('inf')
for seg_b_item in seg_b:
time_diff = abs(seg_b_item['start'] - start_a)
if time_diff < min_time_diff:
min_time_diff = time_diff
closest_seg = seg_b_item
if closest_seg and min_time_diff < 3.0: # 时间差小于3秒视为对应
text_b = closest_seg['text']
# 计算文本差异
if text_a != text_b:
text_similarity = SequenceMatcher(None, text_a, text_b).ratio()
matched_diffs.append({
'time': start_a,
'text_a': text_a,
'text_b': text_b,
'similarity': text_similarity
})
if matched_diffs:
print(f" 发现 {len(matched_diffs)} 处文本差异:")
# 显示前10处差异
for i, diff in enumerate(matched_diffs[:10]):
print(f"\n [{i+1}] 时间: {diff['time']:.2f}")
print(f" {name_a}: \"{diff['text_a']}\"")
print(f" {name_b}: \"{diff['text_b']}\"")
print(f" 相似度: {diff['similarity']*100:.1f}%")
if len(matched_diffs) > 10:
print(f"\n ... 还有 {len(matched_diffs) - 10} 处差异")
else:
print(" ✓ 无显著文本差异")
return {
'segments_diff': len(seg_a) - len(seg_b),
'time_diff': total_time_a - total_time_b,
'similarity': similarity,
'text_diffs': len(matched_diffs)
}
def main():
output_dir = Path('/Users/accusys/momentry_core_0.1/output/benchmark')
# 加载三个方案
seg_a = load_segments(output_dir / 'exasan_pcie/scheme_A_faster-whisper_small_cpu.json')
seg_b = load_segments(output_dir / 'exasan_pcie/scheme_B_whisper_small_cpu.json')
seg_d = load_segments(output_dir / 'exasan_pcie/scheme_D_whisper_medium_cpu.json')
print("="*60)
print("ASR方案内容对比分析报告")
print("="*60)
print()
# 方案基本信息
print("【测试方案】")
print(" 方案A: faster-whisper small CPU")
print(" 方案B: OpenAI whisper small CPU")
print(" 方案D: OpenAI whisper medium CPU")
print(" 方案C/E: MPS失败不支持")
print()
# 三组对比
results = {}
results['A_vs_B'] = compare_segments(seg_a, seg_b, '方案A', '方案B')
results['A_vs_D'] = compare_segments(seg_a, seg_d, '方案A', '方案D')
results['B_vs_D'] = compare_segments(seg_b, seg_d, '方案B', '方案D')
# 总结
print()
print("="*60)
print("对比总结")
print("="*60)
print("\n【Segments数量】")
print(" 方案A: 77 segments (最多)")
print(" 方案B: 74 segments")
print(" 方案D: 74 segments")
print(" 结论: faster-whisper分割更细+3 segments")
print("\n【文本相似度】")
print(f" A vs B: {results['A_vs_B']['similarity']*100:.1f}%")
print(f" A vs D: {results['A_vs_D']['similarity']*100:.1f}%")
print(f" B vs D: {results['B_vs_D']['similarity']*100:.1f}%")
print(" 结论: 三个方案文本高度相似")
print("\n【文本差异统计】")
print(f" A vs B: {results['A_vs_B']['text_diffs']}处差异")
print(f" A vs D: {results['A_vs_D']['text_diffs']}处差异")
print(f" B vs D: {results['B_vs_D']['text_diffs']}处差异")
print("\n【方案Dmediumvs 方案Bsmall")
print(" Segments数量相同: 74条")
print(f" 文本相似度: {results['B_vs_D']['similarity']*100:.1f}%")
print(" 结论: medium模型无明显提升")
print()
print("="*60)
print("推荐方案")
print("="*60)
print()
print("✅ 推荐: 方案A (faster-whisper small CPU)")
print("理由:")
print(" 1. Segments更多77 vs 74- 分割更细致")
print(" 2. 文本相似度与其他方案一致")
print(" 3. 处理速度最快6x faster")
print(" 4. 内存占用最低4x less")
print()
if __name__ == '__main__':
main()