Files
momentry_core/scripts/asr_face_stats.py
Warren e75c4d6f07 cleanup: remove dead code and duplicate docs
- Remove session-ses_2f27.md (161KB raw session log)
- Remove 49 ROOT_* duplicate files across REFERENCE/
- Remove 14 duplicate files between REFERENCE/ root and history/
- Remove asr_legacy.rs (dead code, replaced by asr.rs)
- Remove src/core/worker/ (duplicate JobWorker)
- Remove src/core/layers/ (empty directory)
- Remove 4 .bak files in src/
- Remove 7 dead private methods in worker/processor.rs
- Remove backup directory from git tracking
2026-05-04 01:31:21 +08:00

142 lines
4.4 KiB
Python

#!/usr/bin/python3.11
"""
ASR x Face Combination Statistics
For each ASR segment, count unique faces (person_ids) appearing during that segment.
Then aggregate: how many segments have 1 face, 2 faces, 3 faces, etc.
"""
import json
import os
from collections import defaultdict
UUID = "384b0ff44aaaa1f1"
BASE_DIR = f"output/{UUID}"
def load_json(filepath):
with open(filepath, "r") as f:
return json.load(f)
def build_asr_face_stats():
print(f"📊 Building ASR x Face combination statistics for {UUID}...")
# Load data
asr_data = load_json(os.path.join(BASE_DIR, f"{UUID}.asr.json"))
face_data = load_json(os.path.join(BASE_DIR, f"{UUID}.face_clustered.json"))
segments = asr_data.get("segments", [])
face_frames = face_data.get("frames", [])
# Build face lookup: timestamp -> set of person_ids
face_by_time = {}
for frame in face_frames:
ts = frame.get("timestamp", 0)
faces = frame.get("faces", [])
pids = set()
for f in faces:
pid = f.get("person_id")
if pid:
pids.add(pid)
face_by_time[ts] = pids
# Get sorted timestamps for efficient lookup
sorted_times = sorted(face_by_time.keys())
def get_faces_in_range(start, end):
"""Get all unique person_ids appearing in a time range."""
all_pids = set()
for ts in sorted_times:
if start <= ts <= end:
all_pids.update(face_by_time[ts])
return all_pids
# Analyze each ASR segment
face_count_dist = defaultdict(int)
segment_details = []
for seg in segments:
start = seg.get("start", 0)
end = seg.get("end", 0)
text = seg.get("text", "")
pids = get_faces_in_range(start, end)
face_count = len(pids)
face_count_dist[face_count] += 1
segment_details.append(
{
"start": start,
"end": end,
"text": text[:80],
"face_count": face_count,
"person_ids": list(pids)[:5], # Top 5
}
)
return dict(face_count_dist), segment_details, len(segments)
def print_stats(dist, total_segments):
print("\n" + "=" * 60)
print("📈 ASR x Face Combination Statistics")
print("=" * 60)
print(f"\nTotal ASR segments: {total_segments}")
print(f"\n{'Face Count':<12} {'Segments':>10} {'Percentage':>12}")
print("-" * 40)
sorted_dist = sorted(dist.items(), key=lambda x: x[0])
for fc, count in sorted_dist:
pct = count / total_segments * 100
print(f" {fc:>2} faces {count:>8} {pct:>6.1f}%")
# Summary
total_faces_sum = sum(fc * count for fc, count in dist.items())
avg_faces = total_faces_sum / total_segments if total_segments > 0 else 0
max_faces = max(dist.keys()) if dist else 0
print("\n📊 Summary:")
print(f" Average faces per segment: {avg_faces:.1f}")
print(f" Max faces in a segment: {max_faces}")
print(
f" Segments with 0 faces: {dist.get(0, 0)} ({dist.get(0, 0) / total_segments * 100:.1f}%)"
)
print(
f" Segments with 1 face: {dist.get(1, 0)} ({dist.get(1, 0) / total_segments * 100:.1f}%)"
)
print(
f" Segments with 2+ faces: {total_segments - dist.get(0, 0) - dist.get(1, 0)}"
)
# Show some example segments
print("\n🔍 Example Segments:")
print(" 0 faces:")
examples = [s for s in segment_details if s["face_count"] == 0][:3]
for ex in examples:
print(f" [{ex['start']:.0f}s-{ex['end']:.0f}s] {ex['text']}...")
print(" 1 face:")
examples = [s for s in segment_details if s["face_count"] == 1][:3]
for ex in examples:
print(
f" [{ex['start']:.0f}s-{ex['end']:.0f}s] {ex['person_ids'][0]}: {ex['text']}..."
)
print(" 3 faces:")
examples = [s for s in segment_details if s["face_count"] == 3][:3]
for ex in examples:
pids = ", ".join(ex["person_ids"])
print(f" [{ex['start']:.0f}s-{ex['end']:.0f}s] [{pids}] {ex['text']}...")
if __name__ == "__main__":
dist, segment_details, total = build_asr_face_stats()
print_stats(dist, total)
# Save
output_path = os.path.join(BASE_DIR, "asr_face_stats.json")
with open(output_path, "w") as f:
json.dump({"distribution": dist, "segments": segment_details}, f, indent=2)
print(f"\n💾 Saved: {output_path}")