feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/chunk_statistics.py
+++ b/scripts/chunk_statistics.py
@@ -0,0 +1,219 @@
+#!/opt/bin/python3.11
+"""
+Chunk-based statistics for ASR, Face, and Speaker combinations.
+Generates a comprehensive report of each chunk's content.
+"""
+
+import json
+import os
+import sys
+
+UUID = "384b0ff44aaaa1f1"
+BASE_DIR = f"output/{UUID}"
+CHUNK_DURATION = 60  # seconds per chunk
+
+
+def load_json(filepath):
+    with open(filepath, "r") as f:
+        return json.load(f)
+
+
+def build_chunk_stats():
+    print(f"📊 Building chunk statistics for {UUID}...")
+    print(f"   Chunk duration: {CHUNK_DURATION}s")
+
+    # Load data
+    asr_data = load_json(os.path.join(BASE_DIR, f"{UUID}.asr.json"))
+    face_data = load_json(os.path.join(BASE_DIR, f"{UUID}.face_clustered.json"))
+
+    # Get video duration
+    segments = asr_data.get("segments", [])
+    video_duration = max(seg.get("end", 0) for seg in segments) if segments else 0
+    print(f"   Video duration: {video_duration:.0f}s ({video_duration / 60:.1f} min)")
+
+    # Build chunk structure
+    num_chunks = int(video_duration // CHUNK_DURATION) + 1
+    chunks = []
+
+    for i in range(num_chunks):
+        chunk_start = i * CHUNK_DURATION
+        chunk_end = (i + 1) * CHUNK_DURATION
+        chunks.append(
+            {
+                "chunk_id": i,
+                "start": chunk_start,
+                "end": chunk_end,
+                "asr_count": 0,
+                "asr_text_len": 0,
+                "face_count": 0,
+                "unique_persons": set(),
+                "has_speech": False,
+                "has_faces": False,
+            }
+        )
+
+    # Count ASR segments per chunk
+    for seg in segments:
+        start = seg.get("start", 0)
+        end = seg.get("end", 0)
+        text = seg.get("text", "")
+
+        # Find overlapping chunks
+        chunk_start_idx = int(start // CHUNK_DURATION)
+        chunk_end_idx = int(end // CHUNK_DURATION)
+
+        for ci in range(chunk_start_idx, min(chunk_end_idx + 1, len(chunks))):
+            chunks[ci]["asr_count"] += 1
+            chunks[ci]["asr_text_len"] += len(text)
+            chunks[ci]["has_speech"] = True
+
+    # Count faces per chunk
+    face_frames = face_data.get("frames", [])
+    for frame in face_frames:
+        timestamp = frame.get("timestamp", 0)
+        faces = frame.get("faces", [])
+
+        chunk_idx = int(timestamp // CHUNK_DURATION)
+        if chunk_idx < len(chunks):
+            chunks[chunk_idx]["face_count"] += len(faces)
+            chunks[chunk_idx]["has_faces"] = len(faces) > 0
+
+            for face in faces:
+                pid = face.get("person_id")
+                if pid:
+                    chunks[chunk_idx]["unique_persons"].add(pid)
+
+    # Convert sets to counts for serialization
+    for chunk in chunks:
+        chunk["unique_person_count"] = len(chunk["unique_persons"])
+        chunk["top_persons"] = list(chunk["unique_persons"])[:10]  # Top 10
+        del chunk["unique_persons"]
+
+    return chunks, video_duration
+
+
+def print_summary(chunks):
+    print("\n" + "=" * 80)
+    print("📈 CHUNK STATISTICS SUMMARY")
+    print("=" * 80)
+
+    # Overall stats
+    total_asr = sum(c["asr_count"] for c in chunks)
+    total_faces = sum(c["face_count"] for c in chunks)
+    total_speech_chunks = sum(1 for c in chunks if c["has_speech"])
+    total_face_chunks = sum(1 for c in chunks if c["has_faces"])
+    chunks_with_both = sum(1 for c in chunks if c["has_speech"] and c["has_faces"])
+    chunks_with_neither = sum(
+        1 for c in chunks if not c["has_speech"] and not c["has_faces"]
+    )
+
+    print(f"\n📊 Overview:")
+    print(f"   Total chunks:        {len(chunks)}")
+    print(
+        f"   Chunks with speech:  {total_speech_chunks} ({total_speech_chunks / len(chunks) * 100:.0f}%)"
+    )
+    print(
+        f"   Chunks with faces:   {total_face_chunks} ({total_face_chunks / len(chunks) * 100:.0f}%)"
+    )
+    print(
+        f"   Both speech+faces:   {chunks_with_both} ({chunks_with_both / len(chunks) * 100:.0f}%)"
+    )
+    print(
+        f"   Neither:             {chunks_with_neither} ({chunks_with_neither / len(chunks) * 100:.0f}%)"
+    )
+    print(f"   Total ASR segments:  {total_asr}")
+    print(f"   Total face frames:   {total_faces}")
+
+    # Combination breakdown
+    print(f"\n🎯 ASR/Face Combination Breakdown:")
+
+    combos = {}
+    for c in chunks:
+        key = (c["has_speech"], c["has_faces"])
+        if key not in combos:
+            combos[key] = {"count": 0, "chunk_ids": []}
+        combos[key]["count"] += 1
+        combos[key]["chunk_ids"].append(c["chunk_id"])
+
+    for (has_speech, has_faces), info in sorted(combos.items()):
+        speech_str = "🎤 Speech" if has_speech else "  No Speech"
+        face_str = "👤 Faces" if has_faces else "  No Faces"
+        chunk_range = (
+            f"{min(info['chunk_ids'])}-{max(info['chunk_ids'])}"
+            if len(info["chunk_ids"]) > 1
+            else f"{info['chunk_ids'][0]}"
+        )
+        print(
+            f"   {speech_str} + {face_str}: {info['count']} chunks (IDs: {chunk_range})"
+        )
+
+    # Top chunks by activity
+    print(f"\n🔥 Top 10 Most Active Chunks (by ASR+Faces):")
+    scored_chunks = []
+    for c in chunks:
+        score = c["asr_count"] + c["face_count"]
+        scored_chunks.append((score, c))
+    scored_chunks.sort(key=lambda x: x[0], reverse=True)
+
+    for score, c in scored_chunks[:10]:
+        persons = ", ".join(c["top_persons"][:3])
+        print(
+            f"   Chunk {c['chunk_id']:3d} ({c['start']:5d}-{c['end']:5d}s): "
+            f"ASR={c['asr_count']:3d}, Faces={c['face_count']:4d}, "
+            f"Persons={c['unique_person_count']:2d} ({persons})"
+        )
+
+    # Stamp scene chunk
+    print(f"\n🔍 Special Interest Chunks:")
+    for c in chunks:
+        # Stamp scene around 5730s
+        if c["start"] <= 5730 <= c["end"]:
+            persons = ", ".join(c["top_persons"][:5])
+            print(
+                f"   🎯 Stamp scene chunk: {c['chunk_id']} ({c['start']}-{c['end']}s)"
+            )
+            print(
+                f"      ASR={c['asr_count']}, Faces={c['face_count']}, "
+                f"Persons={c['unique_person_count']} ({persons})"
+            )
+
+        # Magnifying glass scene around 5727s
+        if c["start"] <= 5727 <= c["end"]:
+            print(
+                f"   🔍 Magnifier scene chunk: {c['chunk_id']} ({c['start']}-{c['end']}s)"
+            )
+
+    # Vase scenes
+    vase_times = [300, 660, 3720]
+    for vt in vase_times:
+        for c in chunks:
+            if c["start"] <= vt <= c["end"]:
+                persons = ", ".join(c["top_persons"][:3])
+                print(
+                    f"   🏺 Vase scene chunk: {c['chunk_id']} ({c['start']}-{c['end']}s)"
+                )
+                print(
+                    f"      ASR={c['asr_count']}, Faces={c['face_count']}, "
+                    f"Persons={c['unique_person_count']} ({persons})"
+                )
+
+
+if __name__ == "__main__":
+    chunks, duration = build_chunk_stats()
+    print_summary(chunks)
+
+    # Save to file
+    output_path = os.path.join(BASE_DIR, "chunk_statistics.json")
+    with open(output_path, "w") as f:
+        json.dump(
+            {
+                "uuid": UUID,
+                "duration": duration,
+                "chunk_duration": CHUNK_DURATION,
+                "chunks": chunks,
+            },
+            f,
+            indent=2,
+        )
+
+    print(f"\n💾 Saved detailed stats to: {output_path}")