momentry_core/scripts/story_processor.py

#!/opt/homebrew/bin/python3.11
"""
Story Processor - Generate parent-child chunk hierarchy for RAG
Uses LOCAL video analysis (ASR, YOLO, OCR, Scene) to create parent chunks.
NO cloud API calls - fully offline processing
"""

import sys
import json
import os
import argparse
from typing import Dict, List, Any

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher


def extract_video_metadata(video_path: str) -> Dict[str, Any]:
    """Extract basic video metadata using ffprobe"""
    import subprocess

    try:
        cmd = [
            "ffprobe",
            "-v",
            "quiet",
            "-print_format",
            "json",
            "-show_format",
            "-show_streams",
            video_path,
        ]
        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode == 0:
            return json.loads(result.stdout)
    except Exception:
        pass
    return {}


def generate_parent_child_chunks(
    asr_data: Dict,
    cut_data: Dict,
    yolo_data: Dict,
    ocr_data: Dict,
    scene_data: Dict,
    parent_chunk_size: int = 5,
) -> Dict:
    """
    Generate parent-child chunk hierarchy using LOCAL data only.
    No LLM/API calls - uses template-based narrative generation.
    """
    child_chunks = []
    parent_chunks = []

    # Create child chunks from ASR
    for seg in asr_data.get("segments", []):
        child_chunks.append(
            {
                "chunk_id": f"asr_{seg.get('start', 0):.1f}_{seg.get('end', 0):.1f}",
                "chunk_type": "asr",
                "source": "asr",
                "start_time": seg.get("start", 0),
                "end_time": seg.get("end", 0),
                "text_content": seg.get("text", ""),
                "content": {
                    "text": seg.get("text", ""),
                    "confidence": seg.get("confidence", 0),
                },
                "child_chunk_ids": [],
                "parent_chunk_id": None,
            }
        )

    # Create child chunks from CUT scenes
    for scene in cut_data.get("scenes", []):
        child_chunks.append(
            {
                "chunk_id": f"cut_{scene.get('scene_number', 0)}",
                "chunk_type": "cut",
                "source": "cut",
                "start_time": scene.get("start_time", 0),
                "end_time": scene.get("end_time", 0),
                "text_content": f"Scene {scene.get('scene_number', 0)}",
                "content": {
                    "scene_number": scene.get("scene_number", 0),
                    "duration": scene.get("duration", 0),
                },
                "child_chunk_ids": [],
                "parent_chunk_id": None,
            }
        )

    asr_child_ids = [c["chunk_id"] for c in child_chunks if c["source"] == "asr"]
    cut_child_ids = [c["chunk_id"] for c in child_chunks if c["source"] == "cut"]

    yolo_frames = yolo_data.get("frames", [])
    ocr_frames = ocr_data.get("frames", [])

    # Group ASR segments into parent chunks
    for i in range(0, len(asr_child_ids), parent_chunk_size):
        batch = asr_child_ids[i : i + parent_chunk_size]
        if not batch:
            continue

        batch_texts = []
        batch_objects = []
        batch_times = []

        for child_id in batch:
            for child in child_chunks:
                if child["chunk_id"] == child_id:
                    if child["text_content"]:
                        batch_texts.append(child["text_content"])
                    batch_times.append((child["start_time"], child["end_time"]))
                    break

        start_time = batch_times[0][0] if batch_times else 0
        end_time = batch_times[-1][1] if batch_times else 0

        # Find objects in this time range
        for frame in yolo_frames[:50]:
            ts = frame.get("timestamp", 0)
            if start_time <= ts <= end_time:
                for obj in frame.get("objects", []):
                    batch_objects.append(obj.get("class_name", "unknown"))

        narrative = generate_narrative(batch_texts, batch_objects, start_time, end_time)

        parent_chunk = {
            "chunk_id": f"story_asr_{i // parent_chunk_size:04d}",
            "chunk_type": "story",
            "source": "story_asr",
            "start_time": start_time,
            "end_time": end_time,
            "text_content": narrative,
            "content": {
                "description": narrative,
                "child_count": len(batch),
                "speech_preview": " ".join(batch_texts[:3]) if batch_texts else None,
                "detected_objects": list(set(batch_objects))[:5],
            },
            "child_chunk_ids": batch,
            "parent_chunk_id": None,
        }
        parent_chunks.append(parent_chunk)

        for child_id in batch:
            for child in child_chunks:
                if child["chunk_id"] == child_id:
                    child["parent_chunk_id"] = parent_chunk["chunk_id"]
                    break

    # Group CUT scenes into parent chunks
    for i in range(0, len(cut_child_ids), parent_chunk_size):
        batch = cut_child_ids[i : i + parent_chunk_size]
        if not batch:
            continue

        batch_times = []
        batch_objects = []

        for child_id in batch:
            for child in child_chunks:
                if child["chunk_id"] == child_id:
                    batch_times.append((child["start_time"], child["end_time"]))
                    break

        start_time = batch_times[0][0] if batch_times else 0
        end_time = batch_times[-1][1] if batch_times else 0

        for frame in yolo_frames[:50]:
            ts = frame.get("timestamp", 0)
            if start_time <= ts <= end_time:
                for obj in frame.get("objects", []):
                    batch_objects.append(obj.get("class_name", "unknown"))

        narrative = generate_scene_narrative(
            batch_objects, start_time, end_time, len(batch)
        )

        parent_chunk = {
            "chunk_id": f"story_cut_{i // parent_chunk_size:04d}",
            "chunk_type": "story",
            "source": "story_cut",
            "start_time": start_time,
            "end_time": end_time,
            "text_content": narrative,
            "content": {
                "description": narrative,
                "child_count": len(batch),
                "scenes": batch,
                "detected_objects": list(set(batch_objects))[:5],
            },
            "child_chunk_ids": batch,
            "parent_chunk_id": None,
        }
        parent_chunks.append(parent_chunk)

        for child_id in batch:
            for child in child_chunks:
                if child["chunk_id"] == child_id:
                    child["parent_chunk_id"] = parent_chunk["chunk_id"]
                    break

    return {
        "child_chunks": child_chunks,
        "parent_chunks": parent_chunks,
        "stats": {
            "total_child_chunks": len(child_chunks),
            "total_parent_chunks": len(parent_chunks),
            "asr_children": len(asr_child_ids),
            "cut_children": len(cut_child_ids),
        },
    }


def generate_narrative(
    texts: List[str], objects: List[str], start: float, end: float
) -> str:
    """Generate narrative description from LOCAL text snippets and objects"""
    if not texts and not objects:
        return f"Video segment from {start:.1f}s to {end:.1f}s"

    parts = []
    if texts:
        combined = " ".join(texts[:5])
        if len(combined) > 150:
            combined = combined[:150] + "..."
        parts.append(f"Speech: {combined}")

    if objects:
        unique_objs = list(set(objects))[:5]
        parts.append(f"Visuals: {', '.join(unique_objs)}")

    return f"[{start:.0f}s-{end:.0f}s] {' | '.join(parts)}"


def generate_scene_narrative(
    objects: List[str], start: float, end: float, scene_count: int
) -> str:
    """Generate scene narrative from LOCAL detected objects"""
    unique_objects = list(set(objects))[:5]

    if unique_objects:
        obj_str = ", ".join(unique_objects)
        return f"[{start:.0f}s-{end:.0f}s] {scene_count} scenes. Visuals: {obj_str}."
    else:
        return f"[{start:.0f}s-{end:.0f}s] {scene_count} video scenes."


def run_story(
    video_path: str, output_path: str, uuid: str = "", parent_chunk_size: int = 5
):
    publisher = RedisPublisher(uuid) if uuid else None
    if publisher:
        publisher.info("story", "STORY_START")

    base_path = os.path.dirname(output_path)
    uuid_name = os.path.basename(output_path).split(".")[0]

    asr_data = {"segments": []}
    cut_data = {"scenes": []}
    yolo_data = {"frames": []}
    ocr_data = {"frames": []}
    scene_data = {"scenes": []}

    for name, data_var in [
        ("asr", asr_data),
        ("cut", cut_data),
        ("yolo", yolo_data),
        ("ocr", ocr_data),
        ("scene", scene_data),
    ]:
        path = os.path.join(base_path, f"{uuid_name}.{name}.json")
        if os.path.exists(path):
            with open(path) as f:
                data_var.update(json.load(f))

    result = generate_parent_child_chunks(
        asr_data, cut_data, yolo_data, ocr_data, scene_data, parent_chunk_size
    )

    result["video_metadata"] = extract_video_metadata(video_path)
    result["processing"] = {
        "method": "local_aggregation",
        "cloud_api_used": False,
        "parent_chunk_size": parent_chunk_size,
    }

    with open(output_path, "w") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)

    if publisher:
        publisher.complete(
            "story",
            f"{result['stats']['total_parent_chunks']} parent, {result['stats']['total_child_chunks']} child chunks (LOCAL)",
        )

    return result


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Story Processor - Parent-Child Chunk Hierarchy (LOCAL ONLY)"
    )
    parser.add_argument("video_path", help="Path to video file")
    parser.add_argument("output_path", help="Output JSON path")
    parser.add_argument("--uuid", help="UUID for progress tracking", default="")
    parser.add_argument(
        "--parent-chunk-size",
        type=int,
        default=5,
        help="Number of child chunks per parent",
    )

    args = parser.parse_args()

    result = run_story(
        args.video_path, args.output_path, args.uuid, args.parent_chunk_size
    )
    print(
        f"Story generated: {result['stats']['total_parent_chunks']} parent, "
        f"{result['stats']['total_child_chunks']} child chunks (LOCAL)"
    )