chore: backup before migration to new repo

2026-04-23 16:46:02 +08:00
parent 13dd3b30f3
commit 59809dae1f
40 changed files with 5566 additions and 1783 deletions
--- a/scripts/story_processor.py
+++ b/scripts/story_processor.py
@@ -1,12 +1,8 @@
 #!/opt/homebrew/bin/python3.11
 """
 Story Processor - Generate parent-child chunk hierarchy for RAG
-Uses video analysis (ASR, YOLO, OCR) to create parent chunks that summarize child chunks.
-
-Parent-Child Chunk Strategy:
- Parent chunks: Summarize multiple scenes/segments with narrative description
- Child chunks: Individual ASR segments, OCR texts, detected objects
- When embedding: Parent description + Child content for better retrieval
+Uses LOCAL video analysis (ASR, YOLO, OCR, Scene) to create parent chunks.
+NO cloud API calls - fully offline processing
 """

 import sys
@@ -47,57 +43,59 @@ def generate_parent_child_chunks(
    cut_data: Dict,
    yolo_data: Dict,
    ocr_data: Dict,
+    scene_data: Dict,
    parent_chunk_size: int = 5,
-) -> Dict[str, Any]:
+) -> Dict:
    """
-    Generate parent-child chunk hierarchy.
-
-    Parent chunks summarize multiple child chunks for better RAG retrieval.
-    Child chunks are individual segments from ASR, scenes from CUT, etc.
+    Generate parent-child chunk hierarchy using LOCAL data only.
+    No LLM/API calls - uses template-based narrative generation.
    """
-
    child_chunks = []
    parent_chunks = []

-    # Get source data
-    asr_segments = asr_data.get("segments", [])
-    cut_scenes = cut_data.get("scenes", [])
-    yolo_frames = yolo_data.get("frames", [])
-    _ocr_frames = ocr_data.get("frames", [])
-
-    # Create child chunks from ASR segments
-    asr_child_ids = []
-    for i, seg in enumerate(asr_segments):
-        child_chunk = {
-            "chunk_id": f"asr_{i:04d}",
-            "chunk_type": "sentence",
-            "source": "asr",
-            "start_time": seg.get("start", 0),
-            "end_time": seg.get("end", 0),
-            "text_content": seg.get("text", ""),
-            "content": seg,
-            "child_chunk_ids": [],
-            "parent_chunk_id": None,
-        }
-        child_chunks.append(child_chunk)
-        asr_child_ids.append(child_chunk["chunk_id"])
+    # Create child chunks from ASR
+    for seg in asr_data.get("segments", []):
+        child_chunks.append(
+            {
+                "chunk_id": f"asr_{seg.get('start', 0):.1f}_{seg.get('end', 0):.1f}",
+                "chunk_type": "asr",
+                "source": "asr",
+                "start_time": seg.get("start", 0),
+                "end_time": seg.get("end", 0),
+                "text_content": seg.get("text", ""),
+                "content": {
+                    "text": seg.get("text", ""),
+                    "confidence": seg.get("confidence", 0),
+                },
+                "child_chunk_ids": [],
+                "parent_chunk_id": None,
+            }
+        )

    # Create child chunks from CUT scenes
-    cut_child_ids = []
-    for i, scene in enumerate(cut_scenes):
-        child_chunk = {
-            "chunk_id": f"cut_{i:04d}",
-            "chunk_type": "cut",
-            "source": "cut",
-            "start_time": scene.get("start_time", scene.get("start", 0)),
-            "end_time": scene.get("end_time", scene.get("end", 0)),
-            "text_content": None,
-            "content": scene,
-            "child_chunk_ids": [],
-            "parent_chunk_id": None,
-        }
-        child_chunks.append(child_chunk)
-        cut_child_ids.append(child_chunk["chunk_id"])
+    for scene in cut_data.get("scenes", []):
+        child_chunks.append(
+            {
+                "chunk_id": f"cut_{scene.get('scene_number', 0)}",
+                "chunk_type": "cut",
+                "source": "cut",
+                "start_time": scene.get("start_time", 0),
+                "end_time": scene.get("end_time", 0),
+                "text_content": f"Scene {scene.get('scene_number', 0)}",
+                "content": {
+                    "scene_number": scene.get("scene_number", 0),
+                    "duration": scene.get("duration", 0),
+                },
+                "child_chunk_ids": [],
+                "parent_chunk_id": None,
+            }
+        )
+
+    asr_child_ids = [c["chunk_id"] for c in child_chunks if c["source"] == "asr"]
+    cut_child_ids = [c["chunk_id"] for c in child_chunks if c["source"] == "cut"]
+
+    yolo_frames = yolo_data.get("frames", [])
+    ocr_frames = ocr_data.get("frames", [])

    # Group ASR segments into parent chunks
    for i in range(0, len(asr_child_ids), parent_chunk_size):
@@ -105,7 +103,6 @@ def generate_parent_child_chunks(
        if not batch:
            continue

-        # Collect text from child chunks
        batch_texts = []
        batch_objects = []
        batch_times = []
@@ -118,11 +115,16 @@ def generate_parent_child_chunks(
                    batch_times.append((child["start_time"], child["end_time"]))
                    break

-        # Create parent chunk with narrative description
        start_time = batch_times[0][0] if batch_times else 0
        end_time = batch_times[-1][1] if batch_times else 0

-        # Generate narrative description
+        # Find objects in this time range
+        for frame in yolo_frames[:50]:
+            ts = frame.get("timestamp", 0)
+            if start_time <= ts <= end_time:
+                for obj in frame.get("objects", []):
+                    batch_objects.append(obj.get("class_name", "unknown"))
+
        narrative = generate_narrative(batch_texts, batch_objects, start_time, end_time)

        parent_chunk = {
@@ -136,13 +138,13 @@ def generate_parent_child_chunks(
                "description": narrative,
                "child_count": len(batch),
                "speech_preview": " ".join(batch_texts[:3]) if batch_texts else None,
+                "detected_objects": list(set(batch_objects))[:5],
            },
            "child_chunk_ids": batch,
            "parent_chunk_id": None,
        }
        parent_chunks.append(parent_chunk)

-        # Update child chunks with parent reference
        for child_id in batch:
            for child in child_chunks:
                if child["chunk_id"] == child_id:
@@ -167,14 +169,12 @@ def generate_parent_child_chunks(
        start_time = batch_times[0][0] if batch_times else 0
        end_time = batch_times[-1][1] if batch_times else 0

-        # Find objects in this time range from YOLO
-        for frame in yolo_frames[:100]:  # Sample frames
+        for frame in yolo_frames[:50]:
            ts = frame.get("timestamp", 0)
            if start_time <= ts <= end_time:
                for obj in frame.get("objects", []):
                    batch_objects.append(obj.get("class_name", "unknown"))

-        # Generate scene narrative
        narrative = generate_scene_narrative(
            batch_objects, start_time, end_time, len(batch)
        )
@@ -190,14 +190,13 @@ def generate_parent_child_chunks(
                "description": narrative,
                "child_count": len(batch),
                "scenes": batch,
-                "detected_objects": list(set(batch_objects))[:10],
+                "detected_objects": list(set(batch_objects))[:5],
            },
            "child_chunk_ids": batch,
            "parent_chunk_id": None,
        }
        parent_chunks.append(parent_chunk)

-        # Update child chunks with parent reference
        for child_id in batch:
            for child in child_chunks:
                if child["chunk_id"] == child_id:
@@ -219,27 +218,33 @@ def generate_parent_child_chunks(
 def generate_narrative(
    texts: List[str], objects: List[str], start: float, end: float
 ) -> str:
-    """Generate narrative description from text snippets"""
-    if not texts:
+    """Generate narrative description from LOCAL text snippets and objects"""
+    if not texts and not objects:
        return f"Video segment from {start:.1f}s to {end:.1f}s"

-    # Combine and summarize
-    combined = " ".join(texts)
-    if len(combined) > 200:
-        combined = combined[:200] + "..."
+    parts = []
+    if texts:
+        combined = " ".join(texts[:5])
+        if len(combined) > 150:
+            combined = combined[:150] + "..."
+        parts.append(f"Speech: {combined}")

-    return f"[{start:.0f}s-{end:.0f}s] {combined}"
+    if objects:
+        unique_objs = list(set(objects))[:5]
+        parts.append(f"Visuals: {', '.join(unique_objs)}")
+
+    return f"[{start:.0f}s-{end:.0f}s] {' | '.join(parts)}"


 def generate_scene_narrative(
    objects: List[str], start: float, end: float, scene_count: int
 ) -> str:
-    """Generate scene narrative from detected objects"""
+    """Generate scene narrative from LOCAL detected objects"""
    unique_objects = list(set(objects))[:5]

    if unique_objects:
        obj_str = ", ".join(unique_objects)
-        return f"[{start:.0f}s-{end:.0f}s] Scenes {scene_count} segments. Visual: {obj_str}."
+        return f"[{start:.0f}s-{end:.0f}s] {scene_count} scenes. Visuals: {obj_str}."
    else:
        return f"[{start:.0f}s-{end:.0f}s] {scene_count} video scenes."

@@ -251,70 +256,45 @@ def run_story(
    if publisher:
        publisher.info("story", "STORY_START")

-    # Load existing JSON files
    base_path = os.path.dirname(output_path)
    uuid_name = os.path.basename(output_path).split(".")[0]

-    # Load analysis data
    asr_data = {"segments": []}
    cut_data = {"scenes": []}
    yolo_data = {"frames": []}
    ocr_data = {"frames": []}
+    scene_data = {"scenes": []}

-    # Load ASR
-    asr_path = os.path.join(base_path, f"{uuid_name}.asr.json")
-    if os.path.exists(asr_path):
-        with open(asr_path) as f:
-            asr_data = json.load(f)
-        if publisher:
-            publisher.info(
-                "story", f"Loaded ASR: {len(asr_data.get('segments', []))} segments"
-            )
+    for name, data_var in [
+        ("asr", asr_data),
+        ("cut", cut_data),
+        ("yolo", yolo_data),
+        ("ocr", ocr_data),
+        ("scene", scene_data),
+    ]:
+        path = os.path.join(base_path, f"{uuid_name}.{name}.json")
+        if os.path.exists(path):
+            with open(path) as f:
+                data_var.update(json.load(f))

-    # Load CUT
-    cut_path = os.path.join(base_path, f"{uuid_name}.cut.json")
-    if os.path.exists(cut_path):
-        with open(cut_path) as f:
-            cut_data = json.load(f)
-        if publisher:
-            publisher.info(
-                "story", f"Loaded CUT: {len(cut_data.get('scenes', []))} scenes"
-            )
-
-    # Load YOLO
-    yolo_path = os.path.join(base_path, f"{uuid_name}.yolo.json")
-    if os.path.exists(yolo_path):
-        with open(yolo_path) as f:
-            yolo_data = json.load(f)
-
-    # Load OCR
-    ocr_path = os.path.join(base_path, f"{uuid_name}.ocr.json")
-    if os.path.exists(ocr_path):
-        with open(ocr_path) as f:
-            ocr_data = json.load(f)
-
-    # Load metadata
-    metadata = extract_video_metadata(video_path)
-
-    if publisher:
-        publisher.info("story", "Generating parent-child chunks...")
-
-    # Generate parent-child hierarchy
    result = generate_parent_child_chunks(
-        asr_data, cut_data, yolo_data, ocr_data, parent_chunk_size
+        asr_data, cut_data, yolo_data, ocr_data, scene_data, parent_chunk_size
    )

-    result["metadata"] = metadata
-    result["parent_chunk_size"] = parent_chunk_size
+    result["video_metadata"] = extract_video_metadata(video_path)
+    result["processing"] = {
+        "method": "local_aggregation",
+        "cloud_api_used": False,
+        "parent_chunk_size": parent_chunk_size,
+    }

    with open(output_path, "w") as f:
        json.dump(result, f, indent=2, ensure_ascii=False)

    if publisher:
-        stats = result["stats"]
        publisher.complete(
            "story",
-            f"{stats['total_parent_chunks']} parents, {stats['total_child_chunks']} children",
+            f"{result['stats']['total_parent_chunks']} parent, {result['stats']['total_child_chunks']} child chunks (LOCAL)",
        )

    return result
@@ -322,7 +302,7 @@ def run_story(

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
-        description="Video Story Generator - Parent-Child Chunks"
+        description="Story Processor - Parent-Child Chunk Hierarchy (LOCAL ONLY)"
    )
    parser.add_argument("video_path", help="Path to video file")
    parser.add_argument("output_path", help="Output JSON path")
@@ -331,7 +311,7 @@ if __name__ == "__main__":
        "--parent-chunk-size",
        type=int,
        default=5,
-        help="Number of child chunks per parent chunk",
+        help="Number of child chunks per parent",
    )

    args = parser.parse_args()
@@ -340,6 +320,6 @@ if __name__ == "__main__":
        args.video_path, args.output_path, args.uuid, args.parent_chunk_size
    )
    print(
-        f"Story generated: {result['stats']['total_parent_chunks']} parent chunks, "
-        f"{result['stats']['total_child_chunks']} child chunks"
+        f"Story generated: {result['stats']['total_parent_chunks']} parent, "
+        f"{result['stats']['total_child_chunks']} child chunks (LOCAL)"
    )