326 lines
10 KiB
Python
Executable File
326 lines
10 KiB
Python
Executable File
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
Story Processor - Generate parent-child chunk hierarchy for RAG
|
|
Uses LOCAL video analysis (ASR, YOLO, OCR, Scene) to create parent chunks.
|
|
NO cloud API calls - fully offline processing
|
|
"""
|
|
|
|
import sys
|
|
import json
|
|
import os
|
|
import argparse
|
|
from typing import Dict, List, Any
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
from redis_publisher import RedisPublisher
|
|
|
|
|
|
def extract_video_metadata(video_path: str) -> Dict[str, Any]:
|
|
"""Extract basic video metadata using ffprobe"""
|
|
import subprocess
|
|
|
|
try:
|
|
cmd = [
|
|
"ffprobe",
|
|
"-v",
|
|
"quiet",
|
|
"-print_format",
|
|
"json",
|
|
"-show_format",
|
|
"-show_streams",
|
|
video_path,
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
if result.returncode == 0:
|
|
return json.loads(result.stdout)
|
|
except Exception:
|
|
pass
|
|
return {}
|
|
|
|
|
|
def generate_parent_child_chunks(
|
|
asr_data: Dict,
|
|
cut_data: Dict,
|
|
yolo_data: Dict,
|
|
ocr_data: Dict,
|
|
scene_data: Dict,
|
|
parent_chunk_size: int = 5,
|
|
) -> Dict:
|
|
"""
|
|
Generate parent-child chunk hierarchy using LOCAL data only.
|
|
No LLM/API calls - uses template-based narrative generation.
|
|
"""
|
|
child_chunks = []
|
|
parent_chunks = []
|
|
|
|
# Create child chunks from ASR
|
|
for seg in asr_data.get("segments", []):
|
|
child_chunks.append(
|
|
{
|
|
"chunk_id": f"asr_{seg.get('start', 0):.1f}_{seg.get('end', 0):.1f}",
|
|
"chunk_type": "asr",
|
|
"source": "asr",
|
|
"start_time": seg.get("start", 0),
|
|
"end_time": seg.get("end", 0),
|
|
"text_content": seg.get("text", ""),
|
|
"content": {
|
|
"text": seg.get("text", ""),
|
|
"confidence": seg.get("confidence", 0),
|
|
},
|
|
"child_chunk_ids": [],
|
|
"parent_chunk_id": None,
|
|
}
|
|
)
|
|
|
|
# Create child chunks from CUT scenes
|
|
for scene in cut_data.get("scenes", []):
|
|
child_chunks.append(
|
|
{
|
|
"chunk_id": f"cut_{scene.get('scene_number', 0)}",
|
|
"chunk_type": "cut",
|
|
"source": "cut",
|
|
"start_time": scene.get("start_time", 0),
|
|
"end_time": scene.get("end_time", 0),
|
|
"text_content": f"Scene {scene.get('scene_number', 0)}",
|
|
"content": {
|
|
"scene_number": scene.get("scene_number", 0),
|
|
"duration": scene.get("duration", 0),
|
|
},
|
|
"child_chunk_ids": [],
|
|
"parent_chunk_id": None,
|
|
}
|
|
)
|
|
|
|
asr_child_ids = [c["chunk_id"] for c in child_chunks if c["source"] == "asr"]
|
|
cut_child_ids = [c["chunk_id"] for c in child_chunks if c["source"] == "cut"]
|
|
|
|
yolo_frames = yolo_data.get("frames", [])
|
|
ocr_frames = ocr_data.get("frames", [])
|
|
|
|
# Group ASR segments into parent chunks
|
|
for i in range(0, len(asr_child_ids), parent_chunk_size):
|
|
batch = asr_child_ids[i : i + parent_chunk_size]
|
|
if not batch:
|
|
continue
|
|
|
|
batch_texts = []
|
|
batch_objects = []
|
|
batch_times = []
|
|
|
|
for child_id in batch:
|
|
for child in child_chunks:
|
|
if child["chunk_id"] == child_id:
|
|
if child["text_content"]:
|
|
batch_texts.append(child["text_content"])
|
|
batch_times.append((child["start_time"], child["end_time"]))
|
|
break
|
|
|
|
start_time = batch_times[0][0] if batch_times else 0
|
|
end_time = batch_times[-1][1] if batch_times else 0
|
|
|
|
# Find objects in this time range
|
|
for frame in yolo_frames[:50]:
|
|
ts = frame.get("timestamp", 0)
|
|
if start_time <= ts <= end_time:
|
|
for obj in frame.get("objects", []):
|
|
batch_objects.append(obj.get("class_name", "unknown"))
|
|
|
|
narrative = generate_narrative(batch_texts, batch_objects, start_time, end_time)
|
|
|
|
parent_chunk = {
|
|
"chunk_id": f"story_asr_{i // parent_chunk_size:04d}",
|
|
"chunk_type": "story",
|
|
"source": "story_asr",
|
|
"start_time": start_time,
|
|
"end_time": end_time,
|
|
"text_content": narrative,
|
|
"content": {
|
|
"description": narrative,
|
|
"child_count": len(batch),
|
|
"speech_preview": " ".join(batch_texts[:3]) if batch_texts else None,
|
|
"detected_objects": list(set(batch_objects))[:5],
|
|
},
|
|
"child_chunk_ids": batch,
|
|
"parent_chunk_id": None,
|
|
}
|
|
parent_chunks.append(parent_chunk)
|
|
|
|
for child_id in batch:
|
|
for child in child_chunks:
|
|
if child["chunk_id"] == child_id:
|
|
child["parent_chunk_id"] = parent_chunk["chunk_id"]
|
|
break
|
|
|
|
# Group CUT scenes into parent chunks
|
|
for i in range(0, len(cut_child_ids), parent_chunk_size):
|
|
batch = cut_child_ids[i : i + parent_chunk_size]
|
|
if not batch:
|
|
continue
|
|
|
|
batch_times = []
|
|
batch_objects = []
|
|
|
|
for child_id in batch:
|
|
for child in child_chunks:
|
|
if child["chunk_id"] == child_id:
|
|
batch_times.append((child["start_time"], child["end_time"]))
|
|
break
|
|
|
|
start_time = batch_times[0][0] if batch_times else 0
|
|
end_time = batch_times[-1][1] if batch_times else 0
|
|
|
|
for frame in yolo_frames[:50]:
|
|
ts = frame.get("timestamp", 0)
|
|
if start_time <= ts <= end_time:
|
|
for obj in frame.get("objects", []):
|
|
batch_objects.append(obj.get("class_name", "unknown"))
|
|
|
|
narrative = generate_scene_narrative(
|
|
batch_objects, start_time, end_time, len(batch)
|
|
)
|
|
|
|
parent_chunk = {
|
|
"chunk_id": f"story_cut_{i // parent_chunk_size:04d}",
|
|
"chunk_type": "story",
|
|
"source": "story_cut",
|
|
"start_time": start_time,
|
|
"end_time": end_time,
|
|
"text_content": narrative,
|
|
"content": {
|
|
"description": narrative,
|
|
"child_count": len(batch),
|
|
"scenes": batch,
|
|
"detected_objects": list(set(batch_objects))[:5],
|
|
},
|
|
"child_chunk_ids": batch,
|
|
"parent_chunk_id": None,
|
|
}
|
|
parent_chunks.append(parent_chunk)
|
|
|
|
for child_id in batch:
|
|
for child in child_chunks:
|
|
if child["chunk_id"] == child_id:
|
|
child["parent_chunk_id"] = parent_chunk["chunk_id"]
|
|
break
|
|
|
|
return {
|
|
"child_chunks": child_chunks,
|
|
"parent_chunks": parent_chunks,
|
|
"stats": {
|
|
"total_child_chunks": len(child_chunks),
|
|
"total_parent_chunks": len(parent_chunks),
|
|
"asr_children": len(asr_child_ids),
|
|
"cut_children": len(cut_child_ids),
|
|
},
|
|
}
|
|
|
|
|
|
def generate_narrative(
|
|
texts: List[str], objects: List[str], start: float, end: float
|
|
) -> str:
|
|
"""Generate narrative description from LOCAL text snippets and objects"""
|
|
if not texts and not objects:
|
|
return f"Video segment from {start:.1f}s to {end:.1f}s"
|
|
|
|
parts = []
|
|
if texts:
|
|
combined = " ".join(texts[:5])
|
|
if len(combined) > 150:
|
|
combined = combined[:150] + "..."
|
|
parts.append(f"Speech: {combined}")
|
|
|
|
if objects:
|
|
unique_objs = list(set(objects))[:5]
|
|
parts.append(f"Visuals: {', '.join(unique_objs)}")
|
|
|
|
return f"[{start:.0f}s-{end:.0f}s] {' | '.join(parts)}"
|
|
|
|
|
|
def generate_scene_narrative(
|
|
objects: List[str], start: float, end: float, scene_count: int
|
|
) -> str:
|
|
"""Generate scene narrative from LOCAL detected objects"""
|
|
unique_objects = list(set(objects))[:5]
|
|
|
|
if unique_objects:
|
|
obj_str = ", ".join(unique_objects)
|
|
return f"[{start:.0f}s-{end:.0f}s] {scene_count} scenes. Visuals: {obj_str}."
|
|
else:
|
|
return f"[{start:.0f}s-{end:.0f}s] {scene_count} video scenes."
|
|
|
|
|
|
def run_story(
|
|
video_path: str, output_path: str, uuid: str = "", parent_chunk_size: int = 5
|
|
):
|
|
publisher = RedisPublisher(uuid) if uuid else None
|
|
if publisher:
|
|
publisher.info("story", "STORY_START")
|
|
|
|
base_path = os.path.dirname(output_path)
|
|
uuid_name = os.path.basename(output_path).split(".")[0]
|
|
|
|
asr_data = {"segments": []}
|
|
cut_data = {"scenes": []}
|
|
yolo_data = {"frames": []}
|
|
ocr_data = {"frames": []}
|
|
scene_data = {"scenes": []}
|
|
|
|
for name, data_var in [
|
|
("asr", asr_data),
|
|
("cut", cut_data),
|
|
("yolo", yolo_data),
|
|
("ocr", ocr_data),
|
|
("scene", scene_data),
|
|
]:
|
|
path = os.path.join(base_path, f"{uuid_name}.{name}.json")
|
|
if os.path.exists(path):
|
|
with open(path) as f:
|
|
data_var.update(json.load(f))
|
|
|
|
result = generate_parent_child_chunks(
|
|
asr_data, cut_data, yolo_data, ocr_data, scene_data, parent_chunk_size
|
|
)
|
|
|
|
result["video_metadata"] = extract_video_metadata(video_path)
|
|
result["processing"] = {
|
|
"method": "local_aggregation",
|
|
"cloud_api_used": False,
|
|
"parent_chunk_size": parent_chunk_size,
|
|
}
|
|
|
|
with open(output_path, "w") as f:
|
|
json.dump(result, f, indent=2, ensure_ascii=False)
|
|
|
|
if publisher:
|
|
publisher.complete(
|
|
"story",
|
|
f"{result['stats']['total_parent_chunks']} parent, {result['stats']['total_child_chunks']} child chunks (LOCAL)",
|
|
)
|
|
|
|
return result
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(
|
|
description="Story Processor - Parent-Child Chunk Hierarchy (LOCAL ONLY)"
|
|
)
|
|
parser.add_argument("video_path", help="Path to video file")
|
|
parser.add_argument("output_path", help="Output JSON path")
|
|
parser.add_argument("--uuid", help="UUID for progress tracking", default="")
|
|
parser.add_argument(
|
|
"--parent-chunk-size",
|
|
type=int,
|
|
default=5,
|
|
help="Number of child chunks per parent",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
result = run_story(
|
|
args.video_path, args.output_path, args.uuid, args.parent_chunk_size
|
|
)
|
|
print(
|
|
f"Story generated: {result['stats']['total_parent_chunks']} parent, "
|
|
f"{result['stats']['total_child_chunks']} child chunks (LOCAL)"
|
|
)
|