feat: Initial v0.9 release with API Key authentication
## v0.9.20260325_144654 ### Features - API Key Authentication System - Job Worker System - V2 Backup Versioning ### Bug Fixes - get_processor_results_by_job column mapping Co-authored-by: OpenCode
This commit is contained in:
305
scripts/caption_processor.py
Normal file
305
scripts/caption_processor.py
Normal file
@@ -0,0 +1,305 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Caption Processor - Generate image captions
|
||||
Uses AI vision models to analyze video frames and generate descriptions
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
import argparse
|
||||
import subprocess
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
|
||||
def extract_frames(video_path: str, max_frames: int = 30) -> List[Dict]:
|
||||
"""Extract frames from video at regular intervals"""
|
||||
|
||||
# Get video duration
|
||||
cmd = [
|
||||
"ffprobe",
|
||||
"-v",
|
||||
"quiet",
|
||||
"-print_format",
|
||||
"json",
|
||||
"-show_format",
|
||||
video_path,
|
||||
]
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
if result.returncode == 0:
|
||||
data = json.loads(result.stdout)
|
||||
duration = float(data.get("format", {}).get("duration", 0))
|
||||
else:
|
||||
duration = 60 # Default fallback
|
||||
except Exception:
|
||||
duration = 60
|
||||
|
||||
if duration <= 0:
|
||||
duration = 60
|
||||
|
||||
# Calculate frame interval
|
||||
interval = max(duration / max_frames, 1.0)
|
||||
|
||||
frames = []
|
||||
temp_dir = os.path.join(os.path.dirname(video_path), ".caption_frames")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
for i in range(max_frames):
|
||||
timestamp = i * interval
|
||||
output_file = os.path.join(temp_dir, f"frame_{i:04d}.jpg")
|
||||
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-ss",
|
||||
str(timestamp),
|
||||
"-i",
|
||||
video_path,
|
||||
"-vframes",
|
||||
"1",
|
||||
"-q:v",
|
||||
"2",
|
||||
output_file,
|
||||
]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, capture_output=True, check=False)
|
||||
if os.path.exists(output_file):
|
||||
frames.append({"index": i, "timestamp": timestamp, "path": output_file})
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return frames
|
||||
|
||||
|
||||
def generate_caption_with_llava(
|
||||
image_path: str, prompt: str = "Describe this image in detail."
|
||||
) -> Optional[str]:
|
||||
"""Generate caption using LLaVA model"""
|
||||
try:
|
||||
# Try to use transformers with LLaVA
|
||||
from transformers import AutoProcessor, AutoModelForVision2Seq
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
# Note: This requires llava-hf/llava-1.5-7b-hf or similar
|
||||
# For now, return a placeholder
|
||||
return f"[LLaVA caption for {os.path.basename(image_path)}]"
|
||||
except ImportError:
|
||||
return None
|
||||
|
||||
|
||||
def generate_caption_with_gpt4v(image_path: str, api_key: str = None) -> Optional[str]:
|
||||
"""Generate caption using GPT-4V via OpenAI API"""
|
||||
import base64
|
||||
|
||||
if not api_key:
|
||||
api_key = os.environ.get("OPENAI_API_KEY")
|
||||
|
||||
if not api_key:
|
||||
return None
|
||||
|
||||
try:
|
||||
from openai import OpenAI
|
||||
|
||||
client = OpenAI(api_key=api_key)
|
||||
|
||||
# Encode image
|
||||
with open(image_path, "rb") as f:
|
||||
img_data = base64.b64encode(f.read()).decode()
|
||||
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o", # or gpt-4-turbo for vision
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": f"data:image/jpeg;base64,{img_data}"},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Describe what you see in this image in one sentence.",
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
max_tokens=100,
|
||||
)
|
||||
|
||||
return response.choices[0].message.content
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def generate_caption_fallback(image_path: str, existing_data: Dict = None) -> str:
|
||||
"""Generate a basic caption using available metadata"""
|
||||
|
||||
caption_parts = []
|
||||
|
||||
# Check YOLO data for objects
|
||||
if existing_data and existing_data.get("objects"):
|
||||
objects = list(set([o["class"] for o in existing_data["objects"]]))[:5]
|
||||
if objects:
|
||||
caption_parts.append(f"Contains: {', '.join(objects)}")
|
||||
|
||||
# Check OCR data for text
|
||||
if existing_data and existing_data.get("texts"):
|
||||
texts = [t["text"] for t in existing_data["texts"] if t.get("text")]
|
||||
if texts:
|
||||
caption_parts.append(f"On-screen text: {' '.join(texts[:3])}")
|
||||
|
||||
if caption_parts:
|
||||
return " | ".join(caption_parts)
|
||||
|
||||
return "Video frame at timestamp"
|
||||
|
||||
|
||||
def process_frame(
|
||||
frame_info: Dict, yolo_data: List = None, ocr_data: List = None
|
||||
) -> Dict:
|
||||
"""Process a single frame and generate caption"""
|
||||
|
||||
frame_path = frame_info["path"]
|
||||
timestamp = frame_info["timestamp"]
|
||||
|
||||
caption = None
|
||||
source = "unknown"
|
||||
|
||||
# Try GPT-4V first
|
||||
caption = generate_caption_with_gpt4v(frame_path)
|
||||
if caption:
|
||||
source = "gpt-4v"
|
||||
else:
|
||||
# Try LLaVA
|
||||
caption = generate_caption_with_llava(frame_path)
|
||||
if caption:
|
||||
source = "llava"
|
||||
else:
|
||||
# Use fallback with YOLO/OCR data
|
||||
combined_data = {"objects": [], "texts": []}
|
||||
if yolo_data:
|
||||
combined_data["objects"] = [
|
||||
o for o in yolo_data if o.get("timestamp") == timestamp
|
||||
]
|
||||
if ocr_data:
|
||||
combined_data["texts"] = [
|
||||
t for t in ocr_data if t.get("timestamp") == timestamp
|
||||
]
|
||||
caption = generate_caption_fallback(frame_path, combined_data)
|
||||
source = "metadata"
|
||||
|
||||
return {
|
||||
"index": frame_info["index"],
|
||||
"timestamp": timestamp,
|
||||
"caption": caption,
|
||||
"source": source,
|
||||
}
|
||||
|
||||
|
||||
def run_caption(
|
||||
video_path: str, output_path: str, uuid: str = "", max_frames: int = 30
|
||||
):
|
||||
publisher = RedisPublisher(uuid) if uuid else None
|
||||
if publisher:
|
||||
publisher.info("caption", "CAPTION_START")
|
||||
|
||||
if publisher:
|
||||
publisher.info("caption", "Extracting frames from video...")
|
||||
|
||||
# Extract frames
|
||||
frames = extract_frames(video_path, max_frames)
|
||||
|
||||
if publisher:
|
||||
publisher.info("caption", f"Extracted {len(frames)} frames")
|
||||
|
||||
# Load YOLO and OCR data for context
|
||||
base_path = os.path.dirname(output_path)
|
||||
uuid_name = os.path.basename(output_path).split(".")[0]
|
||||
|
||||
yolo_objects = []
|
||||
ocr_texts = []
|
||||
|
||||
yolo_path = os.path.join(base_path, f"{uuid_name}.yolo.json")
|
||||
if os.path.exists(yolo_path):
|
||||
with open(yolo_path) as f:
|
||||
yolo_data = json.load(f)
|
||||
# Flatten objects from all frames
|
||||
for frame in yolo_data.get("frames", []):
|
||||
for obj in frame.get("objects", []):
|
||||
obj["timestamp"] = frame.get("timestamp", 0)
|
||||
yolo_objects.append(obj)
|
||||
|
||||
ocr_path = os.path.join(base_path, f"{uuid_name}.ocr.json")
|
||||
if os.path.exists(ocr_path):
|
||||
with open(ocr_path) as f:
|
||||
ocr_data = json.load(f)
|
||||
for frame in ocr_data.get("frames", []):
|
||||
for text in frame.get("texts", []):
|
||||
text["timestamp"] = frame.get("timestamp", 0)
|
||||
ocr_texts.append(text)
|
||||
|
||||
# Process each frame
|
||||
captions = []
|
||||
for i, frame in enumerate(frames):
|
||||
if publisher and i % 5 == 0:
|
||||
publisher.progress(
|
||||
"caption", i, len(frames), f"Frame {i + 1}/{len(frames)}"
|
||||
)
|
||||
|
||||
caption_data = process_frame(frame, yolo_objects, ocr_texts)
|
||||
captions.append(caption_data)
|
||||
|
||||
# Cleanup temp frame
|
||||
try:
|
||||
os.remove(frame["path"])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Cleanup temp directory
|
||||
temp_dir = os.path.join(os.path.dirname(video_path), ".caption_frames")
|
||||
try:
|
||||
os.rmdir(temp_dir)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
result = {
|
||||
"video_path": video_path,
|
||||
"total_frames": len(frames),
|
||||
"captions": captions,
|
||||
"summary": {
|
||||
"avg_caption_length": sum(len(c.get("caption", "")) for c in captions)
|
||||
/ max(len(captions), 1),
|
||||
"gpt4v_count": sum(1 for c in captions if c.get("source") == "gpt-4v"),
|
||||
"llava_count": sum(1 for c in captions if c.get("source") == "llava"),
|
||||
"metadata_count": sum(1 for c in captions if c.get("source") == "metadata"),
|
||||
},
|
||||
}
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
if publisher:
|
||||
publisher.complete("caption", f"{len(captions)} frames captioned")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Video Caption Generator")
|
||||
parser.add_argument("video_path", help="Path to video file")
|
||||
parser.add_argument("output_path", help="Output JSON path")
|
||||
parser.add_argument("--uuid", help="UUID for progress tracking", default="")
|
||||
parser.add_argument(
|
||||
"--max-frames", type=int, default=30, help="Maximum frames to caption"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
result = run_caption(args.video_path, args.output_path, args.uuid, args.max_frames)
|
||||
print(f"Caption generated: {result['total_frames']} frames")
|
||||
Reference in New Issue
Block a user