feat: Initial v0.9 release with API Key authentication

## v0.9.20260325_144654 ### Features - API Key Authentication System - Job Worker System - V2 Backup Versioning ### Bug Fixes - get_processor_results_by_job column mapping Co-authored-by: OpenCode
2026-03-25 14:52:51 +08:00
parent 47e86b696f
commit 383201cacd
193 changed files with 40268 additions and 422 deletions
--- a/scripts/caption_processor.py
+++ b/scripts/caption_processor.py
@@ -0,0 +1,305 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Caption Processor - Generate image captions
+Uses AI vision models to analyze video frames and generate descriptions
+"""
+
+import sys
+import json
+import os
+import argparse
+import subprocess
+from typing import Dict, List, Optional
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from redis_publisher import RedisPublisher
+
+
+def extract_frames(video_path: str, max_frames: int = 30) -> List[Dict]:
+    """Extract frames from video at regular intervals"""
+
+    # Get video duration
+    cmd = [
+        "ffprobe",
+        "-v",
+        "quiet",
+        "-print_format",
+        "json",
+        "-show_format",
+        video_path,
+    ]
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode == 0:
+            data = json.loads(result.stdout)
+            duration = float(data.get("format", {}).get("duration", 0))
+        else:
+            duration = 60  # Default fallback
+    except Exception:
+        duration = 60
+
+    if duration <= 0:
+        duration = 60
+
+    # Calculate frame interval
+    interval = max(duration / max_frames, 1.0)
+
+    frames = []
+    temp_dir = os.path.join(os.path.dirname(video_path), ".caption_frames")
+    os.makedirs(temp_dir, exist_ok=True)
+
+    for i in range(max_frames):
+        timestamp = i * interval
+        output_file = os.path.join(temp_dir, f"frame_{i:04d}.jpg")
+
+        cmd = [
+            "ffmpeg",
+            "-y",
+            "-ss",
+            str(timestamp),
+            "-i",
+            video_path,
+            "-vframes",
+            "1",
+            "-q:v",
+            "2",
+            output_file,
+        ]
+
+        try:
+            subprocess.run(cmd, capture_output=True, check=False)
+            if os.path.exists(output_file):
+                frames.append({"index": i, "timestamp": timestamp, "path": output_file})
+        except Exception:
+            pass
+
+    return frames
+
+
+def generate_caption_with_llava(
+    image_path: str, prompt: str = "Describe this image in detail."
+) -> Optional[str]:
+    """Generate caption using LLaVA model"""
+    try:
+        # Try to use transformers with LLaVA
+        from transformers import AutoProcessor, AutoModelForVision2Seq
+        import torch
+        from PIL import Image
+
+        # Note: This requires llava-hf/llava-1.5-7b-hf or similar
+        # For now, return a placeholder
+        return f"[LLaVA caption for {os.path.basename(image_path)}]"
+    except ImportError:
+        return None
+
+
+def generate_caption_with_gpt4v(image_path: str, api_key: str = None) -> Optional[str]:
+    """Generate caption using GPT-4V via OpenAI API"""
+    import base64
+
+    if not api_key:
+        api_key = os.environ.get("OPENAI_API_KEY")
+
+    if not api_key:
+        return None
+
+    try:
+        from openai import OpenAI
+
+        client = OpenAI(api_key=api_key)
+
+        # Encode image
+        with open(image_path, "rb") as f:
+            img_data = base64.b64encode(f.read()).decode()
+
+        response = client.chat.completions.create(
+            model="gpt-4o",  # or gpt-4-turbo for vision
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {"url": f"data:image/jpeg;base64,{img_data}"},
+                        },
+                        {
+                            "type": "text",
+                            "text": "Describe what you see in this image in one sentence.",
+                        },
+                    ],
+                }
+            ],
+            max_tokens=100,
+        )
+
+        return response.choices[0].message.content
+    except Exception:
+        return None
+
+
+def generate_caption_fallback(image_path: str, existing_data: Dict = None) -> str:
+    """Generate a basic caption using available metadata"""
+
+    caption_parts = []
+
+    # Check YOLO data for objects
+    if existing_data and existing_data.get("objects"):
+        objects = list(set([o["class"] for o in existing_data["objects"]]))[:5]
+        if objects:
+            caption_parts.append(f"Contains: {', '.join(objects)}")
+
+    # Check OCR data for text
+    if existing_data and existing_data.get("texts"):
+        texts = [t["text"] for t in existing_data["texts"] if t.get("text")]
+        if texts:
+            caption_parts.append(f"On-screen text: {' '.join(texts[:3])}")
+
+    if caption_parts:
+        return " | ".join(caption_parts)
+
+    return "Video frame at timestamp"
+
+
+def process_frame(
+    frame_info: Dict, yolo_data: List = None, ocr_data: List = None
+) -> Dict:
+    """Process a single frame and generate caption"""
+
+    frame_path = frame_info["path"]
+    timestamp = frame_info["timestamp"]
+
+    caption = None
+    source = "unknown"
+
+    # Try GPT-4V first
+    caption = generate_caption_with_gpt4v(frame_path)
+    if caption:
+        source = "gpt-4v"
+    else:
+        # Try LLaVA
+        caption = generate_caption_with_llava(frame_path)
+        if caption:
+            source = "llava"
+        else:
+            # Use fallback with YOLO/OCR data
+            combined_data = {"objects": [], "texts": []}
+            if yolo_data:
+                combined_data["objects"] = [
+                    o for o in yolo_data if o.get("timestamp") == timestamp
+                ]
+            if ocr_data:
+                combined_data["texts"] = [
+                    t for t in ocr_data if t.get("timestamp") == timestamp
+                ]
+            caption = generate_caption_fallback(frame_path, combined_data)
+            source = "metadata"
+
+    return {
+        "index": frame_info["index"],
+        "timestamp": timestamp,
+        "caption": caption,
+        "source": source,
+    }
+
+
+def run_caption(
+    video_path: str, output_path: str, uuid: str = "", max_frames: int = 30
+):
+    publisher = RedisPublisher(uuid) if uuid else None
+    if publisher:
+        publisher.info("caption", "CAPTION_START")
+
+    if publisher:
+        publisher.info("caption", "Extracting frames from video...")
+
+    # Extract frames
+    frames = extract_frames(video_path, max_frames)
+
+    if publisher:
+        publisher.info("caption", f"Extracted {len(frames)} frames")
+
+    # Load YOLO and OCR data for context
+    base_path = os.path.dirname(output_path)
+    uuid_name = os.path.basename(output_path).split(".")[0]
+
+    yolo_objects = []
+    ocr_texts = []
+
+    yolo_path = os.path.join(base_path, f"{uuid_name}.yolo.json")
+    if os.path.exists(yolo_path):
+        with open(yolo_path) as f:
+            yolo_data = json.load(f)
+            # Flatten objects from all frames
+            for frame in yolo_data.get("frames", []):
+                for obj in frame.get("objects", []):
+                    obj["timestamp"] = frame.get("timestamp", 0)
+                    yolo_objects.append(obj)
+
+    ocr_path = os.path.join(base_path, f"{uuid_name}.ocr.json")
+    if os.path.exists(ocr_path):
+        with open(ocr_path) as f:
+            ocr_data = json.load(f)
+            for frame in ocr_data.get("frames", []):
+                for text in frame.get("texts", []):
+                    text["timestamp"] = frame.get("timestamp", 0)
+                    ocr_texts.append(text)
+
+    # Process each frame
+    captions = []
+    for i, frame in enumerate(frames):
+        if publisher and i % 5 == 0:
+            publisher.progress(
+                "caption", i, len(frames), f"Frame {i + 1}/{len(frames)}"
+            )
+
+        caption_data = process_frame(frame, yolo_objects, ocr_texts)
+        captions.append(caption_data)
+
+        # Cleanup temp frame
+        try:
+            os.remove(frame["path"])
+        except Exception:
+            pass
+
+    # Cleanup temp directory
+    temp_dir = os.path.join(os.path.dirname(video_path), ".caption_frames")
+    try:
+        os.rmdir(temp_dir)
+    except Exception:
+        pass
+
+    result = {
+        "video_path": video_path,
+        "total_frames": len(frames),
+        "captions": captions,
+        "summary": {
+            "avg_caption_length": sum(len(c.get("caption", "")) for c in captions)
+            / max(len(captions), 1),
+            "gpt4v_count": sum(1 for c in captions if c.get("source") == "gpt-4v"),
+            "llava_count": sum(1 for c in captions if c.get("source") == "llava"),
+            "metadata_count": sum(1 for c in captions if c.get("source") == "metadata"),
+        },
+    }
+
+    with open(output_path, "w") as f:
+        json.dump(result, f, indent=2, ensure_ascii=False)
+
+    if publisher:
+        publisher.complete("caption", f"{len(captions)} frames captioned")
+
+    return result
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Video Caption Generator")
+    parser.add_argument("video_path", help="Path to video file")
+    parser.add_argument("output_path", help="Output JSON path")
+    parser.add_argument("--uuid", help="UUID for progress tracking", default="")
+    parser.add_argument(
+        "--max-frames", type=int, default=30, help="Maximum frames to caption"
+    )
+
+    args = parser.parse_args()
+
+    result = run_caption(args.video_path, args.output_path, args.uuid, args.max_frames)
+    print(f"Caption generated: {result['total_frames']} frames")