chore: backup before migration to new repo

2026-04-23 16:46:02 +08:00
parent 13dd3b30f3
commit 59809dae1f
40 changed files with 5566 additions and 1783 deletions
--- a/scripts/caption_processor.py
+++ b/scripts/caption_processor.py
@@ -1,7 +1,8 @@
 #!/opt/homebrew/bin/python3.11
 """
-Caption Processor - Generate image captions
-Uses AI vision models to analyze video frames and generate descriptions
+Caption Processor - Generate image captions (LOCAL ONLY)
+Uses Moondream2 (local VLM) for image captioning
+No cloud API calls - fully offline processing
 """

 import sys
@@ -18,7 +19,6 @@ from redis_publisher import RedisPublisher
 def extract_frames(video_path: str, max_frames: int = 30) -> List[Dict]:
    """Extract frames from video at regular intervals"""

-    # Get video duration
    cmd = [
        "ffprobe",
        "-v",
@@ -34,14 +34,13 @@ def extract_frames(video_path: str, max_frames: int = 30) -> List[Dict]:
            data = json.loads(result.stdout)
            duration = float(data.get("format", {}).get("duration", 0))
        else:
-            duration = 60  # Default fallback
+            duration = 60
    except Exception:
        duration = 60

    if duration <= 0:
        duration = 60

-    # Calculate frame interval
    interval = max(duration / max_frames, 1.0)

    frames = []
@@ -76,94 +75,73 @@ def extract_frames(video_path: str, max_frames: int = 30) -> List[Dict]:
    return frames


-def generate_caption_with_llava(
+def generate_caption_with_moondream(
    image_path: str, prompt: str = "Describe this image in detail."
 ) -> Optional[str]:
-    """Generate caption using LLaVA model"""
+    """Generate caption using Moondream2 (local VLM)"""
    try:
-        # Try to use transformers with LLaVA
-        from transformers import AutoProcessor, AutoModelForVision2Seq  # noqa: F401
-        import torch  # noqa: F401
-        from PIL import Image  # noqa: F401
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        from PIL import Image
+        import torch

-        # Note: This requires llava-hf/llava-1.5-7b-hf or similar
-        # For now, return a placeholder
-        return f"[LLaVA caption for {os.path.basename(image_path)}]"
+        model_id = "vikhyatk/moondream2"
+        revision = "2025-01-09"
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_id, revision=revision, trust_remote_code=True
+        )
+        moondream = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            revision=revision,
+            trust_remote_code=True,
+            torch_dtype=torch.float16,
+        ).to("mps" if torch.backends.mps.is_available() else "cpu")
+
+        moondream.eval()
+
+        image = Image.open(image_path)
+        enc_image = moondream.encode_image(image)
+        caption = moondream.answer_question(enc_image, prompt, tokenizer)
+
+        return caption if caption else None
    except ImportError:
        return None
-
-
-def generate_caption_with_gpt4v(image_path: str, api_key: str = None) -> Optional[str]:
-    """Generate caption using GPT-4V via OpenAI API"""
-    import base64
-
-    if not api_key:
-        api_key = os.environ.get("OPENAI_API_KEY")
-
-    if not api_key:
-        return None
-
-    try:
-        from openai import OpenAI
-
-        client = OpenAI(api_key=api_key)
-
-        # Encode image
-        with open(image_path, "rb") as f:
-            img_data = base64.b64encode(f.read()).decode()
-
-        response = client.chat.completions.create(
-            model="gpt-4o",  # or gpt-4-turbo for vision
-            messages=[
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {"url": f"data:image/jpeg;base64,{img_data}"},
-                        },
-                        {
-                            "type": "text",
-                            "text": "Describe what you see in this image in one sentence.",
-                        },
-                    ],
-                }
-            ],
-            max_tokens=100,
-        )
-
-        return response.choices[0].message.content
-    except Exception:
+    except Exception as e:
+        print(f"[CAPTION] Moondream error: {e}")
        return None


-def generate_caption_fallback(image_path: str, existing_data: Dict = None) -> str:
-    """Generate a basic caption using available metadata"""
+def generate_caption_from_metadata(image_path: str, existing_data: Dict = None) -> str:
+    """Generate caption using YOLO/OCR metadata (fallback)"""

    caption_parts = []

-    # Check YOLO data for objects
    if existing_data and existing_data.get("objects"):
        objects = list(set([o["class"] for o in existing_data["objects"]]))[:5]
        if objects:
-            caption_parts.append(f"Contains: {', '.join(objects)}")
+            caption_parts.append(f"Objects: {', '.join(objects)}")

-    # Check OCR data for text
    if existing_data and existing_data.get("texts"):
        texts = [t["text"] for t in existing_data["texts"] if t.get("text")]
        if texts:
-            caption_parts.append(f"On-screen text: {' '.join(texts[:3])}")
+            caption_parts.append(f"Text: {' '.join(texts[:3])}")
+
+    if existing_data and existing_data.get("scene_type"):
+        caption_parts.append(f"Scene: {existing_data['scene_type']}")

    if caption_parts:
        return " | ".join(caption_parts)

-    return "Video frame at timestamp"
+    return "Video frame"


 def process_frame(
-    frame_info: Dict, yolo_data: List = None, ocr_data: List = None
+    frame_info: Dict,
+    yolo_data: List = None,
+    ocr_data: List = None,
+    scene_data: Dict = None,
 ) -> Dict:
-    """Process a single frame and generate caption"""
+    """Process a single frame and generate caption (LOCAL ONLY)"""

    frame_path = frame_info["path"]
    timestamp = frame_info["timestamp"]
@@ -171,28 +149,34 @@ def process_frame(
    caption = None
    source = "unknown"

-    # Try GPT-4V first
-    caption = generate_caption_with_gpt4v(frame_path)
+    # Try Moondream2 (local VLM)
+    caption = generate_caption_with_moondream(frame_path)
    if caption:
-        source = "gpt-4v"
+        source = "moondream2"
    else:
-        # Try LLaVA
-        caption = generate_caption_with_llava(frame_path)
-        if caption:
-            source = "llava"
-        else:
-            # Use fallback with YOLO/OCR data
-            combined_data = {"objects": [], "texts": []}
-            if yolo_data:
-                combined_data["objects"] = [
-                    o for o in yolo_data if o.get("timestamp") == timestamp
-                ]
-            if ocr_data:
-                combined_data["texts"] = [
-                    t for t in ocr_data if t.get("timestamp") == timestamp
-                ]
-            caption = generate_caption_fallback(frame_path, combined_data)
-            source = "metadata"
+        # Fallback: Use metadata from YOLO/OCR/Scene
+        combined_data = {"objects": [], "texts": [], "scene_type": ""}
+
+        if yolo_data:
+            combined_data["objects"] = [
+                o for o in yolo_data if o.get("timestamp") == timestamp
+            ]
+
+        if ocr_data:
+            combined_data["texts"] = [
+                t for t in ocr_data if t.get("timestamp") == timestamp
+            ]
+
+        if scene_data:
+            for scene in scene_data.get("scenes", []):
+                if scene.get("start_time", 0) <= timestamp <= scene.get("end_time", 0):
+                    combined_data["scene_type"] = scene.get(
+                        "scene_type_zh"
+                    ) or scene.get("scene_type", "")
+                    break
+
+        caption = generate_caption_from_metadata(frame_path, combined_data)
+        source = "metadata"

    return {
        "index": frame_info["index"],
@@ -212,24 +196,22 @@ def run_caption(
    if publisher:
        publisher.info("caption", "Extracting frames from video...")

-    # Extract frames
    frames = extract_frames(video_path, max_frames)

    if publisher:
        publisher.info("caption", f"Extracted {len(frames)} frames")

-    # Load YOLO and OCR data for context
    base_path = os.path.dirname(output_path)
    uuid_name = os.path.basename(output_path).split(".")[0]

    yolo_objects = []
    ocr_texts = []
+    scene_info = {}

    yolo_path = os.path.join(base_path, f"{uuid_name}.yolo.json")
    if os.path.exists(yolo_path):
        with open(yolo_path) as f:
            yolo_data = json.load(f)
-            # Flatten objects from all frames
            for frame in yolo_data.get("frames", []):
                for obj in frame.get("objects", []):
                    obj["timestamp"] = frame.get("timestamp", 0)
@@ -244,7 +226,11 @@ def run_caption(
                    text["timestamp"] = frame.get("timestamp", 0)
                    ocr_texts.append(text)

-    # Process each frame
+    scene_path = os.path.join(base_path, f"{uuid_name}.scene.json")
+    if os.path.exists(scene_path):
+        with open(scene_path) as f:
+            scene_info = json.load(f)
+
    captions = []
    for i, frame in enumerate(frames):
        if publisher and i % 5 == 0:
@@ -252,16 +238,14 @@ def run_caption(
                "caption", i, len(frames), f"Frame {i + 1}/{len(frames)}"
            )

-        caption_data = process_frame(frame, yolo_objects, ocr_texts)
+        caption_data = process_frame(frame, yolo_objects, ocr_texts, scene_info)
        captions.append(caption_data)

-        # Cleanup temp frame
        try:
            os.remove(frame["path"])
        except Exception:
            pass

-    # Cleanup temp directory
    temp_dir = os.path.join(os.path.dirname(video_path), ".caption_frames")
    try:
        os.rmdir(temp_dir)
@@ -275,9 +259,11 @@ def run_caption(
        "summary": {
            "avg_caption_length": sum(len(c.get("caption", "")) for c in captions)
            / max(len(captions), 1),
-            "gpt4v_count": sum(1 for c in captions if c.get("source") == "gpt-4v"),
-            "llava_count": sum(1 for c in captions if c.get("source") == "llava"),
+            "moondream_count": sum(
+                1 for c in captions if c.get("source") == "moondream2"
+            ),
            "metadata_count": sum(1 for c in captions if c.get("source") == "metadata"),
+            "cloud_api_count": 0,
        },
    }

@@ -285,13 +271,13 @@ def run_caption(
        json.dump(result, f, indent=2, ensure_ascii=False)

    if publisher:
-        publisher.complete("caption", f"{len(captions)} frames captioned")
+        publisher.complete("caption", f"{len(captions)} frames captioned (LOCAL)")

    return result


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Video Caption Generator")
+    parser = argparse.ArgumentParser(description="Video Caption Generator (LOCAL ONLY)")
    parser.add_argument("video_path", help="Path to video file")
    parser.add_argument("output_path", help="Output JSON path")
    parser.add_argument("--uuid", help="UUID for progress tracking", default="")
@@ -302,4 +288,4 @@ if __name__ == "__main__":
    args = parser.parse_args()

    result = run_caption(args.video_path, args.output_path, args.uuid, args.max_frames)
-    print(f"Caption generated: {result['total_frames']} frames")
+    print(f"Caption generated: {result['total_frames']} frames (LOCAL)")