chore: backup before migration to new repo

This commit is contained in:
Warren
2026-04-23 16:46:02 +08:00
parent 13dd3b30f3
commit 59809dae1f
40 changed files with 5566 additions and 1783 deletions

View File

@@ -1,7 +1,8 @@
#!/opt/homebrew/bin/python3.11
"""
Caption Processor - Generate image captions
Uses AI vision models to analyze video frames and generate descriptions
Caption Processor - Generate image captions (LOCAL ONLY)
Uses Moondream2 (local VLM) for image captioning
No cloud API calls - fully offline processing
"""
import sys
@@ -18,7 +19,6 @@ from redis_publisher import RedisPublisher
def extract_frames(video_path: str, max_frames: int = 30) -> List[Dict]:
"""Extract frames from video at regular intervals"""
# Get video duration
cmd = [
"ffprobe",
"-v",
@@ -34,14 +34,13 @@ def extract_frames(video_path: str, max_frames: int = 30) -> List[Dict]:
data = json.loads(result.stdout)
duration = float(data.get("format", {}).get("duration", 0))
else:
duration = 60 # Default fallback
duration = 60
except Exception:
duration = 60
if duration <= 0:
duration = 60
# Calculate frame interval
interval = max(duration / max_frames, 1.0)
frames = []
@@ -76,94 +75,73 @@ def extract_frames(video_path: str, max_frames: int = 30) -> List[Dict]:
return frames
def generate_caption_with_llava(
def generate_caption_with_moondream(
image_path: str, prompt: str = "Describe this image in detail."
) -> Optional[str]:
"""Generate caption using LLaVA model"""
"""Generate caption using Moondream2 (local VLM)"""
try:
# Try to use transformers with LLaVA
from transformers import AutoProcessor, AutoModelForVision2Seq # noqa: F401
import torch # noqa: F401
from PIL import Image # noqa: F401
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import torch
# Note: This requires llava-hf/llava-1.5-7b-hf or similar
# For now, return a placeholder
return f"[LLaVA caption for {os.path.basename(image_path)}]"
model_id = "vikhyatk/moondream2"
revision = "2025-01-09"
tokenizer = AutoTokenizer.from_pretrained(
model_id, revision=revision, trust_remote_code=True
)
moondream = AutoModelForCausalLM.from_pretrained(
model_id,
revision=revision,
trust_remote_code=True,
torch_dtype=torch.float16,
).to("mps" if torch.backends.mps.is_available() else "cpu")
moondream.eval()
image = Image.open(image_path)
enc_image = moondream.encode_image(image)
caption = moondream.answer_question(enc_image, prompt, tokenizer)
return caption if caption else None
except ImportError:
return None
def generate_caption_with_gpt4v(image_path: str, api_key: str = None) -> Optional[str]:
"""Generate caption using GPT-4V via OpenAI API"""
import base64
if not api_key:
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
return None
try:
from openai import OpenAI
client = OpenAI(api_key=api_key)
# Encode image
with open(image_path, "rb") as f:
img_data = base64.b64encode(f.read()).decode()
response = client.chat.completions.create(
model="gpt-4o", # or gpt-4-turbo for vision
messages=[
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{img_data}"},
},
{
"type": "text",
"text": "Describe what you see in this image in one sentence.",
},
],
}
],
max_tokens=100,
)
return response.choices[0].message.content
except Exception:
except Exception as e:
print(f"[CAPTION] Moondream error: {e}")
return None
def generate_caption_fallback(image_path: str, existing_data: Dict = None) -> str:
"""Generate a basic caption using available metadata"""
def generate_caption_from_metadata(image_path: str, existing_data: Dict = None) -> str:
"""Generate caption using YOLO/OCR metadata (fallback)"""
caption_parts = []
# Check YOLO data for objects
if existing_data and existing_data.get("objects"):
objects = list(set([o["class"] for o in existing_data["objects"]]))[:5]
if objects:
caption_parts.append(f"Contains: {', '.join(objects)}")
caption_parts.append(f"Objects: {', '.join(objects)}")
# Check OCR data for text
if existing_data and existing_data.get("texts"):
texts = [t["text"] for t in existing_data["texts"] if t.get("text")]
if texts:
caption_parts.append(f"On-screen text: {' '.join(texts[:3])}")
caption_parts.append(f"Text: {' '.join(texts[:3])}")
if existing_data and existing_data.get("scene_type"):
caption_parts.append(f"Scene: {existing_data['scene_type']}")
if caption_parts:
return " | ".join(caption_parts)
return "Video frame at timestamp"
return "Video frame"
def process_frame(
frame_info: Dict, yolo_data: List = None, ocr_data: List = None
frame_info: Dict,
yolo_data: List = None,
ocr_data: List = None,
scene_data: Dict = None,
) -> Dict:
"""Process a single frame and generate caption"""
"""Process a single frame and generate caption (LOCAL ONLY)"""
frame_path = frame_info["path"]
timestamp = frame_info["timestamp"]
@@ -171,28 +149,34 @@ def process_frame(
caption = None
source = "unknown"
# Try GPT-4V first
caption = generate_caption_with_gpt4v(frame_path)
# Try Moondream2 (local VLM)
caption = generate_caption_with_moondream(frame_path)
if caption:
source = "gpt-4v"
source = "moondream2"
else:
# Try LLaVA
caption = generate_caption_with_llava(frame_path)
if caption:
source = "llava"
else:
# Use fallback with YOLO/OCR data
combined_data = {"objects": [], "texts": []}
if yolo_data:
combined_data["objects"] = [
o for o in yolo_data if o.get("timestamp") == timestamp
]
if ocr_data:
combined_data["texts"] = [
t for t in ocr_data if t.get("timestamp") == timestamp
]
caption = generate_caption_fallback(frame_path, combined_data)
source = "metadata"
# Fallback: Use metadata from YOLO/OCR/Scene
combined_data = {"objects": [], "texts": [], "scene_type": ""}
if yolo_data:
combined_data["objects"] = [
o for o in yolo_data if o.get("timestamp") == timestamp
]
if ocr_data:
combined_data["texts"] = [
t for t in ocr_data if t.get("timestamp") == timestamp
]
if scene_data:
for scene in scene_data.get("scenes", []):
if scene.get("start_time", 0) <= timestamp <= scene.get("end_time", 0):
combined_data["scene_type"] = scene.get(
"scene_type_zh"
) or scene.get("scene_type", "")
break
caption = generate_caption_from_metadata(frame_path, combined_data)
source = "metadata"
return {
"index": frame_info["index"],
@@ -212,24 +196,22 @@ def run_caption(
if publisher:
publisher.info("caption", "Extracting frames from video...")
# Extract frames
frames = extract_frames(video_path, max_frames)
if publisher:
publisher.info("caption", f"Extracted {len(frames)} frames")
# Load YOLO and OCR data for context
base_path = os.path.dirname(output_path)
uuid_name = os.path.basename(output_path).split(".")[0]
yolo_objects = []
ocr_texts = []
scene_info = {}
yolo_path = os.path.join(base_path, f"{uuid_name}.yolo.json")
if os.path.exists(yolo_path):
with open(yolo_path) as f:
yolo_data = json.load(f)
# Flatten objects from all frames
for frame in yolo_data.get("frames", []):
for obj in frame.get("objects", []):
obj["timestamp"] = frame.get("timestamp", 0)
@@ -244,7 +226,11 @@ def run_caption(
text["timestamp"] = frame.get("timestamp", 0)
ocr_texts.append(text)
# Process each frame
scene_path = os.path.join(base_path, f"{uuid_name}.scene.json")
if os.path.exists(scene_path):
with open(scene_path) as f:
scene_info = json.load(f)
captions = []
for i, frame in enumerate(frames):
if publisher and i % 5 == 0:
@@ -252,16 +238,14 @@ def run_caption(
"caption", i, len(frames), f"Frame {i + 1}/{len(frames)}"
)
caption_data = process_frame(frame, yolo_objects, ocr_texts)
caption_data = process_frame(frame, yolo_objects, ocr_texts, scene_info)
captions.append(caption_data)
# Cleanup temp frame
try:
os.remove(frame["path"])
except Exception:
pass
# Cleanup temp directory
temp_dir = os.path.join(os.path.dirname(video_path), ".caption_frames")
try:
os.rmdir(temp_dir)
@@ -275,9 +259,11 @@ def run_caption(
"summary": {
"avg_caption_length": sum(len(c.get("caption", "")) for c in captions)
/ max(len(captions), 1),
"gpt4v_count": sum(1 for c in captions if c.get("source") == "gpt-4v"),
"llava_count": sum(1 for c in captions if c.get("source") == "llava"),
"moondream_count": sum(
1 for c in captions if c.get("source") == "moondream2"
),
"metadata_count": sum(1 for c in captions if c.get("source") == "metadata"),
"cloud_api_count": 0,
},
}
@@ -285,13 +271,13 @@ def run_caption(
json.dump(result, f, indent=2, ensure_ascii=False)
if publisher:
publisher.complete("caption", f"{len(captions)} frames captioned")
publisher.complete("caption", f"{len(captions)} frames captioned (LOCAL)")
return result
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Video Caption Generator")
parser = argparse.ArgumentParser(description="Video Caption Generator (LOCAL ONLY)")
parser.add_argument("video_path", help="Path to video file")
parser.add_argument("output_path", help="Output JSON path")
parser.add_argument("--uuid", help="UUID for progress tracking", default="")
@@ -302,4 +288,4 @@ if __name__ == "__main__":
args = parser.parse_args()
result = run_caption(args.video_path, args.output_path, args.uuid, args.max_frames)
print(f"Caption generated: {result['total_frames']} frames")
print(f"Caption generated: {result['total_frames']} frames (LOCAL)")