momentry_core/scripts/ocr_processor.py

#!/opt/homebrew/bin/python3.11
"""
OCR Processor - Text Recognition
Uses EasyOCR (local model)
"""

import sys
import json
import argparse
import os
import signal

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher


def signal_handler(signum, frame):
    print(f"OCR: Received signal {signum}, exiting...")
    sys.exit(1)


def process_ocr(video_path: str, output_path: str, uuid: str = ""):
    """Process video for OCR using EasyOCR"""

    # Set up signal handlers
    signal.signal(signal.SIGTERM, signal_handler)
    signal.signal(signal.SIGINT, signal_handler)

    publisher = RedisPublisher(uuid) if uuid else None
    if publisher:
        publisher.info("ocr", "OCR_START")

    try:
        import easyocr
    except ImportError:
        if publisher:
            publisher.error("ocr", "easyocr not installed")
        result = {"frame_count": 0, "fps": 0.0, "frames": []}
        if publisher:
            publisher.complete("ocr", "0 frames")
        with open(output_path, "w") as f:
            json.dump(result, f, indent=2)
        return result

    if publisher:
        publisher.info("ocr", "OCR_LOADING_MODEL")

    # Load EasyOCR reader
    # languages: add more like 'fr', 'de', 'ja', 'ko', etc.
    # gpu: set to True if GPU available
    reader = easyocr.Reader(["en"], gpu=False, verbose=False)

    if publisher:
        publisher.info("ocr", "OCR_MODEL_LOADED")

    # Get video info
    import cv2

    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    cap.release()

    if publisher:
        publisher.info("ocr", f"fps={fps}, frames={total_frames}")
        publisher.progress("ocr", 0, total_frames, "Starting")

    # Process every N frames to speed up
    sample_interval = 30  # Process every 30 frames

    frames = []
    frame_count = 0
    processed = 0

    cap = cv2.VideoCapture(video_path)

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame_count += 1

        # Sample frames
        if frame_count % sample_interval != 0:
            continue

        processed += 1
        timestamp = (frame_count - 1) / fps if fps > 0 else 0

        # Convert BGR to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

        # Run OCR
        try:
            detections = reader.readtext(
                frame_rgb, text_threshold=0.5, low_text=0.3, link_threshold=0.3
            )
        except Exception as e:
            if publisher:
                publisher.error("ocr", f"Frame {frame_count}: {e}")
            detections = []

        texts = []
        for detection in detections:
            det: tuple = tuple(detection)
            bbox = list(det[0])
            text: str = str(det[1])
            confidence: float = float(det[2])

            x = int(min(float(p[0]) for p in bbox))
            y = int(min(float(p[1]) for p in bbox))
            width = int(max(float(p[0]) for p in bbox) - x)
            height = int(max(float(p[1]) for p in bbox) - y)

            if text.strip():
                texts.append(
                    {
                        "text": text,
                        "x": x,
                        "y": y,
                        "width": width,
                        "height": height,
                        "confidence": confidence,
                    }
                )

        # Only add frames with text
        if texts:
            frames.append(
                {
                    "frame": frame_count - 1,
                    "timestamp": round(timestamp, 3),
                    "texts": texts,
                }
            )
            if publisher:
                publisher.progress(
                    "ocr",
                    processed,
                    total_frames // sample_interval,
                    f"Frame {frame_count}",
                )

    cap.release()

    result = {"frame_count": total_frames, "fps": fps, "frames": frames}

    with open(output_path, "w") as f:
        json.dump(result, f, indent=2)

    if publisher:
        publisher.complete("ocr", f"{len(frames)} frames with text")

    return result


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="OCR Text Recognition")
    parser.add_argument("video_path", help="Path to video file")
    parser.add_argument("output_path", help="Output JSON path")
    parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
    args = parser.parse_args()

    process_ocr(args.video_path, args.output_path, args.uuid)