feat: update Python processors and add utility scripts

- Update ASR, face, OCR, pose processors - Add release pre-flight check script - Add synonym generation, chunk processing scripts - Add face recognition, stamp search utilities
2026-04-30 15:07:49 +08:00
parent f4697396e4
commit 8f05a7c188
256 changed files with 60505 additions and 299 deletions
--- a/scripts/ocr_processor.py
+++ b/scripts/ocr_processor.py
@@ -1,7 +1,12 @@
 #!/opt/homebrew/bin/python3.11
 """
-OCR Processor - Text Recognition
+OCR Processor - Text Recognition with Resume Support
 Uses EasyOCR (local model)
+
+Resume Feature:
+- Auto-detect existing results and resume from last frame
+- Auto-save at configurable intervals (default: 30 seconds)
+- Ctrl+C gracefully saves and exits
 """

 import sys
@@ -9,70 +14,112 @@ import json
 import argparse
 import os
 import signal
+import time
+from datetime import datetime

 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
 from redis_publisher import RedisPublisher
+from resume_framework import ResumeFramework, format_time, print_progress


-def signal_handler(signum, frame):
-    print(f"OCR: Received signal {signum}, exiting...")
-    sys.exit(1)
+def process_ocr(
+    video_path: str,
+    output_path: str,
+    uuid: str = "",
+    auto_save_interval: int = 30,
+    auto_save_frames: int = 300,
+    force_restart: bool = False,
+    sample_interval: int = 30,
+):
+    """Process video for OCR using EasyOCR with resume support"""

+    framework = ResumeFramework(
+        output_path=output_path,
+        processor_name="ocr",
+        uuid=uuid,
+        auto_save_interval=auto_save_interval,
+        auto_save_frames=auto_save_frames,
+        force_restart=force_restart,
+    )

-def process_ocr(video_path: str, output_path: str, uuid: str = ""):
-    """Process video for OCR using EasyOCR"""
-
-    # Set up signal handlers
-    signal.signal(signal.SIGTERM, signal_handler)
-    signal.signal(signal.SIGINT, signal_handler)
-
-    publisher = RedisPublisher(uuid) if uuid else None
-    if publisher:
-        publisher.info("ocr", "OCR_START")
+    framework.publish_info("OCR_START")

    try:
        import easyocr
    except ImportError:
-        if publisher:
-            publisher.error("ocr", "easyocr not installed")
-        result = {"frame_count": 0, "fps": 0.0, "frames": []}
-        if publisher:
-            publisher.complete("ocr", "0 frames")
+        framework.publish_error("easyocr not installed")
+        result = {
+            "metadata": {"status": "error", "error": "easyocr not installed"},
+            "frames": {},
+        }
        with open(output_path, "w") as f:
            json.dump(result, f, indent=2)
+        framework.publish_progress(0, 0, "0 frames")
        return result

-    if publisher:
-        publisher.info("ocr", "OCR_LOADING_MODEL")
+    framework.publish_info("OCR_LOADING_MODEL")

-    # Load EasyOCR reader
-    # languages: add more like 'fr', 'de', 'ja', 'ko', etc.
-    # gpu: set to True if GPU available
    reader = easyocr.Reader(["en"], gpu=False, verbose=False)

-    if publisher:
-        publisher.info("ocr", "OCR_MODEL_LOADED")
+    framework.publish_info("OCR_MODEL_LOADED")

-    # Get video info
    import cv2

    cap = cv2.VideoCapture(video_path)
+
+    if not cap.isOpened():
+        print(f"Error: Cannot open video: {video_path}")
+        return {"metadata": {"status": "error"}, "frames": {}}
+
    fps = cap.get(cv2.CAP_PROP_FPS)
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    total_duration = total_frames / fps if fps > 0 else 0
    cap.release()

-    if publisher:
-        publisher.info("ocr", f"fps={fps}, frames={total_frames}")
-        publisher.progress("ocr", 0, total_frames, "Starting")
+    framework.publish_info(f"fps={fps}, frames={total_frames}")

-    # Process every N frames to speed up
-    sample_interval = 30  # Process every 30 frames
+    existing_data, last_checkpoint = framework.load_existing_data()
+    resume_mode = existing_data is not None and last_checkpoint > 0 and not force_restart

-    frames = []
-    frame_count = 0
-    processed = 0
+    if resume_mode:
+        print(f"\nFound existing data: {output_path}")
+        print(f"Last processed frame: {last_checkpoint}")
+        print(f"Will resume from frame {last_checkpoint + 1}")

-    cap = cv2.VideoCapture(video_path)
+    if resume_mode and existing_data:
+        ocr_data = existing_data
+        frame_count = last_checkpoint
+        processed_frames = set(int(k) for k in existing_data.get("frames", {}).keys())
+        cap = cv2.VideoCapture(video_path)
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count)
+    else:
+        ocr_data = {
+            "metadata": framework.init_metadata(
+                video_path=video_path,
+                fps=fps,
+                width=width,
+                height=height,
+                total_frames=total_frames,
+                total_duration=total_duration,
+                extra={"sample_interval": sample_interval},
+            ),
+            "frames": {},
+        }
+        frame_count = 0
+        processed_frames = set()
+        cap = cv2.VideoCapture(video_path)
+
+    framework.set_data(ocr_data)
+
+    start_time = time.time()
+    framework.last_save_time = start_time
+
+    print(f"\nProcessing video: {total_frames} frames @ {fps:.2f} fps")
+    print(f"Auto-save every {auto_save_interval}s or {auto_save_frames} frames")
+    print(f"Resume from frame {frame_count + 1 if resume_mode else 1}")
+    print()

    while True:
        ret, frame = cap.read()
@@ -80,25 +127,22 @@ def process_ocr(video_path: str, output_path: str, uuid: str = ""):
            break

        frame_count += 1
+        current_time = (frame_count - 1) / fps if fps > 0 else 0
+
+        if frame_count in processed_frames:
+            continue

-        # Sample frames
        if frame_count % sample_interval != 0:
            continue

-        processed += 1
-        timestamp = (frame_count - 1) / fps if fps > 0 else 0
-
-        # Convert BGR to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

-        # Run OCR
        try:
            detections = reader.readtext(
                frame_rgb, text_threshold=0.5, low_text=0.3, link_threshold=0.3
            )
        except Exception as e:
-            if publisher:
-                publisher.error("ocr", f"Frame {frame_count}: {e}")
+            framework.publish_error(f"Frame {frame_count}: {e}")
            detections = []

        texts = []
@@ -110,8 +154,8 @@ def process_ocr(video_path: str, output_path: str, uuid: str = ""):

            x = int(min(float(p[0]) for p in bbox))
            y = int(min(float(p[1]) for p in bbox))
-            width = int(max(float(p[0]) for p in bbox) - x)
-            height = int(max(float(p[1]) for p in bbox) - y)
+            w = int(max(float(p[0]) for p in bbox) - x)
+            h = int(max(float(p[1]) for p in bbox) - y)

            if text.strip():
                texts.append(
@@ -119,47 +163,84 @@ def process_ocr(video_path: str, output_path: str, uuid: str = ""):
                        "text": text,
                        "x": x,
                        "y": y,
-                        "width": width,
-                        "height": height,
+                        "width": w,
+                        "height": h,
                        "confidence": confidence,
                    }
                )

-        # Only add frames with text
        if texts:
-            frames.append(
-                {
-                    "frame": frame_count - 1,
-                    "timestamp": round(timestamp, 3),
-                    "texts": texts,
-                }
-            )
-            if publisher:
-                publisher.progress(
-                    "ocr",
-                    processed,
-                    total_frames // sample_interval,
-                    f"Frame {frame_count}",
-                )
+            ocr_data["frames"][str(frame_count)] = {
+                "frame_number": frame_count,
+                "time_seconds": round(current_time, 3),
+                "time_formatted": format_time(current_time),
+                "texts": texts,
+            }
+            processed_frames.add(frame_count)
+
+        if frame_count % 500 == 0:
+            elapsed = time.time() - start_time
+            print_progress(frame_count, total_frames, elapsed, f"{len(texts)} texts")
+            framework.publish_progress(frame_count, total_frames, f"frame {frame_count}")
+
+        if framework.should_auto_save(frame_count):
+            framework.save_progress(frame_count, silent=True)

    cap.release()

-    result = {"frame_count": total_frames, "fps": fps, "frames": frames}
+    total_processed = len(processed_frames)

-    with open(output_path, "w") as f:
-        json.dump(result, f, indent=2)
+    framework.finalize(
+        total_processed=total_processed,
+        extra_metadata={"sample_interval": sample_interval},
+    )

-    if publisher:
-        publisher.complete("ocr", f"{len(frames)} frames with text")
+    print(f"\nOCR completed: {total_processed} frames processed")
+    print(f"Frames with text: {len(ocr_data['frames'])}")

-    return result
+    return ocr_data


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="OCR Text Recognition")
+    parser = argparse.ArgumentParser(description="OCR Text Recognition with Resume Support")
    parser.add_argument("video_path", help="Path to video file")
    parser.add_argument("output_path", help="Output JSON path")
    parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
+    parser.add_argument(
+        "--auto-save-interval",
+        "-a",
+        help="Auto-save interval in seconds",
+        type=int,
+        default=30,
+    )
+    parser.add_argument(
+        "--auto-save-frames",
+        "-f",
+        help="Auto-save interval in frames",
+        type=int,
+        default=300,
+    )
+    parser.add_argument(
+        "--force-restart",
+        "-r",
+        help="Force restart (ignore existing data)",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--sample-interval",
+        "-s",
+        help="Frame sample interval",
+        type=int,
+        default=30,
+    )
    args = parser.parse_args()

-    process_ocr(args.video_path, args.output_path, args.uuid)
+    process_ocr(
+        args.video_path,
+        args.output_path,
+        args.uuid,
+        args.auto_save_interval,
+        args.auto_save_frames,
+        args.force_restart,
+        args.sample_interval,
+    )