feat: media API (video/bbox/thumbnail), UUID unification, dot matrix text, portal fixes, API dictionary V1.3

This commit is contained in:
Warren
2026-05-06 13:34:49 +08:00
parent e75c4d6f07
commit 74b6182eba
197 changed files with 17511 additions and 8759 deletions

View File

@@ -1,243 +1,89 @@
#!/opt/homebrew/bin/python3.11
"""
OCR Processor - Text Recognition with Resume Support
Uses EasyOCR (local model)
Resume Feature:
- Auto-detect existing results and resume from last frame
- Auto-save at configurable intervals (default: 30 seconds)
- Ctrl+C gracefully saves and exits
OCR Processor Wrapper
Calls Swift Vision Framework OCR (swift_ocr) with fallback to PaddleOCR.
"""
import sys
import json
import argparse
import os
import time
import subprocess
import argparse
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from resume_framework import ResumeFramework, format_time, print_progress
SWIFT_OCR_PATH = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"swift_processors/.build/debug/swift_ocr"
)
SWIFT_OCR_ALT = os.path.join(
os.path.dirname(os.path.abspath(__file__)),
"swift_processors/.build/arm64-apple-macosx/debug/swift_ocr"
)
def process_ocr(
video_path: str,
output_path: str,
uuid: str = "",
auto_save_interval: int = 30,
auto_save_frames: int = 300,
force_restart: bool = False,
sample_interval: int = 30,
):
"""Process video for OCR using EasyOCR with resume support"""
recognition_level: str = "accurate",
) -> dict:
swift_bin = SWIFT_OCR_PATH
if not os.path.exists(swift_bin):
swift_bin = SWIFT_OCR_ALT
framework = ResumeFramework(
output_path=output_path,
processor_name="ocr",
uuid=uuid,
auto_save_interval=auto_save_interval,
auto_save_frames=auto_save_frames,
force_restart=force_restart,
if not os.path.exists(swift_bin):
print("[OCR] Swift binary not found, using PaddleOCR", file=sys.stderr)
return _fallback(video_path, output_path, uuid, sample_interval)
cmd = [swift_bin, video_path, output_path,
"--sample-interval", str(sample_interval),
"--recognition-level", recognition_level,
"--uuid", uuid]
print(f"[OCR] Running Swift OCR", file=sys.stderr)
result = subprocess.run(cmd, capture_output=True, text=True, timeout=7200)
if result.stdout:
print(result.stdout.strip(), file=sys.stderr)
if result.stderr:
print(result.stderr.strip(), file=sys.stderr)
if result.returncode != 0 or not os.path.exists(output_path):
print(f"[OCR] Swift OCR failed, falling back to PaddleOCR", file=sys.stderr)
return _fallback(video_path, output_path, uuid, sample_interval)
with open(output_path) as f:
return json.load(f)
def _fallback(video_path, output_path, uuid, sample_interval):
"""Fallback to original PaddleOCR implementation"""
import importlib
spec = importlib.util.spec_from_file_location(
"paddle_ocr",
os.path.join(os.path.dirname(__file__), "ocr_paddle.py")
)
framework.publish_info("OCR_START")
try:
import easyocr
except ImportError:
framework.publish_error("easyocr not installed")
result = {
"metadata": {"status": "error", "error": "easyocr not installed"},
"frames": {},
}
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
framework.publish_progress(0, 0, "0 frames")
return result
framework.publish_info("OCR_LOADING_MODEL")
reader = easyocr.Reader(["en"], gpu=False, verbose=False)
framework.publish_info("OCR_MODEL_LOADED")
import cv2
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"Error: Cannot open video: {video_path}")
return {"metadata": {"status": "error"}, "frames": {}}
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
total_duration = total_frames / fps if fps > 0 else 0
cap.release()
framework.publish_info(f"fps={fps}, frames={total_frames}")
existing_data, last_checkpoint = framework.load_existing_data()
resume_mode = existing_data is not None and last_checkpoint > 0 and not force_restart
if resume_mode:
print(f"\nFound existing data: {output_path}")
print(f"Last processed frame: {last_checkpoint}")
print(f"Will resume from frame {last_checkpoint + 1}")
if resume_mode and existing_data:
ocr_data = existing_data
frame_count = last_checkpoint
processed_frames = set(int(k) for k in existing_data.get("frames", {}).keys())
cap = cv2.VideoCapture(video_path)
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count)
else:
ocr_data = {
"metadata": framework.init_metadata(
video_path=video_path,
fps=fps,
width=width,
height=height,
total_frames=total_frames,
total_duration=total_duration,
extra={"sample_interval": sample_interval},
),
"frames": {},
}
frame_count = 0
processed_frames = set()
cap = cv2.VideoCapture(video_path)
framework.set_data(ocr_data)
start_time = time.time()
framework.last_save_time = start_time
print(f"\nProcessing video: {total_frames} frames @ {fps:.2f} fps")
print(f"Auto-save every {auto_save_interval}s or {auto_save_frames} frames")
print(f"Resume from frame {frame_count + 1 if resume_mode else 1}")
print()
while True:
ret, frame = cap.read()
if not ret:
break
frame_count += 1
current_time = (frame_count - 1) / fps if fps > 0 else 0
if frame_count in processed_frames:
continue
if frame_count % sample_interval != 0:
continue
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
try:
detections = reader.readtext(
frame_rgb, text_threshold=0.5, low_text=0.3, link_threshold=0.3
)
except Exception as e:
framework.publish_error(f"Frame {frame_count}: {e}")
detections = []
texts = []
for detection in detections:
det: tuple = tuple(detection)
bbox = list(det[0])
text: str = str(det[1])
confidence: float = float(det[2])
x = int(min(float(p[0]) for p in bbox))
y = int(min(float(p[1]) for p in bbox))
w = int(max(float(p[0]) for p in bbox) - x)
h = int(max(float(p[1]) for p in bbox) - y)
if text.strip():
texts.append(
{
"text": text,
"x": x,
"y": y,
"width": w,
"height": h,
"confidence": confidence,
}
)
if texts:
ocr_data["frames"][str(frame_count)] = {
"frame_number": frame_count,
"time_seconds": round(current_time, 3),
"time_formatted": format_time(current_time),
"texts": texts,
}
processed_frames.add(frame_count)
if frame_count % 500 == 0:
elapsed = time.time() - start_time
print_progress(frame_count, total_frames, elapsed, f"{len(texts)} texts")
framework.publish_progress(frame_count, total_frames, f"frame {frame_count}")
if framework.should_auto_save(frame_count):
framework.save_progress(frame_count, silent=True)
cap.release()
total_processed = len(processed_frames)
framework.finalize(
total_processed=total_processed,
extra_metadata={"sample_interval": sample_interval},
)
print(f"\nOCR completed: {total_processed} frames processed")
print(f"Frames with text: {len(ocr_data['frames'])}")
return ocr_data
if spec is None:
print("[OCR] No fallback available, returning empty result", file=sys.stderr)
return {"frame_count": 0, "fps": 0, "frames": []}
paddle = importlib.util.module_from_spec(spec)
spec.loader.exec_module(paddle)
return paddle.process_ocr(video_path, output_path, uuid, sample_interval=sample_interval)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="OCR Text Recognition with Resume Support")
parser.add_argument("video_path", help="Path to video file")
parser.add_argument("output_path", help="Output JSON path")
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
parser.add_argument(
"--auto-save-interval",
"-a",
help="Auto-save interval in seconds",
type=int,
default=30,
)
parser.add_argument(
"--auto-save-frames",
"-f",
help="Auto-save interval in frames",
type=int,
default=300,
)
parser.add_argument(
"--force-restart",
"-r",
help="Force restart (ignore existing data)",
action="store_true",
)
parser.add_argument(
"--sample-interval",
"-s",
help="Frame sample interval",
type=int,
default=30,
)
parser = argparse.ArgumentParser(description="OCR Processor (Swift Vision)")
parser.add_argument("video_path")
parser.add_argument("output_path")
parser.add_argument("--uuid", "-u", default="")
parser.add_argument("--sample-interval", type=int, default=30)
parser.add_argument("--recognition-level", choices=["fast", "accurate"], default="accurate")
args = parser.parse_args()
process_ocr(
args.video_path,
args.output_path,
args.uuid,
args.auto_save_interval,
args.auto_save_frames,
args.force_restart,
args.sample_interval,
)
result = process_ocr(args.video_path, args.output_path, args.uuid,
args.sample_interval, args.recognition_level)
with open(args.output_path, "w") as f:
json.dump(result, f, indent=2)
print(f"OCR: {len(result.get('frames', []))} frames with text")