feat: Phase 2.6 edges migration to Qdrant (TKG-only architecture)

Phase 2.6.1: co_occurrence_edges migration - build_co_occurrence_edges_from_qdrant() - Qdrant embeddings → frame grouping → YOLO objects - Result: 6679 edges (vs 6701 PostgreSQL) Phase 2.6.2: face_face_edges migration - build_face_face_edges_from_qdrant() - Qdrant embeddings → frame grouping → face pairs - mutual_gaze detection preserved - Result: 6 edges (exact match) Phase 2.6.3: speaker_face_edges migration - build_speaker_face_edges_from_qdrant() - Qdrant embeddings → trace_id frame ranges - SPEAKS_AS edge creation Architecture: - All edges use Qdrant payload (no face_detections queries) - PostgreSQL fallback for empty Qdrant - Estimated 3.6x performance improvement Testing: - Playground (3003): ✓ All Phase 2.6 logs verified - Edge counts: ✓ Close match with PostgreSQL - Fallback: ✓ Working Docs: - docs_v1.0/DESIGN/TKG_PHASE2_6_EDGES_MIGRATION.md - docs_v1.0/M4_workspace/2026-06-21_phase2_6_test.md
2026-06-21 04:47:49 +08:00
parent 0afc70fc5b
commit 2cfcfdd1af
2926 changed files with 8311058 additions and 1394 deletions
--- a/v1.1/scripts/swift_processors/Package_v1.11.resolved
+++ b/v1.1/scripts/swift_processors/Package_v1.11.resolved
@@ -0,0 +1,14 @@
+{
+  "pins" : [
+    {
+      "identity" : "swift-argument-parser",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-argument-parser",
+      "state" : {
+        "revision" : "626b5b7b2f45e1b0b1c6f4a309296d1d21d7311b",
+        "version" : "1.7.1"
+      }
+    }
+  ],
+  "version" : 2
+}
--- a/v1.1/scripts/swift_processors/Package_v1.11.swift
+++ b/v1.1/scripts/swift_processors/Package_v1.11.swift
@@ -0,0 +1,114 @@
+// swift-tools-version: 5.9
+import PackageDescription
+
+let package = Package(
+    name: "SwiftProcessors",
+    platforms: [
+        .macOS(.v14)
+    ],
+    dependencies: [
+        .package(url: "https://github.com/apple/swift-argument-parser", from: "1.3.0"),
+    ],
+    targets: [
+        .executableTarget(
+            name: "asr_swift",
+            dependencies: [
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+            ],
+            path: ".",
+            sources: ["asr_swift_v1.11.swift"]
+        ),
+        .executableTarget(
+            name: "asrx_swift",
+            dependencies: [
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+            ],
+            path: ".",
+            sources: ["asrx_swift_v1.11.swift"]
+        ),
+        .executableTarget(
+            name: "speaker_test",
+            dependencies: [],
+            path: ".",
+            sources: ["speaker_test_v1.11.swift"]
+        ),
+        .executableTarget(
+            name: "speaker_meta_test",
+            dependencies: [],
+            path: ".",
+            sources: ["speaker_meta_test_v1.11.swift"]
+        ),
+        .executableTarget(
+            name: "face_vision_test",
+            dependencies: [
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+            ],
+            path: ".",
+            sources: ["face_vision_test_v1.11.swift"]
+        ),
+        .executableTarget(
+            name: "vision_object_test",
+            dependencies: [
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+            ],
+            path: ".",
+            sources: ["vision_object_test_v1.11.swift"]
+        ),
+        .executableTarget(
+            name: "swift_cut_test",
+            dependencies: [
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+            ],
+            path: ".",
+            sources: ["swift_cut_test_v1.11.swift"]
+        ),
+        .executableTarget(
+            name: "vision_ocr_test",
+            dependencies: [
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+            ],
+            path: ".",
+            sources: ["vision_ocr_test_v1.11.swift"]
+        ),
+        .executableTarget(
+            name: "swift_ocr",
+            dependencies: [
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+            ],
+            path: ".",
+            sources: ["swift_ocr_v1.11.swift"]
+        ),
+        .executableTarget(
+            name: "face_compare_test",
+            dependencies: [
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+            ],
+            path: ".",
+            sources: ["face_compare_test_v1.11.swift"]
+        ),
+        .executableTarget(
+            name: "pose_benchmark",
+            dependencies: [
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+            ],
+            path: ".",
+            sources: ["pose_benchmark_v1.11.swift"]
+        ),
+        .executableTarget(
+            name: "swift_pose",
+            dependencies: [
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+            ],
+            path: ".",
+            sources: ["swift_pose_v1.11.swift"]
+        ),
+        .executableTarget(
+            name: "swift_face",
+            dependencies: [
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+            ],
+            path: ".",
+            sources: ["swift_face_v1.11.swift"]
+        ),
+    ]
+)
--- a/v1.1/scripts/swift_processors/asr_swift/Info_v1.11.plist
+++ b/v1.1/scripts/swift_processors/asr_swift/Info_v1.11.plist
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>CFBundleExecutable</key>
+    <string>asr_swift</string>
+    <key>CFBundleIdentifier</key>
+    <string>com.momentry.asr-swift</string>
+    <key>CFBundleName</key>
+    <string>ASR Swift Processor</string>
+    <key>CFBundleVersion</key>
+    <string>1.0</string>
+    <key>CFBundleShortVersionString</key>
+    <string>1.0.0</string>
+    <key>CFBundlePackageType</key>
+    <string>APPL</string>
+    <key>LSUIElement</key>
+    <true/>
+    <key>NSMicrophoneUsageDescription</key>
+    <string>Momentry ASR needs microphone access for speech recognition</string>
+    <key>NSSpeechRecognitionUsageDescription</key>
+    <string>Momentry ASR uses speech recognition to transcribe audio</string>
+</dict>
+</plist>
--- a/v1.1/scripts/swift_processors/asr_swift_v1.11.swift
+++ b/v1.1/scripts/swift_processors/asr_swift_v1.11.swift
@@ -0,0 +1,254 @@
+import Foundation
+import Speech
+import ArgumentParser
+
+/// Swift CLI 處理器：使用 Apple Speech Framework 進行語音辨識
+/// 作為 Python ASR (faster-whisper) 的替代方案
+///
+/// 比較項目：
+/// - Speech Framework 使用 Apple 內建模型（ANE 加速）
+/// - 無需下載模型，系統內建
+/// - 支援即時與批次辨識
+/// - 語言支援依作業系統版本
+
+/// 合併逐字 segments 成句子
+/// 當相鄰 segment 間隔 < 0.5s 時合併，間隔 >= 0.5s 時視為句子邊界
+func mergeWordSegments(_ segments: [[String: Any]]) -> [[String: Any]] {
+    let gapThreshold: TimeInterval = 0.5
+    var merged: [[String: Any]] = []
+    var current: [String: Any]? = nil
+
+    for seg in segments {
+        guard let start = seg["start"] as? TimeInterval,
+              let end = seg["end"] as? TimeInterval,
+              let text = seg["text"] as? String,
+              let conf = seg["confidence"] as? Float else {
+            continue
+        }
+
+        if var cur = current {
+            let curEnd = cur["end"] as? TimeInterval ?? 0
+            let gap = start - curEnd
+
+            if gap < gapThreshold {
+                // 合併到當前句子
+                let curText = cur["text"] as? String ?? ""
+                let curConf = cur["confidence"] as? Float ?? 0
+                let wordCount = cur["_wordCount"] as? Int ?? 0
+                cur["text"] = curText + (curText.hasSuffix(" ") ? "" : " ") + text
+                cur["end"] = end
+                // 用 confidence 加權平均（依字數）
+                let totalWords = wordCount + 1
+                cur["confidence"] = (curConf * Float(wordCount) + conf) / Float(totalWords)
+                cur["_wordCount"] = totalWords
+                current = cur
+            } else {
+                // 句子邊界：儲存當前，開始新的
+                cur.removeValue(forKey: "_wordCount")
+                merged.append(cur)
+                current = [
+                    "start": start,
+                    "end": end,
+                    "text": text,
+                    "confidence": conf,
+                    "_wordCount": 1
+                ]
+            }
+        } else {
+            current = [
+                "start": start,
+                "end": end,
+                "text": text,
+                "confidence": conf,
+                "_wordCount": 1
+            ]
+        }
+    }
+
+    if let cur = current {
+        var finalCur = cur
+        finalCur.removeValue(forKey: "_wordCount")
+        merged.append(finalCur)
+    }
+
+    return merged
+}
+
+@main
+struct ASRSwift: ParsableCommand {
+    @Argument(help: "音訊/影片檔案路徑")
+    var inputPath: String
+
+    @Argument(help: "輸出 JSON 路徑")
+    var outputPath: String
+
+    @Option(name: .long, help: "UUID for Redis")
+    var uuid: String = ""
+
+    @Option(name: .long, help: "語言 (留空則自動嘗試支援的語種)")
+    var language: String = ""
+
+    /// 嘗試自動偵測語種
+    func detectLanguage() -> String {
+        if !language.isEmpty { return language }
+        // 優先嘗試常用語種
+        let candidates = ["zh-TW", "zh-Hans", "en-US", "ja-JP", "ko-KR"]
+        for localeId in candidates {
+            if let reco = SFSpeechRecognizer(locale: Locale(identifier: localeId)), reco.isAvailable {
+                print("[ASR_Swift] Auto-detected language: \(localeId)")
+                return localeId
+            }
+        }
+        return "en-US"
+    }
+
+    mutating func run() throws {
+        let startTime = Date()
+
+        print("[ASR_Swift] Starting: \(inputPath)")
+        print("[ASR_Swift] Language: \(language)")
+        print("[ASR_Swift] Output: \(outputPath)")
+
+        // 1. 萃取音訊（若為影片檔）
+        let audioURL: URL
+        let ext = (inputPath as NSString).pathExtension.lowercased()
+        let tempDir = FileManager.default.temporaryDirectory
+            .appendingPathComponent("asr_swift_\(UUID().uuidString)")
+
+        if ["mp4", "mov", "mkv", "avi"].contains(ext) {
+            // 需要 ffmpeg 萃取音訊
+            let wavPath = tempDir.appendingPathComponent("audio.wav").path
+            print("[ASR_Swift] Extracting audio from video...")
+            let proc = Process()
+            proc.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
+            proc.arguments = ["-y", "-v", "quiet", "-i", inputPath,
+                              "-ar", "16000", "-ac", "1", wavPath]
+            try proc.run()
+            proc.waitUntilExit()
+            guard FileManager.default.fileExists(atPath: wavPath) else {
+                print("[ASR_Swift] Error: ffmpeg failed to extract audio")
+                throw NSError(domain: "ASRSwift", code: 1, userInfo: nil)
+            }
+            audioURL = URL(fileURLWithPath: wavPath)
+            print("[ASR_Swift] Audio extracted: \(wavPath)")
+        } else {
+            audioURL = URL(fileURLWithPath: inputPath)
+        }
+
+        // 2. 語音辨識
+        print("[ASR_Swift] Starting recognition...")
+
+        // 請求授權
+        let authGroup = DispatchGroup()
+        authGroup.enter()
+        var authStatus: SFSpeechRecognizerAuthorizationStatus = .notDetermined
+        SFSpeechRecognizer.requestAuthorization { status in
+            authStatus = status
+            authGroup.leave()
+        }
+        authGroup.wait()
+        guard authStatus == .authorized else {
+            print("[ASR_Swift] Speech recognition not authorized: \(authStatus.rawValue)")
+            return
+        }
+        print("[ASR_Swift] Speech recognition authorized")
+
+        let finalLang = detectLanguage()
+        let recognizer = SFSpeechRecognizer(locale: Locale(identifier: finalLang))
+        guard let recognizer = recognizer, recognizer.isAvailable else {
+            print("[ASR_Swift] Error: Speech recognizer not available for \(language)")
+            // 回傳空結果
+            let emptyResult: [String: Any] = [
+                "language": language,
+                "segments": [],
+                "processing_time": Date().timeIntervalSince(startTime),
+                "model": "Apple Speech Framework",
+                "error": "Recognizer not available"
+            ]
+            let jsonData = try JSONSerialization.data(withJSONObject: emptyResult, options: [.prettyPrinted])
+            try jsonData.write(to: URL(fileURLWithPath: outputPath))
+            return
+        }
+
+        let request = SFSpeechURLRecognitionRequest(url: audioURL)
+        request.shouldReportPartialResults = false
+        request.taskHint = .dictation
+
+        var allSegments: [[String: Any]] = []
+        let semaphore = DispatchSemaphore(value: 0)
+
+        let task = recognizer.recognitionTask(with: request) { result, error in
+            if let error = error {
+                print("[ASR_Swift] Recognition error: \(error.localizedDescription)")
+                semaphore.signal()
+                return
+            }
+
+            if let result = result, result.isFinal {
+                let duration = Date().timeIntervalSince(startTime)
+                print("[ASR_Swift] Recognition completed in \(String(format: "%.2f", duration))s")
+
+                // 將辨識結果轉為 segment 格式
+                for segment in result.bestTranscription.segments {
+                    let seg: [String: Any] = [
+                        "start": segment.timestamp,
+                        "end": segment.timestamp + segment.duration,
+                        "text": segment.substring,
+                        "speaker_id": nil as String?,
+                        "confidence": segment.confidence
+                    ]
+                    allSegments.append(seg)
+                }
+
+                // 合併逐字 segments 成句子
+                if !allSegments.isEmpty {
+                    let beforeCount = allSegments.count
+                    allSegments = mergeWordSegments(allSegments)
+                    print("[ASR_Swift] Merged segments: \(beforeCount) → \(allSegments.count)")
+                }
+
+                // 若無 segment，用整個文字建立一個
+                if allSegments.isEmpty {
+                    let fullText = result.bestTranscription.formattedString
+                    let seg: [String: Any] = [
+                        "start": 0.0,
+                        "end": Date().timeIntervalSince(startTime),
+                        "text": fullText,
+                        "speaker_id": nil as String?,
+                        "confidence": 1.0
+                    ]
+                    allSegments.append(seg)
+                }
+
+                semaphore.signal()
+            }
+        }
+
+        // RunLoop 是必要的 — Speech Framework 需要 main runloop 觸發 callback
+        // 使用 semaphore 等待完成，避免固定 timeout
+        while semaphore.wait(timeout: .now()) == .timedOut {
+            RunLoop.current.run(mode: .default, before: Date(timeIntervalSinceNow: 0.1))
+        }
+        task.cancel()
+
+        // 3. 輸出 JSON
+        let outputDict: [String: Any] = [
+            "language": language,
+            "segments": allSegments,
+            "processing_time": Date().timeIntervalSince(startTime),
+            "model": "Apple Speech Framework (ANE accelerated)",
+            "total_segments": allSegments.count
+        ]
+
+        let jsonData = try JSONSerialization.data(withJSONObject: outputDict, options: [.prettyPrinted])
+        try jsonData.write(to: URL(fileURLWithPath: outputPath))
+
+        print("[ASR_Swift] Saved \(allSegments.count) segments to \(outputPath)")
+        print("[ASR_Swift] Total time: \(String(format: "%.2f", Date().timeIntervalSince(startTime)))s")
+
+        // 清理暫存
+        if tempDir != audioURL.deletingLastPathComponent() {
+            try? FileManager.default.removeItem(at: tempDir)
+        }
+    }
+}
--- a/v1.1/scripts/swift_processors/asrx_swift_v1.11.swift
+++ b/v1.1/scripts/swift_processors/asrx_swift_v1.11.swift
@@ -0,0 +1,183 @@
+import Foundation
+import Speech
+import ArgumentParser
+
+/// Swift ASRX Processor
+/// Speaker Diarization via Apple Speech Framework
+///
+/// 使用 SFSpeechRecognizer 進行語音辨識並嘗試分離說話人
+/// 目前 Apple Speech Framework 不直接支援 speaker diarization，
+/// 此實作透過音訊分段 + 逐段辨識來近似 diarization 效果
+
+@main
+struct ASRXSwift: ParsableCommand {
+    @Argument(help: "音訊/影片檔案路徑")
+    var inputPath: String
+
+    @Argument(help: "輸出 JSON 路徑")
+    var outputPath: String
+
+    @Option(name: .long, help: "UUID for Redis")
+    var uuid: String = ""
+
+    @Option(name: .long, help: "語言 (留空自動偵測)")
+    var language: String = ""
+
+    @Option(name: .long, help: "分段長度（秒），預設 5 秒")
+    var segmentDuration: Double = 5.0
+
+    mutating func run() throws {
+        let startTime = Date()
+        print("[ASRX_Swift] Starting: \(inputPath)")
+
+        // 1. 萃取音訊
+        let audioURL = extractAudio(from: inputPath)
+        defer { try? FileManager.default.removeItem(at: audioURL.deletingLastPathComponent()) }
+
+        // 2. 取得音訊資訊
+        let audioFile = try AVAudioFile(forReading: audioURL)
+        let format = audioFile.processingFormat
+        let totalFrames = audioFile.length
+        let duration = Double(totalFrames) / format.sampleRate
+        print("[ASRX_Swift] Audio: \(totalFrames) frames, \(String(format: "%.1f", duration))s, \(format.sampleRate)Hz")
+
+        // 3. 載入完整音訊
+        guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: AVAudioFrameCount(totalFrames)) else {
+            throw NSError(domain: "ASRXSwift", code: 1, userInfo: [NSLocalizedDescriptionKey: "Failed to create buffer"])
+        }
+        try audioFile.read(into: buffer)
+
+        guard let floatDataPtr = buffer.floatChannelData else {
+            throw NSError(domain: "ASRXSwift", code: 2, userInfo: [NSLocalizedDescriptionKey: "No float data"])
+        }
+        let floatData = UnsafeBufferPointer(start: floatDataPtr[0], count: Int(totalFrames) * Int(format.channelCount))
+
+        // 4. 分段辨識
+        let finalLang = resolveLanguage()
+        let recognizer = SFSpeechRecognizer(locale: Locale(identifier: finalLang))!
+        let frameStep = Int(segmentDuration * format.sampleRate)
+        let totalSegments = Int(ceil(duration / segmentDuration))
+
+        print("[ASRX_Swift] Splitting into \(totalSegments) segments of \(Int(segmentDuration))s")
+        print("[ASRX_Swift] Language: \(finalLang)")
+        print("[ASRX_Swift] Starting diarization...")
+
+        var segments: [[String: Any]] = []
+        var processedCount = 0
+
+        for segIdx in 0..<totalSegments {
+            let startFrame = segIdx * frameStep
+            let endFrame = min(startFrame + frameStep, Int(totalFrames))
+
+            // 取出該段音訊
+            let segLength = endFrame - startFrame
+            guard Double(segLength) > format.sampleRate * 0.5 else { continue } // 跳過 < 0.5s 的片段
+
+            let segBuffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: AVAudioFrameCount(segLength))!
+            segBuffer.frameLength = AVAudioFrameCount(segLength)
+            let src = floatData[0]
+            let dstPtr = segBuffer.floatChannelData![0]
+            let srcSlice = Array(floatData[startFrame..<endFrame])
+            dstPtr.initialize(from: srcSlice, count: segLength)
+
+            // 寫入暫存 WAV（用標準 16-bit PCM 格式）
+            let segURL = FileManager.default.temporaryDirectory
+                .appendingPathComponent("seg_\(segIdx).wav")
+            let wavSettings: [String: Any] = [
+                AVFormatIDKey: kAudioFormatLinearPCM,
+                AVSampleRateKey: 16000,
+                AVNumberOfChannelsKey: 1,
+                AVLinearPCMBitDepthKey: 16,
+                AVLinearPCMIsFloatKey: false,
+            ]
+            let segFile = try AVAudioFile(forWriting: segURL, settings: wavSettings,
+                commonFormat: .pcmFormatInt16, interleaved: false)
+            try segFile.write(from: segBuffer)
+
+            // 辨識該段
+            let semaphore = DispatchSemaphore(value: 0)
+            var segText = ""
+            var segConfidence: Float = 0
+
+            let request = SFSpeechURLRecognitionRequest(url: segURL)
+            request.shouldReportPartialResults = false
+            request.requiresOnDeviceRecognition = true
+
+            let task = recognizer.recognitionTask(with: request) { result, error in
+                if let error = error {
+                    print("[ASRX_Swift]  Segment \(segIdx) error: \(error.localizedDescription)")
+                } else if let result = result, result.isFinal {
+                    segText = result.bestTranscription.formattedString
+                    if let firstSeg = result.bestTranscription.segments.first {
+                        segConfidence = firstSeg.confidence
+                    }
+                }
+                semaphore.signal()
+            }
+
+            RunLoop.current.run(until: Date(timeIntervalSinceNow: 10))
+            if !segText.isEmpty {
+                segments.append([
+                    "start_time": Double(startFrame) / format.sampleRate,
+                    "end_time": Double(endFrame) / format.sampleRate,
+                    "start_frame": Int(Double(startFrame) / format.sampleRate * 30),
+                    "end_frame": Int(Double(endFrame) / format.sampleRate * 30),
+                    "text": segText,
+                    "speaker_id": "SPEAKER_\(segIdx % 2)", // 簡單輪替作為 speaker 標記
+                    "confidence": segConfidence,
+                ])
+                processedCount += 1
+            }
+            task.cancel()
+            try? FileManager.default.removeItem(at: segURL)
+        }
+
+        // 5. 輸出 JSON
+        let outputDict: [String: Any] = [
+            "language": finalLang,
+            "segments": segments,
+            "total_segments": processedCount,
+            "total_duration": duration,
+            "processing_time": Date().timeIntervalSince(startTime),
+            "speaker_count": 2,
+            "model": "Apple Speech Framework (segmented diarization)",
+        ]
+
+        let jsonData = try JSONSerialization.data(withJSONObject: outputDict, options: [.prettyPrinted])
+        try jsonData.write(to: URL(fileURLWithPath: outputPath))
+
+        print("[ASRX_Swift] Output: \(processedCount) segments to \(outputPath)")
+        print("[ASRX_Swift] Total: \(String(format: "%.2f", Date().timeIntervalSince(startTime)))s")
+    }
+
+    func extractAudio(from path: String) -> URL {
+        let ext = (path as NSString).pathExtension.lowercased()
+        let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("asrx_\(UUID().uuidString)")
+        try! FileManager.default.createDirectory(at: tempDir, withIntermediateDirectories: true)
+        let wavURL = tempDir.appendingPathComponent("audio.wav")
+
+        if ["mp4", "mov", "mkv", "avi"].contains(ext) {
+            print("[ASRX_Swift] Extracting audio from video...")
+            let proc = Process()
+            proc.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
+            proc.arguments = ["-y", "-v", "quiet", "-i", path, "-ar", "16000", "-ac", "1", wavURL.path]
+            try! proc.run()
+            proc.waitUntilExit()
+        } else {
+            try! FileManager.default.copyItem(at: URL(fileURLWithPath: path), to: wavURL)
+        }
+        return wavURL
+    }
+
+    func resolveLanguage() -> String {
+        if !language.isEmpty { return language }
+        let candidates = ["zh-TW", "zh-Hans", "en-US", "ja-JP", "ko-KR"]
+        for localeId in candidates {
+            if let reco = SFSpeechRecognizer(locale: Locale(identifier: localeId)), reco.isAvailable {
+                print("[ASRX_Swift] Auto-detected language: \(localeId)")
+                return localeId
+            }
+        }
+        return "en-US"
+    }
+}
--- a/v1.1/scripts/swift_processors/body_pose_scanner_v1.11.swift
+++ b/v1.1/scripts/swift_processors/body_pose_scanner_v1.11.swift
@@ -0,0 +1,124 @@
+#!/usr/bin/env swift
+import Foundation
+import Vision
+import AVFoundation
+import ArgumentParser
+
+/// Full-movie body pose scanner: compute head-to-body ratio for every frame
+/// with face detections. Outputs JSONL (one object per frame).
+@main
+struct BodyPoseScanner: ParsableCommand {
+    @Argument(help: "Video file path")
+    var videoPath: String
+
+    @Argument(help: "Output JSONL path")
+    var outputPath: String
+
+    @Option(help: "Frames to scan (comma-separated, e.g. '840,900,960') or 'all' to scan everything")
+    var frames: String = "all"
+
+    @Option(help: "Sample interval (every N frames, for 'all' mode)")
+    var interval: Int = 60
+
+    func run() throws {
+        let url = URL(fileURLWithPath: videoPath)
+        let asset = AVAsset(url: url)
+        guard let reader = try? AVAssetReader(asset: asset) else {
+            print("[BodyPose] Cannot open video"); return
+        }
+
+        guard let videoTrack = asset.tracks(withMediaType: .video).first else {
+            print("[BodyPose] No video track"); return
+        }
+
+        let fps = videoTrack.nominalFrameRate
+        let totalFrames = Int(videoTrack.timeRange.duration.seconds * Double(fps))
+
+        let readerOutput = AVAssetReaderTrackOutput(track: videoTrack, outputSettings: [
+            kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange
+        ])
+        readerOutput.alwaysCopiesSampleData = false
+        reader.add(readerOutput)
+        reader.startReading()
+
+        // Parse target frames
+        var targetFrames = Set<Int>()
+        if frames == "all" {
+            targetFrames = Set(stride(from: 0, to: totalFrames, by: interval))
+        } else {
+            targetFrames = Set(frames.split(separator: ",").compactMap { Int($0.trimmingCharacters(in: .whitespaces)) })
+        }
+
+        var frameCount = 0
+        var results: [[String: Any]] = []
+        let bodyRequest = VNDetectHumanBodyPoseRequest()
+
+        guard let fh = FileHandle(forWritingAtPath: outputPath) else {
+            print("[BodyPose] Cannot create output"); return
+        }
+
+        while let sampleBuffer = readerOutput.copyNextSampleBuffer() {
+            defer { frameCount += 1 }
+            guard targetFrames.contains(frameCount) else { continue }
+            targetFrames.remove(frameCount)
+
+            guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { continue }
+
+            let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
+            try? handler.perform([bodyRequest])
+
+            guard let poses = bodyRequest.results, !poses.isEmpty else { continue }
+
+            let imgW = CGFloat(CVPixelBufferGetWidth(pixelBuffer))
+            let imgH = CGFloat(CVPixelBufferGetHeight(pixelBuffer))
+
+            for obs in poses {
+                guard let pts = try? obs.recognizedPoints(.all) else { continue }
+
+                var joints: [String: CGFloat] = [:]
+                for (name, pt) in pts where pt.confidence > 0.3 {
+                    // Convert Vision (bottom-left origin) to pixel (top-left origin)
+                    joints[String(describing: name)] = pt.location.x * imgW
+                    joints[String(describing: name) + "_y"] = imgH - pt.location.y * imgH
+                }
+
+                // Get head top
+                let headY = joints["head_joint_y"] ?? joints["neck_1_joint_y"] ?? 
+                            joints["neck_2_joint_y"] ?? joints["right_eye_joint_y"] ?? 0
+
+                // Get lowest visible body point (foot > ankle > knee > hip)
+                var bodyBottom = CGFloat.greatestFiniteMagnitude
+                for jn in ["right_ankle_joint_y", "left_ankle_joint_y",
+                           "right_knee_joint_y", "left_knee_joint_y",
+                           "right_hip_joint_y", "left_hip_joint_y"] {
+                    if let v = joints[jn], v > 0 {
+                        bodyBottom = min(bodyBottom, v)
+                    }
+                }
+
+                let bodyH = bodyBottom == .greatestFiniteMagnitude ? 0 : abs(headY - bodyBottom)
+                let headH = abs(headY - (joints["neck_1_joint_y"] ?? headY))
+
+                let h2b = bodyH > 0 ? headH / bodyH : 0
+
+                let row: [String: Any] = [
+                    "frame": frameCount, "timestamp": Double(frameCount)/Double(fps),
+                    "head_top_y": headY, "body_bottom_y": bodyBottom,
+                    "body_h_px": bodyH, "head_h_px": headH,
+                    "h2b_ratio": Double(String(format: "%.3f", h2b)) ?? 0,
+                    "has_full_body": bodyH > 0 && headH > 0,
+                    "joints": joints.mapValues { Double($0) }
+                ]
+
+                var jsonData = try! JSONSerialization.data(withJSONObject: row)
+                jsonData.append(10) // newline
+                fh.write(jsonData)
+            }
+
+            if targetFrames.isEmpty { break }
+        }
+        reader.cancelReading()
+        fh.closeFile()
+        print("[BodyPose] Done: \(results.count) frames → \(outputPath)")
+    }
+}
--- a/v1.1/scripts/swift_processors/check_speech_apis_v1.11.swift
+++ b/v1.1/scripts/swift_processors/check_speech_apis_v1.11.swift
@@ -0,0 +1,46 @@
+import Foundation
+import Speech
+
+// Check what's available in Speech framework
+print("=== Speech Framework API Availability ===")
+
+// SFSpeechRecognizer
+print("SFSpeechRecognizer available: true")
+
+// Check for SFSpeechRecognitionMetadata (iOS 17+, macOS 14+)
+let mdClass = NSClassFromString("SFSpeechRecognitionMetadata")
+print("SFSpeechRecognitionMetadata: \(mdClass != nil ? "✅ Available" : "❌ Not available")")
+
+// Check SFSpeechAnalyzer (iOS 17+, macOS 14+)
+let analyzerClass = NSClassFromString("SFSpeechAnalyzer")
+print("SFSpeechAnalyzer: \(analyzerClass != nil ? "✅ Available" : "❌ Not available")")
+
+// Check for speaker identification types
+let seClass = NSClassFromString("SFSpeakerEmbedding")
+print("SFSpeakerEmbedding: \(seClass != nil ? "✅ Available" : "❌ Not available")")
+
+let siClass = NSClassFromString("SFSpeakerIdentification")
+print("SFSpeakerIdentification: \(siClass != nil ? "✅ Available" : "❌ Not available")")
+
+let sevClass = NSClassFromString("SFSpeakerEmbeddingVector")
+print("SFSpeakerEmbeddingVector: \(sevClass != nil ? "✅ Available" : "❌ Not available")")
+
+let srClass = NSClassFromString("SFSpeakerRecognition")
+print("SFSpeakerRecognition: \(srClass != nil ? "✅ Available" : "❌ Not available")")
+
+// Check for AFVoiceBank / AVAudioSession speaker recognition
+let avClass = NSClassFromString("AVVoiceBank")
+print("AVVoiceBank: \(avClass != nil ? "✅ Available" : "❌ Not available")")
+
+// Check AVAudioSession
+if #available(macOS 14, *) {
+    print("macOS 14+ APIs available: ✅")
+} else {
+    print("macOS 14+ APIs: ❌")
+}
+
+// Summarize
+print()
+print("=== Summary ===")
+print("SFSpeechAnalyzer: \(analyzerClass != nil ? "✅ High-level speech analysis API" : "❌ Not available on this macOS version")")
+print("Speaker recognition APIs: \(seClass != nil || siClass != nil || srClass != nil ? "✅ Speaker recognition APIs exist" : "❌ No speaker recognition APIs found")")
--- a/v1.1/scripts/swift_processors/check_vision_v1.11.swift
+++ b/v1.1/scripts/swift_processors/check_vision_v1.11.swift
@@ -0,0 +1,23 @@
+import Foundation
+import Vision
+
+let classes = [
+    "VNDetectFaceRectanglesRequest",
+    "VNDetectHumanRectanglesRequest",
+    "VNDetectHumanBodyPoseRequest",
+    "VNDetectHumanHandPoseRequest",
+    "VNClassifyImageRequest",
+    "VNRecognizeTextRequest",
+    "VNGenerateObjectnessBasedSaliencyImageRequest",
+    "VNGenerateAttentionBasedSaliencyImageRequest",
+    "VNRecognizeObjectsRequest",
+    "VNDetectContoursRequest",
+    "VNDetectTrajectoriesRequest",
+]
+for cname in classes {
+    if NSClassFromString(cname) != nil {
+        print("\(cname): ✅")
+    } else {
+        print("\(cname): ❌")
+    }
+}
--- a/v1.1/scripts/swift_processors/entitlements_v1.11.plist
+++ b/v1.1/scripts/swift_processors/entitlements_v1.11.plist
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+    <key>com.apple.security.device.audio-input</key>
+    <true/>
+    <key>com.apple.security.device.camera</key>
+    <true/>
+    <key>com.apple.security.network.client</key>
+    <true/>
+    <key>com.apple.security.files.user-selected.read-write</key>
+    <true/>
+    <key>com.apple.security.temporary-exception.audio-upload</key>
+    <true/>
+</dict>
+</plist>
--- a/v1.1/scripts/swift_processors/face_compare_test_v1.11.swift
+++ b/v1.1/scripts/swift_processors/face_compare_test_v1.11.swift
@@ -0,0 +1,206 @@
+import Foundation
+import Vision
+import ArgumentParser
+import AppKit
+import AVFoundation
+
+/// Full comparison: Apple Vision Framework vs InsightFace for face processing
+@main
+struct FaceCompareTest: ParsableCommand {
+    @Argument(help: "Video path or image path")
+    var inputPath: String
+
+    @Option(name: .long, help: "Sample interval (frames)")
+    var sampleInterval: Int = 30
+
+    @Option(name: .long, help: "Maximum frames to process")
+    var maxFrames: Int = 20
+
+    mutating func run() throws {
+        let startTime = Date()
+        print("=== Apple Vision Framework Face Processing ===")
+        #if arch(arm64)
+        print("HW: Apple Silicon ✅")
+        #endif
+
+        let url = URL(fileURLWithPath: inputPath)
+        let ext = (inputPath as NSString).pathExtension.lowercased()
+
+        if ["mp4", "mov", "mkv", "avi"].contains(ext) {
+            try processVideo(url: url)
+        } else {
+            try processImage(url: url)
+        }
+
+        print("Time: \(String(format: "%.2f", Date().timeIntervalSince(startTime)))s")
+    }
+
+    func processVideo(url: URL) throws {
+        let asset = AVAsset(url: url)
+        guard let track = asset.tracks(withMediaType: .video).first else {
+            print("No video track"); return
+        }
+        let duration = asset.duration.seconds
+        let fps = Double(track.nominalFrameRate)
+        let totalFrames = Int(duration * fps)
+        print("Video: \(duration)s @ \(fps)fps = \(totalFrames) frames")
+
+        // Extract frames with ffmpeg at sample interval
+        let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("face_compare_\(UUID().uuidString)")
+        let framesDir = tempDir.appendingPathComponent("frames")
+        try FileManager.default.createDirectory(at: framesDir, withIntermediateDirectories: true)
+        defer { try? FileManager.default.removeItem(at: tempDir) }
+
+        let pattern = framesDir.appendingPathComponent("frame_%05d.jpg").path
+        let proc = Process()
+        proc.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
+        proc.arguments = ["-y", "-v", "quiet", "-i", url.path,
+                          "-vf", "select=not(mod(n\\,\(sampleInterval)))",
+                          "-vsync", "vfr", "-q:v", "5", pattern]
+        try proc.run()
+        proc.waitUntilExit()
+
+        let allFiles = (try? FileManager.default.contentsOfDirectory(atPath: framesDir.path)) ?? []
+        let frameFiles = allFiles.filter { $0.hasSuffix(".jpg") }.sorted().prefix(maxFrames)
+
+        var totalFaces = 0
+        var framesWithFaces = 0
+        var frameCount = 0
+
+        for fname in frameFiles {
+            let imgPath = framesDir.appendingPathComponent(fname).path
+            guard let imgData = try? Data(contentsOf: URL(fileURLWithPath: imgPath)),
+                  let img = NSImage(data: imgData),
+                  let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else { continue }
+
+            let frameNum = Int(fname.replacingOccurrences(of: "frame_", with: "").replacingOccurrences(of: ".jpg", with: "")) ?? 0
+            let timestamp = Double(frameNum) / fps
+
+            // Run all face detection requests
+            let faceResult = detectFaces(cgImage: cgImage)
+            if faceResult.count > 0 {
+                totalFaces += faceResult.count
+                framesWithFaces += 1
+                print("  Frame \(frameNum) (\(String(format: "%.1f", timestamp))s): \(faceResult.count) faces")
+                for (i, f) in faceResult.enumerated() {
+                    print("    [\(i)] bbox=(\(String(format: "%.0f", f.x)),\(String(format: "%.0f", f.y))) size=\(String(format: "%.0f", f.w))x\(String(format: "%.0f", f.h)) conf=\(String(format: "%.3f", f.conf)) quality=\(String(format: "%.3f", f.quality)) landmarks=\(f.landmarks) embedding=\(f.hasEmbedding ? "✅" : "❌")")
+                }
+            }
+            frameCount += 1
+        }
+
+        print("\n=== Summary ===")
+        print("Frames processed: \(frameCount)")
+        print("Frames with faces: \(framesWithFaces)")
+        print("Total faces detected: \(totalFaces)")
+
+        // Compare with existing InsightFace JSON if available
+        let uuid = extractUUID(from: url.lastPathComponent)
+        if uuid != "" {
+            let faceJsonPath = "/Users/accusys/momentry/output_dev/\(uuid).face.json"
+            if FileManager.default.fileExists(atPath: faceJsonPath) {
+                print("\n=== Comparison with InsightFace (\(uuid).face.json) ===")
+                if let data = try? Data(contentsOf: URL(fileURLWithPath: faceJsonPath)),
+                   let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
+                   let frames = json["frames"] as? [[String: Any]] {
+                    let insightFaces = frames.filter { ($0["faces"] as? [Any])?.count ?? 0 > 0 }.count
+                    let totalInsightFaces = frames.reduce(0) { $0 + (($1["faces"] as? [Any])?.count ?? 0) }
+                    print("  InsightFace frames with faces: \(insightFaces)")
+                    print("  InsightFace total faces: \(totalInsightFaces)")
+                }
+            }
+        }
+    }
+
+    func processImage(url: URL) throws {
+        guard let imgData = try? Data(contentsOf: url),
+              let img = NSImage(data: imgData),
+              let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
+            print("Cannot load image"); return
+        }
+        print("Image: \(cgImage.width)x\(cgImage.height)")
+        let result = detectFaces(cgImage: cgImage)
+        print("Vision faces: \(result.count)")
+        for (i, f) in result.enumerated() {
+            print("  [\(i)] bbox=(\(String(format: "%.0f", f.x)),\(String(format: "%.0f", f.y))) size=\(String(format: "%.0f", f.w))x\(String(format: "%.0f", f.h)) conf=\(String(format: "%.3f", f.conf)) quality=\(String(format: "%.3f", f.quality)) landmarks=\(f.landmarks) embedding=\(f.hasEmbedding ? "✅" : "❌")")
+        }
+    }
+
+    struct FaceResult {
+        let x, y, w, h: Float
+        let conf: Float
+        let quality: Float
+        let landmarks: Int
+        let hasEmbedding: Bool
+    }
+
+    func detectFaces(cgImage: CGImage) -> [FaceResult] {
+        var results: [FaceResult] = []
+        let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+
+        // 1. Face Detection
+        let detectReq = VNDetectFaceRectanglesRequest()
+        try? handler.perform([detectReq])
+
+        // 2. Face Landmarks (run separately for each detected face)
+        let landmarkReq = VNDetectFaceLandmarksRequest()
+
+        // 3. Face Capture Quality
+        let qualityReq = VNDetectFaceCaptureQualityRequest()
+
+        // Run all requests
+        try? handler.perform([landmarkReq, qualityReq])
+
+        guard let detections = detectReq.results else { return [] }
+
+        let qualityResults = qualityReq.results ?? []
+
+        for (i, face) in detections.enumerated() {
+            let bb = face.boundingBox
+            let w = Float(cgImage.width)
+            let h = Float(cgImage.height)
+            let x = Float(bb.origin.x) * w
+            let y = Float(bb.origin.y) * h
+            let fw = Float(bb.size.width) * w
+            let fh = Float(bb.size.height) * h
+
+            // Get landmarks count
+            var lmCount = 0
+            if let lmResults = landmarkReq.results, i < lmResults.count {
+                let lms = lmResults[i].landmarks
+                if let left = lms?.leftEye { lmCount += left.pointCount }
+                if let right = lms?.rightEye { lmCount += right.pointCount }
+                if let nose = lms?.nose { lmCount += nose.pointCount }
+            }
+
+            // Get quality score
+            var quality: Float = 0
+            if i < qualityResults.count {
+                if #available(macOS 14, *) {
+                    quality = (qualityResults[i].value(forKey: "faceCaptureQuality") as? Double).map { Float($0) } ?? 0
+                }
+            }
+
+            // Check for faceprint (embedding) via KVC
+            var hasEmbedding = false
+            if #available(macOS 14, *) {
+                if let fp = face.value(forKey: "faceprint") as? NSObject {
+                    hasEmbedding = (fp.value(forKey: "data") as? Data) != nil
+                }
+            }
+
+            results.append(FaceResult(x: x, y: y, w: fw, h: fh, conf: face.confidence, quality: quality, landmarks: lmCount, hasEmbedding: hasEmbedding))
+        }
+
+        return results
+    }
+
+    func extractUUID(from filename: String) -> String {
+        // Try to extract 32-char hex UUID from filename
+        let pattern = try? NSRegularExpression(pattern: "[a-f0-9]{32}")
+        if let match = pattern?.firstMatch(in: filename, range: NSRange(location: 0, length: filename.count)) {
+            return (filename as NSString).substring(with: match.range)
+        }
+        return ""
+    }
+}
--- a/v1.1/scripts/swift_processors/face_vision_test_v1.11.swift
+++ b/v1.1/scripts/swift_processors/face_vision_test_v1.11.swift
@@ -0,0 +1,98 @@
+import Foundation
+import Vision
+import ArgumentParser
+import AppKit
+
+/// POC: Test Apple Vision Framework for face detection + faceprint extraction
+@main
+struct FaceVisionTest: ParsableCommand {
+    @Argument(help: "Input image path")
+    var inputPath: String
+
+    mutating func run() throws {
+        let startTime = Date()
+        print("=== Apple Vision Framework Face POC ===")
+        #if arch(arm64)
+        print("HW: Apple Silicon ✅")
+        #else
+        print("HW: Intel")
+        #endif
+
+        guard let image = NSImage(contentsOfFile: inputPath) else {
+            print("Error: cannot load image"); return
+        }
+        guard let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
+            print("Error: cannot get CGImage"); return
+        }
+        print("Image: \(cgImage.width)x\(cgImage.height)")
+        let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+
+        // 1. Detect faces (synchronous)
+        print("\n--- Detection ---")
+        let detectReq = VNDetectFaceRectanglesRequest()
+        try handler.perform([detectReq])
+        let faces = detectReq.results ?? []
+        print("Faces: \(faces.count)")
+        for (i, f) in faces.enumerated() {
+            let bb = f.boundingBox
+            print("  [\(i)] bbox=(\(String(format: "%.3f", bb.origin.x)),\(String(format: "%.3f", bb.origin.y))) size=(\(String(format: "%.3f", bb.size.width)),\(String(format: "%.3f", bb.size.height))) conf=\(String(format: "%.2f", f.confidence))")
+        }
+
+        guard !faces.isEmpty else { print("No faces"); return }
+
+        // 2. Landmarks
+        print("\n--- Landmarks ---")
+        let lmReq = VNDetectFaceLandmarksRequest()
+        try handler.perform([lmReq])
+        if let lmResults = lmReq.results {
+            for (i, f) in lmResults.enumerated() {
+                if let lms = f.landmarks {
+                    let count = (lms.leftEye?.pointCount ?? 0) + (lms.rightEye?.pointCount ?? 0)
+                    print("  [\(i)] landmarks: \(lms.leftEye?.pointCount ?? 0)+\(lms.rightEye?.pointCount ?? 0) eye pts, nose=\(lms.nose?.pointCount ?? 0)")
+                }
+            }
+        }
+
+        // 3. Capture quality
+        print("\n--- Capture Quality ---")
+        let qualReq = VNDetectFaceCaptureQualityRequest()
+        try handler.perform([qualReq])
+        if let qResults = qualReq.results {
+            for (i, f) in qResults.enumerated() {
+                if #available(macOS 14, *) {
+                    let q = f.value(forKey: "faceCaptureQuality") as? Double ?? -1
+                    print("  [\(i)] quality=\(String(format: "%.4f", q))")
+                }
+            }
+        }
+
+        // 4. Faceprint (embedding)
+        print("\n--- Faceprint ---")
+        if #available(macOS 14, *) {
+            let fpClass: AnyClass? = NSClassFromString("VNFaceprint")
+            print("VNFaceprint class: \(fpClass != nil ? "✅ exists" : "❌ nil")")
+
+            if let first = faces.first {
+                let fp = first.value(forKey: "faceprint") as? NSObject
+                print("faceprint KVC: \(fp != nil ? "✅" : "❌")")
+                if let fpData = fp {
+                    let data = fpData.value(forKey: "data") as? Data
+                    print("  data: \(data != nil ? "\(data!.count) bytes" : "nil")")
+                    let desc = fpData.value(forKey: "descriptor") as? NSObject
+                    print("  descriptor: \(desc != nil ? "✅ class=\(type(of: desc!))" : "nil")")
+                    if let d = desc, let elems = d.value(forKey: "elements") as? [NSNumber] {
+                        print("  elements: \(elems.count) dims")
+                        if elems.count > 0 {
+                            print("  first 5: \(elems.prefix(5).map { String(format: "%.4f", $0.doubleValue) }.joined(separator: ", "))")
+                        }
+                    }
+                }
+            }
+        } else {
+            print("macOS 14+ required")
+        }
+
+        print("\nTime: \(String(format: "%.2f", Date().timeIntervalSince(startTime)))s")
+        print("=== Done ===")
+    }
+}
--- a/v1.1/scripts/swift_processors/pose_benchmark_v1.11.swift
+++ b/v1.1/scripts/swift_processors/pose_benchmark_v1.11.swift
@@ -0,0 +1,83 @@
+import Foundation
+import Vision
+import ArgumentParser
+import AppKit
+import AVFoundation
+
+/// Benchmark: Apple Vision Framework body pose detection speed
+@main
+struct PoseBenchmark: ParsableCommand {
+    @Argument(help: "Video path or image directory")
+    var inputPath: String
+
+    @Option(name: .long, help: "Sample interval (frames)")
+    var sampleInterval: Int = 30
+
+    mutating func run() throws {
+        let start = Date()
+        print("=== Vision Body Pose Benchmark ===")
+        #if arch(arm64)
+        print("HW: Apple Silicon ✅")
+        #endif
+
+        let url = URL(fileURLWithPath: inputPath)
+        let asset = AVAsset(url: url)
+        guard let track = asset.tracks(withMediaType: .video).first else {
+            print("No video track"); return
+        }
+        let duration = asset.duration.seconds
+        let fps = Double(track.nominalFrameRate)
+        let totalFrames = Int(duration * fps)
+
+        // Extract frames with ffmpeg
+        let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("pose_bench_\(UUID().uuidString)")
+        let framesDir = tempDir.appendingPathComponent("frames")
+        try FileManager.default.createDirectory(at: framesDir, withIntermediateDirectories: true)
+
+        let pattern = framesDir.appendingPathComponent("frame_%05d.jpg").path
+        let extract = Process()
+        extract.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
+        extract.arguments = ["-y", "-v", "quiet", "-i", inputPath,
+                             "-vf", "select=not(mod(n\\,\(sampleInterval)))",
+                             "-vsync", "vfr", "-q:v", "5", pattern]
+        try extract.run()
+        extract.waitUntilExit()
+
+        let files = (try? FileManager.default.contentsOfDirectory(atPath: framesDir.path)) ?? []
+        let frameFiles = files.filter { $0.hasSuffix(".jpg") }.sorted()
+        print("Frames: \(frameFiles.count)")
+
+        // Process all frames in one loop (no subprocess overhead)
+        var totalPoses = 0
+        var framesWithPose = 0
+        let inferenceStart = Date()
+
+        for fname in frameFiles {
+            let imgPath = framesDir.appendingPathComponent(fname).path
+            guard let imgData = try? Data(contentsOf: URL(fileURLWithPath: imgPath)),
+                  let img = NSImage(data: imgData),
+                  let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else { continue }
+
+            let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+            let req = VNDetectHumanBodyPoseRequest()
+            try? handler.perform([req])
+
+            if let poses = req.results, !poses.isEmpty {
+                framesWithPose += 1
+                totalPoses += poses.count
+            }
+        }
+
+        let inferenceTime = Date().timeIntervalSince(inferenceStart)
+        let totalTime = Date().timeIntervalSince(start)
+
+        print("\n=== Results ===")
+        print("Frames: \(frameFiles.count), with poses: \(framesWithPose)")
+        print("Total poses: \(totalPoses)")
+        print("Inference: \(String(format: "%.2f", inferenceTime))s")
+        print("Per frame: \(String(format: "%.0f", inferenceTime / Double(frameFiles.count) * 1000))ms")
+        print("Total: \(String(format: "%.2f", totalTime))s")
+
+        try FileManager.default.removeItem(at: tempDir)
+    }
+}
--- a/v1.1/scripts/swift_processors/speaker_meta_test_v1.11.swift
+++ b/v1.1/scripts/swift_processors/speaker_meta_test_v1.11.swift
@@ -0,0 +1,106 @@
+import Foundation
+import Speech
+
+/// Test: Use KVC to check for speaker metadata on SFSpeechRecognitionResult
+@main
+struct SpeakerMetaTest {
+    static func main() {
+        print("=== Speaker Metadata Test ===")
+        
+        let testFile = "/tmp/test_60s_b.wav"
+        guard FileManager.default.fileExists(atPath: testFile) else {
+            print("Test file not found")
+            return
+        }
+        
+        let semaphore = DispatchSemaphore(value: 0)
+        var done = false
+        
+        SFSpeechRecognizer.requestAuthorization { status in
+            guard status == .authorized else {
+                print("Authorization: \(status.rawValue)")
+                semaphore.signal()
+                return
+            }
+            print("Authorization: ✅")
+            
+            let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
+            recognizer.supportsOnDeviceRecognition ? print("On-device: ✅") : print("On-device: ❌ (will use server)")
+            
+            let request = SFSpeechURLRecognitionRequest(url: URL(fileURLWithPath: testFile))
+            request.shouldReportPartialResults = false
+            request.requiresOnDeviceRecognition = false
+            request.taskHint = .dictation
+            
+            print("Starting recognition...")
+            let task = recognizer.recognitionTask(with: request) { result, error in
+                if let error = error {
+                    print("Error: \(error.localizedDescription)")
+                } else if let result = result, result.isFinal {
+                    print("Recognition complete: \(result.bestTranscription.segments.count) raw segments")
+                    
+                    // Check for metadata on the result object
+                    let resultMetadata = result.value(forKey: "metadata") as? NSObject
+                    print("Result metadata: \(resultMetadata != nil ? "✅" : "❌")")
+                    if let rm = resultMetadata {
+                        print("  Result metadata class: \(type(of: rm))")
+                        // Dump all KVC values
+                        for key in ["speakerID", "speakerName", "speakerConfidence", "voiceProfileID", "speaker"] {
+                            if let val = rm.value(forKey: key) {
+                                print("  result.\(key) = \(val)")
+                            }
+                        }
+                    }
+                    
+                    // Check each segment for metadata
+                    var speakerCount = 0
+                    for (i, seg) in result.bestTranscription.segments.enumerated() {
+                        let segMetadata = seg.value(forKey: "metadata") as? NSObject
+                        if let sm = segMetadata {
+                            if i < 3 {
+                                print("Seg[\(i)] metadata class: \(type(of: sm))")
+                                // Try common keys
+                                for key in ["speakerID", "speaker", "voice", "speakerConfidence", "speakerName"] {
+                                    if let val = sm.value(forKey: key) {
+                                        print("  seg.\(key) = \(val)")
+                                        speakerCount += 1
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    
+                    if speakerCount == 0 {
+                        print("No speaker metadata found on any segment")
+                    } else {
+                        print("Found speaker metadata on segments: ✅")
+                    }
+                    
+                    // Also check all KVC keys on first segment
+                    if let firstSeg = result.bestTranscription.segments.first {
+                        print("\nAll KVC keys on first segment:")
+                        let keys = ["metadata", "speaker", "voice", "recognition", "analysis", "audio"]
+                        for key in keys {
+                            if let val = firstSeg.value(forKey: key) {
+                                print("  \(key): \(type(of: val)) = \(val)")
+                            }
+                        }
+                    }
+                }
+                done = true
+                semaphore.signal()
+            }
+            
+            // Run loop until done
+            while !done {
+                RunLoop.current.run(mode: .default, before: Date(timeIntervalSinceNow: 0.1))
+            }
+            task.cancel()
+            
+            semaphore.signal()
+        }
+        
+        semaphore.wait()
+        print("Done")
+    }
+}
--- a/v1.1/scripts/swift_processors/speaker_test_v1.11.swift
+++ b/v1.1/scripts/swift_processors/speaker_test_v1.11.swift
@@ -0,0 +1,113 @@
+import Foundation
+import Speech
+
+/// POC: Test SFSpeechAnalyzer + SFSpeechRecognitionMetadata for speaker detection
+/// Goal: Determine if ANE-accelerated speaker diarization is feasible
+
+@main
+struct SpeakerTest {
+    static func main() {
+        print("=== SFSpeechAnalyzer Speaker Detection POC ===")
+        
+        // 1. Check if running on ANE-capable hardware
+        #if arch(arm64)
+        print("Hardware: Apple Silicon ✅ (ANE available)")
+        #else
+        print("Hardware: Intel ❌ (No ANE)")
+        #endif
+        
+        // 2. Check SFSpeechRecognizer on-device capability
+        let locale = Locale(identifier: "en-US")
+        let recognizerCheck = SFSpeechRecognizer(locale: locale)
+        print("On-device recognition: \(recognizerCheck?.supportsOnDeviceRecognition == true ? "✅" : "❌")")
+        
+        // 3. Check SFSpeechAnalyzer capabilities via availability API
+        if #available(macOS 14, *) {
+            print("\n=== SFSpeechAnalyzer Analysis ===")
+            let analyzerClass: AnyClass? = NSClassFromString("SFSpeechAnalyzer")
+            print("SFSpeechAnalyzer: \(analyzerClass != nil ? "✅ Available" : "❌ Not available")")
+            
+            let mdClass: AnyClass? = NSClassFromString("SFSpeechRecognitionMetadata")
+            print("SFSpeechRecognitionMetadata: \(mdClass != nil ? "✅ Available" : "❌ Not available")")
+        }
+        
+        // 4. Test: Run ASR with SFSpeechRecognitionMetadata
+        print("\n=== Real-world Test ===")
+        let testFile = "/tmp/test_60s_b.wav"
+        guard FileManager.default.fileExists(atPath: testFile) else {
+            print("Test file not found: \(testFile)")
+            return
+        }
+        
+        let semaphore = DispatchSemaphore(value: 0)
+        var detectedSpeakers: Set<String> = []
+        
+        print("Running ASR with speaker detection on 60s clip...")
+        
+        SFSpeechRecognizer.requestAuthorization { status in
+            guard status == .authorized else {
+                print("Authorization denied")
+                semaphore.signal()
+                return
+            }
+            
+            let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
+            guard recognizer.isAvailable else {
+                print("Recognizer not available")
+                semaphore.signal()
+                return
+            }
+            
+            let request = SFSpeechURLRecognitionRequest(url: URL(fileURLWithPath: testFile))
+            request.shouldReportPartialResults = false
+            request.requiresOnDeviceRecognition = true
+            request.taskHint = .dictation
+            
+            let task = recognizer.recognitionTask(with: request) { result, error in
+                if let error = error {
+                    print("Recognition error: \(error.localizedDescription)")
+                } else if let result = result, result.isFinal {
+                    let text = result.bestTranscription.formattedString
+                    print("Text: \(text.prefix(200))")
+                    print("Segments: \(result.bestTranscription.segments.count)")
+                    
+                    // Check each segment for speaker metadata
+                    if #available(macOS 14, *) {
+                        for (i, seg) in result.bestTranscription.segments.enumerated() {
+                            // Access metadata via KVC since it might be a private API
+                            let md = seg.value(forKey: "metadata") as? NSObject
+                            if let md = md {
+                                let speakerId = md.value(forKey: "speakerID") as? String
+                                let speakerName = md.value(forKey: "speakerName") as? String
+                                let confidence = md.value(forKey: "speakerConfidence") as? Double
+                                
+                                if let sid = speakerId {
+                                    detectedSpeakers.insert(sid)
+                                    if i < 5 || i % 20 == 0 {
+                                        print("  Seg[\(i)] speaker=\(sid) name=\(speakerName ?? "?") conf=\(confidence ?? 0) text=\"\(seg.substring.prefix(40))\"")
+                                    }
+                                }
+                            }
+                        }
+                        
+                        print("\nUnique speakers detected: \(detectedSpeakers)")
+                        if detectedSpeakers.isEmpty {
+                            print("⚠️ No speaker metadata found in recognition results")
+                        }
+                    } else {
+                        print("macOS 14+ required for speaker metadata")
+                    }
+                }
+                semaphore.signal()
+            }
+            
+            // Wait
+            while semaphore.wait(timeout: .now()) == .timedOut {
+                RunLoop.current.run(mode: .default, before: Date(timeIntervalSinceNow: 0.1))
+            }
+            task.cancel()
+        }
+        
+        semaphore.wait()
+    }
+}
--- a/v1.1/scripts/swift_processors/swift_cut_test_v1.11.swift
+++ b/v1.1/scripts/swift_processors/swift_cut_test_v1.11.swift
@@ -0,0 +1,191 @@
+import Foundation
+import AVFoundation
+import ArgumentParser
+import Accelerate
+
+/// POC: Swift-based scene cut detection using AVFoundation histogram analysis
+/// Compared against Python PySceneDetect ContentDetector (threshold=27)
+@main
+struct SwiftCutTest: ParsableCommand {
+    @Argument(help: "Video file path")
+    var inputPath: String
+
+    @Argument(help: "Output JSON path (optional)")
+    var outputPath: String?
+
+    @Option(name: .long, help: "Detection threshold (higher= fewer cuts, default 0.3)")
+    var threshold: Double = 0.3
+
+    @Option(name: .long, help: "Sample interval in frames (default=1)")
+    var sampleInterval: Int = 1
+
+    mutating func run() throws {
+        let startTime = Date()
+        print("=== Swift Scene Cut Detection POC ===")
+        #if arch(arm64)
+        print("HW: Apple Silicon ✅ (ANE available)")
+        #endif
+
+        let url = URL(fileURLWithPath: inputPath)
+        let asset = AVAsset(url: url)
+        
+        guard let videoTrack = asset.tracks(withMediaType: .video).first else {
+            print("Error: No video track found"); return
+        }
+        
+        let duration = asset.duration.seconds
+        let fps = videoTrack.nominalFrameRate
+        let totalFrames = Int(duration * Double(fps))
+        print("Video: \(inputPath)")
+        print("Duration: \(String(format: "%.1f", duration))s")
+        print("FPS: \(String(format: "%.1f", fps))")
+        print("Total frames: \(totalFrames)")
+        print("Threshold: \(String(format: "%.2f", threshold))")
+        print("Sample interval: \(sampleInterval)")
+
+        // Read frame histogram data using AVAssetReader
+        guard let reader = try? AVAssetReader(asset: asset) else {
+            print("Error: Cannot create asset reader"); return
+        }
+
+        let settings: [String: Any] = [
+            kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA,
+            kCVPixelBufferWidthKey as String: 320,  // downscale for speed
+            kCVPixelBufferHeightKey as String: 180,
+        ]
+        let trackOutput = AVAssetReaderTrackOutput(track: videoTrack, outputSettings: settings)
+        reader.add(trackOutput)
+        reader.startReading()
+
+        var frameIndex = 0
+        var prevHistogram: [Float]?
+        var scenes: [(start: Double, end: Double)] = []
+        var sceneStart: Double = 0
+        var diffs: [(frame: Int, diff: Float)] = []
+
+        let frameStep = sampleInterval
+        var lastPrint = 0
+
+        while reader.status == .reading {
+            guard let sampleBuffer = trackOutput.copyNextSampleBuffer() else { break }
+            guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
+                CMSampleBufferInvalidate(sampleBuffer); continue
+            }
+
+            if frameIndex % frameStep == 0 {
+                let timestamp = CMTimeGetSeconds(CMSampleBufferGetPresentationTimeStamp(sampleBuffer))
+                
+                // Compute histogram
+                let histogram = computeLuminanceHistogram(pixelBuffer: pixelBuffer)
+                
+                if let prev = prevHistogram {
+                    let diff = histogramDifference(prev, histogram)
+                    
+                    if diff > Float(threshold) {
+                        // Scene cut detected
+                        let sceneEnd = timestamp
+                        scenes.append((start: sceneStart, end: sceneEnd))
+                        sceneStart = timestamp
+                        diffs.append((frame: frameIndex, diff: diff))
+                        
+                        if scenes.count % 50 == 0 {
+                            print("  Scenes so far: \(scenes.count) at frame \(frameIndex)/\(totalFrames)")
+                        }
+                    }
+                }
+                prevHistogram = histogram
+            }
+
+            frameIndex += 1
+            CMSampleBufferInvalidate(sampleBuffer)
+
+            // Progress every 5%
+            let pct = Int(Float(frameIndex) / Float(totalFrames) * 100)
+            if pct >= lastPrint + 5 {
+                print("  Progress: \(pct)% (\(frameIndex)/\(totalFrames) frames)")
+                lastPrint = pct
+            }
+        }
+
+        // Add last scene
+        if sceneStart < duration {
+            scenes.append((start: sceneStart, end: duration))
+        }
+
+        let elapsed = Date().timeIntervalSince(startTime)
+        print("\n=== Results ===")
+        print("Scenes detected: \(scenes.count)")
+        print("Time: \(String(format: "%.2f", elapsed))s")
+        if totalFrames > 0 {
+            let rtf = elapsed / duration
+            print("RTF: \(String(format: "%.3f", rtf))x")
+        }
+        print("Last 5 cuts:")
+        for s in scenes.suffix(5) {
+            print("  \(String(format: "%.1f", s.start))s - \(String(format: "%.1f", s.end))s")
+        }
+
+        // Output JSON if requested
+        if let outPath = outputPath {
+            let outputDict: [String: Any] = [
+                "scenes": scenes.map { ["start_time": $0.start, "end_time": $0.end] },
+                "metadata": [
+                    "video_path": inputPath,
+                    "duration": duration,
+                    "fps": fps,
+                    "total_frames": totalFrames,
+                    "threshold": threshold,
+                    "sample_interval": sampleInterval,
+                    "processing_time": elapsed,
+                    "rtf": elapsed / duration,
+                ],
+                "diffs": diffs.map { ["frame": $0.frame, "diff": String(format: "%.4f", $0.diff)] }
+            ]
+            if let jsonData = try? JSONSerialization.data(withJSONObject: outputDict, options: [.prettyPrinted]) {
+                try jsonData.write(to: URL(fileURLWithPath: outPath))
+                print("Output written to: \(outPath)")
+            }
+        }
+    }
+
+    func computeLuminanceHistogram(pixelBuffer: CVPixelBuffer) -> [Float] {
+        CVPixelBufferLockBaseAddress(pixelBuffer, .readOnly)
+        defer { CVPixelBufferUnlockBaseAddress(pixelBuffer, .readOnly) }
+
+        let width = CVPixelBufferGetWidth(pixelBuffer)
+        let height = CVPixelBufferGetHeight(pixelBuffer)
+        let bytesPerRow = CVPixelBufferGetBytesPerRow(pixelBuffer)
+        let baseAddress = CVPixelBufferGetBaseAddress(pixelBuffer)!
+        
+        var histogram = [Float](repeating: 0, count: 64)  // 64 bins for speed
+        
+        for y in 0..<height {
+            let row = baseAddress.advanced(by: y * bytesPerRow)
+            let pixels = row.assumingMemoryBound(to: UInt8.self)
+            for x in 0..<width {
+                let b = pixels[x * 4]
+                let g = pixels[x * 4 + 1]
+                let r = pixels[x * 4 + 2]
+                let luminance = (0.299 * Float(r) + 0.587 * Float(g) + 0.114 * Float(b))
+                let bin = min(Int(luminance / 256.0 * 64.0), 63)
+                histogram[bin] += 1
+            }
+        }
+        
+        // Normalize
+        let total = Float(width * height)
+        for i in 0..<histogram.count {
+            histogram[i] /= total
+        }
+        
+        return histogram
+    }
+
+    func histogramDifference(_ a: [Float], _ b: [Float]) -> Float {
+        var diff: Float = 0
+        for i in 0..<min(a.count, b.count) {
+            diff += abs(a[i] - b[i])
+        }
+        return diff
+    }
+}
--- a/v1.1/scripts/swift_processors/swift_face_v1.11.swift
+++ b/v1.1/scripts/swift_processors/swift_face_v1.11.swift
@@ -0,0 +1,291 @@
+import Foundation
+import Vision
+import ArgumentParser
+import AVFoundation
+
+/// Swift Face Processor - Apple Vision Framework for face detection + pose
+/// Uses AVAssetImageGenerator for reliable frame access (no AVAssetReader corruption).
+@main
+struct SwiftFace: ParsableCommand {
+    @Argument(help: "Video file path")
+    var inputPath: String
+
+    @Argument(help: "Output JSON path")
+    var outputPath: String
+
+    @Option(name: .long, help: "Sample interval (frames, default=30)")
+    var sampleInterval: Int = 30
+
+    @Option(name: .long, help: "UUID for logging")
+    var uuid: String = ""
+
+    mutating func run() throws {
+        let startTime = Date()
+        print("[SwiftFace] Vision-based face detection: \(inputPath)")
+
+        let url = URL(fileURLWithPath: inputPath)
+        let asset = AVAsset(url: url)
+
+        guard let videoTrack = asset.tracks(withMediaType: .video).first else {
+            print("[SwiftFace] No video track found")
+            return
+        }
+
+        let fps = videoTrack.nominalFrameRate
+        let duration = CMTimeGetSeconds(asset.duration)
+        let totalFrames = Int(duration * Double(fps))
+        let width = Int(videoTrack.naturalSize.width)
+        let height = Int(videoTrack.naturalSize.height)
+        print("[SwiftFace] Video: \(width)x\(height), \(String(format: "%.1f", fps))fps, \(totalFrames) frames")
+
+        let generator = AVAssetImageGenerator(asset: asset)
+        generator.requestedTimeToleranceBefore = .zero
+        generator.requestedTimeToleranceAfter = .zero
+        generator.appliesPreferredTrackTransform = true
+
+        var allFrames: [[String: Any]] = []
+        var processedCount = 0
+        var checkedCount = 0
+
+        let frameInterval = TimeInterval(sampleInterval) / Double(fps)
+
+        // Process in batches of 1000 frames to avoid memory pressure
+        let batchSize = 1000
+        let totalSamples = totalFrames / sampleInterval
+
+        for batchStart in stride(from: 0, to: totalSamples, by: batchSize) {
+            let batchEnd = min(batchStart + batchSize, totalSamples)
+            var times: [NSValue] = []
+
+            for i in batchStart..<batchEnd {
+                let seconds = Double(i) * frameInterval
+                let cmTime = CMTime(seconds: seconds, preferredTimescale: 1000)
+                times.append(NSValue(time: cmTime))
+            }
+
+            let semaphore = DispatchSemaphore(value: 0)
+            var batchError: Error? = nil
+
+            generator.generateCGImagesAsynchronously(forTimes: times) { requestedTime, cgImage, actualTime, result, error in
+                defer { semaphore.signal() }
+
+                checkedCount += 1
+
+                guard result == .succeeded, let cgImage = cgImage else {
+                    if let error = error {
+                        fputs("[SwiftFace] Frame error at \(CMTimeGetSeconds(requestedTime)): \(error.localizedDescription)\n", stderr)
+                    }
+                    return
+                }
+
+                // Convert CGImage to CVPixelBuffer for Vision
+                var pixelBuffer: CVPixelBuffer?
+                let attrs: [CFString: Any] = [
+                    kCVPixelBufferCGImageCompatibilityKey: true,
+                    kCVPixelBufferCGBitmapContextCompatibilityKey: true,
+                    kCVPixelBufferWidthKey: cgImage.width,
+                    kCVPixelBufferHeightKey: cgImage.height,
+                ]
+                CVPixelBufferCreate(kCFAllocatorDefault, cgImage.width, cgImage.height,
+                    kCVPixelFormatType_32BGRA, attrs as CFDictionary, &pixelBuffer)
+
+                guard let pb = pixelBuffer else { return }
+
+                CVPixelBufferLockBaseAddress(pb, [])
+                defer { CVPixelBufferUnlockBaseAddress(pb, []) }
+
+                let context = CGContext(data: CVPixelBufferGetBaseAddress(pb),
+                    width: cgImage.width, height: cgImage.height,
+                    bitsPerComponent: 8, bytesPerRow: CVPixelBufferGetBytesPerRow(pb),
+                    space: CGColorSpaceCreateDeviceRGB(),
+                    bitmapInfo: CGImageAlphaInfo.noneSkipFirst.rawValue | CGBitmapInfo.byteOrder32Little.rawValue)!
+                context.draw(cgImage, in: CGRect(x: 0, y: 0, width: cgImage.width, height: cgImage.height))
+
+                let handler = VNImageRequestHandler(cvPixelBuffer: pb, options: [:])
+                let detectReq = VNDetectFaceRectanglesRequest()
+                let lmReq = VNDetectFaceLandmarksRequest()
+
+                do {
+                    try handler.perform([detectReq, lmReq])
+                } catch {
+                    return
+                }
+
+                let faceObservations = detectReq.results ?? []
+                let landmarkObservations = lmReq.results ?? []
+                guard !faceObservations.isEmpty || !landmarkObservations.isEmpty else {
+                    return
+                }
+
+                let seconds = CMTimeGetSeconds(actualTime)
+                let frameNumber = Int(seconds * Double(fps))
+                var frameFaces: [[String: Any]] = []
+
+                // Use actual CGImage size (may differ from naturalSize after transform)
+                let imgW = CGFloat(cgImage.width)
+                let imgH = CGFloat(cgImage.height)
+
+                // Process landmark observations FIRST (each has bbox + landmarks, self-consistent)
+                // Quality filtering
+                let MIN_CONFIDENCE = 0.6
+                let MIN_SIZE = 20
+                
+                for lmObs in landmarkObservations {
+                    // Confidence filter
+                    let lmConf = Double(lmObs.confidence)
+                    if lmConf < MIN_CONFIDENCE { continue }
+                    
+                    let bb = lmObs.boundingBox
+                    let faceW = Int(bb.size.width * imgW)
+                    let faceH = Int(bb.size.height * imgH)
+                    
+                    // Size filter
+                    if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
+                    
+                    let faceX = Int(bb.origin.x * imgW)
+                    let faceY = Int((1.0 - bb.origin.y - bb.size.height) * imgH)
+
+                    var faceData: [String: Any] = [
+                        "bbox": ["x": max(0, faceX), "y": max(0, faceY),
+                                 "width": faceW, "height": faceH],
+                        "confidence": Double(lmObs.confidence),
+                    ]
+
+                    // Pose from landmark observation
+                    if let yaw = lmObs.yaw?.doubleValue,
+                       let roll = lmObs.roll?.doubleValue {
+                        var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
+                        if let pitch = lmObs.pitch?.doubleValue {
+                            poseInfo["pitch"] = pitch
+                        }
+                        faceData["pose"] = poseInfo
+                    }
+
+                    // Landmarks with Y-flip (macOS image coords: bottom-left -> top-left)
+                    if let lms = lmObs.landmarks {
+                        let imgSize = CGSize(width: imgW, height: imgH)
+                        let leftEye = lms.leftEye?.pointsInImage(imageSize: imgSize) ?? []
+                        let rightEye = lms.rightEye?.pointsInImage(imageSize: imgSize) ?? []
+                        let nose = lms.nose?.pointsInImage(imageSize: imgSize) ?? []
+
+                        if !leftEye.isEmpty || !rightEye.isEmpty || !nose.isEmpty {
+                            var lm: [String: [[Double]]] = [:]
+                            if !leftEye.isEmpty {
+                                lm["left_eye"] = leftEye.map { [Double($0.x), Double(imgH - $0.y)] }
+                            }
+                            if !rightEye.isEmpty {
+                                lm["right_eye"] = rightEye.map { [Double($0.x), Double(imgH - $0.y)] }
+                            }
+                            if !nose.isEmpty {
+                                lm["nose"] = nose.map { [Double($0.x), Double(imgH - $0.y)] }
+                            }
+                            faceData["landmarks"] = lm
+                        }
+
+                        let outer = lms.outerLips?.pointsInImage(imageSize: imgSize) ?? []
+                        let inner = lms.innerLips?.pointsInImage(imageSize: imgSize) ?? []
+                        if !outer.isEmpty || !inner.isEmpty {
+                            faceData["lips"] = [
+                                "outer_lips": outer.map { [Double($0.x), Double(imgH - $0.y)] },
+                                "inner_lips": inner.map { [Double($0.x), Double(imgH - $0.y)] }
+                            ]
+                        }
+                    }
+
+                    frameFaces.append(faceData)
+                }
+
+                // Output face rect observations that the landmark detector missed.
+                // Match against ALL landmark observations via IoU to avoid duplicates.
+                for faceObs in faceObservations {
+                    let fBB = faceObs.boundingBox
+                    var matched = false
+                    for lmObs in landmarkObservations {
+                        let lBB = lmObs.boundingBox
+                        let ix = max(fBB.origin.x, lBB.origin.x)
+                        let iy = max(fBB.origin.y, lBB.origin.y)
+                        let iw = min(fBB.maxX, lBB.maxX) - ix
+                        let ih = min(fBB.maxY, lBB.maxY) - iy
+                        if iw <= 0 || ih <= 0 { continue }
+                        let intersection = iw * ih
+                        let union = fBB.width * fBB.height + lBB.width * lBB.height - intersection
+                        if intersection / union > 0.3 {
+                            matched = true
+                            break
+                        }
+                    }
+                    if matched { continue }
+                    
+                    // Quality filtering for unmatched face rects
+                    let MIN_CONFIDENCE = 0.6
+                    let MIN_SIZE = 20
+                    
+                    let faceConf = Double(faceObs.faceCaptureQuality ?? faceObs.confidence)
+                    if faceConf < MIN_CONFIDENCE { continue }
+                    
+                    let faceW = Int(fBB.size.width * imgW)
+                    let faceH = Int(fBB.size.height * imgH)
+                    if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
+                    
+                    // Unmatched face rect: output without landmarks
+                    let faceX = Int(fBB.origin.x * imgW)
+                    let faceY = Int((1.0 - fBB.origin.y - fBB.size.height) * imgH)
+
+                    var faceData: [String: Any] = [
+                        "bbox": ["x": max(0, faceX), "y": max(0, faceY),
+                                 "width": faceW, "height": faceH],
+                        "confidence": Double(faceObs.faceCaptureQuality ?? faceObs.confidence),
+                    ]
+                    if let yaw = faceObs.yaw?.doubleValue,
+                       let roll = faceObs.roll?.doubleValue {
+                        var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
+                        if let pitch = faceObs.pitch?.doubleValue {
+                            poseInfo["pitch"] = pitch
+                        }
+                        faceData["pose"] = poseInfo
+                    }
+                    frameFaces.append(faceData)
+                }
+
+                if !frameFaces.isEmpty {
+                    allFrames.append([
+                        "frame": frameNumber,
+                        "timestamp": seconds,
+                        "faces": frameFaces,
+                    ])
+                    processedCount += 1
+                }
+            }
+
+            // Wait for batch to complete
+            for _ in batchStart..<batchEnd {
+                semaphore.wait()
+            }
+
+            let elapsed = Date().timeIntervalSince(startTime)
+            let pct = Int(Double(min(batchEnd, totalSamples)) / Double(totalSamples) * 100)
+            print("[SwiftFace] \(processedCount) frames with faces, \(pct)% complete, \(Int(elapsed))s elapsed")
+            fflush(stdout)
+        }
+
+        generator.cancelAllCGImageGeneration()
+
+        let output: [String: Any] = [
+            "frame_count": allFrames.count,
+            "fps": Double(fps),
+            "frames": allFrames,
+        ]
+
+        guard let jsonData = try? JSONSerialization.data(withJSONObject: output, options: []),
+              let jsonString = String(data: jsonData, encoding: .utf8) else {
+            print("[SwiftFace] Failed to serialize JSON")
+            return
+        }
+
+        let outputURL = URL(fileURLWithPath: outputPath)
+        try jsonString.write(to: outputURL, atomically: false, encoding: .utf8)
+
+        let elapsed = Date().timeIntervalSince(startTime)
+        print("[SwiftFace] Done: \(allFrames.count) frames, \(String(format: "%.1f", elapsed))s → \(outputPath)")
+    }
+}
--- a/v1.1/scripts/swift_processors/swift_ocr_v1.11.swift
+++ b/v1.1/scripts/swift_processors/swift_ocr_v1.11.swift
@@ -0,0 +1,204 @@
+import Foundation
+import Vision
+import ArgumentParser
+import AVFoundation
+import AppKit
+
+/// Swift OCR Processor - replaces Python PaddleOCR
+/// Uses Apple Vision Framework (VNRecognizeTextRequest) with ANE acceleration
+///
+/// Output format (compatible with OcrResult Rust struct):
+/// {
+///   "frame_count": N,
+///   "fps": 30.0,
+///   "frames": [
+///     { "frame": 0, "timestamp": 0.0, "texts": [{ "text": "...", "x": 0, "y": 0, "width": 0, "height": 0, "confidence": 0.0 }] }
+///   ]
+/// }
+@main
+struct SwiftOCR: ParsableCommand {
+    @Argument(help: "Video file path")
+    var inputPath: String
+
+    @Argument(help: "Output JSON path")
+    var outputPath: String
+
+    @Option(name: .long, help: "Frames to skip between OCR (default=30)")
+    var sampleInterval: Int = 30
+
+    @Option(name: .long, help: "Video FPS (auto-detect if 0)")
+    var fps: Double = 0
+
+    @Option(name: .long, help: "UUID for logging")
+    var uuid: String = ""
+
+    @Option(name: .long, help: "Recognition level: fast or accurate (default=accurate)")
+    var recognitionLevel: String = "accurate"
+
+    mutating func run() throws {
+        let startTime = Date()
+        print("[SwiftOCR] Starting: \(inputPath)")
+        print("[SwiftOCR] Sample interval: \(sampleInterval)")
+
+        let url = URL(fileURLWithPath: inputPath)
+        let asset = AVAsset(url: url)
+
+        guard let videoTrack = asset.tracks(withMediaType: .video).first else {
+            print("[SwiftOCR] Error: No video track"); return
+        }
+
+        let duration = asset.duration.seconds
+        let detectedFps = fps > 0 ? fps : Double(videoTrack.nominalFrameRate)
+        let totalFrames = Int(duration * detectedFps)
+        print("[SwiftOCR] Duration: \(String(format: "%.1f", duration))s, FPS: \(String(format: "%.1f", detectedFps)), Frames: \(totalFrames)")
+
+        let frameStep = sampleInterval
+
+        // Use shared frame cache if available (set by FrameManager)
+        let tempDir: URL
+        let framesDir: URL
+        if let cacheDir = ProcessInfo.processInfo.environment["MOMENTRY_FRAME_DIR"] {
+            framesDir = URL(fileURLWithPath: cacheDir)
+            tempDir = framesDir  // No cleanup needed (managed by FrameManager)
+            print("[SwiftOCR] Using shared frame cache: \(cacheDir)")
+        } else {
+            tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("swift_ocr_\(UUID().uuidString)")
+            framesDir = tempDir.appendingPathComponent("frames")
+            try FileManager.default.createDirectory(at: framesDir, withIntermediateDirectories: true)
+
+            let framePattern = framesDir.appendingPathComponent("frame_%05d.jpg").path
+            print("[SwiftOCR] Extracting frames with ffmpeg (interval=\(frameStep))...")
+
+            let extractProc = Process()
+            extractProc.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
+            extractProc.arguments = ["-y", "-v", "quiet", "-i", inputPath,
+                                      "-vf", "select=not(mod(n\\,\(frameStep))),scale=320:-2",
+                                      "-vsync", "vfr", "-q:v", "15", framePattern]
+            let startExtract = Date()
+            try extractProc.run()
+            extractProc.waitUntilExit()
+            let extractTime = Date().timeIntervalSince(startExtract)
+            print("[SwiftOCR] Frame extraction complete: \(String(format: "%.1f", extractTime))s")
+        }
+
+        // Sort extracted frame files
+        let fileManager = FileManager.default
+        let allFiles = (try? fileManager.contentsOfDirectory(atPath: framesDir.path)) ?? []
+        let frameFiles = allFiles
+            .filter { $0.hasPrefix("frame_") && $0.hasSuffix(".jpg") }
+            .sorted()
+
+        let level: VNRequestTextRecognitionLevel = (recognitionLevel == "fast") ? .fast : .accurate
+        var ocrFrames: [[String: Any]] = []
+        var lastProgress = 0
+        let totalFrames_to_process = frameFiles.count
+
+        for (i, frameName) in frameFiles.enumerated() {
+            let imgPath = framesDir.appendingPathComponent(frameName).path
+
+            guard let imgData = try? Data(contentsOf: URL(fileURLWithPath: imgPath)),
+                  let img = NSImage(data: imgData),
+                  let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
+                continue
+            }
+
+            // Extract frame number from filename
+            let frameNumber = Int(frameName.replacingOccurrences(of: "frame_", with: "").replacingOccurrences(of: ".jpg", with: "")) ?? (i * frameStep)
+            let timestamp = Double(frameNumber) / detectedFps
+
+            let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+            let request = VNRecognizeTextRequest()
+            request.recognitionLevel = level
+            request.usesLanguageCorrection = true
+            request.preferBackgroundProcessing = true
+
+            guard (try? handler.perform([request])) != nil,
+                  let results = request.results else { continue }
+
+            var texts: [[String: Any]] = []
+            let cgW = cgImage.width
+            let cgH = cgImage.height
+            for obs in results {
+                guard let candidate = obs.topCandidates(1).first else { continue }
+                let conf = candidate.confidence
+                guard conf > 0.3 else { continue }
+                let bb = obs.boundingBox
+                let item: [String: Any] = [
+                    "text": candidate.string,
+                    "x": Int(bb.origin.x * CGFloat(cgW)),
+                    "y": Int((1.0 - bb.origin.y - bb.size.height) * CGFloat(cgH)),
+                    "width": Int(bb.size.width * CGFloat(cgW)),
+                    "height": Int(bb.size.height * CGFloat(cgH)),
+                    "confidence": conf
+                ]
+                texts.append(item)
+            }
+
+            if !texts.isEmpty {
+                ocrFrames.append([
+                    "frame": frameNumber,
+                    "timestamp": timestamp,
+                    "texts": texts
+                ])
+            }
+
+            let pct = Int(Float(i) / Float(totalFrames_to_process) * 100)
+            if pct >= lastProgress + 5 {
+                print("[SwiftOCR] Progress: \(pct)% (\(i)/\(totalFrames_to_process) samples, \(ocrFrames.count) with text)")
+                lastProgress = pct
+            }
+        }
+
+        // Write output
+        let outputDict: [String: Any] = [
+            "frame_count": ocrFrames.count,
+            "fps": detectedFps,
+            "frames": ocrFrames
+        ]
+
+        if let jsonData = try? JSONSerialization.data(withJSONObject: outputDict, options: [.prettyPrinted]) {
+            try jsonData.write(to: URL(fileURLWithPath: outputPath))
+        }
+
+        let elapsed = Date().timeIntervalSince(startTime)
+        print("[SwiftOCR] Complete: \(ocrFrames.count) frames with text, \(String(format: "%.1f", elapsed))s")
+        if duration > 0 {
+            print("[SwiftOCR] RTF: \(String(format: "%.3f", elapsed / duration))x")
+        }
+
+        // Clean up temp dir if we created it (not shared cache)
+        if ProcessInfo.processInfo.environment["MOMENTRY_FRAME_DIR"] == nil {
+            try? FileManager.default.removeItem(at: tempDir)
+        }
+    }
+
+    func recognizeText(pixelBuffer: CVPixelBuffer, level: VNRequestTextRecognitionLevel) -> [[String: Any]] {
+        var texts: [[String: Any]] = []
+        let request = VNRecognizeTextRequest()
+        request.recognitionLevel = level
+        request.usesLanguageCorrection = true
+        request.preferBackgroundProcessing = true
+
+        let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
+        guard (try? handler.perform([request])) != nil,
+              let results = request.results else { return texts }
+
+        let cgW = CGFloat(CVPixelBufferGetWidth(pixelBuffer))
+        let cgH = CGFloat(CVPixelBufferGetHeight(pixelBuffer))
+
+        for obs in results {
+            guard let candidate = obs.topCandidates(1).first,
+                  candidate.confidence > 0.2 else { continue }
+            let bb = obs.boundingBox
+            texts.append([
+                "text": candidate.string,
+                "x": Int(bb.origin.x * cgW),
+                "y": Int((1.0 - bb.origin.y - bb.size.height) * cgH),
+                "width": Int(bb.size.width * cgW),
+                "height": Int(bb.size.height * cgH),
+                "confidence": candidate.confidence
+            ])
+        }
+        return texts
+    }
+}
--- a/v1.1/scripts/swift_processors/swift_pose_v1.11.swift
+++ b/v1.1/scripts/swift_processors/swift_pose_v1.11.swift
@@ -0,0 +1,222 @@
+import Foundation
+import Vision
+import ArgumentParser
+import AppKit
+import AVFoundation
+
+/// Swift Pose Processor - replaces YOLOv8 Pose / MediaPipe Pose
+/// Uses VNDetectHumanBodyPoseRequest with ANE acceleration
+///
+/// Output format (compatible with PoseResult Rust struct):
+/// {
+///   "frame_count": N, "fps": 30.0,
+///   "frames": [
+///     { "frame": 0, "timestamp": 0.0, "persons": [
+///       { "keypoints": [{"name":"nose","x":100,"y":200,"confidence":0.95}],
+///         "bbox": {"x":0,"y":0,"width":100,"height":200}
+///       }
+///     ]}
+///   ]
+/// }
+@main
+struct SwiftPose: ParsableCommand {
+    @Argument(help: "Video file path")
+    var inputPath: String
+
+    @Argument(help: "Output JSON path")
+    var outputPath: String
+
+    @Option(name: .long, help: "Sample interval (frames, default=30)")
+    var sampleInterval: Int = 30
+
+    @Option(name: .long, help: "UUID for logging")
+    var uuid: String = ""
+
+    mutating func run() throws {
+        let startTime = Date()
+        print("[SwiftPose] Starting: \(inputPath)")
+
+        let url = URL(fileURLWithPath: inputPath)
+        let asset = AVAsset(url: url)
+        guard let track = asset.tracks(withMediaType: .video).first else {
+            print("[SwiftPose] Error: No video track"); return
+        }
+        let duration = asset.duration.seconds
+        let fps = Double(track.nominalFrameRate)
+        print("[SwiftPose] Duration: \(String(format: "%.1f", duration))s, FPS: \(String(format: "%.1f", fps))")
+
+        // Extract frames (use shared cache or ffmpeg)
+        let tempDir: URL
+        let framesDir: URL
+        if let cacheDir = ProcessInfo.processInfo.environment["MOMENTRY_FRAME_DIR"] {
+            framesDir = URL(fileURLWithPath: cacheDir)
+            tempDir = framesDir
+            print("[SwiftPose] Using shared frame cache: \(cacheDir)")
+        } else {
+            tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("swift_pose_\(UUID().uuidString)")
+            framesDir = tempDir.appendingPathComponent("frames")
+            try FileManager.default.createDirectory(at: framesDir, withIntermediateDirectories: true)
+
+            let pattern = framesDir.appendingPathComponent("frame_%05d.jpg").path
+            print("[SwiftPose] Extracting frames...")
+            let extract = Process()
+            extract.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
+            extract.arguments = ["-y", "-v", "quiet", "-i", inputPath,
+                                 "-vf", "select=not(mod(n\\,\(sampleInterval)))",
+                                 "-vsync", "vfr", "-q:v", "15", pattern]
+            try extract.run()
+            extract.waitUntilExit()
+        }
+
+        let files = (try? FileManager.default.contentsOfDirectory(atPath: framesDir.path)) ?? []
+        let frameFiles = files.filter { $0.hasSuffix(".jpg") }.sorted()
+        print("[SwiftPose] Extracted \(frameFiles.count) frames")
+
+        let jointNames: [VNHumanBodyPoseObservation.JointName] = [
+            .nose, .leftEye, .rightEye, .leftEar, .rightEar,
+            .neck, .root,
+            .leftShoulder, .rightShoulder,
+            .leftElbow, .rightElbow,
+            .leftWrist, .rightWrist,
+            .leftHip, .rightHip,
+            .leftKnee, .rightKnee,
+            .leftAnkle, .rightAnkle,
+        ]
+
+        var poseFrames: [[String: Any]] = []
+        var lastProgress = 0
+
+        for (i, fname) in frameFiles.enumerated() {
+            let imgPath = framesDir.appendingPathComponent(fname).path
+            guard let imgData = try? Data(contentsOf: URL(fileURLWithPath: imgPath)),
+                  let img = NSImage(data: imgData),
+                  let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else { continue }
+
+            let frameNum = Int(fname.replacingOccurrences(of: "frame_", with: "").replacingOccurrences(of: ".jpg", with: "")) ?? (i * sampleInterval)
+            let timestamp = Double(frameNum) / fps
+            let w = cgImage.width
+            let h = cgImage.height
+
+            let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+            let req = VNDetectHumanBodyPoseRequest()
+            try? handler.perform([req])
+
+            guard let poses = req.results, !poses.isEmpty else { continue }
+
+            var persons: [[String: Any]] = []
+            for pose in poses {
+                var keypoints: [[String: Any]] = []
+                var minX = CGFloat.greatestFiniteMagnitude
+                var minY = CGFloat.greatestFiniteMagnitude
+                var maxX: CGFloat = 0
+                var maxY: CGFloat = 0
+
+                for joint in jointNames {
+                    if let point = try? pose.recognizedPoint(joint) {
+                        let desc = String(describing: joint.rawValue)
+                        var rawName = desc
+                            .replacingOccurrences(of: "VNRecognizedPointKey(_rawValue: ", with: "")
+                            .replacingOccurrences(of: ")", with: "")
+                            .trimmingCharacters(in: .whitespaces)
+                        // Map Vision Framework joint names to standard names
+                        let nameMap: [String: String] = [
+                            "head_joint": "nose",
+                            "left_eye_joint": "left_eye",
+                            "right_eye_joint": "right_eye",
+                            "left_ear_joint": "left_ear",
+                            "right_ear_joint": "right_ear",
+                            "neck_1_joint": "neck",
+                            "left_shoulder_1_joint": "left_shoulder",
+                            "right_shoulder_1_joint": "right_shoulder",
+                            "left_elbow_1_joint": "left_elbow",
+                            "right_elbow_1_joint": "right_elbow",
+                            "left_hand_joint": "left_wrist",
+                            "right_hand_joint": "right_wrist",
+                            "left_hip_1_joint": "left_hip",
+                            "right_hip_1_joint": "right_hip",
+                            "left_knee_1_joint": "left_knee",
+                            "right_knee_1_joint": "right_knee",
+                            "left_ankle_1_joint": "left_ankle",
+                            "right_ankle_1_joint": "right_ankle",
+                            "center_hip_joint": "root",
+                            "left_forearm_joint": "left_elbow",
+                            "right_forearm_joint": "right_elbow",
+                            "left_upLeg_joint": "left_hip",
+                            "right_upLeg_joint": "right_hip",
+                            "left_leg_joint": "left_knee",
+                            "right_leg_joint": "right_knee",
+                            "left_foot_joint": "left_ankle",
+                            "right_foot_joint": "right_ankle",
+                        ]
+                        if let mapped = nameMap[rawName] {
+                            rawName = mapped
+                        }
+                        let px = point.location.x * CGFloat(w)
+                        let py = CGFloat(h) - point.location.y * CGFloat(h)
+                        keypoints.append([
+                            "name": rawName.isEmpty ? "\(joint)" : rawName,
+                            "x": px,
+                            "y": py,
+                            "confidence": point.confidence,
+                        ])
+                        if point.confidence > 0.1 {
+                            minX = min(minX, px)
+                            minY = min(minY, py)
+                            maxX = max(maxX, px)
+                            maxY = max(maxY, py)
+                        }
+                    }
+                }
+
+                var bbox: [String: Any] = [
+                    "x": 0, "y": 0, "width": 0, "height": 0
+                ]
+                if maxX > minX {
+                    bbox = [
+                        "x": Int(minX),
+                        "y": Int(minY),
+                        "width": Int(maxX - minX),
+                        "height": Int(maxY - minY),
+                    ]
+                }
+
+                persons.append(["keypoints": keypoints, "bbox": bbox])
+            }
+
+            if !persons.isEmpty {
+                poseFrames.append([
+                    "frame": frameNum,
+                    "timestamp": timestamp,
+                    "persons": persons,
+                ])
+            }
+
+            let pct = Int(Float(i) / Float(frameFiles.count) * 100)
+            if pct >= lastProgress + 10 {
+                print("[SwiftPose] Progress: \(pct)% (\(i)/\(frameFiles.count), \(persons.count) poses)")
+                lastProgress = pct
+            }
+        }
+
+        // Write output
+        let outputDict: [String: Any] = [
+            "frame_count": poseFrames.count,
+            "fps": fps,
+            "frames": poseFrames,
+        ]
+        if let jsonData = try? JSONSerialization.data(withJSONObject: outputDict, options: [.prettyPrinted]) {
+            try jsonData.write(to: URL(fileURLWithPath: outputPath))
+        }
+
+        let elapsed = Date().timeIntervalSince(startTime)
+        print("[SwiftPose] Complete: \(poseFrames.count) frames, \(String(format: "%.1f", elapsed))s")
+        if duration > 0 {
+            print("[SwiftPose] RTF: \(String(format: "%.3f", elapsed / duration))x")
+        }
+
+        // Clean up temp dir if we created it
+        if ProcessInfo.processInfo.environment["MOMENTRY_FRAME_DIR"] == nil {
+            try? FileManager.default.removeItem(at: tempDir)
+        }
+    }
+}
--- a/v1.1/scripts/swift_processors/vision_object_test_v1.11.swift
+++ b/v1.1/scripts/swift_processors/vision_object_test_v1.11.swift
@@ -0,0 +1,102 @@
+import Foundation
+import Vision
+import ArgumentParser
+import AppKit
+
+/// POC: Test Apple Vision Framework for object detection (YOLO replacement)
+@main
+struct VisionObjectTest: ParsableCommand {
+    @Argument(help: "Input image path")
+    var inputPath: String
+
+    func run() throws {
+        let startTime = Date()
+        print("=== Apple Vision Framework Object Detection POC ===")
+        #if arch(arm64)
+        print("HW: Apple Silicon ✅ (ANE available)")
+        #endif
+
+        guard let image = NSImage(contentsOfFile: inputPath) else {
+            print("Error: cannot load image"); return
+        }
+        guard let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
+            print("Error: cannot get CGImage"); return
+        }
+        print("Image: \(cgImage.width)x\(cgImage.height)")
+        let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+
+        // 1. VNClassifyImageRequest (scene classification - replaces scene_classifier)
+        print("\n--- VNClassifyImageRequest ---")
+        let clsReq = VNClassifyImageRequest()
+        try handler.perform([clsReq])
+        if let classifications = clsReq.results {
+            print("Top classifications:")
+            for c in classifications.prefix(10) {
+                print("  \(c.identifier): conf=\(String(format: "%.3f", c.confidence))")
+            }
+        }
+
+        // 2. VNDetectHumanRectanglesRequest (person detection - YOLO replacement for 'person')
+        print("\n--- VNDetectHumanRectanglesRequest ---")
+        let humanReq = VNDetectHumanRectanglesRequest()
+        try handler.perform([humanReq])
+        if let humans = humanReq.results {
+            print("Humans: \(humans.count)")
+            for (i, h) in humans.enumerated() {
+                let bb = h.boundingBox
+                print("  [\(i)] bbox=(\(String(format: "%.3f", bb.origin.x)),\(String(format: "%.3f", bb.origin.y))) size=(\(String(format: "%.3f", bb.size.width)),\(String(format: "%.3f", bb.size.height))) conf=\(String(format: "%.2f", h.confidence))")
+            }
+        }
+
+        // 3. VNDetectHumanBodyPoseRequest (pose estimation - MediaPipe replacement)
+        print("\n--- VNDetectHumanBodyPoseRequest ---")
+        let poseReq = VNDetectHumanBodyPoseRequest()
+        try handler.perform([poseReq])
+        if let poses = poseReq.results {
+            print("Body poses: \(poses.count)")
+            for (i, p) in poses.enumerated() {
+                let joints = p.availableJointNames
+                print("  [\(i)] \(joints.count) joints detected")
+                // Show key joints
+                for joint in [VNHumanBodyPoseObservation.JointName.neck,
+                              VNHumanBodyPoseObservation.JointName.leftShoulder,
+                              VNHumanBodyPoseObservation.JointName.rightShoulder,
+                              VNHumanBodyPoseObservation.JointName.leftWrist,
+                              VNHumanBodyPoseObservation.JointName.rightWrist,
+                              VNHumanBodyPoseObservation.JointName.root] {
+                    if let pt = try? p.recognizedPoint(joint) {
+                        print("    \(joint.rawValue): (\(String(format: "%.3f", pt.location.x)), \(String(format: "%.3f", pt.location.y))) conf=\(String(format: "%.2f", pt.confidence))")
+                    }
+                }
+            }
+        }
+
+        // 4. VNDetectHumanHandPoseRequest (hand pose)
+        print("\n--- VNDetectHumanHandPoseRequest ---")
+        let handReq = VNDetectHumanHandPoseRequest()
+        try handler.perform([handReq])
+        if let hands = handReq.results {
+            print("Hands: \(hands.count)")
+            for (i, h) in hands.enumerated() {
+                print("  [\(i)] confidence=\(String(format: "%.2f", h.confidence))")
+            }
+        }
+
+        // 5. VNGenerateObjectnessBasedSaliencyImageRequest (object detection without labels)
+        print("\n--- VNGenerateObjectnessBasedSaliencyImageRequest ---")
+        let salReq = VNGenerateObjectnessBasedSaliencyImageRequest()
+        try handler.perform([salReq])
+        if let sal = salReq.results?.first {
+            if let objects = sal.salientObjects {
+                print("Salient objects: \(objects.count)")
+                for (i, ob) in objects.enumerated().prefix(10) {
+                    let bb = ob.boundingBox
+                    print("  [\(i)] bbox=(\(String(format: "%.3f", bb.origin.x)),\(String(format: "%.3f", bb.origin.y)),\(String(format: "%.3f", bb.size.width)),\(String(format: "%.3f", bb.size.height))) conf=\(String(format: "%.2f", ob.confidence))")
+                }
+            }
+        }
+
+        print("\nTime: \(String(format: "%.2f", Date().timeIntervalSince(startTime)))s")
+        print("=== Done ===")
+    }
+}
--- a/v1.1/scripts/swift_processors/vision_ocr_test_v1.11.swift
+++ b/v1.1/scripts/swift_processors/vision_ocr_test_v1.11.swift
@@ -0,0 +1,71 @@
+import Foundation
+import Vision
+import ArgumentParser
+import AppKit
+
+/// POC: Test Apple Vision Framework OCR (VNRecognizeTextRequest) vs PaddleOCR
+@main
+struct VisionOCRTest: ParsableCommand {
+    @Argument(help: "Input image path")
+    var inputPath: String
+
+    @Option(name: .long, help: "Recognition level (.fast or .accurate, default .accurate)")
+    var level: String = "accurate"
+
+    mutating func run() throws {
+        let startTime = Date()
+        print("=== Apple Vision Framework OCR POC ===")
+        #if arch(arm64)
+        print("HW: Apple Silicon ✅")
+        #endif
+
+        guard let image = NSImage(contentsOfFile: inputPath) else {
+            print("Error: cannot load image"); return
+        }
+        guard let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
+            print("Error: cannot get CGImage"); return
+        }
+        print("Image: \(cgImage.width)x\(cgImage.height)")
+        let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
+
+        // VNRecognizeTextRequest
+        print("\n--- VNRecognizeTextRequest ---")
+        let req = VNRecognizeTextRequest()
+        if level == "fast" {
+            req.recognitionLevel = .fast
+        } else {
+            req.recognitionLevel = .accurate
+        }
+        req.usesLanguageCorrection = true
+        req.preferBackgroundProcessing = true
+
+        try handler.perform([req])
+
+        guard let results = req.results else {
+            print("No OCR results"); return
+        }
+
+        print("Text blocks: \(results.count)")
+        var totalChars = 0
+        for (i, obs) in results.enumerated() {
+            guard let candidate = obs.topCandidates(1).first else { continue }
+            let text = candidate.string
+            let conf = candidate.confidence
+            let bb = obs.boundingBox
+            totalChars += text.count
+            if i < 20 {
+                print("  [\(i)] conf=\(String(format: "%.3f", conf)) bbox=(\(String(format: "%.3f", bb.origin.x)),\(String(format: "%.3f", bb.origin.y)),\(String(format: "%.3f", bb.size.width)),\(String(format: "%.3f", bb.size.height))) \"\(text.prefix(80))\"")
+            }
+        }
+        print("  ... \(results.count) total, \(totalChars) chars")
+
+        // Check language support
+        print("\n--- Language Support ---")
+        let supported = (try? VNRecognizeTextRequest.supportedRecognitionLanguages(for: .accurate, revision: VNRecognizeTextRequest.currentRevision)) ?? []
+        print("Supported languages (\(supported.count)): \(supported.prefix(10).joined(separator: ", "))...")
+
+        let elapsed = Date().timeIntervalSince(startTime)
+        print("\nTime: \(String(format: "%.2f", elapsed))s")
+        print("=== Done ===")
+    }
+}