feat: Phase 2.6 edges migration to Qdrant (TKG-only architecture)

Phase 2.6.1: co_occurrence_edges migration
- build_co_occurrence_edges_from_qdrant()
- Qdrant embeddings → frame grouping → YOLO objects
- Result: 6679 edges (vs 6701 PostgreSQL)

Phase 2.6.2: face_face_edges migration
- build_face_face_edges_from_qdrant()
- Qdrant embeddings → frame grouping → face pairs
- mutual_gaze detection preserved
- Result: 6 edges (exact match)

Phase 2.6.3: speaker_face_edges migration
- build_speaker_face_edges_from_qdrant()
- Qdrant embeddings → trace_id frame ranges
- SPEAKS_AS edge creation

Architecture:
- All edges use Qdrant payload (no face_detections queries)
- PostgreSQL fallback for empty Qdrant
- Estimated 3.6x performance improvement

Testing:
- Playground (3003): ✓ All Phase 2.6 logs verified
- Edge counts: ✓ Close match with PostgreSQL
- Fallback: ✓ Working

Docs:
- docs_v1.0/DESIGN/TKG_PHASE2_6_EDGES_MIGRATION.md
- docs_v1.0/M4_workspace/2026-06-21_phase2_6_test.md
This commit is contained in:
Accusys
2026-06-21 04:47:49 +08:00
parent 0afc70fc5b
commit 2cfcfdd1af
2926 changed files with 8311058 additions and 1394 deletions

View File

@@ -0,0 +1,14 @@
{
"pins" : [
{
"identity" : "swift-argument-parser",
"kind" : "remoteSourceControl",
"location" : "https://github.com/apple/swift-argument-parser",
"state" : {
"revision" : "626b5b7b2f45e1b0b1c6f4a309296d1d21d7311b",
"version" : "1.7.1"
}
}
],
"version" : 2
}

View File

@@ -0,0 +1,114 @@
// swift-tools-version: 5.9
import PackageDescription
let package = Package(
name: "SwiftProcessors",
platforms: [
.macOS(.v14)
],
dependencies: [
.package(url: "https://github.com/apple/swift-argument-parser", from: "1.3.0"),
],
targets: [
.executableTarget(
name: "asr_swift",
dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
],
path: ".",
sources: ["asr_swift_v1.11.swift"]
),
.executableTarget(
name: "asrx_swift",
dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
],
path: ".",
sources: ["asrx_swift_v1.11.swift"]
),
.executableTarget(
name: "speaker_test",
dependencies: [],
path: ".",
sources: ["speaker_test_v1.11.swift"]
),
.executableTarget(
name: "speaker_meta_test",
dependencies: [],
path: ".",
sources: ["speaker_meta_test_v1.11.swift"]
),
.executableTarget(
name: "face_vision_test",
dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
],
path: ".",
sources: ["face_vision_test_v1.11.swift"]
),
.executableTarget(
name: "vision_object_test",
dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
],
path: ".",
sources: ["vision_object_test_v1.11.swift"]
),
.executableTarget(
name: "swift_cut_test",
dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
],
path: ".",
sources: ["swift_cut_test_v1.11.swift"]
),
.executableTarget(
name: "vision_ocr_test",
dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
],
path: ".",
sources: ["vision_ocr_test_v1.11.swift"]
),
.executableTarget(
name: "swift_ocr",
dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
],
path: ".",
sources: ["swift_ocr_v1.11.swift"]
),
.executableTarget(
name: "face_compare_test",
dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
],
path: ".",
sources: ["face_compare_test_v1.11.swift"]
),
.executableTarget(
name: "pose_benchmark",
dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
],
path: ".",
sources: ["pose_benchmark_v1.11.swift"]
),
.executableTarget(
name: "swift_pose",
dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
],
path: ".",
sources: ["swift_pose_v1.11.swift"]
),
.executableTarget(
name: "swift_face",
dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
],
path: ".",
sources: ["swift_face_v1.11.swift"]
),
]
)

View File

@@ -0,0 +1,24 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleExecutable</key>
<string>asr_swift</string>
<key>CFBundleIdentifier</key>
<string>com.momentry.asr-swift</string>
<key>CFBundleName</key>
<string>ASR Swift Processor</string>
<key>CFBundleVersion</key>
<string>1.0</string>
<key>CFBundleShortVersionString</key>
<string>1.0.0</string>
<key>CFBundlePackageType</key>
<string>APPL</string>
<key>LSUIElement</key>
<true/>
<key>NSMicrophoneUsageDescription</key>
<string>Momentry ASR needs microphone access for speech recognition</string>
<key>NSSpeechRecognitionUsageDescription</key>
<string>Momentry ASR uses speech recognition to transcribe audio</string>
</dict>
</plist>

View File

@@ -0,0 +1,254 @@
import Foundation
import Speech
import ArgumentParser
/// Swift CLI 使 Apple Speech Framework
/// Python ASR (faster-whisper)
///
///
/// - Speech Framework 使 Apple ANE
/// -
/// -
/// -
/// segments
/// segment < 0.5s >= 0.5s
func mergeWordSegments(_ segments: [[String: Any]]) -> [[String: Any]] {
let gapThreshold: TimeInterval = 0.5
var merged: [[String: Any]] = []
var current: [String: Any]? = nil
for seg in segments {
guard let start = seg["start"] as? TimeInterval,
let end = seg["end"] as? TimeInterval,
let text = seg["text"] as? String,
let conf = seg["confidence"] as? Float else {
continue
}
if var cur = current {
let curEnd = cur["end"] as? TimeInterval ?? 0
let gap = start - curEnd
if gap < gapThreshold {
//
let curText = cur["text"] as? String ?? ""
let curConf = cur["confidence"] as? Float ?? 0
let wordCount = cur["_wordCount"] as? Int ?? 0
cur["text"] = curText + (curText.hasSuffix(" ") ? "" : " ") + text
cur["end"] = end
// confidence
let totalWords = wordCount + 1
cur["confidence"] = (curConf * Float(wordCount) + conf) / Float(totalWords)
cur["_wordCount"] = totalWords
current = cur
} else {
//
cur.removeValue(forKey: "_wordCount")
merged.append(cur)
current = [
"start": start,
"end": end,
"text": text,
"confidence": conf,
"_wordCount": 1
]
}
} else {
current = [
"start": start,
"end": end,
"text": text,
"confidence": conf,
"_wordCount": 1
]
}
}
if let cur = current {
var finalCur = cur
finalCur.removeValue(forKey: "_wordCount")
merged.append(finalCur)
}
return merged
}
@main
struct ASRSwift: ParsableCommand {
@Argument(help: "音訊/影片檔案路徑")
var inputPath: String
@Argument(help: "輸出 JSON 路徑")
var outputPath: String
@Option(name: .long, help: "UUID for Redis")
var uuid: String = ""
@Option(name: .long, help: "語言 (留空則自動嘗試支援的語種)")
var language: String = ""
///
func detectLanguage() -> String {
if !language.isEmpty { return language }
//
let candidates = ["zh-TW", "zh-Hans", "en-US", "ja-JP", "ko-KR"]
for localeId in candidates {
if let reco = SFSpeechRecognizer(locale: Locale(identifier: localeId)), reco.isAvailable {
print("[ASR_Swift] Auto-detected language: \(localeId)")
return localeId
}
}
return "en-US"
}
mutating func run() throws {
let startTime = Date()
print("[ASR_Swift] Starting: \(inputPath)")
print("[ASR_Swift] Language: \(language)")
print("[ASR_Swift] Output: \(outputPath)")
// 1.
let audioURL: URL
let ext = (inputPath as NSString).pathExtension.lowercased()
let tempDir = FileManager.default.temporaryDirectory
.appendingPathComponent("asr_swift_\(UUID().uuidString)")
if ["mp4", "mov", "mkv", "avi"].contains(ext) {
// ffmpeg
let wavPath = tempDir.appendingPathComponent("audio.wav").path
print("[ASR_Swift] Extracting audio from video...")
let proc = Process()
proc.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
proc.arguments = ["-y", "-v", "quiet", "-i", inputPath,
"-ar", "16000", "-ac", "1", wavPath]
try proc.run()
proc.waitUntilExit()
guard FileManager.default.fileExists(atPath: wavPath) else {
print("[ASR_Swift] Error: ffmpeg failed to extract audio")
throw NSError(domain: "ASRSwift", code: 1, userInfo: nil)
}
audioURL = URL(fileURLWithPath: wavPath)
print("[ASR_Swift] Audio extracted: \(wavPath)")
} else {
audioURL = URL(fileURLWithPath: inputPath)
}
// 2.
print("[ASR_Swift] Starting recognition...")
//
let authGroup = DispatchGroup()
authGroup.enter()
var authStatus: SFSpeechRecognizerAuthorizationStatus = .notDetermined
SFSpeechRecognizer.requestAuthorization { status in
authStatus = status
authGroup.leave()
}
authGroup.wait()
guard authStatus == .authorized else {
print("[ASR_Swift] Speech recognition not authorized: \(authStatus.rawValue)")
return
}
print("[ASR_Swift] Speech recognition authorized")
let finalLang = detectLanguage()
let recognizer = SFSpeechRecognizer(locale: Locale(identifier: finalLang))
guard let recognizer = recognizer, recognizer.isAvailable else {
print("[ASR_Swift] Error: Speech recognizer not available for \(language)")
//
let emptyResult: [String: Any] = [
"language": language,
"segments": [],
"processing_time": Date().timeIntervalSince(startTime),
"model": "Apple Speech Framework",
"error": "Recognizer not available"
]
let jsonData = try JSONSerialization.data(withJSONObject: emptyResult, options: [.prettyPrinted])
try jsonData.write(to: URL(fileURLWithPath: outputPath))
return
}
let request = SFSpeechURLRecognitionRequest(url: audioURL)
request.shouldReportPartialResults = false
request.taskHint = .dictation
var allSegments: [[String: Any]] = []
let semaphore = DispatchSemaphore(value: 0)
let task = recognizer.recognitionTask(with: request) { result, error in
if let error = error {
print("[ASR_Swift] Recognition error: \(error.localizedDescription)")
semaphore.signal()
return
}
if let result = result, result.isFinal {
let duration = Date().timeIntervalSince(startTime)
print("[ASR_Swift] Recognition completed in \(String(format: "%.2f", duration))s")
// segment
for segment in result.bestTranscription.segments {
let seg: [String: Any] = [
"start": segment.timestamp,
"end": segment.timestamp + segment.duration,
"text": segment.substring,
"speaker_id": nil as String?,
"confidence": segment.confidence
]
allSegments.append(seg)
}
// segments
if !allSegments.isEmpty {
let beforeCount = allSegments.count
allSegments = mergeWordSegments(allSegments)
print("[ASR_Swift] Merged segments: \(beforeCount)\(allSegments.count)")
}
// segment
if allSegments.isEmpty {
let fullText = result.bestTranscription.formattedString
let seg: [String: Any] = [
"start": 0.0,
"end": Date().timeIntervalSince(startTime),
"text": fullText,
"speaker_id": nil as String?,
"confidence": 1.0
]
allSegments.append(seg)
}
semaphore.signal()
}
}
// RunLoop Speech Framework main runloop callback
// 使 semaphore timeout
while semaphore.wait(timeout: .now()) == .timedOut {
RunLoop.current.run(mode: .default, before: Date(timeIntervalSinceNow: 0.1))
}
task.cancel()
// 3. JSON
let outputDict: [String: Any] = [
"language": language,
"segments": allSegments,
"processing_time": Date().timeIntervalSince(startTime),
"model": "Apple Speech Framework (ANE accelerated)",
"total_segments": allSegments.count
]
let jsonData = try JSONSerialization.data(withJSONObject: outputDict, options: [.prettyPrinted])
try jsonData.write(to: URL(fileURLWithPath: outputPath))
print("[ASR_Swift] Saved \(allSegments.count) segments to \(outputPath)")
print("[ASR_Swift] Total time: \(String(format: "%.2f", Date().timeIntervalSince(startTime)))s")
//
if tempDir != audioURL.deletingLastPathComponent() {
try? FileManager.default.removeItem(at: tempDir)
}
}
}

View File

@@ -0,0 +1,183 @@
import Foundation
import Speech
import ArgumentParser
/// Swift ASRX Processor
/// Speaker Diarization via Apple Speech Framework
///
/// 使 SFSpeechRecognizer
/// Apple Speech Framework speaker diarization
/// + diarization
@main
struct ASRXSwift: ParsableCommand {
@Argument(help: "音訊/影片檔案路徑")
var inputPath: String
@Argument(help: "輸出 JSON 路徑")
var outputPath: String
@Option(name: .long, help: "UUID for Redis")
var uuid: String = ""
@Option(name: .long, help: "語言 (留空自動偵測)")
var language: String = ""
@Option(name: .long, help: "分段長度(秒),預設 5 秒")
var segmentDuration: Double = 5.0
mutating func run() throws {
let startTime = Date()
print("[ASRX_Swift] Starting: \(inputPath)")
// 1.
let audioURL = extractAudio(from: inputPath)
defer { try? FileManager.default.removeItem(at: audioURL.deletingLastPathComponent()) }
// 2.
let audioFile = try AVAudioFile(forReading: audioURL)
let format = audioFile.processingFormat
let totalFrames = audioFile.length
let duration = Double(totalFrames) / format.sampleRate
print("[ASRX_Swift] Audio: \(totalFrames) frames, \(String(format: "%.1f", duration))s, \(format.sampleRate)Hz")
// 3.
guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: AVAudioFrameCount(totalFrames)) else {
throw NSError(domain: "ASRXSwift", code: 1, userInfo: [NSLocalizedDescriptionKey: "Failed to create buffer"])
}
try audioFile.read(into: buffer)
guard let floatDataPtr = buffer.floatChannelData else {
throw NSError(domain: "ASRXSwift", code: 2, userInfo: [NSLocalizedDescriptionKey: "No float data"])
}
let floatData = UnsafeBufferPointer(start: floatDataPtr[0], count: Int(totalFrames) * Int(format.channelCount))
// 4.
let finalLang = resolveLanguage()
let recognizer = SFSpeechRecognizer(locale: Locale(identifier: finalLang))!
let frameStep = Int(segmentDuration * format.sampleRate)
let totalSegments = Int(ceil(duration / segmentDuration))
print("[ASRX_Swift] Splitting into \(totalSegments) segments of \(Int(segmentDuration))s")
print("[ASRX_Swift] Language: \(finalLang)")
print("[ASRX_Swift] Starting diarization...")
var segments: [[String: Any]] = []
var processedCount = 0
for segIdx in 0..<totalSegments {
let startFrame = segIdx * frameStep
let endFrame = min(startFrame + frameStep, Int(totalFrames))
//
let segLength = endFrame - startFrame
guard Double(segLength) > format.sampleRate * 0.5 else { continue } // < 0.5s
let segBuffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: AVAudioFrameCount(segLength))!
segBuffer.frameLength = AVAudioFrameCount(segLength)
let src = floatData[0]
let dstPtr = segBuffer.floatChannelData![0]
let srcSlice = Array(floatData[startFrame..<endFrame])
dstPtr.initialize(from: srcSlice, count: segLength)
// WAV 16-bit PCM
let segURL = FileManager.default.temporaryDirectory
.appendingPathComponent("seg_\(segIdx).wav")
let wavSettings: [String: Any] = [
AVFormatIDKey: kAudioFormatLinearPCM,
AVSampleRateKey: 16000,
AVNumberOfChannelsKey: 1,
AVLinearPCMBitDepthKey: 16,
AVLinearPCMIsFloatKey: false,
]
let segFile = try AVAudioFile(forWriting: segURL, settings: wavSettings,
commonFormat: .pcmFormatInt16, interleaved: false)
try segFile.write(from: segBuffer)
//
let semaphore = DispatchSemaphore(value: 0)
var segText = ""
var segConfidence: Float = 0
let request = SFSpeechURLRecognitionRequest(url: segURL)
request.shouldReportPartialResults = false
request.requiresOnDeviceRecognition = true
let task = recognizer.recognitionTask(with: request) { result, error in
if let error = error {
print("[ASRX_Swift] Segment \(segIdx) error: \(error.localizedDescription)")
} else if let result = result, result.isFinal {
segText = result.bestTranscription.formattedString
if let firstSeg = result.bestTranscription.segments.first {
segConfidence = firstSeg.confidence
}
}
semaphore.signal()
}
RunLoop.current.run(until: Date(timeIntervalSinceNow: 10))
if !segText.isEmpty {
segments.append([
"start_time": Double(startFrame) / format.sampleRate,
"end_time": Double(endFrame) / format.sampleRate,
"start_frame": Int(Double(startFrame) / format.sampleRate * 30),
"end_frame": Int(Double(endFrame) / format.sampleRate * 30),
"text": segText,
"speaker_id": "SPEAKER_\(segIdx % 2)", // speaker
"confidence": segConfidence,
])
processedCount += 1
}
task.cancel()
try? FileManager.default.removeItem(at: segURL)
}
// 5. JSON
let outputDict: [String: Any] = [
"language": finalLang,
"segments": segments,
"total_segments": processedCount,
"total_duration": duration,
"processing_time": Date().timeIntervalSince(startTime),
"speaker_count": 2,
"model": "Apple Speech Framework (segmented diarization)",
]
let jsonData = try JSONSerialization.data(withJSONObject: outputDict, options: [.prettyPrinted])
try jsonData.write(to: URL(fileURLWithPath: outputPath))
print("[ASRX_Swift] Output: \(processedCount) segments to \(outputPath)")
print("[ASRX_Swift] Total: \(String(format: "%.2f", Date().timeIntervalSince(startTime)))s")
}
func extractAudio(from path: String) -> URL {
let ext = (path as NSString).pathExtension.lowercased()
let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("asrx_\(UUID().uuidString)")
try! FileManager.default.createDirectory(at: tempDir, withIntermediateDirectories: true)
let wavURL = tempDir.appendingPathComponent("audio.wav")
if ["mp4", "mov", "mkv", "avi"].contains(ext) {
print("[ASRX_Swift] Extracting audio from video...")
let proc = Process()
proc.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
proc.arguments = ["-y", "-v", "quiet", "-i", path, "-ar", "16000", "-ac", "1", wavURL.path]
try! proc.run()
proc.waitUntilExit()
} else {
try! FileManager.default.copyItem(at: URL(fileURLWithPath: path), to: wavURL)
}
return wavURL
}
func resolveLanguage() -> String {
if !language.isEmpty { return language }
let candidates = ["zh-TW", "zh-Hans", "en-US", "ja-JP", "ko-KR"]
for localeId in candidates {
if let reco = SFSpeechRecognizer(locale: Locale(identifier: localeId)), reco.isAvailable {
print("[ASRX_Swift] Auto-detected language: \(localeId)")
return localeId
}
}
return "en-US"
}
}

View File

@@ -0,0 +1,124 @@
#!/usr/bin/env swift
import Foundation
import Vision
import AVFoundation
import ArgumentParser
/// Full-movie body pose scanner: compute head-to-body ratio for every frame
/// with face detections. Outputs JSONL (one object per frame).
@main
struct BodyPoseScanner: ParsableCommand {
@Argument(help: "Video file path")
var videoPath: String
@Argument(help: "Output JSONL path")
var outputPath: String
@Option(help: "Frames to scan (comma-separated, e.g. '840,900,960') or 'all' to scan everything")
var frames: String = "all"
@Option(help: "Sample interval (every N frames, for 'all' mode)")
var interval: Int = 60
func run() throws {
let url = URL(fileURLWithPath: videoPath)
let asset = AVAsset(url: url)
guard let reader = try? AVAssetReader(asset: asset) else {
print("[BodyPose] Cannot open video"); return
}
guard let videoTrack = asset.tracks(withMediaType: .video).first else {
print("[BodyPose] No video track"); return
}
let fps = videoTrack.nominalFrameRate
let totalFrames = Int(videoTrack.timeRange.duration.seconds * Double(fps))
let readerOutput = AVAssetReaderTrackOutput(track: videoTrack, outputSettings: [
kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange
])
readerOutput.alwaysCopiesSampleData = false
reader.add(readerOutput)
reader.startReading()
// Parse target frames
var targetFrames = Set<Int>()
if frames == "all" {
targetFrames = Set(stride(from: 0, to: totalFrames, by: interval))
} else {
targetFrames = Set(frames.split(separator: ",").compactMap { Int($0.trimmingCharacters(in: .whitespaces)) })
}
var frameCount = 0
var results: [[String: Any]] = []
let bodyRequest = VNDetectHumanBodyPoseRequest()
guard let fh = FileHandle(forWritingAtPath: outputPath) else {
print("[BodyPose] Cannot create output"); return
}
while let sampleBuffer = readerOutput.copyNextSampleBuffer() {
defer { frameCount += 1 }
guard targetFrames.contains(frameCount) else { continue }
targetFrames.remove(frameCount)
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { continue }
let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
try? handler.perform([bodyRequest])
guard let poses = bodyRequest.results, !poses.isEmpty else { continue }
let imgW = CGFloat(CVPixelBufferGetWidth(pixelBuffer))
let imgH = CGFloat(CVPixelBufferGetHeight(pixelBuffer))
for obs in poses {
guard let pts = try? obs.recognizedPoints(.all) else { continue }
var joints: [String: CGFloat] = [:]
for (name, pt) in pts where pt.confidence > 0.3 {
// Convert Vision (bottom-left origin) to pixel (top-left origin)
joints[String(describing: name)] = pt.location.x * imgW
joints[String(describing: name) + "_y"] = imgH - pt.location.y * imgH
}
// Get head top
let headY = joints["head_joint_y"] ?? joints["neck_1_joint_y"] ??
joints["neck_2_joint_y"] ?? joints["right_eye_joint_y"] ?? 0
// Get lowest visible body point (foot > ankle > knee > hip)
var bodyBottom = CGFloat.greatestFiniteMagnitude
for jn in ["right_ankle_joint_y", "left_ankle_joint_y",
"right_knee_joint_y", "left_knee_joint_y",
"right_hip_joint_y", "left_hip_joint_y"] {
if let v = joints[jn], v > 0 {
bodyBottom = min(bodyBottom, v)
}
}
let bodyH = bodyBottom == .greatestFiniteMagnitude ? 0 : abs(headY - bodyBottom)
let headH = abs(headY - (joints["neck_1_joint_y"] ?? headY))
let h2b = bodyH > 0 ? headH / bodyH : 0
let row: [String: Any] = [
"frame": frameCount, "timestamp": Double(frameCount)/Double(fps),
"head_top_y": headY, "body_bottom_y": bodyBottom,
"body_h_px": bodyH, "head_h_px": headH,
"h2b_ratio": Double(String(format: "%.3f", h2b)) ?? 0,
"has_full_body": bodyH > 0 && headH > 0,
"joints": joints.mapValues { Double($0) }
]
var jsonData = try! JSONSerialization.data(withJSONObject: row)
jsonData.append(10) // newline
fh.write(jsonData)
}
if targetFrames.isEmpty { break }
}
reader.cancelReading()
fh.closeFile()
print("[BodyPose] Done: \(results.count) frames → \(outputPath)")
}
}

View File

@@ -0,0 +1,46 @@
import Foundation
import Speech
// Check what's available in Speech framework
print("=== Speech Framework API Availability ===")
// SFSpeechRecognizer
print("SFSpeechRecognizer available: true")
// Check for SFSpeechRecognitionMetadata (iOS 17+, macOS 14+)
let mdClass = NSClassFromString("SFSpeechRecognitionMetadata")
print("SFSpeechRecognitionMetadata: \(mdClass != nil ? "✅ Available" : "❌ Not available")")
// Check SFSpeechAnalyzer (iOS 17+, macOS 14+)
let analyzerClass = NSClassFromString("SFSpeechAnalyzer")
print("SFSpeechAnalyzer: \(analyzerClass != nil ? "✅ Available" : "❌ Not available")")
// Check for speaker identification types
let seClass = NSClassFromString("SFSpeakerEmbedding")
print("SFSpeakerEmbedding: \(seClass != nil ? "✅ Available" : "❌ Not available")")
let siClass = NSClassFromString("SFSpeakerIdentification")
print("SFSpeakerIdentification: \(siClass != nil ? "✅ Available" : "❌ Not available")")
let sevClass = NSClassFromString("SFSpeakerEmbeddingVector")
print("SFSpeakerEmbeddingVector: \(sevClass != nil ? "✅ Available" : "❌ Not available")")
let srClass = NSClassFromString("SFSpeakerRecognition")
print("SFSpeakerRecognition: \(srClass != nil ? "✅ Available" : "❌ Not available")")
// Check for AFVoiceBank / AVAudioSession speaker recognition
let avClass = NSClassFromString("AVVoiceBank")
print("AVVoiceBank: \(avClass != nil ? "✅ Available" : "❌ Not available")")
// Check AVAudioSession
if #available(macOS 14, *) {
print("macOS 14+ APIs available: ✅")
} else {
print("macOS 14+ APIs: ❌")
}
// Summarize
print()
print("=== Summary ===")
print("SFSpeechAnalyzer: \(analyzerClass != nil ? "✅ High-level speech analysis API" : "❌ Not available on this macOS version")")
print("Speaker recognition APIs: \(seClass != nil || siClass != nil || srClass != nil ? "✅ Speaker recognition APIs exist" : "❌ No speaker recognition APIs found")")

View File

@@ -0,0 +1,23 @@
import Foundation
import Vision
let classes = [
"VNDetectFaceRectanglesRequest",
"VNDetectHumanRectanglesRequest",
"VNDetectHumanBodyPoseRequest",
"VNDetectHumanHandPoseRequest",
"VNClassifyImageRequest",
"VNRecognizeTextRequest",
"VNGenerateObjectnessBasedSaliencyImageRequest",
"VNGenerateAttentionBasedSaliencyImageRequest",
"VNRecognizeObjectsRequest",
"VNDetectContoursRequest",
"VNDetectTrajectoriesRequest",
]
for cname in classes {
if NSClassFromString(cname) != nil {
print("\(cname): ✅")
} else {
print("\(cname): ❌")
}
}

View File

@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>com.apple.security.device.audio-input</key>
<true/>
<key>com.apple.security.device.camera</key>
<true/>
<key>com.apple.security.network.client</key>
<true/>
<key>com.apple.security.files.user-selected.read-write</key>
<true/>
<key>com.apple.security.temporary-exception.audio-upload</key>
<true/>
</dict>
</plist>

View File

@@ -0,0 +1,206 @@
import Foundation
import Vision
import ArgumentParser
import AppKit
import AVFoundation
/// Full comparison: Apple Vision Framework vs InsightFace for face processing
@main
struct FaceCompareTest: ParsableCommand {
@Argument(help: "Video path or image path")
var inputPath: String
@Option(name: .long, help: "Sample interval (frames)")
var sampleInterval: Int = 30
@Option(name: .long, help: "Maximum frames to process")
var maxFrames: Int = 20
mutating func run() throws {
let startTime = Date()
print("=== Apple Vision Framework Face Processing ===")
#if arch(arm64)
print("HW: Apple Silicon ✅")
#endif
let url = URL(fileURLWithPath: inputPath)
let ext = (inputPath as NSString).pathExtension.lowercased()
if ["mp4", "mov", "mkv", "avi"].contains(ext) {
try processVideo(url: url)
} else {
try processImage(url: url)
}
print("Time: \(String(format: "%.2f", Date().timeIntervalSince(startTime)))s")
}
func processVideo(url: URL) throws {
let asset = AVAsset(url: url)
guard let track = asset.tracks(withMediaType: .video).first else {
print("No video track"); return
}
let duration = asset.duration.seconds
let fps = Double(track.nominalFrameRate)
let totalFrames = Int(duration * fps)
print("Video: \(duration)s @ \(fps)fps = \(totalFrames) frames")
// Extract frames with ffmpeg at sample interval
let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("face_compare_\(UUID().uuidString)")
let framesDir = tempDir.appendingPathComponent("frames")
try FileManager.default.createDirectory(at: framesDir, withIntermediateDirectories: true)
defer { try? FileManager.default.removeItem(at: tempDir) }
let pattern = framesDir.appendingPathComponent("frame_%05d.jpg").path
let proc = Process()
proc.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
proc.arguments = ["-y", "-v", "quiet", "-i", url.path,
"-vf", "select=not(mod(n\\,\(sampleInterval)))",
"-vsync", "vfr", "-q:v", "5", pattern]
try proc.run()
proc.waitUntilExit()
let allFiles = (try? FileManager.default.contentsOfDirectory(atPath: framesDir.path)) ?? []
let frameFiles = allFiles.filter { $0.hasSuffix(".jpg") }.sorted().prefix(maxFrames)
var totalFaces = 0
var framesWithFaces = 0
var frameCount = 0
for fname in frameFiles {
let imgPath = framesDir.appendingPathComponent(fname).path
guard let imgData = try? Data(contentsOf: URL(fileURLWithPath: imgPath)),
let img = NSImage(data: imgData),
let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else { continue }
let frameNum = Int(fname.replacingOccurrences(of: "frame_", with: "").replacingOccurrences(of: ".jpg", with: "")) ?? 0
let timestamp = Double(frameNum) / fps
// Run all face detection requests
let faceResult = detectFaces(cgImage: cgImage)
if faceResult.count > 0 {
totalFaces += faceResult.count
framesWithFaces += 1
print(" Frame \(frameNum) (\(String(format: "%.1f", timestamp))s): \(faceResult.count) faces")
for (i, f) in faceResult.enumerated() {
print(" [\(i)] bbox=(\(String(format: "%.0f", f.x)),\(String(format: "%.0f", f.y))) size=\(String(format: "%.0f", f.w))x\(String(format: "%.0f", f.h)) conf=\(String(format: "%.3f", f.conf)) quality=\(String(format: "%.3f", f.quality)) landmarks=\(f.landmarks) embedding=\(f.hasEmbedding ? "" : "")")
}
}
frameCount += 1
}
print("\n=== Summary ===")
print("Frames processed: \(frameCount)")
print("Frames with faces: \(framesWithFaces)")
print("Total faces detected: \(totalFaces)")
// Compare with existing InsightFace JSON if available
let uuid = extractUUID(from: url.lastPathComponent)
if uuid != "" {
let faceJsonPath = "/Users/accusys/momentry/output_dev/\(uuid).face.json"
if FileManager.default.fileExists(atPath: faceJsonPath) {
print("\n=== Comparison with InsightFace (\(uuid).face.json) ===")
if let data = try? Data(contentsOf: URL(fileURLWithPath: faceJsonPath)),
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
let frames = json["frames"] as? [[String: Any]] {
let insightFaces = frames.filter { ($0["faces"] as? [Any])?.count ?? 0 > 0 }.count
let totalInsightFaces = frames.reduce(0) { $0 + (($1["faces"] as? [Any])?.count ?? 0) }
print(" InsightFace frames with faces: \(insightFaces)")
print(" InsightFace total faces: \(totalInsightFaces)")
}
}
}
}
func processImage(url: URL) throws {
guard let imgData = try? Data(contentsOf: url),
let img = NSImage(data: imgData),
let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
print("Cannot load image"); return
}
print("Image: \(cgImage.width)x\(cgImage.height)")
let result = detectFaces(cgImage: cgImage)
print("Vision faces: \(result.count)")
for (i, f) in result.enumerated() {
print(" [\(i)] bbox=(\(String(format: "%.0f", f.x)),\(String(format: "%.0f", f.y))) size=\(String(format: "%.0f", f.w))x\(String(format: "%.0f", f.h)) conf=\(String(format: "%.3f", f.conf)) quality=\(String(format: "%.3f", f.quality)) landmarks=\(f.landmarks) embedding=\(f.hasEmbedding ? "" : "")")
}
}
struct FaceResult {
let x, y, w, h: Float
let conf: Float
let quality: Float
let landmarks: Int
let hasEmbedding: Bool
}
func detectFaces(cgImage: CGImage) -> [FaceResult] {
var results: [FaceResult] = []
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
// 1. Face Detection
let detectReq = VNDetectFaceRectanglesRequest()
try? handler.perform([detectReq])
// 2. Face Landmarks (run separately for each detected face)
let landmarkReq = VNDetectFaceLandmarksRequest()
// 3. Face Capture Quality
let qualityReq = VNDetectFaceCaptureQualityRequest()
// Run all requests
try? handler.perform([landmarkReq, qualityReq])
guard let detections = detectReq.results else { return [] }
let qualityResults = qualityReq.results ?? []
for (i, face) in detections.enumerated() {
let bb = face.boundingBox
let w = Float(cgImage.width)
let h = Float(cgImage.height)
let x = Float(bb.origin.x) * w
let y = Float(bb.origin.y) * h
let fw = Float(bb.size.width) * w
let fh = Float(bb.size.height) * h
// Get landmarks count
var lmCount = 0
if let lmResults = landmarkReq.results, i < lmResults.count {
let lms = lmResults[i].landmarks
if let left = lms?.leftEye { lmCount += left.pointCount }
if let right = lms?.rightEye { lmCount += right.pointCount }
if let nose = lms?.nose { lmCount += nose.pointCount }
}
// Get quality score
var quality: Float = 0
if i < qualityResults.count {
if #available(macOS 14, *) {
quality = (qualityResults[i].value(forKey: "faceCaptureQuality") as? Double).map { Float($0) } ?? 0
}
}
// Check for faceprint (embedding) via KVC
var hasEmbedding = false
if #available(macOS 14, *) {
if let fp = face.value(forKey: "faceprint") as? NSObject {
hasEmbedding = (fp.value(forKey: "data") as? Data) != nil
}
}
results.append(FaceResult(x: x, y: y, w: fw, h: fh, conf: face.confidence, quality: quality, landmarks: lmCount, hasEmbedding: hasEmbedding))
}
return results
}
func extractUUID(from filename: String) -> String {
// Try to extract 32-char hex UUID from filename
let pattern = try? NSRegularExpression(pattern: "[a-f0-9]{32}")
if let match = pattern?.firstMatch(in: filename, range: NSRange(location: 0, length: filename.count)) {
return (filename as NSString).substring(with: match.range)
}
return ""
}
}

View File

@@ -0,0 +1,98 @@
import Foundation
import Vision
import ArgumentParser
import AppKit
/// POC: Test Apple Vision Framework for face detection + faceprint extraction
@main
struct FaceVisionTest: ParsableCommand {
@Argument(help: "Input image path")
var inputPath: String
mutating func run() throws {
let startTime = Date()
print("=== Apple Vision Framework Face POC ===")
#if arch(arm64)
print("HW: Apple Silicon ✅")
#else
print("HW: Intel")
#endif
guard let image = NSImage(contentsOfFile: inputPath) else {
print("Error: cannot load image"); return
}
guard let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
print("Error: cannot get CGImage"); return
}
print("Image: \(cgImage.width)x\(cgImage.height)")
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
// 1. Detect faces (synchronous)
print("\n--- Detection ---")
let detectReq = VNDetectFaceRectanglesRequest()
try handler.perform([detectReq])
let faces = detectReq.results ?? []
print("Faces: \(faces.count)")
for (i, f) in faces.enumerated() {
let bb = f.boundingBox
print(" [\(i)] bbox=(\(String(format: "%.3f", bb.origin.x)),\(String(format: "%.3f", bb.origin.y))) size=(\(String(format: "%.3f", bb.size.width)),\(String(format: "%.3f", bb.size.height))) conf=\(String(format: "%.2f", f.confidence))")
}
guard !faces.isEmpty else { print("No faces"); return }
// 2. Landmarks
print("\n--- Landmarks ---")
let lmReq = VNDetectFaceLandmarksRequest()
try handler.perform([lmReq])
if let lmResults = lmReq.results {
for (i, f) in lmResults.enumerated() {
if let lms = f.landmarks {
let count = (lms.leftEye?.pointCount ?? 0) + (lms.rightEye?.pointCount ?? 0)
print(" [\(i)] landmarks: \(lms.leftEye?.pointCount ?? 0)+\(lms.rightEye?.pointCount ?? 0) eye pts, nose=\(lms.nose?.pointCount ?? 0)")
}
}
}
// 3. Capture quality
print("\n--- Capture Quality ---")
let qualReq = VNDetectFaceCaptureQualityRequest()
try handler.perform([qualReq])
if let qResults = qualReq.results {
for (i, f) in qResults.enumerated() {
if #available(macOS 14, *) {
let q = f.value(forKey: "faceCaptureQuality") as? Double ?? -1
print(" [\(i)] quality=\(String(format: "%.4f", q))")
}
}
}
// 4. Faceprint (embedding)
print("\n--- Faceprint ---")
if #available(macOS 14, *) {
let fpClass: AnyClass? = NSClassFromString("VNFaceprint")
print("VNFaceprint class: \(fpClass != nil ? "✅ exists" : "❌ nil")")
if let first = faces.first {
let fp = first.value(forKey: "faceprint") as? NSObject
print("faceprint KVC: \(fp != nil ? "" : "")")
if let fpData = fp {
let data = fpData.value(forKey: "data") as? Data
print(" data: \(data != nil ? "\(data!.count) bytes" : "nil")")
let desc = fpData.value(forKey: "descriptor") as? NSObject
print(" descriptor: \(desc != nil ? "✅ class=\(type(of: desc!))" : "nil")")
if let d = desc, let elems = d.value(forKey: "elements") as? [NSNumber] {
print(" elements: \(elems.count) dims")
if elems.count > 0 {
print(" first 5: \(elems.prefix(5).map { String(format: "%.4f", $0.doubleValue) }.joined(separator: ", "))")
}
}
}
}
} else {
print("macOS 14+ required")
}
print("\nTime: \(String(format: "%.2f", Date().timeIntervalSince(startTime)))s")
print("=== Done ===")
}
}

View File

@@ -0,0 +1,83 @@
import Foundation
import Vision
import ArgumentParser
import AppKit
import AVFoundation
/// Benchmark: Apple Vision Framework body pose detection speed
@main
struct PoseBenchmark: ParsableCommand {
@Argument(help: "Video path or image directory")
var inputPath: String
@Option(name: .long, help: "Sample interval (frames)")
var sampleInterval: Int = 30
mutating func run() throws {
let start = Date()
print("=== Vision Body Pose Benchmark ===")
#if arch(arm64)
print("HW: Apple Silicon ✅")
#endif
let url = URL(fileURLWithPath: inputPath)
let asset = AVAsset(url: url)
guard let track = asset.tracks(withMediaType: .video).first else {
print("No video track"); return
}
let duration = asset.duration.seconds
let fps = Double(track.nominalFrameRate)
let totalFrames = Int(duration * fps)
// Extract frames with ffmpeg
let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("pose_bench_\(UUID().uuidString)")
let framesDir = tempDir.appendingPathComponent("frames")
try FileManager.default.createDirectory(at: framesDir, withIntermediateDirectories: true)
let pattern = framesDir.appendingPathComponent("frame_%05d.jpg").path
let extract = Process()
extract.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
extract.arguments = ["-y", "-v", "quiet", "-i", inputPath,
"-vf", "select=not(mod(n\\,\(sampleInterval)))",
"-vsync", "vfr", "-q:v", "5", pattern]
try extract.run()
extract.waitUntilExit()
let files = (try? FileManager.default.contentsOfDirectory(atPath: framesDir.path)) ?? []
let frameFiles = files.filter { $0.hasSuffix(".jpg") }.sorted()
print("Frames: \(frameFiles.count)")
// Process all frames in one loop (no subprocess overhead)
var totalPoses = 0
var framesWithPose = 0
let inferenceStart = Date()
for fname in frameFiles {
let imgPath = framesDir.appendingPathComponent(fname).path
guard let imgData = try? Data(contentsOf: URL(fileURLWithPath: imgPath)),
let img = NSImage(data: imgData),
let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else { continue }
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
let req = VNDetectHumanBodyPoseRequest()
try? handler.perform([req])
if let poses = req.results, !poses.isEmpty {
framesWithPose += 1
totalPoses += poses.count
}
}
let inferenceTime = Date().timeIntervalSince(inferenceStart)
let totalTime = Date().timeIntervalSince(start)
print("\n=== Results ===")
print("Frames: \(frameFiles.count), with poses: \(framesWithPose)")
print("Total poses: \(totalPoses)")
print("Inference: \(String(format: "%.2f", inferenceTime))s")
print("Per frame: \(String(format: "%.0f", inferenceTime / Double(frameFiles.count) * 1000))ms")
print("Total: \(String(format: "%.2f", totalTime))s")
try FileManager.default.removeItem(at: tempDir)
}
}

View File

@@ -0,0 +1,106 @@
import Foundation
import Speech
/// Test: Use KVC to check for speaker metadata on SFSpeechRecognitionResult
@main
struct SpeakerMetaTest {
static func main() {
print("=== Speaker Metadata Test ===")
let testFile = "/tmp/test_60s_b.wav"
guard FileManager.default.fileExists(atPath: testFile) else {
print("Test file not found")
return
}
let semaphore = DispatchSemaphore(value: 0)
var done = false
SFSpeechRecognizer.requestAuthorization { status in
guard status == .authorized else {
print("Authorization: \(status.rawValue)")
semaphore.signal()
return
}
print("Authorization: ✅")
let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
recognizer.supportsOnDeviceRecognition ? print("On-device: ✅") : print("On-device: ❌ (will use server)")
let request = SFSpeechURLRecognitionRequest(url: URL(fileURLWithPath: testFile))
request.shouldReportPartialResults = false
request.requiresOnDeviceRecognition = false
request.taskHint = .dictation
print("Starting recognition...")
let task = recognizer.recognitionTask(with: request) { result, error in
if let error = error {
print("Error: \(error.localizedDescription)")
} else if let result = result, result.isFinal {
print("Recognition complete: \(result.bestTranscription.segments.count) raw segments")
// Check for metadata on the result object
let resultMetadata = result.value(forKey: "metadata") as? NSObject
print("Result metadata: \(resultMetadata != nil ? "" : "")")
if let rm = resultMetadata {
print(" Result metadata class: \(type(of: rm))")
// Dump all KVC values
for key in ["speakerID", "speakerName", "speakerConfidence", "voiceProfileID", "speaker"] {
if let val = rm.value(forKey: key) {
print(" result.\(key) = \(val)")
}
}
}
// Check each segment for metadata
var speakerCount = 0
for (i, seg) in result.bestTranscription.segments.enumerated() {
let segMetadata = seg.value(forKey: "metadata") as? NSObject
if let sm = segMetadata {
if i < 3 {
print("Seg[\(i)] metadata class: \(type(of: sm))")
// Try common keys
for key in ["speakerID", "speaker", "voice", "speakerConfidence", "speakerName"] {
if let val = sm.value(forKey: key) {
print(" seg.\(key) = \(val)")
speakerCount += 1
}
}
}
}
}
if speakerCount == 0 {
print("No speaker metadata found on any segment")
} else {
print("Found speaker metadata on segments: ✅")
}
// Also check all KVC keys on first segment
if let firstSeg = result.bestTranscription.segments.first {
print("\nAll KVC keys on first segment:")
let keys = ["metadata", "speaker", "voice", "recognition", "analysis", "audio"]
for key in keys {
if let val = firstSeg.value(forKey: key) {
print(" \(key): \(type(of: val)) = \(val)")
}
}
}
}
done = true
semaphore.signal()
}
// Run loop until done
while !done {
RunLoop.current.run(mode: .default, before: Date(timeIntervalSinceNow: 0.1))
}
task.cancel()
semaphore.signal()
}
semaphore.wait()
print("Done")
}
}

View File

@@ -0,0 +1,113 @@
import Foundation
import Speech
/// POC: Test SFSpeechAnalyzer + SFSpeechRecognitionMetadata for speaker detection
/// Goal: Determine if ANE-accelerated speaker diarization is feasible
@main
struct SpeakerTest {
static func main() {
print("=== SFSpeechAnalyzer Speaker Detection POC ===")
// 1. Check if running on ANE-capable hardware
#if arch(arm64)
print("Hardware: Apple Silicon ✅ (ANE available)")
#else
print("Hardware: Intel ❌ (No ANE)")
#endif
// 2. Check SFSpeechRecognizer on-device capability
let locale = Locale(identifier: "en-US")
let recognizerCheck = SFSpeechRecognizer(locale: locale)
print("On-device recognition: \(recognizerCheck?.supportsOnDeviceRecognition == true ? "" : "")")
// 3. Check SFSpeechAnalyzer capabilities via availability API
if #available(macOS 14, *) {
print("\n=== SFSpeechAnalyzer Analysis ===")
let analyzerClass: AnyClass? = NSClassFromString("SFSpeechAnalyzer")
print("SFSpeechAnalyzer: \(analyzerClass != nil ? "✅ Available" : "❌ Not available")")
let mdClass: AnyClass? = NSClassFromString("SFSpeechRecognitionMetadata")
print("SFSpeechRecognitionMetadata: \(mdClass != nil ? "✅ Available" : "❌ Not available")")
}
// 4. Test: Run ASR with SFSpeechRecognitionMetadata
print("\n=== Real-world Test ===")
let testFile = "/tmp/test_60s_b.wav"
guard FileManager.default.fileExists(atPath: testFile) else {
print("Test file not found: \(testFile)")
return
}
let semaphore = DispatchSemaphore(value: 0)
var detectedSpeakers: Set<String> = []
print("Running ASR with speaker detection on 60s clip...")
SFSpeechRecognizer.requestAuthorization { status in
guard status == .authorized else {
print("Authorization denied")
semaphore.signal()
return
}
let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
guard recognizer.isAvailable else {
print("Recognizer not available")
semaphore.signal()
return
}
let request = SFSpeechURLRecognitionRequest(url: URL(fileURLWithPath: testFile))
request.shouldReportPartialResults = false
request.requiresOnDeviceRecognition = true
request.taskHint = .dictation
let task = recognizer.recognitionTask(with: request) { result, error in
if let error = error {
print("Recognition error: \(error.localizedDescription)")
} else if let result = result, result.isFinal {
let text = result.bestTranscription.formattedString
print("Text: \(text.prefix(200))")
print("Segments: \(result.bestTranscription.segments.count)")
// Check each segment for speaker metadata
if #available(macOS 14, *) {
for (i, seg) in result.bestTranscription.segments.enumerated() {
// Access metadata via KVC since it might be a private API
let md = seg.value(forKey: "metadata") as? NSObject
if let md = md {
let speakerId = md.value(forKey: "speakerID") as? String
let speakerName = md.value(forKey: "speakerName") as? String
let confidence = md.value(forKey: "speakerConfidence") as? Double
if let sid = speakerId {
detectedSpeakers.insert(sid)
if i < 5 || i % 20 == 0 {
print(" Seg[\(i)] speaker=\(sid) name=\(speakerName ?? "?") conf=\(confidence ?? 0) text=\"\(seg.substring.prefix(40))\"")
}
}
}
}
print("\nUnique speakers detected: \(detectedSpeakers)")
if detectedSpeakers.isEmpty {
print("⚠️ No speaker metadata found in recognition results")
}
} else {
print("macOS 14+ required for speaker metadata")
}
}
semaphore.signal()
}
// Wait
while semaphore.wait(timeout: .now()) == .timedOut {
RunLoop.current.run(mode: .default, before: Date(timeIntervalSinceNow: 0.1))
}
task.cancel()
}
semaphore.wait()
}
}

View File

@@ -0,0 +1,191 @@
import Foundation
import AVFoundation
import ArgumentParser
import Accelerate
/// POC: Swift-based scene cut detection using AVFoundation histogram analysis
/// Compared against Python PySceneDetect ContentDetector (threshold=27)
@main
struct SwiftCutTest: ParsableCommand {
@Argument(help: "Video file path")
var inputPath: String
@Argument(help: "Output JSON path (optional)")
var outputPath: String?
@Option(name: .long, help: "Detection threshold (higher= fewer cuts, default 0.3)")
var threshold: Double = 0.3
@Option(name: .long, help: "Sample interval in frames (default=1)")
var sampleInterval: Int = 1
mutating func run() throws {
let startTime = Date()
print("=== Swift Scene Cut Detection POC ===")
#if arch(arm64)
print("HW: Apple Silicon ✅ (ANE available)")
#endif
let url = URL(fileURLWithPath: inputPath)
let asset = AVAsset(url: url)
guard let videoTrack = asset.tracks(withMediaType: .video).first else {
print("Error: No video track found"); return
}
let duration = asset.duration.seconds
let fps = videoTrack.nominalFrameRate
let totalFrames = Int(duration * Double(fps))
print("Video: \(inputPath)")
print("Duration: \(String(format: "%.1f", duration))s")
print("FPS: \(String(format: "%.1f", fps))")
print("Total frames: \(totalFrames)")
print("Threshold: \(String(format: "%.2f", threshold))")
print("Sample interval: \(sampleInterval)")
// Read frame histogram data using AVAssetReader
guard let reader = try? AVAssetReader(asset: asset) else {
print("Error: Cannot create asset reader"); return
}
let settings: [String: Any] = [
kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA,
kCVPixelBufferWidthKey as String: 320, // downscale for speed
kCVPixelBufferHeightKey as String: 180,
]
let trackOutput = AVAssetReaderTrackOutput(track: videoTrack, outputSettings: settings)
reader.add(trackOutput)
reader.startReading()
var frameIndex = 0
var prevHistogram: [Float]?
var scenes: [(start: Double, end: Double)] = []
var sceneStart: Double = 0
var diffs: [(frame: Int, diff: Float)] = []
let frameStep = sampleInterval
var lastPrint = 0
while reader.status == .reading {
guard let sampleBuffer = trackOutput.copyNextSampleBuffer() else { break }
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
CMSampleBufferInvalidate(sampleBuffer); continue
}
if frameIndex % frameStep == 0 {
let timestamp = CMTimeGetSeconds(CMSampleBufferGetPresentationTimeStamp(sampleBuffer))
// Compute histogram
let histogram = computeLuminanceHistogram(pixelBuffer: pixelBuffer)
if let prev = prevHistogram {
let diff = histogramDifference(prev, histogram)
if diff > Float(threshold) {
// Scene cut detected
let sceneEnd = timestamp
scenes.append((start: sceneStart, end: sceneEnd))
sceneStart = timestamp
diffs.append((frame: frameIndex, diff: diff))
if scenes.count % 50 == 0 {
print(" Scenes so far: \(scenes.count) at frame \(frameIndex)/\(totalFrames)")
}
}
}
prevHistogram = histogram
}
frameIndex += 1
CMSampleBufferInvalidate(sampleBuffer)
// Progress every 5%
let pct = Int(Float(frameIndex) / Float(totalFrames) * 100)
if pct >= lastPrint + 5 {
print(" Progress: \(pct)% (\(frameIndex)/\(totalFrames) frames)")
lastPrint = pct
}
}
// Add last scene
if sceneStart < duration {
scenes.append((start: sceneStart, end: duration))
}
let elapsed = Date().timeIntervalSince(startTime)
print("\n=== Results ===")
print("Scenes detected: \(scenes.count)")
print("Time: \(String(format: "%.2f", elapsed))s")
if totalFrames > 0 {
let rtf = elapsed / duration
print("RTF: \(String(format: "%.3f", rtf))x")
}
print("Last 5 cuts:")
for s in scenes.suffix(5) {
print(" \(String(format: "%.1f", s.start))s - \(String(format: "%.1f", s.end))s")
}
// Output JSON if requested
if let outPath = outputPath {
let outputDict: [String: Any] = [
"scenes": scenes.map { ["start_time": $0.start, "end_time": $0.end] },
"metadata": [
"video_path": inputPath,
"duration": duration,
"fps": fps,
"total_frames": totalFrames,
"threshold": threshold,
"sample_interval": sampleInterval,
"processing_time": elapsed,
"rtf": elapsed / duration,
],
"diffs": diffs.map { ["frame": $0.frame, "diff": String(format: "%.4f", $0.diff)] }
]
if let jsonData = try? JSONSerialization.data(withJSONObject: outputDict, options: [.prettyPrinted]) {
try jsonData.write(to: URL(fileURLWithPath: outPath))
print("Output written to: \(outPath)")
}
}
}
func computeLuminanceHistogram(pixelBuffer: CVPixelBuffer) -> [Float] {
CVPixelBufferLockBaseAddress(pixelBuffer, .readOnly)
defer { CVPixelBufferUnlockBaseAddress(pixelBuffer, .readOnly) }
let width = CVPixelBufferGetWidth(pixelBuffer)
let height = CVPixelBufferGetHeight(pixelBuffer)
let bytesPerRow = CVPixelBufferGetBytesPerRow(pixelBuffer)
let baseAddress = CVPixelBufferGetBaseAddress(pixelBuffer)!
var histogram = [Float](repeating: 0, count: 64) // 64 bins for speed
for y in 0..<height {
let row = baseAddress.advanced(by: y * bytesPerRow)
let pixels = row.assumingMemoryBound(to: UInt8.self)
for x in 0..<width {
let b = pixels[x * 4]
let g = pixels[x * 4 + 1]
let r = pixels[x * 4 + 2]
let luminance = (0.299 * Float(r) + 0.587 * Float(g) + 0.114 * Float(b))
let bin = min(Int(luminance / 256.0 * 64.0), 63)
histogram[bin] += 1
}
}
// Normalize
let total = Float(width * height)
for i in 0..<histogram.count {
histogram[i] /= total
}
return histogram
}
func histogramDifference(_ a: [Float], _ b: [Float]) -> Float {
var diff: Float = 0
for i in 0..<min(a.count, b.count) {
diff += abs(a[i] - b[i])
}
return diff
}
}

View File

@@ -0,0 +1,291 @@
import Foundation
import Vision
import ArgumentParser
import AVFoundation
/// Swift Face Processor - Apple Vision Framework for face detection + pose
/// Uses AVAssetImageGenerator for reliable frame access (no AVAssetReader corruption).
@main
struct SwiftFace: ParsableCommand {
@Argument(help: "Video file path")
var inputPath: String
@Argument(help: "Output JSON path")
var outputPath: String
@Option(name: .long, help: "Sample interval (frames, default=30)")
var sampleInterval: Int = 30
@Option(name: .long, help: "UUID for logging")
var uuid: String = ""
mutating func run() throws {
let startTime = Date()
print("[SwiftFace] Vision-based face detection: \(inputPath)")
let url = URL(fileURLWithPath: inputPath)
let asset = AVAsset(url: url)
guard let videoTrack = asset.tracks(withMediaType: .video).first else {
print("[SwiftFace] No video track found")
return
}
let fps = videoTrack.nominalFrameRate
let duration = CMTimeGetSeconds(asset.duration)
let totalFrames = Int(duration * Double(fps))
let width = Int(videoTrack.naturalSize.width)
let height = Int(videoTrack.naturalSize.height)
print("[SwiftFace] Video: \(width)x\(height), \(String(format: "%.1f", fps))fps, \(totalFrames) frames")
let generator = AVAssetImageGenerator(asset: asset)
generator.requestedTimeToleranceBefore = .zero
generator.requestedTimeToleranceAfter = .zero
generator.appliesPreferredTrackTransform = true
var allFrames: [[String: Any]] = []
var processedCount = 0
var checkedCount = 0
let frameInterval = TimeInterval(sampleInterval) / Double(fps)
// Process in batches of 1000 frames to avoid memory pressure
let batchSize = 1000
let totalSamples = totalFrames / sampleInterval
for batchStart in stride(from: 0, to: totalSamples, by: batchSize) {
let batchEnd = min(batchStart + batchSize, totalSamples)
var times: [NSValue] = []
for i in batchStart..<batchEnd {
let seconds = Double(i) * frameInterval
let cmTime = CMTime(seconds: seconds, preferredTimescale: 1000)
times.append(NSValue(time: cmTime))
}
let semaphore = DispatchSemaphore(value: 0)
var batchError: Error? = nil
generator.generateCGImagesAsynchronously(forTimes: times) { requestedTime, cgImage, actualTime, result, error in
defer { semaphore.signal() }
checkedCount += 1
guard result == .succeeded, let cgImage = cgImage else {
if let error = error {
fputs("[SwiftFace] Frame error at \(CMTimeGetSeconds(requestedTime)): \(error.localizedDescription)\n", stderr)
}
return
}
// Convert CGImage to CVPixelBuffer for Vision
var pixelBuffer: CVPixelBuffer?
let attrs: [CFString: Any] = [
kCVPixelBufferCGImageCompatibilityKey: true,
kCVPixelBufferCGBitmapContextCompatibilityKey: true,
kCVPixelBufferWidthKey: cgImage.width,
kCVPixelBufferHeightKey: cgImage.height,
]
CVPixelBufferCreate(kCFAllocatorDefault, cgImage.width, cgImage.height,
kCVPixelFormatType_32BGRA, attrs as CFDictionary, &pixelBuffer)
guard let pb = pixelBuffer else { return }
CVPixelBufferLockBaseAddress(pb, [])
defer { CVPixelBufferUnlockBaseAddress(pb, []) }
let context = CGContext(data: CVPixelBufferGetBaseAddress(pb),
width: cgImage.width, height: cgImage.height,
bitsPerComponent: 8, bytesPerRow: CVPixelBufferGetBytesPerRow(pb),
space: CGColorSpaceCreateDeviceRGB(),
bitmapInfo: CGImageAlphaInfo.noneSkipFirst.rawValue | CGBitmapInfo.byteOrder32Little.rawValue)!
context.draw(cgImage, in: CGRect(x: 0, y: 0, width: cgImage.width, height: cgImage.height))
let handler = VNImageRequestHandler(cvPixelBuffer: pb, options: [:])
let detectReq = VNDetectFaceRectanglesRequest()
let lmReq = VNDetectFaceLandmarksRequest()
do {
try handler.perform([detectReq, lmReq])
} catch {
return
}
let faceObservations = detectReq.results ?? []
let landmarkObservations = lmReq.results ?? []
guard !faceObservations.isEmpty || !landmarkObservations.isEmpty else {
return
}
let seconds = CMTimeGetSeconds(actualTime)
let frameNumber = Int(seconds * Double(fps))
var frameFaces: [[String: Any]] = []
// Use actual CGImage size (may differ from naturalSize after transform)
let imgW = CGFloat(cgImage.width)
let imgH = CGFloat(cgImage.height)
// Process landmark observations FIRST (each has bbox + landmarks, self-consistent)
// Quality filtering
let MIN_CONFIDENCE = 0.6
let MIN_SIZE = 20
for lmObs in landmarkObservations {
// Confidence filter
let lmConf = Double(lmObs.confidence)
if lmConf < MIN_CONFIDENCE { continue }
let bb = lmObs.boundingBox
let faceW = Int(bb.size.width * imgW)
let faceH = Int(bb.size.height * imgH)
// Size filter
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
let faceX = Int(bb.origin.x * imgW)
let faceY = Int((1.0 - bb.origin.y - bb.size.height) * imgH)
var faceData: [String: Any] = [
"bbox": ["x": max(0, faceX), "y": max(0, faceY),
"width": faceW, "height": faceH],
"confidence": Double(lmObs.confidence),
]
// Pose from landmark observation
if let yaw = lmObs.yaw?.doubleValue,
let roll = lmObs.roll?.doubleValue {
var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
if let pitch = lmObs.pitch?.doubleValue {
poseInfo["pitch"] = pitch
}
faceData["pose"] = poseInfo
}
// Landmarks with Y-flip (macOS image coords: bottom-left -> top-left)
if let lms = lmObs.landmarks {
let imgSize = CGSize(width: imgW, height: imgH)
let leftEye = lms.leftEye?.pointsInImage(imageSize: imgSize) ?? []
let rightEye = lms.rightEye?.pointsInImage(imageSize: imgSize) ?? []
let nose = lms.nose?.pointsInImage(imageSize: imgSize) ?? []
if !leftEye.isEmpty || !rightEye.isEmpty || !nose.isEmpty {
var lm: [String: [[Double]]] = [:]
if !leftEye.isEmpty {
lm["left_eye"] = leftEye.map { [Double($0.x), Double(imgH - $0.y)] }
}
if !rightEye.isEmpty {
lm["right_eye"] = rightEye.map { [Double($0.x), Double(imgH - $0.y)] }
}
if !nose.isEmpty {
lm["nose"] = nose.map { [Double($0.x), Double(imgH - $0.y)] }
}
faceData["landmarks"] = lm
}
let outer = lms.outerLips?.pointsInImage(imageSize: imgSize) ?? []
let inner = lms.innerLips?.pointsInImage(imageSize: imgSize) ?? []
if !outer.isEmpty || !inner.isEmpty {
faceData["lips"] = [
"outer_lips": outer.map { [Double($0.x), Double(imgH - $0.y)] },
"inner_lips": inner.map { [Double($0.x), Double(imgH - $0.y)] }
]
}
}
frameFaces.append(faceData)
}
// Output face rect observations that the landmark detector missed.
// Match against ALL landmark observations via IoU to avoid duplicates.
for faceObs in faceObservations {
let fBB = faceObs.boundingBox
var matched = false
for lmObs in landmarkObservations {
let lBB = lmObs.boundingBox
let ix = max(fBB.origin.x, lBB.origin.x)
let iy = max(fBB.origin.y, lBB.origin.y)
let iw = min(fBB.maxX, lBB.maxX) - ix
let ih = min(fBB.maxY, lBB.maxY) - iy
if iw <= 0 || ih <= 0 { continue }
let intersection = iw * ih
let union = fBB.width * fBB.height + lBB.width * lBB.height - intersection
if intersection / union > 0.3 {
matched = true
break
}
}
if matched { continue }
// Quality filtering for unmatched face rects
let MIN_CONFIDENCE = 0.6
let MIN_SIZE = 20
let faceConf = Double(faceObs.faceCaptureQuality ?? faceObs.confidence)
if faceConf < MIN_CONFIDENCE { continue }
let faceW = Int(fBB.size.width * imgW)
let faceH = Int(fBB.size.height * imgH)
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
// Unmatched face rect: output without landmarks
let faceX = Int(fBB.origin.x * imgW)
let faceY = Int((1.0 - fBB.origin.y - fBB.size.height) * imgH)
var faceData: [String: Any] = [
"bbox": ["x": max(0, faceX), "y": max(0, faceY),
"width": faceW, "height": faceH],
"confidence": Double(faceObs.faceCaptureQuality ?? faceObs.confidence),
]
if let yaw = faceObs.yaw?.doubleValue,
let roll = faceObs.roll?.doubleValue {
var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
if let pitch = faceObs.pitch?.doubleValue {
poseInfo["pitch"] = pitch
}
faceData["pose"] = poseInfo
}
frameFaces.append(faceData)
}
if !frameFaces.isEmpty {
allFrames.append([
"frame": frameNumber,
"timestamp": seconds,
"faces": frameFaces,
])
processedCount += 1
}
}
// Wait for batch to complete
for _ in batchStart..<batchEnd {
semaphore.wait()
}
let elapsed = Date().timeIntervalSince(startTime)
let pct = Int(Double(min(batchEnd, totalSamples)) / Double(totalSamples) * 100)
print("[SwiftFace] \(processedCount) frames with faces, \(pct)% complete, \(Int(elapsed))s elapsed")
fflush(stdout)
}
generator.cancelAllCGImageGeneration()
let output: [String: Any] = [
"frame_count": allFrames.count,
"fps": Double(fps),
"frames": allFrames,
]
guard let jsonData = try? JSONSerialization.data(withJSONObject: output, options: []),
let jsonString = String(data: jsonData, encoding: .utf8) else {
print("[SwiftFace] Failed to serialize JSON")
return
}
let outputURL = URL(fileURLWithPath: outputPath)
try jsonString.write(to: outputURL, atomically: false, encoding: .utf8)
let elapsed = Date().timeIntervalSince(startTime)
print("[SwiftFace] Done: \(allFrames.count) frames, \(String(format: "%.1f", elapsed))s → \(outputPath)")
}
}

View File

@@ -0,0 +1,204 @@
import Foundation
import Vision
import ArgumentParser
import AVFoundation
import AppKit
/// Swift OCR Processor - replaces Python PaddleOCR
/// Uses Apple Vision Framework (VNRecognizeTextRequest) with ANE acceleration
///
/// Output format (compatible with OcrResult Rust struct):
/// {
/// "frame_count": N,
/// "fps": 30.0,
/// "frames": [
/// { "frame": 0, "timestamp": 0.0, "texts": [{ "text": "...", "x": 0, "y": 0, "width": 0, "height": 0, "confidence": 0.0 }] }
/// ]
/// }
@main
struct SwiftOCR: ParsableCommand {
@Argument(help: "Video file path")
var inputPath: String
@Argument(help: "Output JSON path")
var outputPath: String
@Option(name: .long, help: "Frames to skip between OCR (default=30)")
var sampleInterval: Int = 30
@Option(name: .long, help: "Video FPS (auto-detect if 0)")
var fps: Double = 0
@Option(name: .long, help: "UUID for logging")
var uuid: String = ""
@Option(name: .long, help: "Recognition level: fast or accurate (default=accurate)")
var recognitionLevel: String = "accurate"
mutating func run() throws {
let startTime = Date()
print("[SwiftOCR] Starting: \(inputPath)")
print("[SwiftOCR] Sample interval: \(sampleInterval)")
let url = URL(fileURLWithPath: inputPath)
let asset = AVAsset(url: url)
guard let videoTrack = asset.tracks(withMediaType: .video).first else {
print("[SwiftOCR] Error: No video track"); return
}
let duration = asset.duration.seconds
let detectedFps = fps > 0 ? fps : Double(videoTrack.nominalFrameRate)
let totalFrames = Int(duration * detectedFps)
print("[SwiftOCR] Duration: \(String(format: "%.1f", duration))s, FPS: \(String(format: "%.1f", detectedFps)), Frames: \(totalFrames)")
let frameStep = sampleInterval
// Use shared frame cache if available (set by FrameManager)
let tempDir: URL
let framesDir: URL
if let cacheDir = ProcessInfo.processInfo.environment["MOMENTRY_FRAME_DIR"] {
framesDir = URL(fileURLWithPath: cacheDir)
tempDir = framesDir // No cleanup needed (managed by FrameManager)
print("[SwiftOCR] Using shared frame cache: \(cacheDir)")
} else {
tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("swift_ocr_\(UUID().uuidString)")
framesDir = tempDir.appendingPathComponent("frames")
try FileManager.default.createDirectory(at: framesDir, withIntermediateDirectories: true)
let framePattern = framesDir.appendingPathComponent("frame_%05d.jpg").path
print("[SwiftOCR] Extracting frames with ffmpeg (interval=\(frameStep))...")
let extractProc = Process()
extractProc.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
extractProc.arguments = ["-y", "-v", "quiet", "-i", inputPath,
"-vf", "select=not(mod(n\\,\(frameStep))),scale=320:-2",
"-vsync", "vfr", "-q:v", "15", framePattern]
let startExtract = Date()
try extractProc.run()
extractProc.waitUntilExit()
let extractTime = Date().timeIntervalSince(startExtract)
print("[SwiftOCR] Frame extraction complete: \(String(format: "%.1f", extractTime))s")
}
// Sort extracted frame files
let fileManager = FileManager.default
let allFiles = (try? fileManager.contentsOfDirectory(atPath: framesDir.path)) ?? []
let frameFiles = allFiles
.filter { $0.hasPrefix("frame_") && $0.hasSuffix(".jpg") }
.sorted()
let level: VNRequestTextRecognitionLevel = (recognitionLevel == "fast") ? .fast : .accurate
var ocrFrames: [[String: Any]] = []
var lastProgress = 0
let totalFrames_to_process = frameFiles.count
for (i, frameName) in frameFiles.enumerated() {
let imgPath = framesDir.appendingPathComponent(frameName).path
guard let imgData = try? Data(contentsOf: URL(fileURLWithPath: imgPath)),
let img = NSImage(data: imgData),
let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
continue
}
// Extract frame number from filename
let frameNumber = Int(frameName.replacingOccurrences(of: "frame_", with: "").replacingOccurrences(of: ".jpg", with: "")) ?? (i * frameStep)
let timestamp = Double(frameNumber) / detectedFps
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
let request = VNRecognizeTextRequest()
request.recognitionLevel = level
request.usesLanguageCorrection = true
request.preferBackgroundProcessing = true
guard (try? handler.perform([request])) != nil,
let results = request.results else { continue }
var texts: [[String: Any]] = []
let cgW = cgImage.width
let cgH = cgImage.height
for obs in results {
guard let candidate = obs.topCandidates(1).first else { continue }
let conf = candidate.confidence
guard conf > 0.3 else { continue }
let bb = obs.boundingBox
let item: [String: Any] = [
"text": candidate.string,
"x": Int(bb.origin.x * CGFloat(cgW)),
"y": Int((1.0 - bb.origin.y - bb.size.height) * CGFloat(cgH)),
"width": Int(bb.size.width * CGFloat(cgW)),
"height": Int(bb.size.height * CGFloat(cgH)),
"confidence": conf
]
texts.append(item)
}
if !texts.isEmpty {
ocrFrames.append([
"frame": frameNumber,
"timestamp": timestamp,
"texts": texts
])
}
let pct = Int(Float(i) / Float(totalFrames_to_process) * 100)
if pct >= lastProgress + 5 {
print("[SwiftOCR] Progress: \(pct)% (\(i)/\(totalFrames_to_process) samples, \(ocrFrames.count) with text)")
lastProgress = pct
}
}
// Write output
let outputDict: [String: Any] = [
"frame_count": ocrFrames.count,
"fps": detectedFps,
"frames": ocrFrames
]
if let jsonData = try? JSONSerialization.data(withJSONObject: outputDict, options: [.prettyPrinted]) {
try jsonData.write(to: URL(fileURLWithPath: outputPath))
}
let elapsed = Date().timeIntervalSince(startTime)
print("[SwiftOCR] Complete: \(ocrFrames.count) frames with text, \(String(format: "%.1f", elapsed))s")
if duration > 0 {
print("[SwiftOCR] RTF: \(String(format: "%.3f", elapsed / duration))x")
}
// Clean up temp dir if we created it (not shared cache)
if ProcessInfo.processInfo.environment["MOMENTRY_FRAME_DIR"] == nil {
try? FileManager.default.removeItem(at: tempDir)
}
}
func recognizeText(pixelBuffer: CVPixelBuffer, level: VNRequestTextRecognitionLevel) -> [[String: Any]] {
var texts: [[String: Any]] = []
let request = VNRecognizeTextRequest()
request.recognitionLevel = level
request.usesLanguageCorrection = true
request.preferBackgroundProcessing = true
let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
guard (try? handler.perform([request])) != nil,
let results = request.results else { return texts }
let cgW = CGFloat(CVPixelBufferGetWidth(pixelBuffer))
let cgH = CGFloat(CVPixelBufferGetHeight(pixelBuffer))
for obs in results {
guard let candidate = obs.topCandidates(1).first,
candidate.confidence > 0.2 else { continue }
let bb = obs.boundingBox
texts.append([
"text": candidate.string,
"x": Int(bb.origin.x * cgW),
"y": Int((1.0 - bb.origin.y - bb.size.height) * cgH),
"width": Int(bb.size.width * cgW),
"height": Int(bb.size.height * cgH),
"confidence": candidate.confidence
])
}
return texts
}
}

View File

@@ -0,0 +1,222 @@
import Foundation
import Vision
import ArgumentParser
import AppKit
import AVFoundation
/// Swift Pose Processor - replaces YOLOv8 Pose / MediaPipe Pose
/// Uses VNDetectHumanBodyPoseRequest with ANE acceleration
///
/// Output format (compatible with PoseResult Rust struct):
/// {
/// "frame_count": N, "fps": 30.0,
/// "frames": [
/// { "frame": 0, "timestamp": 0.0, "persons": [
/// { "keypoints": [{"name":"nose","x":100,"y":200,"confidence":0.95}],
/// "bbox": {"x":0,"y":0,"width":100,"height":200}
/// }
/// ]}
/// ]
/// }
@main
struct SwiftPose: ParsableCommand {
@Argument(help: "Video file path")
var inputPath: String
@Argument(help: "Output JSON path")
var outputPath: String
@Option(name: .long, help: "Sample interval (frames, default=30)")
var sampleInterval: Int = 30
@Option(name: .long, help: "UUID for logging")
var uuid: String = ""
mutating func run() throws {
let startTime = Date()
print("[SwiftPose] Starting: \(inputPath)")
let url = URL(fileURLWithPath: inputPath)
let asset = AVAsset(url: url)
guard let track = asset.tracks(withMediaType: .video).first else {
print("[SwiftPose] Error: No video track"); return
}
let duration = asset.duration.seconds
let fps = Double(track.nominalFrameRate)
print("[SwiftPose] Duration: \(String(format: "%.1f", duration))s, FPS: \(String(format: "%.1f", fps))")
// Extract frames (use shared cache or ffmpeg)
let tempDir: URL
let framesDir: URL
if let cacheDir = ProcessInfo.processInfo.environment["MOMENTRY_FRAME_DIR"] {
framesDir = URL(fileURLWithPath: cacheDir)
tempDir = framesDir
print("[SwiftPose] Using shared frame cache: \(cacheDir)")
} else {
tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("swift_pose_\(UUID().uuidString)")
framesDir = tempDir.appendingPathComponent("frames")
try FileManager.default.createDirectory(at: framesDir, withIntermediateDirectories: true)
let pattern = framesDir.appendingPathComponent("frame_%05d.jpg").path
print("[SwiftPose] Extracting frames...")
let extract = Process()
extract.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
extract.arguments = ["-y", "-v", "quiet", "-i", inputPath,
"-vf", "select=not(mod(n\\,\(sampleInterval)))",
"-vsync", "vfr", "-q:v", "15", pattern]
try extract.run()
extract.waitUntilExit()
}
let files = (try? FileManager.default.contentsOfDirectory(atPath: framesDir.path)) ?? []
let frameFiles = files.filter { $0.hasSuffix(".jpg") }.sorted()
print("[SwiftPose] Extracted \(frameFiles.count) frames")
let jointNames: [VNHumanBodyPoseObservation.JointName] = [
.nose, .leftEye, .rightEye, .leftEar, .rightEar,
.neck, .root,
.leftShoulder, .rightShoulder,
.leftElbow, .rightElbow,
.leftWrist, .rightWrist,
.leftHip, .rightHip,
.leftKnee, .rightKnee,
.leftAnkle, .rightAnkle,
]
var poseFrames: [[String: Any]] = []
var lastProgress = 0
for (i, fname) in frameFiles.enumerated() {
let imgPath = framesDir.appendingPathComponent(fname).path
guard let imgData = try? Data(contentsOf: URL(fileURLWithPath: imgPath)),
let img = NSImage(data: imgData),
let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else { continue }
let frameNum = Int(fname.replacingOccurrences(of: "frame_", with: "").replacingOccurrences(of: ".jpg", with: "")) ?? (i * sampleInterval)
let timestamp = Double(frameNum) / fps
let w = cgImage.width
let h = cgImage.height
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
let req = VNDetectHumanBodyPoseRequest()
try? handler.perform([req])
guard let poses = req.results, !poses.isEmpty else { continue }
var persons: [[String: Any]] = []
for pose in poses {
var keypoints: [[String: Any]] = []
var minX = CGFloat.greatestFiniteMagnitude
var minY = CGFloat.greatestFiniteMagnitude
var maxX: CGFloat = 0
var maxY: CGFloat = 0
for joint in jointNames {
if let point = try? pose.recognizedPoint(joint) {
let desc = String(describing: joint.rawValue)
var rawName = desc
.replacingOccurrences(of: "VNRecognizedPointKey(_rawValue: ", with: "")
.replacingOccurrences(of: ")", with: "")
.trimmingCharacters(in: .whitespaces)
// Map Vision Framework joint names to standard names
let nameMap: [String: String] = [
"head_joint": "nose",
"left_eye_joint": "left_eye",
"right_eye_joint": "right_eye",
"left_ear_joint": "left_ear",
"right_ear_joint": "right_ear",
"neck_1_joint": "neck",
"left_shoulder_1_joint": "left_shoulder",
"right_shoulder_1_joint": "right_shoulder",
"left_elbow_1_joint": "left_elbow",
"right_elbow_1_joint": "right_elbow",
"left_hand_joint": "left_wrist",
"right_hand_joint": "right_wrist",
"left_hip_1_joint": "left_hip",
"right_hip_1_joint": "right_hip",
"left_knee_1_joint": "left_knee",
"right_knee_1_joint": "right_knee",
"left_ankle_1_joint": "left_ankle",
"right_ankle_1_joint": "right_ankle",
"center_hip_joint": "root",
"left_forearm_joint": "left_elbow",
"right_forearm_joint": "right_elbow",
"left_upLeg_joint": "left_hip",
"right_upLeg_joint": "right_hip",
"left_leg_joint": "left_knee",
"right_leg_joint": "right_knee",
"left_foot_joint": "left_ankle",
"right_foot_joint": "right_ankle",
]
if let mapped = nameMap[rawName] {
rawName = mapped
}
let px = point.location.x * CGFloat(w)
let py = CGFloat(h) - point.location.y * CGFloat(h)
keypoints.append([
"name": rawName.isEmpty ? "\(joint)" : rawName,
"x": px,
"y": py,
"confidence": point.confidence,
])
if point.confidence > 0.1 {
minX = min(minX, px)
minY = min(minY, py)
maxX = max(maxX, px)
maxY = max(maxY, py)
}
}
}
var bbox: [String: Any] = [
"x": 0, "y": 0, "width": 0, "height": 0
]
if maxX > minX {
bbox = [
"x": Int(minX),
"y": Int(minY),
"width": Int(maxX - minX),
"height": Int(maxY - minY),
]
}
persons.append(["keypoints": keypoints, "bbox": bbox])
}
if !persons.isEmpty {
poseFrames.append([
"frame": frameNum,
"timestamp": timestamp,
"persons": persons,
])
}
let pct = Int(Float(i) / Float(frameFiles.count) * 100)
if pct >= lastProgress + 10 {
print("[SwiftPose] Progress: \(pct)% (\(i)/\(frameFiles.count), \(persons.count) poses)")
lastProgress = pct
}
}
// Write output
let outputDict: [String: Any] = [
"frame_count": poseFrames.count,
"fps": fps,
"frames": poseFrames,
]
if let jsonData = try? JSONSerialization.data(withJSONObject: outputDict, options: [.prettyPrinted]) {
try jsonData.write(to: URL(fileURLWithPath: outputPath))
}
let elapsed = Date().timeIntervalSince(startTime)
print("[SwiftPose] Complete: \(poseFrames.count) frames, \(String(format: "%.1f", elapsed))s")
if duration > 0 {
print("[SwiftPose] RTF: \(String(format: "%.3f", elapsed / duration))x")
}
// Clean up temp dir if we created it
if ProcessInfo.processInfo.environment["MOMENTRY_FRAME_DIR"] == nil {
try? FileManager.default.removeItem(at: tempDir)
}
}
}

View File

@@ -0,0 +1,102 @@
import Foundation
import Vision
import ArgumentParser
import AppKit
/// POC: Test Apple Vision Framework for object detection (YOLO replacement)
@main
struct VisionObjectTest: ParsableCommand {
@Argument(help: "Input image path")
var inputPath: String
func run() throws {
let startTime = Date()
print("=== Apple Vision Framework Object Detection POC ===")
#if arch(arm64)
print("HW: Apple Silicon ✅ (ANE available)")
#endif
guard let image = NSImage(contentsOfFile: inputPath) else {
print("Error: cannot load image"); return
}
guard let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
print("Error: cannot get CGImage"); return
}
print("Image: \(cgImage.width)x\(cgImage.height)")
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
// 1. VNClassifyImageRequest (scene classification - replaces scene_classifier)
print("\n--- VNClassifyImageRequest ---")
let clsReq = VNClassifyImageRequest()
try handler.perform([clsReq])
if let classifications = clsReq.results {
print("Top classifications:")
for c in classifications.prefix(10) {
print(" \(c.identifier): conf=\(String(format: "%.3f", c.confidence))")
}
}
// 2. VNDetectHumanRectanglesRequest (person detection - YOLO replacement for 'person')
print("\n--- VNDetectHumanRectanglesRequest ---")
let humanReq = VNDetectHumanRectanglesRequest()
try handler.perform([humanReq])
if let humans = humanReq.results {
print("Humans: \(humans.count)")
for (i, h) in humans.enumerated() {
let bb = h.boundingBox
print(" [\(i)] bbox=(\(String(format: "%.3f", bb.origin.x)),\(String(format: "%.3f", bb.origin.y))) size=(\(String(format: "%.3f", bb.size.width)),\(String(format: "%.3f", bb.size.height))) conf=\(String(format: "%.2f", h.confidence))")
}
}
// 3. VNDetectHumanBodyPoseRequest (pose estimation - MediaPipe replacement)
print("\n--- VNDetectHumanBodyPoseRequest ---")
let poseReq = VNDetectHumanBodyPoseRequest()
try handler.perform([poseReq])
if let poses = poseReq.results {
print("Body poses: \(poses.count)")
for (i, p) in poses.enumerated() {
let joints = p.availableJointNames
print(" [\(i)] \(joints.count) joints detected")
// Show key joints
for joint in [VNHumanBodyPoseObservation.JointName.neck,
VNHumanBodyPoseObservation.JointName.leftShoulder,
VNHumanBodyPoseObservation.JointName.rightShoulder,
VNHumanBodyPoseObservation.JointName.leftWrist,
VNHumanBodyPoseObservation.JointName.rightWrist,
VNHumanBodyPoseObservation.JointName.root] {
if let pt = try? p.recognizedPoint(joint) {
print(" \(joint.rawValue): (\(String(format: "%.3f", pt.location.x)), \(String(format: "%.3f", pt.location.y))) conf=\(String(format: "%.2f", pt.confidence))")
}
}
}
}
// 4. VNDetectHumanHandPoseRequest (hand pose)
print("\n--- VNDetectHumanHandPoseRequest ---")
let handReq = VNDetectHumanHandPoseRequest()
try handler.perform([handReq])
if let hands = handReq.results {
print("Hands: \(hands.count)")
for (i, h) in hands.enumerated() {
print(" [\(i)] confidence=\(String(format: "%.2f", h.confidence))")
}
}
// 5. VNGenerateObjectnessBasedSaliencyImageRequest (object detection without labels)
print("\n--- VNGenerateObjectnessBasedSaliencyImageRequest ---")
let salReq = VNGenerateObjectnessBasedSaliencyImageRequest()
try handler.perform([salReq])
if let sal = salReq.results?.first {
if let objects = sal.salientObjects {
print("Salient objects: \(objects.count)")
for (i, ob) in objects.enumerated().prefix(10) {
let bb = ob.boundingBox
print(" [\(i)] bbox=(\(String(format: "%.3f", bb.origin.x)),\(String(format: "%.3f", bb.origin.y)),\(String(format: "%.3f", bb.size.width)),\(String(format: "%.3f", bb.size.height))) conf=\(String(format: "%.2f", ob.confidence))")
}
}
}
print("\nTime: \(String(format: "%.2f", Date().timeIntervalSince(startTime)))s")
print("=== Done ===")
}
}

View File

@@ -0,0 +1,71 @@
import Foundation
import Vision
import ArgumentParser
import AppKit
/// POC: Test Apple Vision Framework OCR (VNRecognizeTextRequest) vs PaddleOCR
@main
struct VisionOCRTest: ParsableCommand {
@Argument(help: "Input image path")
var inputPath: String
@Option(name: .long, help: "Recognition level (.fast or .accurate, default .accurate)")
var level: String = "accurate"
mutating func run() throws {
let startTime = Date()
print("=== Apple Vision Framework OCR POC ===")
#if arch(arm64)
print("HW: Apple Silicon ✅")
#endif
guard let image = NSImage(contentsOfFile: inputPath) else {
print("Error: cannot load image"); return
}
guard let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
print("Error: cannot get CGImage"); return
}
print("Image: \(cgImage.width)x\(cgImage.height)")
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
// VNRecognizeTextRequest
print("\n--- VNRecognizeTextRequest ---")
let req = VNRecognizeTextRequest()
if level == "fast" {
req.recognitionLevel = .fast
} else {
req.recognitionLevel = .accurate
}
req.usesLanguageCorrection = true
req.preferBackgroundProcessing = true
try handler.perform([req])
guard let results = req.results else {
print("No OCR results"); return
}
print("Text blocks: \(results.count)")
var totalChars = 0
for (i, obs) in results.enumerated() {
guard let candidate = obs.topCandidates(1).first else { continue }
let text = candidate.string
let conf = candidate.confidence
let bb = obs.boundingBox
totalChars += text.count
if i < 20 {
print(" [\(i)] conf=\(String(format: "%.3f", conf)) bbox=(\(String(format: "%.3f", bb.origin.x)),\(String(format: "%.3f", bb.origin.y)),\(String(format: "%.3f", bb.size.width)),\(String(format: "%.3f", bb.size.height))) \"\(text.prefix(80))\"")
}
}
print(" ... \(results.count) total, \(totalChars) chars")
// Check language support
print("\n--- Language Support ---")
let supported = (try? VNRecognizeTextRequest.supportedRecognitionLanguages(for: .accurate, revision: VNRecognizeTextRequest.currentRevision)) ?? []
print("Supported languages (\(supported.count)): \(supported.prefix(10).joined(separator: ", "))...")
let elapsed = Date().timeIntervalSince(startTime)
print("\nTime: \(String(format: "%.2f", elapsed))s")
print("=== Done ===")
}
}