feat: Phase 2.6 edges migration to Qdrant (TKG-only architecture)
Phase 2.6.1: co_occurrence_edges migration - build_co_occurrence_edges_from_qdrant() - Qdrant embeddings → frame grouping → YOLO objects - Result: 6679 edges (vs 6701 PostgreSQL) Phase 2.6.2: face_face_edges migration - build_face_face_edges_from_qdrant() - Qdrant embeddings → frame grouping → face pairs - mutual_gaze detection preserved - Result: 6 edges (exact match) Phase 2.6.3: speaker_face_edges migration - build_speaker_face_edges_from_qdrant() - Qdrant embeddings → trace_id frame ranges - SPEAKS_AS edge creation Architecture: - All edges use Qdrant payload (no face_detections queries) - PostgreSQL fallback for empty Qdrant - Estimated 3.6x performance improvement Testing: - Playground (3003): ✓ All Phase 2.6 logs verified - Edge counts: ✓ Close match with PostgreSQL - Fallback: ✓ Working Docs: - docs_v1.0/DESIGN/TKG_PHASE2_6_EDGES_MIGRATION.md - docs_v1.0/M4_workspace/2026-06-21_phase2_6_test.md
This commit is contained in:
14
v1.1/scripts/swift_processors/Package_v1.11.resolved
Normal file
14
v1.1/scripts/swift_processors/Package_v1.11.resolved
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"pins" : [
|
||||
{
|
||||
"identity" : "swift-argument-parser",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://github.com/apple/swift-argument-parser",
|
||||
"state" : {
|
||||
"revision" : "626b5b7b2f45e1b0b1c6f4a309296d1d21d7311b",
|
||||
"version" : "1.7.1"
|
||||
}
|
||||
}
|
||||
],
|
||||
"version" : 2
|
||||
}
|
||||
114
v1.1/scripts/swift_processors/Package_v1.11.swift
Normal file
114
v1.1/scripts/swift_processors/Package_v1.11.swift
Normal file
@@ -0,0 +1,114 @@
|
||||
// swift-tools-version: 5.9
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "SwiftProcessors",
|
||||
platforms: [
|
||||
.macOS(.v14)
|
||||
],
|
||||
dependencies: [
|
||||
.package(url: "https://github.com/apple/swift-argument-parser", from: "1.3.0"),
|
||||
],
|
||||
targets: [
|
||||
.executableTarget(
|
||||
name: "asr_swift",
|
||||
dependencies: [
|
||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||
],
|
||||
path: ".",
|
||||
sources: ["asr_swift_v1.11.swift"]
|
||||
),
|
||||
.executableTarget(
|
||||
name: "asrx_swift",
|
||||
dependencies: [
|
||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||
],
|
||||
path: ".",
|
||||
sources: ["asrx_swift_v1.11.swift"]
|
||||
),
|
||||
.executableTarget(
|
||||
name: "speaker_test",
|
||||
dependencies: [],
|
||||
path: ".",
|
||||
sources: ["speaker_test_v1.11.swift"]
|
||||
),
|
||||
.executableTarget(
|
||||
name: "speaker_meta_test",
|
||||
dependencies: [],
|
||||
path: ".",
|
||||
sources: ["speaker_meta_test_v1.11.swift"]
|
||||
),
|
||||
.executableTarget(
|
||||
name: "face_vision_test",
|
||||
dependencies: [
|
||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||
],
|
||||
path: ".",
|
||||
sources: ["face_vision_test_v1.11.swift"]
|
||||
),
|
||||
.executableTarget(
|
||||
name: "vision_object_test",
|
||||
dependencies: [
|
||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||
],
|
||||
path: ".",
|
||||
sources: ["vision_object_test_v1.11.swift"]
|
||||
),
|
||||
.executableTarget(
|
||||
name: "swift_cut_test",
|
||||
dependencies: [
|
||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||
],
|
||||
path: ".",
|
||||
sources: ["swift_cut_test_v1.11.swift"]
|
||||
),
|
||||
.executableTarget(
|
||||
name: "vision_ocr_test",
|
||||
dependencies: [
|
||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||
],
|
||||
path: ".",
|
||||
sources: ["vision_ocr_test_v1.11.swift"]
|
||||
),
|
||||
.executableTarget(
|
||||
name: "swift_ocr",
|
||||
dependencies: [
|
||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||
],
|
||||
path: ".",
|
||||
sources: ["swift_ocr_v1.11.swift"]
|
||||
),
|
||||
.executableTarget(
|
||||
name: "face_compare_test",
|
||||
dependencies: [
|
||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||
],
|
||||
path: ".",
|
||||
sources: ["face_compare_test_v1.11.swift"]
|
||||
),
|
||||
.executableTarget(
|
||||
name: "pose_benchmark",
|
||||
dependencies: [
|
||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||
],
|
||||
path: ".",
|
||||
sources: ["pose_benchmark_v1.11.swift"]
|
||||
),
|
||||
.executableTarget(
|
||||
name: "swift_pose",
|
||||
dependencies: [
|
||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||
],
|
||||
path: ".",
|
||||
sources: ["swift_pose_v1.11.swift"]
|
||||
),
|
||||
.executableTarget(
|
||||
name: "swift_face",
|
||||
dependencies: [
|
||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||
],
|
||||
path: ".",
|
||||
sources: ["swift_face_v1.11.swift"]
|
||||
),
|
||||
]
|
||||
)
|
||||
24
v1.1/scripts/swift_processors/asr_swift/Info_v1.11.plist
Normal file
24
v1.1/scripts/swift_processors/asr_swift/Info_v1.11.plist
Normal file
@@ -0,0 +1,24 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>CFBundleExecutable</key>
|
||||
<string>asr_swift</string>
|
||||
<key>CFBundleIdentifier</key>
|
||||
<string>com.momentry.asr-swift</string>
|
||||
<key>CFBundleName</key>
|
||||
<string>ASR Swift Processor</string>
|
||||
<key>CFBundleVersion</key>
|
||||
<string>1.0</string>
|
||||
<key>CFBundleShortVersionString</key>
|
||||
<string>1.0.0</string>
|
||||
<key>CFBundlePackageType</key>
|
||||
<string>APPL</string>
|
||||
<key>LSUIElement</key>
|
||||
<true/>
|
||||
<key>NSMicrophoneUsageDescription</key>
|
||||
<string>Momentry ASR needs microphone access for speech recognition</string>
|
||||
<key>NSSpeechRecognitionUsageDescription</key>
|
||||
<string>Momentry ASR uses speech recognition to transcribe audio</string>
|
||||
</dict>
|
||||
</plist>
|
||||
254
v1.1/scripts/swift_processors/asr_swift_v1.11.swift
Normal file
254
v1.1/scripts/swift_processors/asr_swift_v1.11.swift
Normal file
@@ -0,0 +1,254 @@
|
||||
import Foundation
|
||||
import Speech
|
||||
import ArgumentParser
|
||||
|
||||
/// Swift CLI 處理器:使用 Apple Speech Framework 進行語音辨識
|
||||
/// 作為 Python ASR (faster-whisper) 的替代方案
|
||||
///
|
||||
/// 比較項目:
|
||||
/// - Speech Framework 使用 Apple 內建模型(ANE 加速)
|
||||
/// - 無需下載模型,系統內建
|
||||
/// - 支援即時與批次辨識
|
||||
/// - 語言支援依作業系統版本
|
||||
|
||||
/// 合併逐字 segments 成句子
|
||||
/// 當相鄰 segment 間隔 < 0.5s 時合併,間隔 >= 0.5s 時視為句子邊界
|
||||
func mergeWordSegments(_ segments: [[String: Any]]) -> [[String: Any]] {
|
||||
let gapThreshold: TimeInterval = 0.5
|
||||
var merged: [[String: Any]] = []
|
||||
var current: [String: Any]? = nil
|
||||
|
||||
for seg in segments {
|
||||
guard let start = seg["start"] as? TimeInterval,
|
||||
let end = seg["end"] as? TimeInterval,
|
||||
let text = seg["text"] as? String,
|
||||
let conf = seg["confidence"] as? Float else {
|
||||
continue
|
||||
}
|
||||
|
||||
if var cur = current {
|
||||
let curEnd = cur["end"] as? TimeInterval ?? 0
|
||||
let gap = start - curEnd
|
||||
|
||||
if gap < gapThreshold {
|
||||
// 合併到當前句子
|
||||
let curText = cur["text"] as? String ?? ""
|
||||
let curConf = cur["confidence"] as? Float ?? 0
|
||||
let wordCount = cur["_wordCount"] as? Int ?? 0
|
||||
cur["text"] = curText + (curText.hasSuffix(" ") ? "" : " ") + text
|
||||
cur["end"] = end
|
||||
// 用 confidence 加權平均(依字數)
|
||||
let totalWords = wordCount + 1
|
||||
cur["confidence"] = (curConf * Float(wordCount) + conf) / Float(totalWords)
|
||||
cur["_wordCount"] = totalWords
|
||||
current = cur
|
||||
} else {
|
||||
// 句子邊界:儲存當前,開始新的
|
||||
cur.removeValue(forKey: "_wordCount")
|
||||
merged.append(cur)
|
||||
current = [
|
||||
"start": start,
|
||||
"end": end,
|
||||
"text": text,
|
||||
"confidence": conf,
|
||||
"_wordCount": 1
|
||||
]
|
||||
}
|
||||
} else {
|
||||
current = [
|
||||
"start": start,
|
||||
"end": end,
|
||||
"text": text,
|
||||
"confidence": conf,
|
||||
"_wordCount": 1
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
if let cur = current {
|
||||
var finalCur = cur
|
||||
finalCur.removeValue(forKey: "_wordCount")
|
||||
merged.append(finalCur)
|
||||
}
|
||||
|
||||
return merged
|
||||
}
|
||||
|
||||
@main
|
||||
struct ASRSwift: ParsableCommand {
|
||||
@Argument(help: "音訊/影片檔案路徑")
|
||||
var inputPath: String
|
||||
|
||||
@Argument(help: "輸出 JSON 路徑")
|
||||
var outputPath: String
|
||||
|
||||
@Option(name: .long, help: "UUID for Redis")
|
||||
var uuid: String = ""
|
||||
|
||||
@Option(name: .long, help: "語言 (留空則自動嘗試支援的語種)")
|
||||
var language: String = ""
|
||||
|
||||
/// 嘗試自動偵測語種
|
||||
func detectLanguage() -> String {
|
||||
if !language.isEmpty { return language }
|
||||
// 優先嘗試常用語種
|
||||
let candidates = ["zh-TW", "zh-Hans", "en-US", "ja-JP", "ko-KR"]
|
||||
for localeId in candidates {
|
||||
if let reco = SFSpeechRecognizer(locale: Locale(identifier: localeId)), reco.isAvailable {
|
||||
print("[ASR_Swift] Auto-detected language: \(localeId)")
|
||||
return localeId
|
||||
}
|
||||
}
|
||||
return "en-US"
|
||||
}
|
||||
|
||||
mutating func run() throws {
|
||||
let startTime = Date()
|
||||
|
||||
print("[ASR_Swift] Starting: \(inputPath)")
|
||||
print("[ASR_Swift] Language: \(language)")
|
||||
print("[ASR_Swift] Output: \(outputPath)")
|
||||
|
||||
// 1. 萃取音訊(若為影片檔)
|
||||
let audioURL: URL
|
||||
let ext = (inputPath as NSString).pathExtension.lowercased()
|
||||
let tempDir = FileManager.default.temporaryDirectory
|
||||
.appendingPathComponent("asr_swift_\(UUID().uuidString)")
|
||||
|
||||
if ["mp4", "mov", "mkv", "avi"].contains(ext) {
|
||||
// 需要 ffmpeg 萃取音訊
|
||||
let wavPath = tempDir.appendingPathComponent("audio.wav").path
|
||||
print("[ASR_Swift] Extracting audio from video...")
|
||||
let proc = Process()
|
||||
proc.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
|
||||
proc.arguments = ["-y", "-v", "quiet", "-i", inputPath,
|
||||
"-ar", "16000", "-ac", "1", wavPath]
|
||||
try proc.run()
|
||||
proc.waitUntilExit()
|
||||
guard FileManager.default.fileExists(atPath: wavPath) else {
|
||||
print("[ASR_Swift] Error: ffmpeg failed to extract audio")
|
||||
throw NSError(domain: "ASRSwift", code: 1, userInfo: nil)
|
||||
}
|
||||
audioURL = URL(fileURLWithPath: wavPath)
|
||||
print("[ASR_Swift] Audio extracted: \(wavPath)")
|
||||
} else {
|
||||
audioURL = URL(fileURLWithPath: inputPath)
|
||||
}
|
||||
|
||||
// 2. 語音辨識
|
||||
print("[ASR_Swift] Starting recognition...")
|
||||
|
||||
// 請求授權
|
||||
let authGroup = DispatchGroup()
|
||||
authGroup.enter()
|
||||
var authStatus: SFSpeechRecognizerAuthorizationStatus = .notDetermined
|
||||
SFSpeechRecognizer.requestAuthorization { status in
|
||||
authStatus = status
|
||||
authGroup.leave()
|
||||
}
|
||||
authGroup.wait()
|
||||
guard authStatus == .authorized else {
|
||||
print("[ASR_Swift] Speech recognition not authorized: \(authStatus.rawValue)")
|
||||
return
|
||||
}
|
||||
print("[ASR_Swift] Speech recognition authorized")
|
||||
|
||||
let finalLang = detectLanguage()
|
||||
let recognizer = SFSpeechRecognizer(locale: Locale(identifier: finalLang))
|
||||
guard let recognizer = recognizer, recognizer.isAvailable else {
|
||||
print("[ASR_Swift] Error: Speech recognizer not available for \(language)")
|
||||
// 回傳空結果
|
||||
let emptyResult: [String: Any] = [
|
||||
"language": language,
|
||||
"segments": [],
|
||||
"processing_time": Date().timeIntervalSince(startTime),
|
||||
"model": "Apple Speech Framework",
|
||||
"error": "Recognizer not available"
|
||||
]
|
||||
let jsonData = try JSONSerialization.data(withJSONObject: emptyResult, options: [.prettyPrinted])
|
||||
try jsonData.write(to: URL(fileURLWithPath: outputPath))
|
||||
return
|
||||
}
|
||||
|
||||
let request = SFSpeechURLRecognitionRequest(url: audioURL)
|
||||
request.shouldReportPartialResults = false
|
||||
request.taskHint = .dictation
|
||||
|
||||
var allSegments: [[String: Any]] = []
|
||||
let semaphore = DispatchSemaphore(value: 0)
|
||||
|
||||
let task = recognizer.recognitionTask(with: request) { result, error in
|
||||
if let error = error {
|
||||
print("[ASR_Swift] Recognition error: \(error.localizedDescription)")
|
||||
semaphore.signal()
|
||||
return
|
||||
}
|
||||
|
||||
if let result = result, result.isFinal {
|
||||
let duration = Date().timeIntervalSince(startTime)
|
||||
print("[ASR_Swift] Recognition completed in \(String(format: "%.2f", duration))s")
|
||||
|
||||
// 將辨識結果轉為 segment 格式
|
||||
for segment in result.bestTranscription.segments {
|
||||
let seg: [String: Any] = [
|
||||
"start": segment.timestamp,
|
||||
"end": segment.timestamp + segment.duration,
|
||||
"text": segment.substring,
|
||||
"speaker_id": nil as String?,
|
||||
"confidence": segment.confidence
|
||||
]
|
||||
allSegments.append(seg)
|
||||
}
|
||||
|
||||
// 合併逐字 segments 成句子
|
||||
if !allSegments.isEmpty {
|
||||
let beforeCount = allSegments.count
|
||||
allSegments = mergeWordSegments(allSegments)
|
||||
print("[ASR_Swift] Merged segments: \(beforeCount) → \(allSegments.count)")
|
||||
}
|
||||
|
||||
// 若無 segment,用整個文字建立一個
|
||||
if allSegments.isEmpty {
|
||||
let fullText = result.bestTranscription.formattedString
|
||||
let seg: [String: Any] = [
|
||||
"start": 0.0,
|
||||
"end": Date().timeIntervalSince(startTime),
|
||||
"text": fullText,
|
||||
"speaker_id": nil as String?,
|
||||
"confidence": 1.0
|
||||
]
|
||||
allSegments.append(seg)
|
||||
}
|
||||
|
||||
semaphore.signal()
|
||||
}
|
||||
}
|
||||
|
||||
// RunLoop 是必要的 — Speech Framework 需要 main runloop 觸發 callback
|
||||
// 使用 semaphore 等待完成,避免固定 timeout
|
||||
while semaphore.wait(timeout: .now()) == .timedOut {
|
||||
RunLoop.current.run(mode: .default, before: Date(timeIntervalSinceNow: 0.1))
|
||||
}
|
||||
task.cancel()
|
||||
|
||||
// 3. 輸出 JSON
|
||||
let outputDict: [String: Any] = [
|
||||
"language": language,
|
||||
"segments": allSegments,
|
||||
"processing_time": Date().timeIntervalSince(startTime),
|
||||
"model": "Apple Speech Framework (ANE accelerated)",
|
||||
"total_segments": allSegments.count
|
||||
]
|
||||
|
||||
let jsonData = try JSONSerialization.data(withJSONObject: outputDict, options: [.prettyPrinted])
|
||||
try jsonData.write(to: URL(fileURLWithPath: outputPath))
|
||||
|
||||
print("[ASR_Swift] Saved \(allSegments.count) segments to \(outputPath)")
|
||||
print("[ASR_Swift] Total time: \(String(format: "%.2f", Date().timeIntervalSince(startTime)))s")
|
||||
|
||||
// 清理暫存
|
||||
if tempDir != audioURL.deletingLastPathComponent() {
|
||||
try? FileManager.default.removeItem(at: tempDir)
|
||||
}
|
||||
}
|
||||
}
|
||||
183
v1.1/scripts/swift_processors/asrx_swift_v1.11.swift
Normal file
183
v1.1/scripts/swift_processors/asrx_swift_v1.11.swift
Normal file
@@ -0,0 +1,183 @@
|
||||
import Foundation
|
||||
import Speech
|
||||
import ArgumentParser
|
||||
|
||||
/// Swift ASRX Processor
|
||||
/// Speaker Diarization via Apple Speech Framework
|
||||
///
|
||||
/// 使用 SFSpeechRecognizer 進行語音辨識並嘗試分離說話人
|
||||
/// 目前 Apple Speech Framework 不直接支援 speaker diarization,
|
||||
/// 此實作透過音訊分段 + 逐段辨識來近似 diarization 效果
|
||||
|
||||
@main
|
||||
struct ASRXSwift: ParsableCommand {
|
||||
@Argument(help: "音訊/影片檔案路徑")
|
||||
var inputPath: String
|
||||
|
||||
@Argument(help: "輸出 JSON 路徑")
|
||||
var outputPath: String
|
||||
|
||||
@Option(name: .long, help: "UUID for Redis")
|
||||
var uuid: String = ""
|
||||
|
||||
@Option(name: .long, help: "語言 (留空自動偵測)")
|
||||
var language: String = ""
|
||||
|
||||
@Option(name: .long, help: "分段長度(秒),預設 5 秒")
|
||||
var segmentDuration: Double = 5.0
|
||||
|
||||
mutating func run() throws {
|
||||
let startTime = Date()
|
||||
print("[ASRX_Swift] Starting: \(inputPath)")
|
||||
|
||||
// 1. 萃取音訊
|
||||
let audioURL = extractAudio(from: inputPath)
|
||||
defer { try? FileManager.default.removeItem(at: audioURL.deletingLastPathComponent()) }
|
||||
|
||||
// 2. 取得音訊資訊
|
||||
let audioFile = try AVAudioFile(forReading: audioURL)
|
||||
let format = audioFile.processingFormat
|
||||
let totalFrames = audioFile.length
|
||||
let duration = Double(totalFrames) / format.sampleRate
|
||||
print("[ASRX_Swift] Audio: \(totalFrames) frames, \(String(format: "%.1f", duration))s, \(format.sampleRate)Hz")
|
||||
|
||||
// 3. 載入完整音訊
|
||||
guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: AVAudioFrameCount(totalFrames)) else {
|
||||
throw NSError(domain: "ASRXSwift", code: 1, userInfo: [NSLocalizedDescriptionKey: "Failed to create buffer"])
|
||||
}
|
||||
try audioFile.read(into: buffer)
|
||||
|
||||
guard let floatDataPtr = buffer.floatChannelData else {
|
||||
throw NSError(domain: "ASRXSwift", code: 2, userInfo: [NSLocalizedDescriptionKey: "No float data"])
|
||||
}
|
||||
let floatData = UnsafeBufferPointer(start: floatDataPtr[0], count: Int(totalFrames) * Int(format.channelCount))
|
||||
|
||||
// 4. 分段辨識
|
||||
let finalLang = resolveLanguage()
|
||||
let recognizer = SFSpeechRecognizer(locale: Locale(identifier: finalLang))!
|
||||
let frameStep = Int(segmentDuration * format.sampleRate)
|
||||
let totalSegments = Int(ceil(duration / segmentDuration))
|
||||
|
||||
print("[ASRX_Swift] Splitting into \(totalSegments) segments of \(Int(segmentDuration))s")
|
||||
print("[ASRX_Swift] Language: \(finalLang)")
|
||||
print("[ASRX_Swift] Starting diarization...")
|
||||
|
||||
var segments: [[String: Any]] = []
|
||||
var processedCount = 0
|
||||
|
||||
for segIdx in 0..<totalSegments {
|
||||
let startFrame = segIdx * frameStep
|
||||
let endFrame = min(startFrame + frameStep, Int(totalFrames))
|
||||
|
||||
// 取出該段音訊
|
||||
let segLength = endFrame - startFrame
|
||||
guard Double(segLength) > format.sampleRate * 0.5 else { continue } // 跳過 < 0.5s 的片段
|
||||
|
||||
let segBuffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: AVAudioFrameCount(segLength))!
|
||||
segBuffer.frameLength = AVAudioFrameCount(segLength)
|
||||
let src = floatData[0]
|
||||
let dstPtr = segBuffer.floatChannelData![0]
|
||||
let srcSlice = Array(floatData[startFrame..<endFrame])
|
||||
dstPtr.initialize(from: srcSlice, count: segLength)
|
||||
|
||||
// 寫入暫存 WAV(用標準 16-bit PCM 格式)
|
||||
let segURL = FileManager.default.temporaryDirectory
|
||||
.appendingPathComponent("seg_\(segIdx).wav")
|
||||
let wavSettings: [String: Any] = [
|
||||
AVFormatIDKey: kAudioFormatLinearPCM,
|
||||
AVSampleRateKey: 16000,
|
||||
AVNumberOfChannelsKey: 1,
|
||||
AVLinearPCMBitDepthKey: 16,
|
||||
AVLinearPCMIsFloatKey: false,
|
||||
]
|
||||
let segFile = try AVAudioFile(forWriting: segURL, settings: wavSettings,
|
||||
commonFormat: .pcmFormatInt16, interleaved: false)
|
||||
try segFile.write(from: segBuffer)
|
||||
|
||||
// 辨識該段
|
||||
let semaphore = DispatchSemaphore(value: 0)
|
||||
var segText = ""
|
||||
var segConfidence: Float = 0
|
||||
|
||||
let request = SFSpeechURLRecognitionRequest(url: segURL)
|
||||
request.shouldReportPartialResults = false
|
||||
request.requiresOnDeviceRecognition = true
|
||||
|
||||
let task = recognizer.recognitionTask(with: request) { result, error in
|
||||
if let error = error {
|
||||
print("[ASRX_Swift] Segment \(segIdx) error: \(error.localizedDescription)")
|
||||
} else if let result = result, result.isFinal {
|
||||
segText = result.bestTranscription.formattedString
|
||||
if let firstSeg = result.bestTranscription.segments.first {
|
||||
segConfidence = firstSeg.confidence
|
||||
}
|
||||
}
|
||||
semaphore.signal()
|
||||
}
|
||||
|
||||
RunLoop.current.run(until: Date(timeIntervalSinceNow: 10))
|
||||
if !segText.isEmpty {
|
||||
segments.append([
|
||||
"start_time": Double(startFrame) / format.sampleRate,
|
||||
"end_time": Double(endFrame) / format.sampleRate,
|
||||
"start_frame": Int(Double(startFrame) / format.sampleRate * 30),
|
||||
"end_frame": Int(Double(endFrame) / format.sampleRate * 30),
|
||||
"text": segText,
|
||||
"speaker_id": "SPEAKER_\(segIdx % 2)", // 簡單輪替作為 speaker 標記
|
||||
"confidence": segConfidence,
|
||||
])
|
||||
processedCount += 1
|
||||
}
|
||||
task.cancel()
|
||||
try? FileManager.default.removeItem(at: segURL)
|
||||
}
|
||||
|
||||
// 5. 輸出 JSON
|
||||
let outputDict: [String: Any] = [
|
||||
"language": finalLang,
|
||||
"segments": segments,
|
||||
"total_segments": processedCount,
|
||||
"total_duration": duration,
|
||||
"processing_time": Date().timeIntervalSince(startTime),
|
||||
"speaker_count": 2,
|
||||
"model": "Apple Speech Framework (segmented diarization)",
|
||||
]
|
||||
|
||||
let jsonData = try JSONSerialization.data(withJSONObject: outputDict, options: [.prettyPrinted])
|
||||
try jsonData.write(to: URL(fileURLWithPath: outputPath))
|
||||
|
||||
print("[ASRX_Swift] Output: \(processedCount) segments to \(outputPath)")
|
||||
print("[ASRX_Swift] Total: \(String(format: "%.2f", Date().timeIntervalSince(startTime)))s")
|
||||
}
|
||||
|
||||
func extractAudio(from path: String) -> URL {
|
||||
let ext = (path as NSString).pathExtension.lowercased()
|
||||
let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("asrx_\(UUID().uuidString)")
|
||||
try! FileManager.default.createDirectory(at: tempDir, withIntermediateDirectories: true)
|
||||
let wavURL = tempDir.appendingPathComponent("audio.wav")
|
||||
|
||||
if ["mp4", "mov", "mkv", "avi"].contains(ext) {
|
||||
print("[ASRX_Swift] Extracting audio from video...")
|
||||
let proc = Process()
|
||||
proc.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
|
||||
proc.arguments = ["-y", "-v", "quiet", "-i", path, "-ar", "16000", "-ac", "1", wavURL.path]
|
||||
try! proc.run()
|
||||
proc.waitUntilExit()
|
||||
} else {
|
||||
try! FileManager.default.copyItem(at: URL(fileURLWithPath: path), to: wavURL)
|
||||
}
|
||||
return wavURL
|
||||
}
|
||||
|
||||
func resolveLanguage() -> String {
|
||||
if !language.isEmpty { return language }
|
||||
let candidates = ["zh-TW", "zh-Hans", "en-US", "ja-JP", "ko-KR"]
|
||||
for localeId in candidates {
|
||||
if let reco = SFSpeechRecognizer(locale: Locale(identifier: localeId)), reco.isAvailable {
|
||||
print("[ASRX_Swift] Auto-detected language: \(localeId)")
|
||||
return localeId
|
||||
}
|
||||
}
|
||||
return "en-US"
|
||||
}
|
||||
}
|
||||
124
v1.1/scripts/swift_processors/body_pose_scanner_v1.11.swift
Normal file
124
v1.1/scripts/swift_processors/body_pose_scanner_v1.11.swift
Normal file
@@ -0,0 +1,124 @@
|
||||
#!/usr/bin/env swift
|
||||
import Foundation
|
||||
import Vision
|
||||
import AVFoundation
|
||||
import ArgumentParser
|
||||
|
||||
/// Full-movie body pose scanner: compute head-to-body ratio for every frame
|
||||
/// with face detections. Outputs JSONL (one object per frame).
|
||||
@main
|
||||
struct BodyPoseScanner: ParsableCommand {
|
||||
@Argument(help: "Video file path")
|
||||
var videoPath: String
|
||||
|
||||
@Argument(help: "Output JSONL path")
|
||||
var outputPath: String
|
||||
|
||||
@Option(help: "Frames to scan (comma-separated, e.g. '840,900,960') or 'all' to scan everything")
|
||||
var frames: String = "all"
|
||||
|
||||
@Option(help: "Sample interval (every N frames, for 'all' mode)")
|
||||
var interval: Int = 60
|
||||
|
||||
func run() throws {
|
||||
let url = URL(fileURLWithPath: videoPath)
|
||||
let asset = AVAsset(url: url)
|
||||
guard let reader = try? AVAssetReader(asset: asset) else {
|
||||
print("[BodyPose] Cannot open video"); return
|
||||
}
|
||||
|
||||
guard let videoTrack = asset.tracks(withMediaType: .video).first else {
|
||||
print("[BodyPose] No video track"); return
|
||||
}
|
||||
|
||||
let fps = videoTrack.nominalFrameRate
|
||||
let totalFrames = Int(videoTrack.timeRange.duration.seconds * Double(fps))
|
||||
|
||||
let readerOutput = AVAssetReaderTrackOutput(track: videoTrack, outputSettings: [
|
||||
kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_420YpCbCr8BiPlanarVideoRange
|
||||
])
|
||||
readerOutput.alwaysCopiesSampleData = false
|
||||
reader.add(readerOutput)
|
||||
reader.startReading()
|
||||
|
||||
// Parse target frames
|
||||
var targetFrames = Set<Int>()
|
||||
if frames == "all" {
|
||||
targetFrames = Set(stride(from: 0, to: totalFrames, by: interval))
|
||||
} else {
|
||||
targetFrames = Set(frames.split(separator: ",").compactMap { Int($0.trimmingCharacters(in: .whitespaces)) })
|
||||
}
|
||||
|
||||
var frameCount = 0
|
||||
var results: [[String: Any]] = []
|
||||
let bodyRequest = VNDetectHumanBodyPoseRequest()
|
||||
|
||||
guard let fh = FileHandle(forWritingAtPath: outputPath) else {
|
||||
print("[BodyPose] Cannot create output"); return
|
||||
}
|
||||
|
||||
while let sampleBuffer = readerOutput.copyNextSampleBuffer() {
|
||||
defer { frameCount += 1 }
|
||||
guard targetFrames.contains(frameCount) else { continue }
|
||||
targetFrames.remove(frameCount)
|
||||
|
||||
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else { continue }
|
||||
|
||||
let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
|
||||
try? handler.perform([bodyRequest])
|
||||
|
||||
guard let poses = bodyRequest.results, !poses.isEmpty else { continue }
|
||||
|
||||
let imgW = CGFloat(CVPixelBufferGetWidth(pixelBuffer))
|
||||
let imgH = CGFloat(CVPixelBufferGetHeight(pixelBuffer))
|
||||
|
||||
for obs in poses {
|
||||
guard let pts = try? obs.recognizedPoints(.all) else { continue }
|
||||
|
||||
var joints: [String: CGFloat] = [:]
|
||||
for (name, pt) in pts where pt.confidence > 0.3 {
|
||||
// Convert Vision (bottom-left origin) to pixel (top-left origin)
|
||||
joints[String(describing: name)] = pt.location.x * imgW
|
||||
joints[String(describing: name) + "_y"] = imgH - pt.location.y * imgH
|
||||
}
|
||||
|
||||
// Get head top
|
||||
let headY = joints["head_joint_y"] ?? joints["neck_1_joint_y"] ??
|
||||
joints["neck_2_joint_y"] ?? joints["right_eye_joint_y"] ?? 0
|
||||
|
||||
// Get lowest visible body point (foot > ankle > knee > hip)
|
||||
var bodyBottom = CGFloat.greatestFiniteMagnitude
|
||||
for jn in ["right_ankle_joint_y", "left_ankle_joint_y",
|
||||
"right_knee_joint_y", "left_knee_joint_y",
|
||||
"right_hip_joint_y", "left_hip_joint_y"] {
|
||||
if let v = joints[jn], v > 0 {
|
||||
bodyBottom = min(bodyBottom, v)
|
||||
}
|
||||
}
|
||||
|
||||
let bodyH = bodyBottom == .greatestFiniteMagnitude ? 0 : abs(headY - bodyBottom)
|
||||
let headH = abs(headY - (joints["neck_1_joint_y"] ?? headY))
|
||||
|
||||
let h2b = bodyH > 0 ? headH / bodyH : 0
|
||||
|
||||
let row: [String: Any] = [
|
||||
"frame": frameCount, "timestamp": Double(frameCount)/Double(fps),
|
||||
"head_top_y": headY, "body_bottom_y": bodyBottom,
|
||||
"body_h_px": bodyH, "head_h_px": headH,
|
||||
"h2b_ratio": Double(String(format: "%.3f", h2b)) ?? 0,
|
||||
"has_full_body": bodyH > 0 && headH > 0,
|
||||
"joints": joints.mapValues { Double($0) }
|
||||
]
|
||||
|
||||
var jsonData = try! JSONSerialization.data(withJSONObject: row)
|
||||
jsonData.append(10) // newline
|
||||
fh.write(jsonData)
|
||||
}
|
||||
|
||||
if targetFrames.isEmpty { break }
|
||||
}
|
||||
reader.cancelReading()
|
||||
fh.closeFile()
|
||||
print("[BodyPose] Done: \(results.count) frames → \(outputPath)")
|
||||
}
|
||||
}
|
||||
46
v1.1/scripts/swift_processors/check_speech_apis_v1.11.swift
Normal file
46
v1.1/scripts/swift_processors/check_speech_apis_v1.11.swift
Normal file
@@ -0,0 +1,46 @@
|
||||
import Foundation
|
||||
import Speech
|
||||
|
||||
// Check what's available in Speech framework
|
||||
print("=== Speech Framework API Availability ===")
|
||||
|
||||
// SFSpeechRecognizer
|
||||
print("SFSpeechRecognizer available: true")
|
||||
|
||||
// Check for SFSpeechRecognitionMetadata (iOS 17+, macOS 14+)
|
||||
let mdClass = NSClassFromString("SFSpeechRecognitionMetadata")
|
||||
print("SFSpeechRecognitionMetadata: \(mdClass != nil ? "✅ Available" : "❌ Not available")")
|
||||
|
||||
// Check SFSpeechAnalyzer (iOS 17+, macOS 14+)
|
||||
let analyzerClass = NSClassFromString("SFSpeechAnalyzer")
|
||||
print("SFSpeechAnalyzer: \(analyzerClass != nil ? "✅ Available" : "❌ Not available")")
|
||||
|
||||
// Check for speaker identification types
|
||||
let seClass = NSClassFromString("SFSpeakerEmbedding")
|
||||
print("SFSpeakerEmbedding: \(seClass != nil ? "✅ Available" : "❌ Not available")")
|
||||
|
||||
let siClass = NSClassFromString("SFSpeakerIdentification")
|
||||
print("SFSpeakerIdentification: \(siClass != nil ? "✅ Available" : "❌ Not available")")
|
||||
|
||||
let sevClass = NSClassFromString("SFSpeakerEmbeddingVector")
|
||||
print("SFSpeakerEmbeddingVector: \(sevClass != nil ? "✅ Available" : "❌ Not available")")
|
||||
|
||||
let srClass = NSClassFromString("SFSpeakerRecognition")
|
||||
print("SFSpeakerRecognition: \(srClass != nil ? "✅ Available" : "❌ Not available")")
|
||||
|
||||
// Check for AFVoiceBank / AVAudioSession speaker recognition
|
||||
let avClass = NSClassFromString("AVVoiceBank")
|
||||
print("AVVoiceBank: \(avClass != nil ? "✅ Available" : "❌ Not available")")
|
||||
|
||||
// Check AVAudioSession
|
||||
if #available(macOS 14, *) {
|
||||
print("macOS 14+ APIs available: ✅")
|
||||
} else {
|
||||
print("macOS 14+ APIs: ❌")
|
||||
}
|
||||
|
||||
// Summarize
|
||||
print()
|
||||
print("=== Summary ===")
|
||||
print("SFSpeechAnalyzer: \(analyzerClass != nil ? "✅ High-level speech analysis API" : "❌ Not available on this macOS version")")
|
||||
print("Speaker recognition APIs: \(seClass != nil || siClass != nil || srClass != nil ? "✅ Speaker recognition APIs exist" : "❌ No speaker recognition APIs found")")
|
||||
23
v1.1/scripts/swift_processors/check_vision_v1.11.swift
Normal file
23
v1.1/scripts/swift_processors/check_vision_v1.11.swift
Normal file
@@ -0,0 +1,23 @@
|
||||
import Foundation
|
||||
import Vision
|
||||
|
||||
let classes = [
|
||||
"VNDetectFaceRectanglesRequest",
|
||||
"VNDetectHumanRectanglesRequest",
|
||||
"VNDetectHumanBodyPoseRequest",
|
||||
"VNDetectHumanHandPoseRequest",
|
||||
"VNClassifyImageRequest",
|
||||
"VNRecognizeTextRequest",
|
||||
"VNGenerateObjectnessBasedSaliencyImageRequest",
|
||||
"VNGenerateAttentionBasedSaliencyImageRequest",
|
||||
"VNRecognizeObjectsRequest",
|
||||
"VNDetectContoursRequest",
|
||||
"VNDetectTrajectoriesRequest",
|
||||
]
|
||||
for cname in classes {
|
||||
if NSClassFromString(cname) != nil {
|
||||
print("\(cname): ✅")
|
||||
} else {
|
||||
print("\(cname): ❌")
|
||||
}
|
||||
}
|
||||
16
v1.1/scripts/swift_processors/entitlements_v1.11.plist
Normal file
16
v1.1/scripts/swift_processors/entitlements_v1.11.plist
Normal file
@@ -0,0 +1,16 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
|
||||
<plist version="1.0">
|
||||
<dict>
|
||||
<key>com.apple.security.device.audio-input</key>
|
||||
<true/>
|
||||
<key>com.apple.security.device.camera</key>
|
||||
<true/>
|
||||
<key>com.apple.security.network.client</key>
|
||||
<true/>
|
||||
<key>com.apple.security.files.user-selected.read-write</key>
|
||||
<true/>
|
||||
<key>com.apple.security.temporary-exception.audio-upload</key>
|
||||
<true/>
|
||||
</dict>
|
||||
</plist>
|
||||
206
v1.1/scripts/swift_processors/face_compare_test_v1.11.swift
Normal file
206
v1.1/scripts/swift_processors/face_compare_test_v1.11.swift
Normal file
@@ -0,0 +1,206 @@
|
||||
import Foundation
|
||||
import Vision
|
||||
import ArgumentParser
|
||||
import AppKit
|
||||
import AVFoundation
|
||||
|
||||
/// Full comparison: Apple Vision Framework vs InsightFace for face processing
|
||||
@main
|
||||
struct FaceCompareTest: ParsableCommand {
|
||||
@Argument(help: "Video path or image path")
|
||||
var inputPath: String
|
||||
|
||||
@Option(name: .long, help: "Sample interval (frames)")
|
||||
var sampleInterval: Int = 30
|
||||
|
||||
@Option(name: .long, help: "Maximum frames to process")
|
||||
var maxFrames: Int = 20
|
||||
|
||||
mutating func run() throws {
|
||||
let startTime = Date()
|
||||
print("=== Apple Vision Framework Face Processing ===")
|
||||
#if arch(arm64)
|
||||
print("HW: Apple Silicon ✅")
|
||||
#endif
|
||||
|
||||
let url = URL(fileURLWithPath: inputPath)
|
||||
let ext = (inputPath as NSString).pathExtension.lowercased()
|
||||
|
||||
if ["mp4", "mov", "mkv", "avi"].contains(ext) {
|
||||
try processVideo(url: url)
|
||||
} else {
|
||||
try processImage(url: url)
|
||||
}
|
||||
|
||||
print("Time: \(String(format: "%.2f", Date().timeIntervalSince(startTime)))s")
|
||||
}
|
||||
|
||||
func processVideo(url: URL) throws {
|
||||
let asset = AVAsset(url: url)
|
||||
guard let track = asset.tracks(withMediaType: .video).first else {
|
||||
print("No video track"); return
|
||||
}
|
||||
let duration = asset.duration.seconds
|
||||
let fps = Double(track.nominalFrameRate)
|
||||
let totalFrames = Int(duration * fps)
|
||||
print("Video: \(duration)s @ \(fps)fps = \(totalFrames) frames")
|
||||
|
||||
// Extract frames with ffmpeg at sample interval
|
||||
let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("face_compare_\(UUID().uuidString)")
|
||||
let framesDir = tempDir.appendingPathComponent("frames")
|
||||
try FileManager.default.createDirectory(at: framesDir, withIntermediateDirectories: true)
|
||||
defer { try? FileManager.default.removeItem(at: tempDir) }
|
||||
|
||||
let pattern = framesDir.appendingPathComponent("frame_%05d.jpg").path
|
||||
let proc = Process()
|
||||
proc.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
|
||||
proc.arguments = ["-y", "-v", "quiet", "-i", url.path,
|
||||
"-vf", "select=not(mod(n\\,\(sampleInterval)))",
|
||||
"-vsync", "vfr", "-q:v", "5", pattern]
|
||||
try proc.run()
|
||||
proc.waitUntilExit()
|
||||
|
||||
let allFiles = (try? FileManager.default.contentsOfDirectory(atPath: framesDir.path)) ?? []
|
||||
let frameFiles = allFiles.filter { $0.hasSuffix(".jpg") }.sorted().prefix(maxFrames)
|
||||
|
||||
var totalFaces = 0
|
||||
var framesWithFaces = 0
|
||||
var frameCount = 0
|
||||
|
||||
for fname in frameFiles {
|
||||
let imgPath = framesDir.appendingPathComponent(fname).path
|
||||
guard let imgData = try? Data(contentsOf: URL(fileURLWithPath: imgPath)),
|
||||
let img = NSImage(data: imgData),
|
||||
let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else { continue }
|
||||
|
||||
let frameNum = Int(fname.replacingOccurrences(of: "frame_", with: "").replacingOccurrences(of: ".jpg", with: "")) ?? 0
|
||||
let timestamp = Double(frameNum) / fps
|
||||
|
||||
// Run all face detection requests
|
||||
let faceResult = detectFaces(cgImage: cgImage)
|
||||
if faceResult.count > 0 {
|
||||
totalFaces += faceResult.count
|
||||
framesWithFaces += 1
|
||||
print(" Frame \(frameNum) (\(String(format: "%.1f", timestamp))s): \(faceResult.count) faces")
|
||||
for (i, f) in faceResult.enumerated() {
|
||||
print(" [\(i)] bbox=(\(String(format: "%.0f", f.x)),\(String(format: "%.0f", f.y))) size=\(String(format: "%.0f", f.w))x\(String(format: "%.0f", f.h)) conf=\(String(format: "%.3f", f.conf)) quality=\(String(format: "%.3f", f.quality)) landmarks=\(f.landmarks) embedding=\(f.hasEmbedding ? "✅" : "❌")")
|
||||
}
|
||||
}
|
||||
frameCount += 1
|
||||
}
|
||||
|
||||
print("\n=== Summary ===")
|
||||
print("Frames processed: \(frameCount)")
|
||||
print("Frames with faces: \(framesWithFaces)")
|
||||
print("Total faces detected: \(totalFaces)")
|
||||
|
||||
// Compare with existing InsightFace JSON if available
|
||||
let uuid = extractUUID(from: url.lastPathComponent)
|
||||
if uuid != "" {
|
||||
let faceJsonPath = "/Users/accusys/momentry/output_dev/\(uuid).face.json"
|
||||
if FileManager.default.fileExists(atPath: faceJsonPath) {
|
||||
print("\n=== Comparison with InsightFace (\(uuid).face.json) ===")
|
||||
if let data = try? Data(contentsOf: URL(fileURLWithPath: faceJsonPath)),
|
||||
let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any],
|
||||
let frames = json["frames"] as? [[String: Any]] {
|
||||
let insightFaces = frames.filter { ($0["faces"] as? [Any])?.count ?? 0 > 0 }.count
|
||||
let totalInsightFaces = frames.reduce(0) { $0 + (($1["faces"] as? [Any])?.count ?? 0) }
|
||||
print(" InsightFace frames with faces: \(insightFaces)")
|
||||
print(" InsightFace total faces: \(totalInsightFaces)")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func processImage(url: URL) throws {
|
||||
guard let imgData = try? Data(contentsOf: url),
|
||||
let img = NSImage(data: imgData),
|
||||
let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
|
||||
print("Cannot load image"); return
|
||||
}
|
||||
print("Image: \(cgImage.width)x\(cgImage.height)")
|
||||
let result = detectFaces(cgImage: cgImage)
|
||||
print("Vision faces: \(result.count)")
|
||||
for (i, f) in result.enumerated() {
|
||||
print(" [\(i)] bbox=(\(String(format: "%.0f", f.x)),\(String(format: "%.0f", f.y))) size=\(String(format: "%.0f", f.w))x\(String(format: "%.0f", f.h)) conf=\(String(format: "%.3f", f.conf)) quality=\(String(format: "%.3f", f.quality)) landmarks=\(f.landmarks) embedding=\(f.hasEmbedding ? "✅" : "❌")")
|
||||
}
|
||||
}
|
||||
|
||||
struct FaceResult {
|
||||
let x, y, w, h: Float
|
||||
let conf: Float
|
||||
let quality: Float
|
||||
let landmarks: Int
|
||||
let hasEmbedding: Bool
|
||||
}
|
||||
|
||||
func detectFaces(cgImage: CGImage) -> [FaceResult] {
|
||||
var results: [FaceResult] = []
|
||||
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
||||
|
||||
// 1. Face Detection
|
||||
let detectReq = VNDetectFaceRectanglesRequest()
|
||||
try? handler.perform([detectReq])
|
||||
|
||||
// 2. Face Landmarks (run separately for each detected face)
|
||||
let landmarkReq = VNDetectFaceLandmarksRequest()
|
||||
|
||||
// 3. Face Capture Quality
|
||||
let qualityReq = VNDetectFaceCaptureQualityRequest()
|
||||
|
||||
// Run all requests
|
||||
try? handler.perform([landmarkReq, qualityReq])
|
||||
|
||||
guard let detections = detectReq.results else { return [] }
|
||||
|
||||
let qualityResults = qualityReq.results ?? []
|
||||
|
||||
for (i, face) in detections.enumerated() {
|
||||
let bb = face.boundingBox
|
||||
let w = Float(cgImage.width)
|
||||
let h = Float(cgImage.height)
|
||||
let x = Float(bb.origin.x) * w
|
||||
let y = Float(bb.origin.y) * h
|
||||
let fw = Float(bb.size.width) * w
|
||||
let fh = Float(bb.size.height) * h
|
||||
|
||||
// Get landmarks count
|
||||
var lmCount = 0
|
||||
if let lmResults = landmarkReq.results, i < lmResults.count {
|
||||
let lms = lmResults[i].landmarks
|
||||
if let left = lms?.leftEye { lmCount += left.pointCount }
|
||||
if let right = lms?.rightEye { lmCount += right.pointCount }
|
||||
if let nose = lms?.nose { lmCount += nose.pointCount }
|
||||
}
|
||||
|
||||
// Get quality score
|
||||
var quality: Float = 0
|
||||
if i < qualityResults.count {
|
||||
if #available(macOS 14, *) {
|
||||
quality = (qualityResults[i].value(forKey: "faceCaptureQuality") as? Double).map { Float($0) } ?? 0
|
||||
}
|
||||
}
|
||||
|
||||
// Check for faceprint (embedding) via KVC
|
||||
var hasEmbedding = false
|
||||
if #available(macOS 14, *) {
|
||||
if let fp = face.value(forKey: "faceprint") as? NSObject {
|
||||
hasEmbedding = (fp.value(forKey: "data") as? Data) != nil
|
||||
}
|
||||
}
|
||||
|
||||
results.append(FaceResult(x: x, y: y, w: fw, h: fh, conf: face.confidence, quality: quality, landmarks: lmCount, hasEmbedding: hasEmbedding))
|
||||
}
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
func extractUUID(from filename: String) -> String {
|
||||
// Try to extract 32-char hex UUID from filename
|
||||
let pattern = try? NSRegularExpression(pattern: "[a-f0-9]{32}")
|
||||
if let match = pattern?.firstMatch(in: filename, range: NSRange(location: 0, length: filename.count)) {
|
||||
return (filename as NSString).substring(with: match.range)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
}
|
||||
98
v1.1/scripts/swift_processors/face_vision_test_v1.11.swift
Normal file
98
v1.1/scripts/swift_processors/face_vision_test_v1.11.swift
Normal file
@@ -0,0 +1,98 @@
|
||||
import Foundation
|
||||
import Vision
|
||||
import ArgumentParser
|
||||
import AppKit
|
||||
|
||||
/// POC: Test Apple Vision Framework for face detection + faceprint extraction
|
||||
@main
|
||||
struct FaceVisionTest: ParsableCommand {
|
||||
@Argument(help: "Input image path")
|
||||
var inputPath: String
|
||||
|
||||
mutating func run() throws {
|
||||
let startTime = Date()
|
||||
print("=== Apple Vision Framework Face POC ===")
|
||||
#if arch(arm64)
|
||||
print("HW: Apple Silicon ✅")
|
||||
#else
|
||||
print("HW: Intel")
|
||||
#endif
|
||||
|
||||
guard let image = NSImage(contentsOfFile: inputPath) else {
|
||||
print("Error: cannot load image"); return
|
||||
}
|
||||
guard let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
|
||||
print("Error: cannot get CGImage"); return
|
||||
}
|
||||
print("Image: \(cgImage.width)x\(cgImage.height)")
|
||||
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
||||
|
||||
// 1. Detect faces (synchronous)
|
||||
print("\n--- Detection ---")
|
||||
let detectReq = VNDetectFaceRectanglesRequest()
|
||||
try handler.perform([detectReq])
|
||||
let faces = detectReq.results ?? []
|
||||
print("Faces: \(faces.count)")
|
||||
for (i, f) in faces.enumerated() {
|
||||
let bb = f.boundingBox
|
||||
print(" [\(i)] bbox=(\(String(format: "%.3f", bb.origin.x)),\(String(format: "%.3f", bb.origin.y))) size=(\(String(format: "%.3f", bb.size.width)),\(String(format: "%.3f", bb.size.height))) conf=\(String(format: "%.2f", f.confidence))")
|
||||
}
|
||||
|
||||
guard !faces.isEmpty else { print("No faces"); return }
|
||||
|
||||
// 2. Landmarks
|
||||
print("\n--- Landmarks ---")
|
||||
let lmReq = VNDetectFaceLandmarksRequest()
|
||||
try handler.perform([lmReq])
|
||||
if let lmResults = lmReq.results {
|
||||
for (i, f) in lmResults.enumerated() {
|
||||
if let lms = f.landmarks {
|
||||
let count = (lms.leftEye?.pointCount ?? 0) + (lms.rightEye?.pointCount ?? 0)
|
||||
print(" [\(i)] landmarks: \(lms.leftEye?.pointCount ?? 0)+\(lms.rightEye?.pointCount ?? 0) eye pts, nose=\(lms.nose?.pointCount ?? 0)")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Capture quality
|
||||
print("\n--- Capture Quality ---")
|
||||
let qualReq = VNDetectFaceCaptureQualityRequest()
|
||||
try handler.perform([qualReq])
|
||||
if let qResults = qualReq.results {
|
||||
for (i, f) in qResults.enumerated() {
|
||||
if #available(macOS 14, *) {
|
||||
let q = f.value(forKey: "faceCaptureQuality") as? Double ?? -1
|
||||
print(" [\(i)] quality=\(String(format: "%.4f", q))")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Faceprint (embedding)
|
||||
print("\n--- Faceprint ---")
|
||||
if #available(macOS 14, *) {
|
||||
let fpClass: AnyClass? = NSClassFromString("VNFaceprint")
|
||||
print("VNFaceprint class: \(fpClass != nil ? "✅ exists" : "❌ nil")")
|
||||
|
||||
if let first = faces.first {
|
||||
let fp = first.value(forKey: "faceprint") as? NSObject
|
||||
print("faceprint KVC: \(fp != nil ? "✅" : "❌")")
|
||||
if let fpData = fp {
|
||||
let data = fpData.value(forKey: "data") as? Data
|
||||
print(" data: \(data != nil ? "\(data!.count) bytes" : "nil")")
|
||||
let desc = fpData.value(forKey: "descriptor") as? NSObject
|
||||
print(" descriptor: \(desc != nil ? "✅ class=\(type(of: desc!))" : "nil")")
|
||||
if let d = desc, let elems = d.value(forKey: "elements") as? [NSNumber] {
|
||||
print(" elements: \(elems.count) dims")
|
||||
if elems.count > 0 {
|
||||
print(" first 5: \(elems.prefix(5).map { String(format: "%.4f", $0.doubleValue) }.joined(separator: ", "))")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
print("macOS 14+ required")
|
||||
}
|
||||
|
||||
print("\nTime: \(String(format: "%.2f", Date().timeIntervalSince(startTime)))s")
|
||||
print("=== Done ===")
|
||||
}
|
||||
}
|
||||
83
v1.1/scripts/swift_processors/pose_benchmark_v1.11.swift
Normal file
83
v1.1/scripts/swift_processors/pose_benchmark_v1.11.swift
Normal file
@@ -0,0 +1,83 @@
|
||||
import Foundation
|
||||
import Vision
|
||||
import ArgumentParser
|
||||
import AppKit
|
||||
import AVFoundation
|
||||
|
||||
/// Benchmark: Apple Vision Framework body pose detection speed
|
||||
@main
|
||||
struct PoseBenchmark: ParsableCommand {
|
||||
@Argument(help: "Video path or image directory")
|
||||
var inputPath: String
|
||||
|
||||
@Option(name: .long, help: "Sample interval (frames)")
|
||||
var sampleInterval: Int = 30
|
||||
|
||||
mutating func run() throws {
|
||||
let start = Date()
|
||||
print("=== Vision Body Pose Benchmark ===")
|
||||
#if arch(arm64)
|
||||
print("HW: Apple Silicon ✅")
|
||||
#endif
|
||||
|
||||
let url = URL(fileURLWithPath: inputPath)
|
||||
let asset = AVAsset(url: url)
|
||||
guard let track = asset.tracks(withMediaType: .video).first else {
|
||||
print("No video track"); return
|
||||
}
|
||||
let duration = asset.duration.seconds
|
||||
let fps = Double(track.nominalFrameRate)
|
||||
let totalFrames = Int(duration * fps)
|
||||
|
||||
// Extract frames with ffmpeg
|
||||
let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("pose_bench_\(UUID().uuidString)")
|
||||
let framesDir = tempDir.appendingPathComponent("frames")
|
||||
try FileManager.default.createDirectory(at: framesDir, withIntermediateDirectories: true)
|
||||
|
||||
let pattern = framesDir.appendingPathComponent("frame_%05d.jpg").path
|
||||
let extract = Process()
|
||||
extract.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
|
||||
extract.arguments = ["-y", "-v", "quiet", "-i", inputPath,
|
||||
"-vf", "select=not(mod(n\\,\(sampleInterval)))",
|
||||
"-vsync", "vfr", "-q:v", "5", pattern]
|
||||
try extract.run()
|
||||
extract.waitUntilExit()
|
||||
|
||||
let files = (try? FileManager.default.contentsOfDirectory(atPath: framesDir.path)) ?? []
|
||||
let frameFiles = files.filter { $0.hasSuffix(".jpg") }.sorted()
|
||||
print("Frames: \(frameFiles.count)")
|
||||
|
||||
// Process all frames in one loop (no subprocess overhead)
|
||||
var totalPoses = 0
|
||||
var framesWithPose = 0
|
||||
let inferenceStart = Date()
|
||||
|
||||
for fname in frameFiles {
|
||||
let imgPath = framesDir.appendingPathComponent(fname).path
|
||||
guard let imgData = try? Data(contentsOf: URL(fileURLWithPath: imgPath)),
|
||||
let img = NSImage(data: imgData),
|
||||
let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else { continue }
|
||||
|
||||
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
||||
let req = VNDetectHumanBodyPoseRequest()
|
||||
try? handler.perform([req])
|
||||
|
||||
if let poses = req.results, !poses.isEmpty {
|
||||
framesWithPose += 1
|
||||
totalPoses += poses.count
|
||||
}
|
||||
}
|
||||
|
||||
let inferenceTime = Date().timeIntervalSince(inferenceStart)
|
||||
let totalTime = Date().timeIntervalSince(start)
|
||||
|
||||
print("\n=== Results ===")
|
||||
print("Frames: \(frameFiles.count), with poses: \(framesWithPose)")
|
||||
print("Total poses: \(totalPoses)")
|
||||
print("Inference: \(String(format: "%.2f", inferenceTime))s")
|
||||
print("Per frame: \(String(format: "%.0f", inferenceTime / Double(frameFiles.count) * 1000))ms")
|
||||
print("Total: \(String(format: "%.2f", totalTime))s")
|
||||
|
||||
try FileManager.default.removeItem(at: tempDir)
|
||||
}
|
||||
}
|
||||
106
v1.1/scripts/swift_processors/speaker_meta_test_v1.11.swift
Normal file
106
v1.1/scripts/swift_processors/speaker_meta_test_v1.11.swift
Normal file
@@ -0,0 +1,106 @@
|
||||
import Foundation
|
||||
import Speech
|
||||
|
||||
/// Test: Use KVC to check for speaker metadata on SFSpeechRecognitionResult
|
||||
@main
|
||||
struct SpeakerMetaTest {
|
||||
static func main() {
|
||||
print("=== Speaker Metadata Test ===")
|
||||
|
||||
let testFile = "/tmp/test_60s_b.wav"
|
||||
guard FileManager.default.fileExists(atPath: testFile) else {
|
||||
print("Test file not found")
|
||||
return
|
||||
}
|
||||
|
||||
let semaphore = DispatchSemaphore(value: 0)
|
||||
var done = false
|
||||
|
||||
SFSpeechRecognizer.requestAuthorization { status in
|
||||
guard status == .authorized else {
|
||||
print("Authorization: \(status.rawValue)")
|
||||
semaphore.signal()
|
||||
return
|
||||
}
|
||||
print("Authorization: ✅")
|
||||
|
||||
let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
|
||||
recognizer.supportsOnDeviceRecognition ? print("On-device: ✅") : print("On-device: ❌ (will use server)")
|
||||
|
||||
let request = SFSpeechURLRecognitionRequest(url: URL(fileURLWithPath: testFile))
|
||||
request.shouldReportPartialResults = false
|
||||
request.requiresOnDeviceRecognition = false
|
||||
request.taskHint = .dictation
|
||||
|
||||
print("Starting recognition...")
|
||||
let task = recognizer.recognitionTask(with: request) { result, error in
|
||||
if let error = error {
|
||||
print("Error: \(error.localizedDescription)")
|
||||
} else if let result = result, result.isFinal {
|
||||
print("Recognition complete: \(result.bestTranscription.segments.count) raw segments")
|
||||
|
||||
// Check for metadata on the result object
|
||||
let resultMetadata = result.value(forKey: "metadata") as? NSObject
|
||||
print("Result metadata: \(resultMetadata != nil ? "✅" : "❌")")
|
||||
if let rm = resultMetadata {
|
||||
print(" Result metadata class: \(type(of: rm))")
|
||||
// Dump all KVC values
|
||||
for key in ["speakerID", "speakerName", "speakerConfidence", "voiceProfileID", "speaker"] {
|
||||
if let val = rm.value(forKey: key) {
|
||||
print(" result.\(key) = \(val)")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check each segment for metadata
|
||||
var speakerCount = 0
|
||||
for (i, seg) in result.bestTranscription.segments.enumerated() {
|
||||
let segMetadata = seg.value(forKey: "metadata") as? NSObject
|
||||
if let sm = segMetadata {
|
||||
if i < 3 {
|
||||
print("Seg[\(i)] metadata class: \(type(of: sm))")
|
||||
// Try common keys
|
||||
for key in ["speakerID", "speaker", "voice", "speakerConfidence", "speakerName"] {
|
||||
if let val = sm.value(forKey: key) {
|
||||
print(" seg.\(key) = \(val)")
|
||||
speakerCount += 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if speakerCount == 0 {
|
||||
print("No speaker metadata found on any segment")
|
||||
} else {
|
||||
print("Found speaker metadata on segments: ✅")
|
||||
}
|
||||
|
||||
// Also check all KVC keys on first segment
|
||||
if let firstSeg = result.bestTranscription.segments.first {
|
||||
print("\nAll KVC keys on first segment:")
|
||||
let keys = ["metadata", "speaker", "voice", "recognition", "analysis", "audio"]
|
||||
for key in keys {
|
||||
if let val = firstSeg.value(forKey: key) {
|
||||
print(" \(key): \(type(of: val)) = \(val)")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
done = true
|
||||
semaphore.signal()
|
||||
}
|
||||
|
||||
// Run loop until done
|
||||
while !done {
|
||||
RunLoop.current.run(mode: .default, before: Date(timeIntervalSinceNow: 0.1))
|
||||
}
|
||||
task.cancel()
|
||||
|
||||
semaphore.signal()
|
||||
}
|
||||
|
||||
semaphore.wait()
|
||||
print("Done")
|
||||
}
|
||||
}
|
||||
113
v1.1/scripts/swift_processors/speaker_test_v1.11.swift
Normal file
113
v1.1/scripts/swift_processors/speaker_test_v1.11.swift
Normal file
@@ -0,0 +1,113 @@
|
||||
import Foundation
|
||||
import Speech
|
||||
|
||||
/// POC: Test SFSpeechAnalyzer + SFSpeechRecognitionMetadata for speaker detection
|
||||
/// Goal: Determine if ANE-accelerated speaker diarization is feasible
|
||||
|
||||
@main
|
||||
struct SpeakerTest {
|
||||
static func main() {
|
||||
print("=== SFSpeechAnalyzer Speaker Detection POC ===")
|
||||
|
||||
// 1. Check if running on ANE-capable hardware
|
||||
#if arch(arm64)
|
||||
print("Hardware: Apple Silicon ✅ (ANE available)")
|
||||
#else
|
||||
print("Hardware: Intel ❌ (No ANE)")
|
||||
#endif
|
||||
|
||||
// 2. Check SFSpeechRecognizer on-device capability
|
||||
let locale = Locale(identifier: "en-US")
|
||||
let recognizerCheck = SFSpeechRecognizer(locale: locale)
|
||||
print("On-device recognition: \(recognizerCheck?.supportsOnDeviceRecognition == true ? "✅" : "❌")")
|
||||
|
||||
// 3. Check SFSpeechAnalyzer capabilities via availability API
|
||||
if #available(macOS 14, *) {
|
||||
print("\n=== SFSpeechAnalyzer Analysis ===")
|
||||
let analyzerClass: AnyClass? = NSClassFromString("SFSpeechAnalyzer")
|
||||
print("SFSpeechAnalyzer: \(analyzerClass != nil ? "✅ Available" : "❌ Not available")")
|
||||
|
||||
let mdClass: AnyClass? = NSClassFromString("SFSpeechRecognitionMetadata")
|
||||
print("SFSpeechRecognitionMetadata: \(mdClass != nil ? "✅ Available" : "❌ Not available")")
|
||||
}
|
||||
|
||||
// 4. Test: Run ASR with SFSpeechRecognitionMetadata
|
||||
print("\n=== Real-world Test ===")
|
||||
let testFile = "/tmp/test_60s_b.wav"
|
||||
guard FileManager.default.fileExists(atPath: testFile) else {
|
||||
print("Test file not found: \(testFile)")
|
||||
return
|
||||
}
|
||||
|
||||
let semaphore = DispatchSemaphore(value: 0)
|
||||
var detectedSpeakers: Set<String> = []
|
||||
|
||||
print("Running ASR with speaker detection on 60s clip...")
|
||||
|
||||
SFSpeechRecognizer.requestAuthorization { status in
|
||||
guard status == .authorized else {
|
||||
print("Authorization denied")
|
||||
semaphore.signal()
|
||||
return
|
||||
}
|
||||
|
||||
let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
|
||||
guard recognizer.isAvailable else {
|
||||
print("Recognizer not available")
|
||||
semaphore.signal()
|
||||
return
|
||||
}
|
||||
|
||||
let request = SFSpeechURLRecognitionRequest(url: URL(fileURLWithPath: testFile))
|
||||
request.shouldReportPartialResults = false
|
||||
request.requiresOnDeviceRecognition = true
|
||||
request.taskHint = .dictation
|
||||
|
||||
let task = recognizer.recognitionTask(with: request) { result, error in
|
||||
if let error = error {
|
||||
print("Recognition error: \(error.localizedDescription)")
|
||||
} else if let result = result, result.isFinal {
|
||||
let text = result.bestTranscription.formattedString
|
||||
print("Text: \(text.prefix(200))")
|
||||
print("Segments: \(result.bestTranscription.segments.count)")
|
||||
|
||||
// Check each segment for speaker metadata
|
||||
if #available(macOS 14, *) {
|
||||
for (i, seg) in result.bestTranscription.segments.enumerated() {
|
||||
// Access metadata via KVC since it might be a private API
|
||||
let md = seg.value(forKey: "metadata") as? NSObject
|
||||
if let md = md {
|
||||
let speakerId = md.value(forKey: "speakerID") as? String
|
||||
let speakerName = md.value(forKey: "speakerName") as? String
|
||||
let confidence = md.value(forKey: "speakerConfidence") as? Double
|
||||
|
||||
if let sid = speakerId {
|
||||
detectedSpeakers.insert(sid)
|
||||
if i < 5 || i % 20 == 0 {
|
||||
print(" Seg[\(i)] speaker=\(sid) name=\(speakerName ?? "?") conf=\(confidence ?? 0) text=\"\(seg.substring.prefix(40))\"")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
print("\nUnique speakers detected: \(detectedSpeakers)")
|
||||
if detectedSpeakers.isEmpty {
|
||||
print("⚠️ No speaker metadata found in recognition results")
|
||||
}
|
||||
} else {
|
||||
print("macOS 14+ required for speaker metadata")
|
||||
}
|
||||
}
|
||||
semaphore.signal()
|
||||
}
|
||||
|
||||
// Wait
|
||||
while semaphore.wait(timeout: .now()) == .timedOut {
|
||||
RunLoop.current.run(mode: .default, before: Date(timeIntervalSinceNow: 0.1))
|
||||
}
|
||||
task.cancel()
|
||||
}
|
||||
|
||||
semaphore.wait()
|
||||
}
|
||||
}
|
||||
191
v1.1/scripts/swift_processors/swift_cut_test_v1.11.swift
Normal file
191
v1.1/scripts/swift_processors/swift_cut_test_v1.11.swift
Normal file
@@ -0,0 +1,191 @@
|
||||
import Foundation
|
||||
import AVFoundation
|
||||
import ArgumentParser
|
||||
import Accelerate
|
||||
|
||||
/// POC: Swift-based scene cut detection using AVFoundation histogram analysis
|
||||
/// Compared against Python PySceneDetect ContentDetector (threshold=27)
|
||||
@main
|
||||
struct SwiftCutTest: ParsableCommand {
|
||||
@Argument(help: "Video file path")
|
||||
var inputPath: String
|
||||
|
||||
@Argument(help: "Output JSON path (optional)")
|
||||
var outputPath: String?
|
||||
|
||||
@Option(name: .long, help: "Detection threshold (higher= fewer cuts, default 0.3)")
|
||||
var threshold: Double = 0.3
|
||||
|
||||
@Option(name: .long, help: "Sample interval in frames (default=1)")
|
||||
var sampleInterval: Int = 1
|
||||
|
||||
mutating func run() throws {
|
||||
let startTime = Date()
|
||||
print("=== Swift Scene Cut Detection POC ===")
|
||||
#if arch(arm64)
|
||||
print("HW: Apple Silicon ✅ (ANE available)")
|
||||
#endif
|
||||
|
||||
let url = URL(fileURLWithPath: inputPath)
|
||||
let asset = AVAsset(url: url)
|
||||
|
||||
guard let videoTrack = asset.tracks(withMediaType: .video).first else {
|
||||
print("Error: No video track found"); return
|
||||
}
|
||||
|
||||
let duration = asset.duration.seconds
|
||||
let fps = videoTrack.nominalFrameRate
|
||||
let totalFrames = Int(duration * Double(fps))
|
||||
print("Video: \(inputPath)")
|
||||
print("Duration: \(String(format: "%.1f", duration))s")
|
||||
print("FPS: \(String(format: "%.1f", fps))")
|
||||
print("Total frames: \(totalFrames)")
|
||||
print("Threshold: \(String(format: "%.2f", threshold))")
|
||||
print("Sample interval: \(sampleInterval)")
|
||||
|
||||
// Read frame histogram data using AVAssetReader
|
||||
guard let reader = try? AVAssetReader(asset: asset) else {
|
||||
print("Error: Cannot create asset reader"); return
|
||||
}
|
||||
|
||||
let settings: [String: Any] = [
|
||||
kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA,
|
||||
kCVPixelBufferWidthKey as String: 320, // downscale for speed
|
||||
kCVPixelBufferHeightKey as String: 180,
|
||||
]
|
||||
let trackOutput = AVAssetReaderTrackOutput(track: videoTrack, outputSettings: settings)
|
||||
reader.add(trackOutput)
|
||||
reader.startReading()
|
||||
|
||||
var frameIndex = 0
|
||||
var prevHistogram: [Float]?
|
||||
var scenes: [(start: Double, end: Double)] = []
|
||||
var sceneStart: Double = 0
|
||||
var diffs: [(frame: Int, diff: Float)] = []
|
||||
|
||||
let frameStep = sampleInterval
|
||||
var lastPrint = 0
|
||||
|
||||
while reader.status == .reading {
|
||||
guard let sampleBuffer = trackOutput.copyNextSampleBuffer() else { break }
|
||||
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
|
||||
CMSampleBufferInvalidate(sampleBuffer); continue
|
||||
}
|
||||
|
||||
if frameIndex % frameStep == 0 {
|
||||
let timestamp = CMTimeGetSeconds(CMSampleBufferGetPresentationTimeStamp(sampleBuffer))
|
||||
|
||||
// Compute histogram
|
||||
let histogram = computeLuminanceHistogram(pixelBuffer: pixelBuffer)
|
||||
|
||||
if let prev = prevHistogram {
|
||||
let diff = histogramDifference(prev, histogram)
|
||||
|
||||
if diff > Float(threshold) {
|
||||
// Scene cut detected
|
||||
let sceneEnd = timestamp
|
||||
scenes.append((start: sceneStart, end: sceneEnd))
|
||||
sceneStart = timestamp
|
||||
diffs.append((frame: frameIndex, diff: diff))
|
||||
|
||||
if scenes.count % 50 == 0 {
|
||||
print(" Scenes so far: \(scenes.count) at frame \(frameIndex)/\(totalFrames)")
|
||||
}
|
||||
}
|
||||
}
|
||||
prevHistogram = histogram
|
||||
}
|
||||
|
||||
frameIndex += 1
|
||||
CMSampleBufferInvalidate(sampleBuffer)
|
||||
|
||||
// Progress every 5%
|
||||
let pct = Int(Float(frameIndex) / Float(totalFrames) * 100)
|
||||
if pct >= lastPrint + 5 {
|
||||
print(" Progress: \(pct)% (\(frameIndex)/\(totalFrames) frames)")
|
||||
lastPrint = pct
|
||||
}
|
||||
}
|
||||
|
||||
// Add last scene
|
||||
if sceneStart < duration {
|
||||
scenes.append((start: sceneStart, end: duration))
|
||||
}
|
||||
|
||||
let elapsed = Date().timeIntervalSince(startTime)
|
||||
print("\n=== Results ===")
|
||||
print("Scenes detected: \(scenes.count)")
|
||||
print("Time: \(String(format: "%.2f", elapsed))s")
|
||||
if totalFrames > 0 {
|
||||
let rtf = elapsed / duration
|
||||
print("RTF: \(String(format: "%.3f", rtf))x")
|
||||
}
|
||||
print("Last 5 cuts:")
|
||||
for s in scenes.suffix(5) {
|
||||
print(" \(String(format: "%.1f", s.start))s - \(String(format: "%.1f", s.end))s")
|
||||
}
|
||||
|
||||
// Output JSON if requested
|
||||
if let outPath = outputPath {
|
||||
let outputDict: [String: Any] = [
|
||||
"scenes": scenes.map { ["start_time": $0.start, "end_time": $0.end] },
|
||||
"metadata": [
|
||||
"video_path": inputPath,
|
||||
"duration": duration,
|
||||
"fps": fps,
|
||||
"total_frames": totalFrames,
|
||||
"threshold": threshold,
|
||||
"sample_interval": sampleInterval,
|
||||
"processing_time": elapsed,
|
||||
"rtf": elapsed / duration,
|
||||
],
|
||||
"diffs": diffs.map { ["frame": $0.frame, "diff": String(format: "%.4f", $0.diff)] }
|
||||
]
|
||||
if let jsonData = try? JSONSerialization.data(withJSONObject: outputDict, options: [.prettyPrinted]) {
|
||||
try jsonData.write(to: URL(fileURLWithPath: outPath))
|
||||
print("Output written to: \(outPath)")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func computeLuminanceHistogram(pixelBuffer: CVPixelBuffer) -> [Float] {
|
||||
CVPixelBufferLockBaseAddress(pixelBuffer, .readOnly)
|
||||
defer { CVPixelBufferUnlockBaseAddress(pixelBuffer, .readOnly) }
|
||||
|
||||
let width = CVPixelBufferGetWidth(pixelBuffer)
|
||||
let height = CVPixelBufferGetHeight(pixelBuffer)
|
||||
let bytesPerRow = CVPixelBufferGetBytesPerRow(pixelBuffer)
|
||||
let baseAddress = CVPixelBufferGetBaseAddress(pixelBuffer)!
|
||||
|
||||
var histogram = [Float](repeating: 0, count: 64) // 64 bins for speed
|
||||
|
||||
for y in 0..<height {
|
||||
let row = baseAddress.advanced(by: y * bytesPerRow)
|
||||
let pixels = row.assumingMemoryBound(to: UInt8.self)
|
||||
for x in 0..<width {
|
||||
let b = pixels[x * 4]
|
||||
let g = pixels[x * 4 + 1]
|
||||
let r = pixels[x * 4 + 2]
|
||||
let luminance = (0.299 * Float(r) + 0.587 * Float(g) + 0.114 * Float(b))
|
||||
let bin = min(Int(luminance / 256.0 * 64.0), 63)
|
||||
histogram[bin] += 1
|
||||
}
|
||||
}
|
||||
|
||||
// Normalize
|
||||
let total = Float(width * height)
|
||||
for i in 0..<histogram.count {
|
||||
histogram[i] /= total
|
||||
}
|
||||
|
||||
return histogram
|
||||
}
|
||||
|
||||
func histogramDifference(_ a: [Float], _ b: [Float]) -> Float {
|
||||
var diff: Float = 0
|
||||
for i in 0..<min(a.count, b.count) {
|
||||
diff += abs(a[i] - b[i])
|
||||
}
|
||||
return diff
|
||||
}
|
||||
}
|
||||
291
v1.1/scripts/swift_processors/swift_face_v1.11.swift
Normal file
291
v1.1/scripts/swift_processors/swift_face_v1.11.swift
Normal file
@@ -0,0 +1,291 @@
|
||||
import Foundation
|
||||
import Vision
|
||||
import ArgumentParser
|
||||
import AVFoundation
|
||||
|
||||
/// Swift Face Processor - Apple Vision Framework for face detection + pose
|
||||
/// Uses AVAssetImageGenerator for reliable frame access (no AVAssetReader corruption).
|
||||
@main
|
||||
struct SwiftFace: ParsableCommand {
|
||||
@Argument(help: "Video file path")
|
||||
var inputPath: String
|
||||
|
||||
@Argument(help: "Output JSON path")
|
||||
var outputPath: String
|
||||
|
||||
@Option(name: .long, help: "Sample interval (frames, default=30)")
|
||||
var sampleInterval: Int = 30
|
||||
|
||||
@Option(name: .long, help: "UUID for logging")
|
||||
var uuid: String = ""
|
||||
|
||||
mutating func run() throws {
|
||||
let startTime = Date()
|
||||
print("[SwiftFace] Vision-based face detection: \(inputPath)")
|
||||
|
||||
let url = URL(fileURLWithPath: inputPath)
|
||||
let asset = AVAsset(url: url)
|
||||
|
||||
guard let videoTrack = asset.tracks(withMediaType: .video).first else {
|
||||
print("[SwiftFace] No video track found")
|
||||
return
|
||||
}
|
||||
|
||||
let fps = videoTrack.nominalFrameRate
|
||||
let duration = CMTimeGetSeconds(asset.duration)
|
||||
let totalFrames = Int(duration * Double(fps))
|
||||
let width = Int(videoTrack.naturalSize.width)
|
||||
let height = Int(videoTrack.naturalSize.height)
|
||||
print("[SwiftFace] Video: \(width)x\(height), \(String(format: "%.1f", fps))fps, \(totalFrames) frames")
|
||||
|
||||
let generator = AVAssetImageGenerator(asset: asset)
|
||||
generator.requestedTimeToleranceBefore = .zero
|
||||
generator.requestedTimeToleranceAfter = .zero
|
||||
generator.appliesPreferredTrackTransform = true
|
||||
|
||||
var allFrames: [[String: Any]] = []
|
||||
var processedCount = 0
|
||||
var checkedCount = 0
|
||||
|
||||
let frameInterval = TimeInterval(sampleInterval) / Double(fps)
|
||||
|
||||
// Process in batches of 1000 frames to avoid memory pressure
|
||||
let batchSize = 1000
|
||||
let totalSamples = totalFrames / sampleInterval
|
||||
|
||||
for batchStart in stride(from: 0, to: totalSamples, by: batchSize) {
|
||||
let batchEnd = min(batchStart + batchSize, totalSamples)
|
||||
var times: [NSValue] = []
|
||||
|
||||
for i in batchStart..<batchEnd {
|
||||
let seconds = Double(i) * frameInterval
|
||||
let cmTime = CMTime(seconds: seconds, preferredTimescale: 1000)
|
||||
times.append(NSValue(time: cmTime))
|
||||
}
|
||||
|
||||
let semaphore = DispatchSemaphore(value: 0)
|
||||
var batchError: Error? = nil
|
||||
|
||||
generator.generateCGImagesAsynchronously(forTimes: times) { requestedTime, cgImage, actualTime, result, error in
|
||||
defer { semaphore.signal() }
|
||||
|
||||
checkedCount += 1
|
||||
|
||||
guard result == .succeeded, let cgImage = cgImage else {
|
||||
if let error = error {
|
||||
fputs("[SwiftFace] Frame error at \(CMTimeGetSeconds(requestedTime)): \(error.localizedDescription)\n", stderr)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Convert CGImage to CVPixelBuffer for Vision
|
||||
var pixelBuffer: CVPixelBuffer?
|
||||
let attrs: [CFString: Any] = [
|
||||
kCVPixelBufferCGImageCompatibilityKey: true,
|
||||
kCVPixelBufferCGBitmapContextCompatibilityKey: true,
|
||||
kCVPixelBufferWidthKey: cgImage.width,
|
||||
kCVPixelBufferHeightKey: cgImage.height,
|
||||
]
|
||||
CVPixelBufferCreate(kCFAllocatorDefault, cgImage.width, cgImage.height,
|
||||
kCVPixelFormatType_32BGRA, attrs as CFDictionary, &pixelBuffer)
|
||||
|
||||
guard let pb = pixelBuffer else { return }
|
||||
|
||||
CVPixelBufferLockBaseAddress(pb, [])
|
||||
defer { CVPixelBufferUnlockBaseAddress(pb, []) }
|
||||
|
||||
let context = CGContext(data: CVPixelBufferGetBaseAddress(pb),
|
||||
width: cgImage.width, height: cgImage.height,
|
||||
bitsPerComponent: 8, bytesPerRow: CVPixelBufferGetBytesPerRow(pb),
|
||||
space: CGColorSpaceCreateDeviceRGB(),
|
||||
bitmapInfo: CGImageAlphaInfo.noneSkipFirst.rawValue | CGBitmapInfo.byteOrder32Little.rawValue)!
|
||||
context.draw(cgImage, in: CGRect(x: 0, y: 0, width: cgImage.width, height: cgImage.height))
|
||||
|
||||
let handler = VNImageRequestHandler(cvPixelBuffer: pb, options: [:])
|
||||
let detectReq = VNDetectFaceRectanglesRequest()
|
||||
let lmReq = VNDetectFaceLandmarksRequest()
|
||||
|
||||
do {
|
||||
try handler.perform([detectReq, lmReq])
|
||||
} catch {
|
||||
return
|
||||
}
|
||||
|
||||
let faceObservations = detectReq.results ?? []
|
||||
let landmarkObservations = lmReq.results ?? []
|
||||
guard !faceObservations.isEmpty || !landmarkObservations.isEmpty else {
|
||||
return
|
||||
}
|
||||
|
||||
let seconds = CMTimeGetSeconds(actualTime)
|
||||
let frameNumber = Int(seconds * Double(fps))
|
||||
var frameFaces: [[String: Any]] = []
|
||||
|
||||
// Use actual CGImage size (may differ from naturalSize after transform)
|
||||
let imgW = CGFloat(cgImage.width)
|
||||
let imgH = CGFloat(cgImage.height)
|
||||
|
||||
// Process landmark observations FIRST (each has bbox + landmarks, self-consistent)
|
||||
// Quality filtering
|
||||
let MIN_CONFIDENCE = 0.6
|
||||
let MIN_SIZE = 20
|
||||
|
||||
for lmObs in landmarkObservations {
|
||||
// Confidence filter
|
||||
let lmConf = Double(lmObs.confidence)
|
||||
if lmConf < MIN_CONFIDENCE { continue }
|
||||
|
||||
let bb = lmObs.boundingBox
|
||||
let faceW = Int(bb.size.width * imgW)
|
||||
let faceH = Int(bb.size.height * imgH)
|
||||
|
||||
// Size filter
|
||||
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
|
||||
|
||||
let faceX = Int(bb.origin.x * imgW)
|
||||
let faceY = Int((1.0 - bb.origin.y - bb.size.height) * imgH)
|
||||
|
||||
var faceData: [String: Any] = [
|
||||
"bbox": ["x": max(0, faceX), "y": max(0, faceY),
|
||||
"width": faceW, "height": faceH],
|
||||
"confidence": Double(lmObs.confidence),
|
||||
]
|
||||
|
||||
// Pose from landmark observation
|
||||
if let yaw = lmObs.yaw?.doubleValue,
|
||||
let roll = lmObs.roll?.doubleValue {
|
||||
var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
|
||||
if let pitch = lmObs.pitch?.doubleValue {
|
||||
poseInfo["pitch"] = pitch
|
||||
}
|
||||
faceData["pose"] = poseInfo
|
||||
}
|
||||
|
||||
// Landmarks with Y-flip (macOS image coords: bottom-left -> top-left)
|
||||
if let lms = lmObs.landmarks {
|
||||
let imgSize = CGSize(width: imgW, height: imgH)
|
||||
let leftEye = lms.leftEye?.pointsInImage(imageSize: imgSize) ?? []
|
||||
let rightEye = lms.rightEye?.pointsInImage(imageSize: imgSize) ?? []
|
||||
let nose = lms.nose?.pointsInImage(imageSize: imgSize) ?? []
|
||||
|
||||
if !leftEye.isEmpty || !rightEye.isEmpty || !nose.isEmpty {
|
||||
var lm: [String: [[Double]]] = [:]
|
||||
if !leftEye.isEmpty {
|
||||
lm["left_eye"] = leftEye.map { [Double($0.x), Double(imgH - $0.y)] }
|
||||
}
|
||||
if !rightEye.isEmpty {
|
||||
lm["right_eye"] = rightEye.map { [Double($0.x), Double(imgH - $0.y)] }
|
||||
}
|
||||
if !nose.isEmpty {
|
||||
lm["nose"] = nose.map { [Double($0.x), Double(imgH - $0.y)] }
|
||||
}
|
||||
faceData["landmarks"] = lm
|
||||
}
|
||||
|
||||
let outer = lms.outerLips?.pointsInImage(imageSize: imgSize) ?? []
|
||||
let inner = lms.innerLips?.pointsInImage(imageSize: imgSize) ?? []
|
||||
if !outer.isEmpty || !inner.isEmpty {
|
||||
faceData["lips"] = [
|
||||
"outer_lips": outer.map { [Double($0.x), Double(imgH - $0.y)] },
|
||||
"inner_lips": inner.map { [Double($0.x), Double(imgH - $0.y)] }
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
frameFaces.append(faceData)
|
||||
}
|
||||
|
||||
// Output face rect observations that the landmark detector missed.
|
||||
// Match against ALL landmark observations via IoU to avoid duplicates.
|
||||
for faceObs in faceObservations {
|
||||
let fBB = faceObs.boundingBox
|
||||
var matched = false
|
||||
for lmObs in landmarkObservations {
|
||||
let lBB = lmObs.boundingBox
|
||||
let ix = max(fBB.origin.x, lBB.origin.x)
|
||||
let iy = max(fBB.origin.y, lBB.origin.y)
|
||||
let iw = min(fBB.maxX, lBB.maxX) - ix
|
||||
let ih = min(fBB.maxY, lBB.maxY) - iy
|
||||
if iw <= 0 || ih <= 0 { continue }
|
||||
let intersection = iw * ih
|
||||
let union = fBB.width * fBB.height + lBB.width * lBB.height - intersection
|
||||
if intersection / union > 0.3 {
|
||||
matched = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if matched { continue }
|
||||
|
||||
// Quality filtering for unmatched face rects
|
||||
let MIN_CONFIDENCE = 0.6
|
||||
let MIN_SIZE = 20
|
||||
|
||||
let faceConf = Double(faceObs.faceCaptureQuality ?? faceObs.confidence)
|
||||
if faceConf < MIN_CONFIDENCE { continue }
|
||||
|
||||
let faceW = Int(fBB.size.width * imgW)
|
||||
let faceH = Int(fBB.size.height * imgH)
|
||||
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
|
||||
|
||||
// Unmatched face rect: output without landmarks
|
||||
let faceX = Int(fBB.origin.x * imgW)
|
||||
let faceY = Int((1.0 - fBB.origin.y - fBB.size.height) * imgH)
|
||||
|
||||
var faceData: [String: Any] = [
|
||||
"bbox": ["x": max(0, faceX), "y": max(0, faceY),
|
||||
"width": faceW, "height": faceH],
|
||||
"confidence": Double(faceObs.faceCaptureQuality ?? faceObs.confidence),
|
||||
]
|
||||
if let yaw = faceObs.yaw?.doubleValue,
|
||||
let roll = faceObs.roll?.doubleValue {
|
||||
var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
|
||||
if let pitch = faceObs.pitch?.doubleValue {
|
||||
poseInfo["pitch"] = pitch
|
||||
}
|
||||
faceData["pose"] = poseInfo
|
||||
}
|
||||
frameFaces.append(faceData)
|
||||
}
|
||||
|
||||
if !frameFaces.isEmpty {
|
||||
allFrames.append([
|
||||
"frame": frameNumber,
|
||||
"timestamp": seconds,
|
||||
"faces": frameFaces,
|
||||
])
|
||||
processedCount += 1
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for batch to complete
|
||||
for _ in batchStart..<batchEnd {
|
||||
semaphore.wait()
|
||||
}
|
||||
|
||||
let elapsed = Date().timeIntervalSince(startTime)
|
||||
let pct = Int(Double(min(batchEnd, totalSamples)) / Double(totalSamples) * 100)
|
||||
print("[SwiftFace] \(processedCount) frames with faces, \(pct)% complete, \(Int(elapsed))s elapsed")
|
||||
fflush(stdout)
|
||||
}
|
||||
|
||||
generator.cancelAllCGImageGeneration()
|
||||
|
||||
let output: [String: Any] = [
|
||||
"frame_count": allFrames.count,
|
||||
"fps": Double(fps),
|
||||
"frames": allFrames,
|
||||
]
|
||||
|
||||
guard let jsonData = try? JSONSerialization.data(withJSONObject: output, options: []),
|
||||
let jsonString = String(data: jsonData, encoding: .utf8) else {
|
||||
print("[SwiftFace] Failed to serialize JSON")
|
||||
return
|
||||
}
|
||||
|
||||
let outputURL = URL(fileURLWithPath: outputPath)
|
||||
try jsonString.write(to: outputURL, atomically: false, encoding: .utf8)
|
||||
|
||||
let elapsed = Date().timeIntervalSince(startTime)
|
||||
print("[SwiftFace] Done: \(allFrames.count) frames, \(String(format: "%.1f", elapsed))s → \(outputPath)")
|
||||
}
|
||||
}
|
||||
204
v1.1/scripts/swift_processors/swift_ocr_v1.11.swift
Normal file
204
v1.1/scripts/swift_processors/swift_ocr_v1.11.swift
Normal file
@@ -0,0 +1,204 @@
|
||||
import Foundation
|
||||
import Vision
|
||||
import ArgumentParser
|
||||
import AVFoundation
|
||||
import AppKit
|
||||
|
||||
/// Swift OCR Processor - replaces Python PaddleOCR
|
||||
/// Uses Apple Vision Framework (VNRecognizeTextRequest) with ANE acceleration
|
||||
///
|
||||
/// Output format (compatible with OcrResult Rust struct):
|
||||
/// {
|
||||
/// "frame_count": N,
|
||||
/// "fps": 30.0,
|
||||
/// "frames": [
|
||||
/// { "frame": 0, "timestamp": 0.0, "texts": [{ "text": "...", "x": 0, "y": 0, "width": 0, "height": 0, "confidence": 0.0 }] }
|
||||
/// ]
|
||||
/// }
|
||||
@main
|
||||
struct SwiftOCR: ParsableCommand {
|
||||
@Argument(help: "Video file path")
|
||||
var inputPath: String
|
||||
|
||||
@Argument(help: "Output JSON path")
|
||||
var outputPath: String
|
||||
|
||||
@Option(name: .long, help: "Frames to skip between OCR (default=30)")
|
||||
var sampleInterval: Int = 30
|
||||
|
||||
@Option(name: .long, help: "Video FPS (auto-detect if 0)")
|
||||
var fps: Double = 0
|
||||
|
||||
@Option(name: .long, help: "UUID for logging")
|
||||
var uuid: String = ""
|
||||
|
||||
@Option(name: .long, help: "Recognition level: fast or accurate (default=accurate)")
|
||||
var recognitionLevel: String = "accurate"
|
||||
|
||||
mutating func run() throws {
|
||||
let startTime = Date()
|
||||
print("[SwiftOCR] Starting: \(inputPath)")
|
||||
print("[SwiftOCR] Sample interval: \(sampleInterval)")
|
||||
|
||||
let url = URL(fileURLWithPath: inputPath)
|
||||
let asset = AVAsset(url: url)
|
||||
|
||||
guard let videoTrack = asset.tracks(withMediaType: .video).first else {
|
||||
print("[SwiftOCR] Error: No video track"); return
|
||||
}
|
||||
|
||||
let duration = asset.duration.seconds
|
||||
let detectedFps = fps > 0 ? fps : Double(videoTrack.nominalFrameRate)
|
||||
let totalFrames = Int(duration * detectedFps)
|
||||
print("[SwiftOCR] Duration: \(String(format: "%.1f", duration))s, FPS: \(String(format: "%.1f", detectedFps)), Frames: \(totalFrames)")
|
||||
|
||||
let frameStep = sampleInterval
|
||||
|
||||
// Use shared frame cache if available (set by FrameManager)
|
||||
let tempDir: URL
|
||||
let framesDir: URL
|
||||
if let cacheDir = ProcessInfo.processInfo.environment["MOMENTRY_FRAME_DIR"] {
|
||||
framesDir = URL(fileURLWithPath: cacheDir)
|
||||
tempDir = framesDir // No cleanup needed (managed by FrameManager)
|
||||
print("[SwiftOCR] Using shared frame cache: \(cacheDir)")
|
||||
} else {
|
||||
tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("swift_ocr_\(UUID().uuidString)")
|
||||
framesDir = tempDir.appendingPathComponent("frames")
|
||||
try FileManager.default.createDirectory(at: framesDir, withIntermediateDirectories: true)
|
||||
|
||||
let framePattern = framesDir.appendingPathComponent("frame_%05d.jpg").path
|
||||
print("[SwiftOCR] Extracting frames with ffmpeg (interval=\(frameStep))...")
|
||||
|
||||
let extractProc = Process()
|
||||
extractProc.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
|
||||
extractProc.arguments = ["-y", "-v", "quiet", "-i", inputPath,
|
||||
"-vf", "select=not(mod(n\\,\(frameStep))),scale=320:-2",
|
||||
"-vsync", "vfr", "-q:v", "15", framePattern]
|
||||
let startExtract = Date()
|
||||
try extractProc.run()
|
||||
extractProc.waitUntilExit()
|
||||
let extractTime = Date().timeIntervalSince(startExtract)
|
||||
print("[SwiftOCR] Frame extraction complete: \(String(format: "%.1f", extractTime))s")
|
||||
}
|
||||
|
||||
// Sort extracted frame files
|
||||
let fileManager = FileManager.default
|
||||
let allFiles = (try? fileManager.contentsOfDirectory(atPath: framesDir.path)) ?? []
|
||||
let frameFiles = allFiles
|
||||
.filter { $0.hasPrefix("frame_") && $0.hasSuffix(".jpg") }
|
||||
.sorted()
|
||||
|
||||
let level: VNRequestTextRecognitionLevel = (recognitionLevel == "fast") ? .fast : .accurate
|
||||
var ocrFrames: [[String: Any]] = []
|
||||
var lastProgress = 0
|
||||
let totalFrames_to_process = frameFiles.count
|
||||
|
||||
for (i, frameName) in frameFiles.enumerated() {
|
||||
let imgPath = framesDir.appendingPathComponent(frameName).path
|
||||
|
||||
guard let imgData = try? Data(contentsOf: URL(fileURLWithPath: imgPath)),
|
||||
let img = NSImage(data: imgData),
|
||||
let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
|
||||
continue
|
||||
}
|
||||
|
||||
// Extract frame number from filename
|
||||
let frameNumber = Int(frameName.replacingOccurrences(of: "frame_", with: "").replacingOccurrences(of: ".jpg", with: "")) ?? (i * frameStep)
|
||||
let timestamp = Double(frameNumber) / detectedFps
|
||||
|
||||
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
||||
let request = VNRecognizeTextRequest()
|
||||
request.recognitionLevel = level
|
||||
request.usesLanguageCorrection = true
|
||||
request.preferBackgroundProcessing = true
|
||||
|
||||
guard (try? handler.perform([request])) != nil,
|
||||
let results = request.results else { continue }
|
||||
|
||||
var texts: [[String: Any]] = []
|
||||
let cgW = cgImage.width
|
||||
let cgH = cgImage.height
|
||||
for obs in results {
|
||||
guard let candidate = obs.topCandidates(1).first else { continue }
|
||||
let conf = candidate.confidence
|
||||
guard conf > 0.3 else { continue }
|
||||
let bb = obs.boundingBox
|
||||
let item: [String: Any] = [
|
||||
"text": candidate.string,
|
||||
"x": Int(bb.origin.x * CGFloat(cgW)),
|
||||
"y": Int((1.0 - bb.origin.y - bb.size.height) * CGFloat(cgH)),
|
||||
"width": Int(bb.size.width * CGFloat(cgW)),
|
||||
"height": Int(bb.size.height * CGFloat(cgH)),
|
||||
"confidence": conf
|
||||
]
|
||||
texts.append(item)
|
||||
}
|
||||
|
||||
if !texts.isEmpty {
|
||||
ocrFrames.append([
|
||||
"frame": frameNumber,
|
||||
"timestamp": timestamp,
|
||||
"texts": texts
|
||||
])
|
||||
}
|
||||
|
||||
let pct = Int(Float(i) / Float(totalFrames_to_process) * 100)
|
||||
if pct >= lastProgress + 5 {
|
||||
print("[SwiftOCR] Progress: \(pct)% (\(i)/\(totalFrames_to_process) samples, \(ocrFrames.count) with text)")
|
||||
lastProgress = pct
|
||||
}
|
||||
}
|
||||
|
||||
// Write output
|
||||
let outputDict: [String: Any] = [
|
||||
"frame_count": ocrFrames.count,
|
||||
"fps": detectedFps,
|
||||
"frames": ocrFrames
|
||||
]
|
||||
|
||||
if let jsonData = try? JSONSerialization.data(withJSONObject: outputDict, options: [.prettyPrinted]) {
|
||||
try jsonData.write(to: URL(fileURLWithPath: outputPath))
|
||||
}
|
||||
|
||||
let elapsed = Date().timeIntervalSince(startTime)
|
||||
print("[SwiftOCR] Complete: \(ocrFrames.count) frames with text, \(String(format: "%.1f", elapsed))s")
|
||||
if duration > 0 {
|
||||
print("[SwiftOCR] RTF: \(String(format: "%.3f", elapsed / duration))x")
|
||||
}
|
||||
|
||||
// Clean up temp dir if we created it (not shared cache)
|
||||
if ProcessInfo.processInfo.environment["MOMENTRY_FRAME_DIR"] == nil {
|
||||
try? FileManager.default.removeItem(at: tempDir)
|
||||
}
|
||||
}
|
||||
|
||||
func recognizeText(pixelBuffer: CVPixelBuffer, level: VNRequestTextRecognitionLevel) -> [[String: Any]] {
|
||||
var texts: [[String: Any]] = []
|
||||
let request = VNRecognizeTextRequest()
|
||||
request.recognitionLevel = level
|
||||
request.usesLanguageCorrection = true
|
||||
request.preferBackgroundProcessing = true
|
||||
|
||||
let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
|
||||
guard (try? handler.perform([request])) != nil,
|
||||
let results = request.results else { return texts }
|
||||
|
||||
let cgW = CGFloat(CVPixelBufferGetWidth(pixelBuffer))
|
||||
let cgH = CGFloat(CVPixelBufferGetHeight(pixelBuffer))
|
||||
|
||||
for obs in results {
|
||||
guard let candidate = obs.topCandidates(1).first,
|
||||
candidate.confidence > 0.2 else { continue }
|
||||
let bb = obs.boundingBox
|
||||
texts.append([
|
||||
"text": candidate.string,
|
||||
"x": Int(bb.origin.x * cgW),
|
||||
"y": Int((1.0 - bb.origin.y - bb.size.height) * cgH),
|
||||
"width": Int(bb.size.width * cgW),
|
||||
"height": Int(bb.size.height * cgH),
|
||||
"confidence": candidate.confidence
|
||||
])
|
||||
}
|
||||
return texts
|
||||
}
|
||||
}
|
||||
222
v1.1/scripts/swift_processors/swift_pose_v1.11.swift
Normal file
222
v1.1/scripts/swift_processors/swift_pose_v1.11.swift
Normal file
@@ -0,0 +1,222 @@
|
||||
import Foundation
|
||||
import Vision
|
||||
import ArgumentParser
|
||||
import AppKit
|
||||
import AVFoundation
|
||||
|
||||
/// Swift Pose Processor - replaces YOLOv8 Pose / MediaPipe Pose
|
||||
/// Uses VNDetectHumanBodyPoseRequest with ANE acceleration
|
||||
///
|
||||
/// Output format (compatible with PoseResult Rust struct):
|
||||
/// {
|
||||
/// "frame_count": N, "fps": 30.0,
|
||||
/// "frames": [
|
||||
/// { "frame": 0, "timestamp": 0.0, "persons": [
|
||||
/// { "keypoints": [{"name":"nose","x":100,"y":200,"confidence":0.95}],
|
||||
/// "bbox": {"x":0,"y":0,"width":100,"height":200}
|
||||
/// }
|
||||
/// ]}
|
||||
/// ]
|
||||
/// }
|
||||
@main
|
||||
struct SwiftPose: ParsableCommand {
|
||||
@Argument(help: "Video file path")
|
||||
var inputPath: String
|
||||
|
||||
@Argument(help: "Output JSON path")
|
||||
var outputPath: String
|
||||
|
||||
@Option(name: .long, help: "Sample interval (frames, default=30)")
|
||||
var sampleInterval: Int = 30
|
||||
|
||||
@Option(name: .long, help: "UUID for logging")
|
||||
var uuid: String = ""
|
||||
|
||||
mutating func run() throws {
|
||||
let startTime = Date()
|
||||
print("[SwiftPose] Starting: \(inputPath)")
|
||||
|
||||
let url = URL(fileURLWithPath: inputPath)
|
||||
let asset = AVAsset(url: url)
|
||||
guard let track = asset.tracks(withMediaType: .video).first else {
|
||||
print("[SwiftPose] Error: No video track"); return
|
||||
}
|
||||
let duration = asset.duration.seconds
|
||||
let fps = Double(track.nominalFrameRate)
|
||||
print("[SwiftPose] Duration: \(String(format: "%.1f", duration))s, FPS: \(String(format: "%.1f", fps))")
|
||||
|
||||
// Extract frames (use shared cache or ffmpeg)
|
||||
let tempDir: URL
|
||||
let framesDir: URL
|
||||
if let cacheDir = ProcessInfo.processInfo.environment["MOMENTRY_FRAME_DIR"] {
|
||||
framesDir = URL(fileURLWithPath: cacheDir)
|
||||
tempDir = framesDir
|
||||
print("[SwiftPose] Using shared frame cache: \(cacheDir)")
|
||||
} else {
|
||||
tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("swift_pose_\(UUID().uuidString)")
|
||||
framesDir = tempDir.appendingPathComponent("frames")
|
||||
try FileManager.default.createDirectory(at: framesDir, withIntermediateDirectories: true)
|
||||
|
||||
let pattern = framesDir.appendingPathComponent("frame_%05d.jpg").path
|
||||
print("[SwiftPose] Extracting frames...")
|
||||
let extract = Process()
|
||||
extract.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
|
||||
extract.arguments = ["-y", "-v", "quiet", "-i", inputPath,
|
||||
"-vf", "select=not(mod(n\\,\(sampleInterval)))",
|
||||
"-vsync", "vfr", "-q:v", "15", pattern]
|
||||
try extract.run()
|
||||
extract.waitUntilExit()
|
||||
}
|
||||
|
||||
let files = (try? FileManager.default.contentsOfDirectory(atPath: framesDir.path)) ?? []
|
||||
let frameFiles = files.filter { $0.hasSuffix(".jpg") }.sorted()
|
||||
print("[SwiftPose] Extracted \(frameFiles.count) frames")
|
||||
|
||||
let jointNames: [VNHumanBodyPoseObservation.JointName] = [
|
||||
.nose, .leftEye, .rightEye, .leftEar, .rightEar,
|
||||
.neck, .root,
|
||||
.leftShoulder, .rightShoulder,
|
||||
.leftElbow, .rightElbow,
|
||||
.leftWrist, .rightWrist,
|
||||
.leftHip, .rightHip,
|
||||
.leftKnee, .rightKnee,
|
||||
.leftAnkle, .rightAnkle,
|
||||
]
|
||||
|
||||
var poseFrames: [[String: Any]] = []
|
||||
var lastProgress = 0
|
||||
|
||||
for (i, fname) in frameFiles.enumerated() {
|
||||
let imgPath = framesDir.appendingPathComponent(fname).path
|
||||
guard let imgData = try? Data(contentsOf: URL(fileURLWithPath: imgPath)),
|
||||
let img = NSImage(data: imgData),
|
||||
let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else { continue }
|
||||
|
||||
let frameNum = Int(fname.replacingOccurrences(of: "frame_", with: "").replacingOccurrences(of: ".jpg", with: "")) ?? (i * sampleInterval)
|
||||
let timestamp = Double(frameNum) / fps
|
||||
let w = cgImage.width
|
||||
let h = cgImage.height
|
||||
|
||||
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
||||
let req = VNDetectHumanBodyPoseRequest()
|
||||
try? handler.perform([req])
|
||||
|
||||
guard let poses = req.results, !poses.isEmpty else { continue }
|
||||
|
||||
var persons: [[String: Any]] = []
|
||||
for pose in poses {
|
||||
var keypoints: [[String: Any]] = []
|
||||
var minX = CGFloat.greatestFiniteMagnitude
|
||||
var minY = CGFloat.greatestFiniteMagnitude
|
||||
var maxX: CGFloat = 0
|
||||
var maxY: CGFloat = 0
|
||||
|
||||
for joint in jointNames {
|
||||
if let point = try? pose.recognizedPoint(joint) {
|
||||
let desc = String(describing: joint.rawValue)
|
||||
var rawName = desc
|
||||
.replacingOccurrences(of: "VNRecognizedPointKey(_rawValue: ", with: "")
|
||||
.replacingOccurrences(of: ")", with: "")
|
||||
.trimmingCharacters(in: .whitespaces)
|
||||
// Map Vision Framework joint names to standard names
|
||||
let nameMap: [String: String] = [
|
||||
"head_joint": "nose",
|
||||
"left_eye_joint": "left_eye",
|
||||
"right_eye_joint": "right_eye",
|
||||
"left_ear_joint": "left_ear",
|
||||
"right_ear_joint": "right_ear",
|
||||
"neck_1_joint": "neck",
|
||||
"left_shoulder_1_joint": "left_shoulder",
|
||||
"right_shoulder_1_joint": "right_shoulder",
|
||||
"left_elbow_1_joint": "left_elbow",
|
||||
"right_elbow_1_joint": "right_elbow",
|
||||
"left_hand_joint": "left_wrist",
|
||||
"right_hand_joint": "right_wrist",
|
||||
"left_hip_1_joint": "left_hip",
|
||||
"right_hip_1_joint": "right_hip",
|
||||
"left_knee_1_joint": "left_knee",
|
||||
"right_knee_1_joint": "right_knee",
|
||||
"left_ankle_1_joint": "left_ankle",
|
||||
"right_ankle_1_joint": "right_ankle",
|
||||
"center_hip_joint": "root",
|
||||
"left_forearm_joint": "left_elbow",
|
||||
"right_forearm_joint": "right_elbow",
|
||||
"left_upLeg_joint": "left_hip",
|
||||
"right_upLeg_joint": "right_hip",
|
||||
"left_leg_joint": "left_knee",
|
||||
"right_leg_joint": "right_knee",
|
||||
"left_foot_joint": "left_ankle",
|
||||
"right_foot_joint": "right_ankle",
|
||||
]
|
||||
if let mapped = nameMap[rawName] {
|
||||
rawName = mapped
|
||||
}
|
||||
let px = point.location.x * CGFloat(w)
|
||||
let py = CGFloat(h) - point.location.y * CGFloat(h)
|
||||
keypoints.append([
|
||||
"name": rawName.isEmpty ? "\(joint)" : rawName,
|
||||
"x": px,
|
||||
"y": py,
|
||||
"confidence": point.confidence,
|
||||
])
|
||||
if point.confidence > 0.1 {
|
||||
minX = min(minX, px)
|
||||
minY = min(minY, py)
|
||||
maxX = max(maxX, px)
|
||||
maxY = max(maxY, py)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var bbox: [String: Any] = [
|
||||
"x": 0, "y": 0, "width": 0, "height": 0
|
||||
]
|
||||
if maxX > minX {
|
||||
bbox = [
|
||||
"x": Int(minX),
|
||||
"y": Int(minY),
|
||||
"width": Int(maxX - minX),
|
||||
"height": Int(maxY - minY),
|
||||
]
|
||||
}
|
||||
|
||||
persons.append(["keypoints": keypoints, "bbox": bbox])
|
||||
}
|
||||
|
||||
if !persons.isEmpty {
|
||||
poseFrames.append([
|
||||
"frame": frameNum,
|
||||
"timestamp": timestamp,
|
||||
"persons": persons,
|
||||
])
|
||||
}
|
||||
|
||||
let pct = Int(Float(i) / Float(frameFiles.count) * 100)
|
||||
if pct >= lastProgress + 10 {
|
||||
print("[SwiftPose] Progress: \(pct)% (\(i)/\(frameFiles.count), \(persons.count) poses)")
|
||||
lastProgress = pct
|
||||
}
|
||||
}
|
||||
|
||||
// Write output
|
||||
let outputDict: [String: Any] = [
|
||||
"frame_count": poseFrames.count,
|
||||
"fps": fps,
|
||||
"frames": poseFrames,
|
||||
]
|
||||
if let jsonData = try? JSONSerialization.data(withJSONObject: outputDict, options: [.prettyPrinted]) {
|
||||
try jsonData.write(to: URL(fileURLWithPath: outputPath))
|
||||
}
|
||||
|
||||
let elapsed = Date().timeIntervalSince(startTime)
|
||||
print("[SwiftPose] Complete: \(poseFrames.count) frames, \(String(format: "%.1f", elapsed))s")
|
||||
if duration > 0 {
|
||||
print("[SwiftPose] RTF: \(String(format: "%.3f", elapsed / duration))x")
|
||||
}
|
||||
|
||||
// Clean up temp dir if we created it
|
||||
if ProcessInfo.processInfo.environment["MOMENTRY_FRAME_DIR"] == nil {
|
||||
try? FileManager.default.removeItem(at: tempDir)
|
||||
}
|
||||
}
|
||||
}
|
||||
102
v1.1/scripts/swift_processors/vision_object_test_v1.11.swift
Normal file
102
v1.1/scripts/swift_processors/vision_object_test_v1.11.swift
Normal file
@@ -0,0 +1,102 @@
|
||||
import Foundation
|
||||
import Vision
|
||||
import ArgumentParser
|
||||
import AppKit
|
||||
|
||||
/// POC: Test Apple Vision Framework for object detection (YOLO replacement)
|
||||
@main
|
||||
struct VisionObjectTest: ParsableCommand {
|
||||
@Argument(help: "Input image path")
|
||||
var inputPath: String
|
||||
|
||||
func run() throws {
|
||||
let startTime = Date()
|
||||
print("=== Apple Vision Framework Object Detection POC ===")
|
||||
#if arch(arm64)
|
||||
print("HW: Apple Silicon ✅ (ANE available)")
|
||||
#endif
|
||||
|
||||
guard let image = NSImage(contentsOfFile: inputPath) else {
|
||||
print("Error: cannot load image"); return
|
||||
}
|
||||
guard let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
|
||||
print("Error: cannot get CGImage"); return
|
||||
}
|
||||
print("Image: \(cgImage.width)x\(cgImage.height)")
|
||||
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
||||
|
||||
// 1. VNClassifyImageRequest (scene classification - replaces scene_classifier)
|
||||
print("\n--- VNClassifyImageRequest ---")
|
||||
let clsReq = VNClassifyImageRequest()
|
||||
try handler.perform([clsReq])
|
||||
if let classifications = clsReq.results {
|
||||
print("Top classifications:")
|
||||
for c in classifications.prefix(10) {
|
||||
print(" \(c.identifier): conf=\(String(format: "%.3f", c.confidence))")
|
||||
}
|
||||
}
|
||||
|
||||
// 2. VNDetectHumanRectanglesRequest (person detection - YOLO replacement for 'person')
|
||||
print("\n--- VNDetectHumanRectanglesRequest ---")
|
||||
let humanReq = VNDetectHumanRectanglesRequest()
|
||||
try handler.perform([humanReq])
|
||||
if let humans = humanReq.results {
|
||||
print("Humans: \(humans.count)")
|
||||
for (i, h) in humans.enumerated() {
|
||||
let bb = h.boundingBox
|
||||
print(" [\(i)] bbox=(\(String(format: "%.3f", bb.origin.x)),\(String(format: "%.3f", bb.origin.y))) size=(\(String(format: "%.3f", bb.size.width)),\(String(format: "%.3f", bb.size.height))) conf=\(String(format: "%.2f", h.confidence))")
|
||||
}
|
||||
}
|
||||
|
||||
// 3. VNDetectHumanBodyPoseRequest (pose estimation - MediaPipe replacement)
|
||||
print("\n--- VNDetectHumanBodyPoseRequest ---")
|
||||
let poseReq = VNDetectHumanBodyPoseRequest()
|
||||
try handler.perform([poseReq])
|
||||
if let poses = poseReq.results {
|
||||
print("Body poses: \(poses.count)")
|
||||
for (i, p) in poses.enumerated() {
|
||||
let joints = p.availableJointNames
|
||||
print(" [\(i)] \(joints.count) joints detected")
|
||||
// Show key joints
|
||||
for joint in [VNHumanBodyPoseObservation.JointName.neck,
|
||||
VNHumanBodyPoseObservation.JointName.leftShoulder,
|
||||
VNHumanBodyPoseObservation.JointName.rightShoulder,
|
||||
VNHumanBodyPoseObservation.JointName.leftWrist,
|
||||
VNHumanBodyPoseObservation.JointName.rightWrist,
|
||||
VNHumanBodyPoseObservation.JointName.root] {
|
||||
if let pt = try? p.recognizedPoint(joint) {
|
||||
print(" \(joint.rawValue): (\(String(format: "%.3f", pt.location.x)), \(String(format: "%.3f", pt.location.y))) conf=\(String(format: "%.2f", pt.confidence))")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4. VNDetectHumanHandPoseRequest (hand pose)
|
||||
print("\n--- VNDetectHumanHandPoseRequest ---")
|
||||
let handReq = VNDetectHumanHandPoseRequest()
|
||||
try handler.perform([handReq])
|
||||
if let hands = handReq.results {
|
||||
print("Hands: \(hands.count)")
|
||||
for (i, h) in hands.enumerated() {
|
||||
print(" [\(i)] confidence=\(String(format: "%.2f", h.confidence))")
|
||||
}
|
||||
}
|
||||
|
||||
// 5. VNGenerateObjectnessBasedSaliencyImageRequest (object detection without labels)
|
||||
print("\n--- VNGenerateObjectnessBasedSaliencyImageRequest ---")
|
||||
let salReq = VNGenerateObjectnessBasedSaliencyImageRequest()
|
||||
try handler.perform([salReq])
|
||||
if let sal = salReq.results?.first {
|
||||
if let objects = sal.salientObjects {
|
||||
print("Salient objects: \(objects.count)")
|
||||
for (i, ob) in objects.enumerated().prefix(10) {
|
||||
let bb = ob.boundingBox
|
||||
print(" [\(i)] bbox=(\(String(format: "%.3f", bb.origin.x)),\(String(format: "%.3f", bb.origin.y)),\(String(format: "%.3f", bb.size.width)),\(String(format: "%.3f", bb.size.height))) conf=\(String(format: "%.2f", ob.confidence))")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
print("\nTime: \(String(format: "%.2f", Date().timeIntervalSince(startTime)))s")
|
||||
print("=== Done ===")
|
||||
}
|
||||
}
|
||||
71
v1.1/scripts/swift_processors/vision_ocr_test_v1.11.swift
Normal file
71
v1.1/scripts/swift_processors/vision_ocr_test_v1.11.swift
Normal file
@@ -0,0 +1,71 @@
|
||||
import Foundation
|
||||
import Vision
|
||||
import ArgumentParser
|
||||
import AppKit
|
||||
|
||||
/// POC: Test Apple Vision Framework OCR (VNRecognizeTextRequest) vs PaddleOCR
|
||||
@main
|
||||
struct VisionOCRTest: ParsableCommand {
|
||||
@Argument(help: "Input image path")
|
||||
var inputPath: String
|
||||
|
||||
@Option(name: .long, help: "Recognition level (.fast or .accurate, default .accurate)")
|
||||
var level: String = "accurate"
|
||||
|
||||
mutating func run() throws {
|
||||
let startTime = Date()
|
||||
print("=== Apple Vision Framework OCR POC ===")
|
||||
#if arch(arm64)
|
||||
print("HW: Apple Silicon ✅")
|
||||
#endif
|
||||
|
||||
guard let image = NSImage(contentsOfFile: inputPath) else {
|
||||
print("Error: cannot load image"); return
|
||||
}
|
||||
guard let cgImage = image.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
|
||||
print("Error: cannot get CGImage"); return
|
||||
}
|
||||
print("Image: \(cgImage.width)x\(cgImage.height)")
|
||||
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
||||
|
||||
// VNRecognizeTextRequest
|
||||
print("\n--- VNRecognizeTextRequest ---")
|
||||
let req = VNRecognizeTextRequest()
|
||||
if level == "fast" {
|
||||
req.recognitionLevel = .fast
|
||||
} else {
|
||||
req.recognitionLevel = .accurate
|
||||
}
|
||||
req.usesLanguageCorrection = true
|
||||
req.preferBackgroundProcessing = true
|
||||
|
||||
try handler.perform([req])
|
||||
|
||||
guard let results = req.results else {
|
||||
print("No OCR results"); return
|
||||
}
|
||||
|
||||
print("Text blocks: \(results.count)")
|
||||
var totalChars = 0
|
||||
for (i, obs) in results.enumerated() {
|
||||
guard let candidate = obs.topCandidates(1).first else { continue }
|
||||
let text = candidate.string
|
||||
let conf = candidate.confidence
|
||||
let bb = obs.boundingBox
|
||||
totalChars += text.count
|
||||
if i < 20 {
|
||||
print(" [\(i)] conf=\(String(format: "%.3f", conf)) bbox=(\(String(format: "%.3f", bb.origin.x)),\(String(format: "%.3f", bb.origin.y)),\(String(format: "%.3f", bb.size.width)),\(String(format: "%.3f", bb.size.height))) \"\(text.prefix(80))\"")
|
||||
}
|
||||
}
|
||||
print(" ... \(results.count) total, \(totalChars) chars")
|
||||
|
||||
// Check language support
|
||||
print("\n--- Language Support ---")
|
||||
let supported = (try? VNRecognizeTextRequest.supportedRecognitionLanguages(for: .accurate, revision: VNRecognizeTextRequest.currentRevision)) ?? []
|
||||
print("Supported languages (\(supported.count)): \(supported.prefix(10).joined(separator: ", "))...")
|
||||
|
||||
let elapsed = Date().timeIntervalSince(startTime)
|
||||
print("\nTime: \(String(format: "%.2f", elapsed))s")
|
||||
print("=== Done ===")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user