release: v1.3.0 - TKG node type renaming
Changes: - Rust: face_trace → face_track (45 occurrences in 8 files) - Rust: gaze_trace → gaze_track, lip_trace → lip_track - Python: tkg_builder.py unified + pipeline_checklist.py fixed - Swift: swift_hand.swift hand state detection (empty vs holding) Node type changes: face_trace → face_track person_trace → body_track gaze_trace → gaze_track lip_trace → lip_track hand_trace → hand_track speaker → speaker_segment object → detected_object text_trace → text_region Migration: PUBLIC schema: 12970 + 892 + 305 rows updated
This commit is contained in:
@@ -110,5 +110,13 @@ let package = Package(
|
||||
path: ".",
|
||||
sources: ["swift_face.swift"]
|
||||
),
|
||||
.executableTarget(
|
||||
name: "swift_hand",
|
||||
dependencies: [
|
||||
.product(name: "ArgumentParser", package: "swift-argument-parser"),
|
||||
],
|
||||
path: ".",
|
||||
sources: ["swift_hand.swift"]
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
299
scripts/swift_processors/swift_hand.swift
Normal file
299
scripts/swift_processors/swift_hand.swift
Normal file
@@ -0,0 +1,299 @@
|
||||
import Foundation
|
||||
import Vision
|
||||
import ArgumentParser
|
||||
import AppKit
|
||||
import AVFoundation
|
||||
|
||||
/// Swift Hand Pose Processor
|
||||
/// Uses Apple Vision Framework VNDetectHumanHandPoseRequest for 21 hand landmarks
|
||||
@main
|
||||
struct SwiftHandProcessor: ParsableCommand {
|
||||
@Argument(help: "Input video path")
|
||||
var inputPath: String
|
||||
|
||||
@Argument(help: "Output JSON path")
|
||||
var outputPath: String
|
||||
|
||||
@Option(name: [.short, .long], help: "UUID for the file")
|
||||
var uuid: String = ""
|
||||
|
||||
@Option(name: [.short, .long], help: "Sample interval (frames)")
|
||||
var sampleInterval: Int = 30
|
||||
|
||||
@Option(name: [.long], help: "Minimum confidence threshold")
|
||||
var minConfidence: Double = 0.3
|
||||
|
||||
func run() throws {
|
||||
print("[SwiftHand] Starting: \(inputPath)")
|
||||
|
||||
let url = URL(fileURLWithPath: inputPath)
|
||||
let asset = AVURLAsset(url: url)
|
||||
|
||||
guard let track = asset.tracks(withMediaType: AVMediaType.video).first else {
|
||||
print("[SwiftHand] Error: No video track"); return
|
||||
}
|
||||
|
||||
let duration = asset.duration.seconds
|
||||
let fps = Double(track.nominalFrameRate)
|
||||
|
||||
print("[SwiftHand] Duration: \(String(format: "%.1f", duration))s, FPS: \(String(format: "%.1f", fps))")
|
||||
|
||||
// Extract frames using ffmpeg (same approach as swift_pose)
|
||||
let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("swift_hand_\(UUID().uuidString)")
|
||||
let framesDir = tempDir.appendingPathComponent("frames")
|
||||
try FileManager.default.createDirectory(at: framesDir, withIntermediateDirectories: true)
|
||||
|
||||
let pattern = framesDir.appendingPathComponent("frame_%05d.jpg").path
|
||||
print("[SwiftHand] Extracting frames...")
|
||||
let extract = Process()
|
||||
extract.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
|
||||
extract.arguments = ["-y", "-v", "quiet", "-i", inputPath,
|
||||
"-vf", "select=not(mod(n\\,\(sampleInterval)))",
|
||||
"-vsync", "vfr", "-q:v", "15", pattern]
|
||||
try extract.run()
|
||||
extract.waitUntilExit()
|
||||
|
||||
let files = (try? FileManager.default.contentsOfDirectory(atPath: framesDir.path)) ?? []
|
||||
let frameFiles = files.filter { $0.hasSuffix(".jpg") }.sorted()
|
||||
print("[SwiftHand] Extracted \(frameFiles.count) frames")
|
||||
|
||||
// Hand joint names (21 landmarks)
|
||||
let jointNames: [VNHumanHandPoseObservation.JointName] = [
|
||||
.wrist,
|
||||
.thumbTip, .thumbIP, .thumbMP, .thumbCMC,
|
||||
.indexTip, .indexDIP, .indexPIP, .indexMCP,
|
||||
.middleTip, .middleDIP, .middlePIP, .middleMCP,
|
||||
.ringTip, .ringDIP, .ringPIP, .ringMCP,
|
||||
.littleTip, .littleDIP, .littlePIP, .littleMCP,
|
||||
]
|
||||
|
||||
var handFrames: [[String: Any]] = []
|
||||
var lastProgress = 0
|
||||
|
||||
for (i, fname) in frameFiles.enumerated() {
|
||||
let imgPath = framesDir.appendingPathComponent(fname).path
|
||||
guard let imgData = try? Data(contentsOf: URL(fileURLWithPath: imgPath)),
|
||||
let img = NSImage(data: imgData),
|
||||
let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else { continue }
|
||||
|
||||
let frameNum = Int(fname.replacingOccurrences(of: "frame_", with: "").replacingOccurrences(of: ".jpg", with: "")) ?? (i * sampleInterval)
|
||||
let timestamp = Double(frameNum) / fps
|
||||
let w = cgImage.width
|
||||
let h = cgImage.height
|
||||
|
||||
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
|
||||
let req = VNDetectHumanHandPoseRequest()
|
||||
try? handler.perform([req])
|
||||
|
||||
guard let hands = req.results, !hands.isEmpty else { continue }
|
||||
|
||||
var persons: [[String: Any]] = []
|
||||
|
||||
for (handIdx, hand) in hands.enumerated() {
|
||||
if Float(hand.confidence) < Float(minConfidence) {
|
||||
continue
|
||||
}
|
||||
|
||||
var landmarks: [[String: Any]] = []
|
||||
|
||||
for joint in jointNames {
|
||||
if let point = try? hand.recognizedPoint(joint) {
|
||||
let desc = String(describing: joint.rawValue.rawValue)
|
||||
let rawName = desc
|
||||
.replacingOccurrences(of: "VNRecognizedPointKey(_rawValue: ", with: "")
|
||||
.replacingOccurrences(of: ")", with: "")
|
||||
.trimmingCharacters(in: .whitespaces)
|
||||
|
||||
let name = mapJointName(rawName)
|
||||
let px = Float(point.location.x) * Float(w)
|
||||
let py = Float(h) - Float(point.location.y) * Float(h) // Y-flip to Top-Left
|
||||
let conf = Float(point.confidence)
|
||||
|
||||
if conf > 0.1 {
|
||||
landmarks.append([
|
||||
"name": name,
|
||||
"x": px,
|
||||
"y": py,
|
||||
"confidence": conf
|
||||
])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Gesture detection
|
||||
let gesture = detectGesture(hand)
|
||||
|
||||
let handType = handIdx == 0 ? "left" : "right"
|
||||
|
||||
persons.append([
|
||||
"person_id": handIdx,
|
||||
"hand_type": handType,
|
||||
"confidence": Float(hand.confidence),
|
||||
"landmarks": landmarks,
|
||||
"num_landmarks": landmarks.count,
|
||||
"gesture": gesture["gesture"] as? String ?? "unknown",
|
||||
"hand_state": gesture["hand_state"] as? String ?? "empty",
|
||||
"finger_extensions": gesture["finger_extensions"] as? [String: Bool] ?? [:],
|
||||
"num_fingers_extended": gesture["num_fingers_extended"] as? Int ?? 0,
|
||||
"num_fingers_curled": gesture["num_fingers_curled"] as? Int ?? 0
|
||||
])
|
||||
}
|
||||
|
||||
if !persons.isEmpty {
|
||||
handFrames.append([
|
||||
"frame": frameNum,
|
||||
"timestamp": timestamp,
|
||||
"persons": persons
|
||||
])
|
||||
}
|
||||
|
||||
// Progress reporting
|
||||
let progress = (i + 1) * 100 / frameFiles.count
|
||||
if progress > lastProgress && progress % 10 == 0 {
|
||||
print("[SwiftHand] Progress: \(progress)% (\(handFrames.count) hand frames)")
|
||||
lastProgress = progress
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup temp directory
|
||||
try? FileManager.default.removeItem(at: tempDir)
|
||||
|
||||
// Build output JSON
|
||||
let outputData: [String: Any] = [
|
||||
"frame_count": handFrames.count,
|
||||
"fps": fps,
|
||||
"frames": handFrames,
|
||||
"metadata": [
|
||||
"source": "swift_hand",
|
||||
"uuid": uuid,
|
||||
"landmarks_per_hand": 21,
|
||||
"min_confidence": minConfidence,
|
||||
"sample_interval": sampleInterval
|
||||
]
|
||||
]
|
||||
|
||||
let jsonData = try JSONSerialization.data(withJSONObject: outputData, options: [.prettyPrinted])
|
||||
try jsonData.write(to: URL(fileURLWithPath: outputPath))
|
||||
|
||||
print("[SwiftHand] Complete: \(handFrames.count) frames with hands")
|
||||
print("[SwiftHand] Output: \(outputPath)")
|
||||
}
|
||||
|
||||
/// Map Vision joint codes to readable names
|
||||
func mapJointName(_ rawName: String) -> String {
|
||||
let mapping: [String: String] = [
|
||||
"VNHLKWRI": "wrist",
|
||||
"VNHLKTIP": "thumb_tip",
|
||||
"VNHLKTTIP": "thumb_tip",
|
||||
"VNHLKTMP": "thumb_mp",
|
||||
"VNHLKTCMC": "thumb_cmc",
|
||||
"VNHLKITIP": "index_tip",
|
||||
"VNHLKIDIP": "index_dip",
|
||||
"VNHLKIPIP": "index_pip",
|
||||
"VNHLKIMCP": "index_mcp",
|
||||
"VNHLKMTIP": "middle_tip",
|
||||
"VNHLKMDIP": "middle_dip",
|
||||
"VNHLKMPIP": "middle_pip",
|
||||
"VNHLKMMCP": "middle_mcp",
|
||||
"VNHLKRTIP": "ring_tip",
|
||||
"VNHLKRDIP": "ring_dip",
|
||||
"VNHLKRPIP": "ring_pip",
|
||||
"VNHLKRMCP": "ring_mcp",
|
||||
"VNHLKPTIP": "little_tip",
|
||||
"VNHLKPDIP": "little_dip",
|
||||
"VNHLKPPIP": "little_pip",
|
||||
"VNHLKPMCP": "little_mcp",
|
||||
]
|
||||
return mapping[rawName] ?? rawName.lowercased()
|
||||
}
|
||||
|
||||
/// Detect gesture from finger extensions
|
||||
/// Returns: gesture, hand_state ("empty" or "holding"), finger info
|
||||
func detectGesture(_ hand: VNHumanHandPoseObservation) -> [String: Any] {
|
||||
// Finger extension check (tip lower than pip after flip = extended)
|
||||
func isFingerExtended(tipName: VNHumanHandPoseObservation.JointName, pipName: VNHumanHandPoseObservation.JointName) -> Bool {
|
||||
guard let tip = try? hand.recognizedPoint(tipName),
|
||||
let pip = try? hand.recognizedPoint(pipName) else { return false }
|
||||
return tip.confidence > 0.3 && pip.confidence > 0.3 && tip.location.y > pip.location.y
|
||||
}
|
||||
|
||||
// Finger curled check (tip higher than pip after flip = curled around object)
|
||||
func isFingerCurled(tipName: VNHumanHandPoseObservation.JointName, pipName: VNHumanHandPoseObservation.JointName) -> Bool {
|
||||
guard let tip = try? hand.recognizedPoint(tipName),
|
||||
let pip = try? hand.recognizedPoint(pipName) else { return false }
|
||||
return tip.confidence > 0.3 && pip.confidence > 0.3 && tip.location.y < pip.location.y
|
||||
}
|
||||
|
||||
// Thumb: tip vs cmc (horizontal distance)
|
||||
func isThumbExtended() -> Bool {
|
||||
guard let tip = try? hand.recognizedPoint(.thumbTip),
|
||||
let cmc = try? hand.recognizedPoint(.thumbCMC) else { return false }
|
||||
return tip.confidence > 0.3 && cmc.confidence > 0.3 &&
|
||||
abs(tip.location.x - cmc.location.x) > 0.05
|
||||
}
|
||||
|
||||
let thumb = isThumbExtended()
|
||||
let index = isFingerExtended(tipName: .indexTip, pipName: .indexPIP)
|
||||
let middle = isFingerExtended(tipName: .middleTip, pipName: .middlePIP)
|
||||
let ring = isFingerExtended(tipName: .ringTip, pipName: .ringPIP)
|
||||
let little = isFingerExtended(tipName: .littleTip, pipName: .littlePIP)
|
||||
|
||||
// Curled fingers (holding object indicator)
|
||||
let indexCurled = isFingerCurled(tipName: .indexTip, pipName: .indexPIP)
|
||||
let middleCurled = isFingerCurled(tipName: .middleTip, pipName: .middlePIP)
|
||||
let ringCurled = isFingerCurled(tipName: .ringTip, pipName: .ringPIP)
|
||||
let littleCurled = isFingerCurled(tipName: .littleTip, pipName: .littlePIP)
|
||||
|
||||
let extensions: [String: Bool] = [
|
||||
"thumb": thumb,
|
||||
"index": index,
|
||||
"middle": middle,
|
||||
"ring": ring,
|
||||
"little": little
|
||||
]
|
||||
|
||||
let numExtended = extensions.values.filter { $0 }.count
|
||||
let numCurled = [indexCurled, middleCurled, ringCurled, littleCurled].filter { $0 }.count
|
||||
|
||||
var gesture = "unknown"
|
||||
var handState = "empty" // "empty" or "holding"
|
||||
|
||||
// === HOLDING DETECTION ===
|
||||
// Holding object: 2+ fingers curled, thumb may be wrapped or supporting
|
||||
if numCurled >= 2 && !thumb {
|
||||
// Fist-like grip without thumb extended
|
||||
handState = "holding"
|
||||
gesture = "holding_object"
|
||||
} else if numCurled >= 3 {
|
||||
// Multiple fingers wrapped around object
|
||||
handState = "holding"
|
||||
gesture = "holding_object"
|
||||
}
|
||||
// === EMPTY HAND GESTURES ===
|
||||
else if numExtended == 5 {
|
||||
gesture = "open_hand"
|
||||
} else if numExtended == 0 {
|
||||
gesture = "fist"
|
||||
} else if thumb && numExtended == 1 {
|
||||
gesture = "thumbs_up"
|
||||
} else if index && numExtended == 1 {
|
||||
gesture = "pointing"
|
||||
} else if index && middle && numExtended == 2 {
|
||||
gesture = "peace_sign"
|
||||
} else if thumb && index && !middle && !ring && !little {
|
||||
gesture = "ok_sign"
|
||||
} else if thumb && index && middle && !ring && !little {
|
||||
gesture = "three_fingers"
|
||||
} else if numExtended >= 3 {
|
||||
gesture = "partial_open"
|
||||
}
|
||||
|
||||
return [
|
||||
"gesture": gesture,
|
||||
"hand_state": handState,
|
||||
"finger_extensions": extensions,
|
||||
"num_fingers_extended": numExtended,
|
||||
"num_fingers_curled": numCurled
|
||||
]
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user