release: v1.3.0 - TKG node type renaming

Changes:
- Rust: face_trace → face_track (45 occurrences in 8 files)
- Rust: gaze_trace → gaze_track, lip_trace → lip_track
- Python: tkg_builder.py unified + pipeline_checklist.py fixed
- Swift: swift_hand.swift hand state detection (empty vs holding)

Node type changes:
  face_trace    → face_track
  person_trace  → body_track
  gaze_trace    → gaze_track
  lip_trace     → lip_track
  hand_trace    → hand_track
  speaker       → speaker_segment
  object        → detected_object
  text_trace    → text_region

Migration:
  PUBLIC schema: 12970 + 892 + 305 rows updated
This commit is contained in:
Accusys
2026-06-22 07:18:21 +08:00
parent bce9435823
commit 7e548f8b08
35 changed files with 2789 additions and 481 deletions

View File

@@ -110,5 +110,13 @@ let package = Package(
path: ".",
sources: ["swift_face.swift"]
),
.executableTarget(
name: "swift_hand",
dependencies: [
.product(name: "ArgumentParser", package: "swift-argument-parser"),
],
path: ".",
sources: ["swift_hand.swift"]
),
]
)

View File

@@ -0,0 +1,299 @@
import Foundation
import Vision
import ArgumentParser
import AppKit
import AVFoundation
/// Swift Hand Pose Processor
/// Uses Apple Vision Framework VNDetectHumanHandPoseRequest for 21 hand landmarks
@main
struct SwiftHandProcessor: ParsableCommand {
@Argument(help: "Input video path")
var inputPath: String
@Argument(help: "Output JSON path")
var outputPath: String
@Option(name: [.short, .long], help: "UUID for the file")
var uuid: String = ""
@Option(name: [.short, .long], help: "Sample interval (frames)")
var sampleInterval: Int = 30
@Option(name: [.long], help: "Minimum confidence threshold")
var minConfidence: Double = 0.3
func run() throws {
print("[SwiftHand] Starting: \(inputPath)")
let url = URL(fileURLWithPath: inputPath)
let asset = AVURLAsset(url: url)
guard let track = asset.tracks(withMediaType: AVMediaType.video).first else {
print("[SwiftHand] Error: No video track"); return
}
let duration = asset.duration.seconds
let fps = Double(track.nominalFrameRate)
print("[SwiftHand] Duration: \(String(format: "%.1f", duration))s, FPS: \(String(format: "%.1f", fps))")
// Extract frames using ffmpeg (same approach as swift_pose)
let tempDir = FileManager.default.temporaryDirectory.appendingPathComponent("swift_hand_\(UUID().uuidString)")
let framesDir = tempDir.appendingPathComponent("frames")
try FileManager.default.createDirectory(at: framesDir, withIntermediateDirectories: true)
let pattern = framesDir.appendingPathComponent("frame_%05d.jpg").path
print("[SwiftHand] Extracting frames...")
let extract = Process()
extract.executableURL = URL(fileURLWithPath: "/opt/homebrew/bin/ffmpeg")
extract.arguments = ["-y", "-v", "quiet", "-i", inputPath,
"-vf", "select=not(mod(n\\,\(sampleInterval)))",
"-vsync", "vfr", "-q:v", "15", pattern]
try extract.run()
extract.waitUntilExit()
let files = (try? FileManager.default.contentsOfDirectory(atPath: framesDir.path)) ?? []
let frameFiles = files.filter { $0.hasSuffix(".jpg") }.sorted()
print("[SwiftHand] Extracted \(frameFiles.count) frames")
// Hand joint names (21 landmarks)
let jointNames: [VNHumanHandPoseObservation.JointName] = [
.wrist,
.thumbTip, .thumbIP, .thumbMP, .thumbCMC,
.indexTip, .indexDIP, .indexPIP, .indexMCP,
.middleTip, .middleDIP, .middlePIP, .middleMCP,
.ringTip, .ringDIP, .ringPIP, .ringMCP,
.littleTip, .littleDIP, .littlePIP, .littleMCP,
]
var handFrames: [[String: Any]] = []
var lastProgress = 0
for (i, fname) in frameFiles.enumerated() {
let imgPath = framesDir.appendingPathComponent(fname).path
guard let imgData = try? Data(contentsOf: URL(fileURLWithPath: imgPath)),
let img = NSImage(data: imgData),
let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else { continue }
let frameNum = Int(fname.replacingOccurrences(of: "frame_", with: "").replacingOccurrences(of: ".jpg", with: "")) ?? (i * sampleInterval)
let timestamp = Double(frameNum) / fps
let w = cgImage.width
let h = cgImage.height
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
let req = VNDetectHumanHandPoseRequest()
try? handler.perform([req])
guard let hands = req.results, !hands.isEmpty else { continue }
var persons: [[String: Any]] = []
for (handIdx, hand) in hands.enumerated() {
if Float(hand.confidence) < Float(minConfidence) {
continue
}
var landmarks: [[String: Any]] = []
for joint in jointNames {
if let point = try? hand.recognizedPoint(joint) {
let desc = String(describing: joint.rawValue.rawValue)
let rawName = desc
.replacingOccurrences(of: "VNRecognizedPointKey(_rawValue: ", with: "")
.replacingOccurrences(of: ")", with: "")
.trimmingCharacters(in: .whitespaces)
let name = mapJointName(rawName)
let px = Float(point.location.x) * Float(w)
let py = Float(h) - Float(point.location.y) * Float(h) // Y-flip to Top-Left
let conf = Float(point.confidence)
if conf > 0.1 {
landmarks.append([
"name": name,
"x": px,
"y": py,
"confidence": conf
])
}
}
}
// Gesture detection
let gesture = detectGesture(hand)
let handType = handIdx == 0 ? "left" : "right"
persons.append([
"person_id": handIdx,
"hand_type": handType,
"confidence": Float(hand.confidence),
"landmarks": landmarks,
"num_landmarks": landmarks.count,
"gesture": gesture["gesture"] as? String ?? "unknown",
"hand_state": gesture["hand_state"] as? String ?? "empty",
"finger_extensions": gesture["finger_extensions"] as? [String: Bool] ?? [:],
"num_fingers_extended": gesture["num_fingers_extended"] as? Int ?? 0,
"num_fingers_curled": gesture["num_fingers_curled"] as? Int ?? 0
])
}
if !persons.isEmpty {
handFrames.append([
"frame": frameNum,
"timestamp": timestamp,
"persons": persons
])
}
// Progress reporting
let progress = (i + 1) * 100 / frameFiles.count
if progress > lastProgress && progress % 10 == 0 {
print("[SwiftHand] Progress: \(progress)% (\(handFrames.count) hand frames)")
lastProgress = progress
}
}
// Cleanup temp directory
try? FileManager.default.removeItem(at: tempDir)
// Build output JSON
let outputData: [String: Any] = [
"frame_count": handFrames.count,
"fps": fps,
"frames": handFrames,
"metadata": [
"source": "swift_hand",
"uuid": uuid,
"landmarks_per_hand": 21,
"min_confidence": minConfidence,
"sample_interval": sampleInterval
]
]
let jsonData = try JSONSerialization.data(withJSONObject: outputData, options: [.prettyPrinted])
try jsonData.write(to: URL(fileURLWithPath: outputPath))
print("[SwiftHand] Complete: \(handFrames.count) frames with hands")
print("[SwiftHand] Output: \(outputPath)")
}
/// Map Vision joint codes to readable names
func mapJointName(_ rawName: String) -> String {
let mapping: [String: String] = [
"VNHLKWRI": "wrist",
"VNHLKTIP": "thumb_tip",
"VNHLKTTIP": "thumb_tip",
"VNHLKTMP": "thumb_mp",
"VNHLKTCMC": "thumb_cmc",
"VNHLKITIP": "index_tip",
"VNHLKIDIP": "index_dip",
"VNHLKIPIP": "index_pip",
"VNHLKIMCP": "index_mcp",
"VNHLKMTIP": "middle_tip",
"VNHLKMDIP": "middle_dip",
"VNHLKMPIP": "middle_pip",
"VNHLKMMCP": "middle_mcp",
"VNHLKRTIP": "ring_tip",
"VNHLKRDIP": "ring_dip",
"VNHLKRPIP": "ring_pip",
"VNHLKRMCP": "ring_mcp",
"VNHLKPTIP": "little_tip",
"VNHLKPDIP": "little_dip",
"VNHLKPPIP": "little_pip",
"VNHLKPMCP": "little_mcp",
]
return mapping[rawName] ?? rawName.lowercased()
}
/// Detect gesture from finger extensions
/// Returns: gesture, hand_state ("empty" or "holding"), finger info
func detectGesture(_ hand: VNHumanHandPoseObservation) -> [String: Any] {
// Finger extension check (tip lower than pip after flip = extended)
func isFingerExtended(tipName: VNHumanHandPoseObservation.JointName, pipName: VNHumanHandPoseObservation.JointName) -> Bool {
guard let tip = try? hand.recognizedPoint(tipName),
let pip = try? hand.recognizedPoint(pipName) else { return false }
return tip.confidence > 0.3 && pip.confidence > 0.3 && tip.location.y > pip.location.y
}
// Finger curled check (tip higher than pip after flip = curled around object)
func isFingerCurled(tipName: VNHumanHandPoseObservation.JointName, pipName: VNHumanHandPoseObservation.JointName) -> Bool {
guard let tip = try? hand.recognizedPoint(tipName),
let pip = try? hand.recognizedPoint(pipName) else { return false }
return tip.confidence > 0.3 && pip.confidence > 0.3 && tip.location.y < pip.location.y
}
// Thumb: tip vs cmc (horizontal distance)
func isThumbExtended() -> Bool {
guard let tip = try? hand.recognizedPoint(.thumbTip),
let cmc = try? hand.recognizedPoint(.thumbCMC) else { return false }
return tip.confidence > 0.3 && cmc.confidence > 0.3 &&
abs(tip.location.x - cmc.location.x) > 0.05
}
let thumb = isThumbExtended()
let index = isFingerExtended(tipName: .indexTip, pipName: .indexPIP)
let middle = isFingerExtended(tipName: .middleTip, pipName: .middlePIP)
let ring = isFingerExtended(tipName: .ringTip, pipName: .ringPIP)
let little = isFingerExtended(tipName: .littleTip, pipName: .littlePIP)
// Curled fingers (holding object indicator)
let indexCurled = isFingerCurled(tipName: .indexTip, pipName: .indexPIP)
let middleCurled = isFingerCurled(tipName: .middleTip, pipName: .middlePIP)
let ringCurled = isFingerCurled(tipName: .ringTip, pipName: .ringPIP)
let littleCurled = isFingerCurled(tipName: .littleTip, pipName: .littlePIP)
let extensions: [String: Bool] = [
"thumb": thumb,
"index": index,
"middle": middle,
"ring": ring,
"little": little
]
let numExtended = extensions.values.filter { $0 }.count
let numCurled = [indexCurled, middleCurled, ringCurled, littleCurled].filter { $0 }.count
var gesture = "unknown"
var handState = "empty" // "empty" or "holding"
// === HOLDING DETECTION ===
// Holding object: 2+ fingers curled, thumb may be wrapped or supporting
if numCurled >= 2 && !thumb {
// Fist-like grip without thumb extended
handState = "holding"
gesture = "holding_object"
} else if numCurled >= 3 {
// Multiple fingers wrapped around object
handState = "holding"
gesture = "holding_object"
}
// === EMPTY HAND GESTURES ===
else if numExtended == 5 {
gesture = "open_hand"
} else if numExtended == 0 {
gesture = "fist"
} else if thumb && numExtended == 1 {
gesture = "thumbs_up"
} else if index && numExtended == 1 {
gesture = "pointing"
} else if index && middle && numExtended == 2 {
gesture = "peace_sign"
} else if thumb && index && !middle && !ring && !little {
gesture = "ok_sign"
} else if thumb && index && middle && !ring && !little {
gesture = "three_fingers"
} else if numExtended >= 3 {
gesture = "partial_open"
}
return [
"gesture": gesture,
"hand_state": handState,
"finger_extensions": extensions,
"num_fingers_extended": numExtended,
"num_fingers_curled": numCurled
]
}
}