feat: Phase 2.6 edges migration to Qdrant (TKG-only architecture)
Phase 2.6.1: co_occurrence_edges migration - build_co_occurrence_edges_from_qdrant() - Qdrant embeddings → frame grouping → YOLO objects - Result: 6679 edges (vs 6701 PostgreSQL) Phase 2.6.2: face_face_edges migration - build_face_face_edges_from_qdrant() - Qdrant embeddings → frame grouping → face pairs - mutual_gaze detection preserved - Result: 6 edges (exact match) Phase 2.6.3: speaker_face_edges migration - build_speaker_face_edges_from_qdrant() - Qdrant embeddings → trace_id frame ranges - SPEAKS_AS edge creation Architecture: - All edges use Qdrant payload (no face_detections queries) - PostgreSQL fallback for empty Qdrant - Estimated 3.6x performance improvement Testing: - Playground (3003): ✓ All Phase 2.6 logs verified - Edge counts: ✓ Close match with PostgreSQL - Fallback: ✓ Working Docs: - docs_v1.0/DESIGN/TKG_PHASE2_6_EDGES_MIGRATION.md - docs_v1.0/M4_workspace/2026-06-21_phase2_6_test.md
This commit is contained in:
337
scripts/swift_processors/swift_face_pose.swift
Normal file
337
scripts/swift_processors/swift_face_pose.swift
Normal file
@@ -0,0 +1,337 @@
|
||||
import Foundation
|
||||
import Vision
|
||||
import ArgumentParser
|
||||
import AVFoundation
|
||||
|
||||
/// Swift Face+Pose Processor - one pass, two outputs
|
||||
/// Runs VNDetectFaceRectanglesRequest, VNDetectFaceLandmarksRequest,
|
||||
/// and VNDetectHumanBodyPoseRequest on each sampled frame.
|
||||
/// Uses AVAssetReader sequential read (frame-based), matching cv2 behavior.
|
||||
@main
|
||||
struct SwiftFacePose: ParsableCommand {
|
||||
@Argument(help: "Video file path")
|
||||
var inputPath: String
|
||||
|
||||
@Argument(help: "Output JSON path for face detection")
|
||||
var faceOutput: String
|
||||
|
||||
@Argument(help: "Output JSON path for pose detection")
|
||||
var poseOutput: String
|
||||
|
||||
@Option(name: .long, help: "Sample interval (frames, default=30)")
|
||||
var sampleInterval: Int = 30
|
||||
|
||||
@Option(name: .long, help: "UUID for logging")
|
||||
var uuid: String = ""
|
||||
|
||||
mutating func run() throws {
|
||||
let startTime = Date()
|
||||
print("[SwiftFacePose] Vision face+pose detection: \(inputPath)")
|
||||
|
||||
let url = URL(fileURLWithPath: inputPath)
|
||||
let asset = AVAsset(url: url)
|
||||
|
||||
guard let videoTrack = asset.tracks(withMediaType: .video).first else {
|
||||
print("[SwiftFacePose] No video track found")
|
||||
return
|
||||
}
|
||||
|
||||
let fps = videoTrack.nominalFrameRate
|
||||
let duration = CMTimeGetSeconds(asset.duration)
|
||||
let totalFrames = Int(duration * Double(fps))
|
||||
print("[SwiftFacePose] Video: \(Int(videoTrack.naturalSize.width))x\(Int(videoTrack.naturalSize.height)), \(String(format: "%.1f", fps))fps, \(totalFrames) frames, interval=\(sampleInterval)")
|
||||
|
||||
// read sequentially, matching cv2 frame-by-frame behavior
|
||||
let reader = try AVAssetReader(asset: asset)
|
||||
let outputSettings: [String: Any] = [
|
||||
kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA
|
||||
]
|
||||
let trackOutput = AVAssetReaderTrackOutput(track: videoTrack, outputSettings: outputSettings)
|
||||
trackOutput.alwaysCopiesSampleData = false
|
||||
reader.add(trackOutput)
|
||||
guard reader.startReading() else {
|
||||
print("[SwiftFacePose] Failed to start AVAssetReader: \(reader.error?.localizedDescription ?? "unknown")")
|
||||
return
|
||||
}
|
||||
|
||||
var faceFrames: [[String: Any]] = []
|
||||
var poseFrames: [[String: Any]] = []
|
||||
var processedCount = 0
|
||||
var frameIndex = 0
|
||||
|
||||
let jointNames: [VNHumanBodyPoseObservation.JointName] = [
|
||||
.nose, .leftEye, .rightEye, .leftEar, .rightEar,
|
||||
.neck, .root,
|
||||
.leftShoulder, .rightShoulder,
|
||||
.leftElbow, .rightElbow,
|
||||
.leftWrist, .rightWrist,
|
||||
.leftHip, .rightHip,
|
||||
.leftKnee, .rightKnee,
|
||||
.leftAnkle, .rightAnkle,
|
||||
]
|
||||
|
||||
while let sampleBuffer = trackOutput.copyNextSampleBuffer() {
|
||||
defer { frameIndex += 1 }
|
||||
|
||||
if frameIndex % sampleInterval != 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
|
||||
continue
|
||||
}
|
||||
|
||||
let imgW = CGFloat(CVPixelBufferGetWidth(pixelBuffer))
|
||||
let imgH = CGFloat(CVPixelBufferGetHeight(pixelBuffer))
|
||||
let seconds = Double(frameIndex) / Double(fps)
|
||||
|
||||
let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
|
||||
let faceReq = VNDetectFaceRectanglesRequest()
|
||||
let lmReq = VNDetectFaceLandmarksRequest()
|
||||
let bodyReq = VNDetectHumanBodyPoseRequest()
|
||||
|
||||
do {
|
||||
try handler.perform([faceReq, lmReq, bodyReq])
|
||||
} catch {
|
||||
continue
|
||||
}
|
||||
|
||||
// ── Face output ──
|
||||
let faceObservations = faceReq.results ?? []
|
||||
let landmarkObservations = lmReq.results ?? []
|
||||
|
||||
if !faceObservations.isEmpty || !landmarkObservations.isEmpty {
|
||||
var faces: [[String: Any]] = []
|
||||
|
||||
let MIN_CONFIDENCE = 0.6
|
||||
let MIN_SIZE = 20
|
||||
|
||||
for lmObs in landmarkObservations {
|
||||
let lmConf = Double(lmObs.confidence)
|
||||
if lmConf < MIN_CONFIDENCE { continue }
|
||||
|
||||
let bb = lmObs.boundingBox
|
||||
let faceW = Int(bb.size.width * imgW)
|
||||
let faceH = Int(bb.size.height * imgH)
|
||||
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
|
||||
|
||||
let faceX = Int(bb.origin.x * imgW)
|
||||
let faceY = Int((1.0 - bb.origin.y - bb.size.height) * imgH)
|
||||
|
||||
var faceData: [String: Any] = [
|
||||
"bbox": ["x": max(0, faceX), "y": max(0, faceY),
|
||||
"width": faceW, "height": faceH],
|
||||
"confidence": Double(lmObs.confidence),
|
||||
]
|
||||
|
||||
if let yaw = lmObs.yaw?.doubleValue,
|
||||
let roll = lmObs.roll?.doubleValue {
|
||||
var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
|
||||
if let pitch = lmObs.pitch?.doubleValue {
|
||||
poseInfo["pitch"] = pitch
|
||||
}
|
||||
faceData["pose"] = poseInfo
|
||||
}
|
||||
|
||||
if let lms = lmObs.landmarks {
|
||||
let imgSize = CGSize(width: imgW, height: imgH)
|
||||
let leftEye = lms.leftEye?.pointsInImage(imageSize: imgSize) ?? []
|
||||
let rightEye = lms.rightEye?.pointsInImage(imageSize: imgSize) ?? []
|
||||
let nose = lms.nose?.pointsInImage(imageSize: imgSize) ?? []
|
||||
|
||||
if !leftEye.isEmpty || !rightEye.isEmpty || !nose.isEmpty {
|
||||
var lm: [String: [[Double]]] = [:]
|
||||
if !leftEye.isEmpty {
|
||||
lm["left_eye"] = leftEye.map { [Double($0.x), Double(imgH - $0.y)] }
|
||||
}
|
||||
if !rightEye.isEmpty {
|
||||
lm["right_eye"] = rightEye.map { [Double($0.x), Double(imgH - $0.y)] }
|
||||
}
|
||||
if !nose.isEmpty {
|
||||
lm["nose"] = nose.map { [Double($0.x), Double(imgH - $0.y)] }
|
||||
}
|
||||
faceData["landmarks"] = lm
|
||||
}
|
||||
|
||||
let outer = lms.outerLips?.pointsInImage(imageSize: imgSize) ?? []
|
||||
let inner = lms.innerLips?.pointsInImage(imageSize: imgSize) ?? []
|
||||
if !outer.isEmpty || !inner.isEmpty {
|
||||
faceData["lips"] = [
|
||||
"outer_lips": outer.map { [Double($0.x), Double(imgH - $0.y)] },
|
||||
"inner_lips": inner.map { [Double($0.x), Double(imgH - $0.y)] }
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
faces.append(faceData)
|
||||
}
|
||||
|
||||
for faceObs in faceObservations {
|
||||
let fBB = faceObs.boundingBox
|
||||
var matched = false
|
||||
for lmObs in landmarkObservations {
|
||||
let lBB = lmObs.boundingBox
|
||||
let ix = max(fBB.origin.x, lBB.origin.x)
|
||||
let iy = max(fBB.origin.y, lBB.origin.y)
|
||||
let iw = min(fBB.maxX, lBB.maxX) - ix
|
||||
let ih = min(fBB.maxY, lBB.maxY) - iy
|
||||
if iw <= 0 || ih <= 0 { continue }
|
||||
let intersection = iw * ih
|
||||
let union = fBB.width * fBB.height + lBB.width * lBB.height - intersection
|
||||
if intersection / union > 0.3 {
|
||||
matched = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if matched { continue }
|
||||
|
||||
let faceConf = Double(faceObs.faceCaptureQuality ?? faceObs.confidence)
|
||||
if faceConf < MIN_CONFIDENCE { continue }
|
||||
|
||||
let faceW = Int(fBB.size.width * imgW)
|
||||
let faceH = Int(fBB.size.height * imgH)
|
||||
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
|
||||
|
||||
let faceX = Int(fBB.origin.x * imgW)
|
||||
let faceY = Int((1.0 - fBB.origin.y - fBB.size.height) * imgH)
|
||||
|
||||
var faceData: [String: Any] = [
|
||||
"bbox": ["x": max(0, faceX), "y": max(0, faceY),
|
||||
"width": faceW, "height": faceH],
|
||||
"confidence": Double(faceObs.faceCaptureQuality ?? faceObs.confidence),
|
||||
]
|
||||
if let yaw = faceObs.yaw?.doubleValue,
|
||||
let roll = faceObs.roll?.doubleValue {
|
||||
var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
|
||||
if let pitch = faceObs.pitch?.doubleValue {
|
||||
poseInfo["pitch"] = pitch
|
||||
}
|
||||
faceData["pose"] = poseInfo
|
||||
}
|
||||
faces.append(faceData)
|
||||
}
|
||||
|
||||
if !faces.isEmpty {
|
||||
faceFrames.append([
|
||||
"frame": frameIndex,
|
||||
"timestamp": seconds,
|
||||
"faces": faces,
|
||||
])
|
||||
}
|
||||
}
|
||||
|
||||
// ── Pose output ──
|
||||
guard let poses = bodyReq.results, !poses.isEmpty else { continue }
|
||||
|
||||
var persons: [[String: Any]] = []
|
||||
for pose in poses {
|
||||
var keypoints: [[String: Any]] = []
|
||||
var minX = CGFloat.greatestFiniteMagnitude
|
||||
var minY = CGFloat.greatestFiniteMagnitude
|
||||
var maxX: CGFloat = 0
|
||||
var maxY: CGFloat = 0
|
||||
|
||||
for joint in jointNames {
|
||||
if let point = try? pose.recognizedPoint(joint) {
|
||||
let desc = String(describing: joint.rawValue)
|
||||
var rawName = desc
|
||||
.replacingOccurrences(of: "VNRecognizedPointKey(_rawValue: ", with: "")
|
||||
.replacingOccurrences(of: ")", with: "")
|
||||
.trimmingCharacters(in: .whitespaces)
|
||||
let nameMap: [String: String] = [
|
||||
"head_joint": "nose",
|
||||
"left_eye_joint": "left_eye",
|
||||
"right_eye_joint": "right_eye",
|
||||
"left_ear_joint": "left_ear",
|
||||
"right_ear_joint": "right_ear",
|
||||
"neck_1_joint": "neck",
|
||||
"left_shoulder_1_joint": "left_shoulder",
|
||||
"right_shoulder_1_joint": "right_shoulder",
|
||||
"left_elbow_1_joint": "left_elbow",
|
||||
"right_elbow_1_joint": "right_elbow",
|
||||
"left_hand_joint": "left_wrist",
|
||||
"right_hand_joint": "right_wrist",
|
||||
"left_hip_1_joint": "left_hip",
|
||||
"right_hip_1_joint": "right_hip",
|
||||
"left_knee_1_joint": "left_knee",
|
||||
"right_knee_1_joint": "right_knee",
|
||||
"left_ankle_1_joint": "left_ankle",
|
||||
"right_ankle_1_joint": "right_ankle",
|
||||
"center_hip_joint": "root",
|
||||
]
|
||||
if let mapped = nameMap[rawName] {
|
||||
rawName = mapped
|
||||
}
|
||||
let px = point.location.x * CGFloat(imgW)
|
||||
let py = CGFloat(imgH) - point.location.y * CGFloat(imgH)
|
||||
keypoints.append([
|
||||
"name": rawName.isEmpty ? "\(joint)" : rawName,
|
||||
"x": px,
|
||||
"y": py,
|
||||
"confidence": point.confidence,
|
||||
])
|
||||
if point.confidence > 0.1 {
|
||||
minX = min(minX, px)
|
||||
minY = min(minY, py)
|
||||
maxX = max(maxX, px)
|
||||
maxY = max(maxY, py)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var bbox: [String: Any] = ["x": 0, "y": 0, "width": 0, "height": 0]
|
||||
if maxX > minX {
|
||||
bbox = [
|
||||
"x": Int(minX),
|
||||
"y": Int(minY),
|
||||
"width": Int(maxX - minX),
|
||||
"height": Int(maxY - minY),
|
||||
]
|
||||
}
|
||||
|
||||
persons.append(["keypoints": keypoints, "bbox": bbox])
|
||||
}
|
||||
|
||||
if !persons.isEmpty {
|
||||
poseFrames.append([
|
||||
"frame": frameIndex,
|
||||
"timestamp": seconds,
|
||||
"persons": persons,
|
||||
])
|
||||
}
|
||||
|
||||
processedCount += 1
|
||||
|
||||
if processedCount % 100 == 0 {
|
||||
let elapsed = Date().timeIntervalSince(startTime)
|
||||
let totalSamples = totalFrames / sampleInterval
|
||||
let pct = Int(Double(processedCount) / Double(totalSamples) * 100)
|
||||
print("[SwiftFacePose] \(faceFrames.count) face frames, \(poseFrames.count) pose frames, \(pct)% complete, \(Int(elapsed))s elapsed")
|
||||
fflush(stdout)
|
||||
}
|
||||
}
|
||||
|
||||
reader.cancelReading()
|
||||
|
||||
let faceOutputDict: [String: Any] = [
|
||||
"frame_count": faceFrames.count,
|
||||
"fps": Double(fps),
|
||||
"frames": faceFrames,
|
||||
]
|
||||
if let faceJson = try? JSONSerialization.data(withJSONObject: faceOutputDict, options: []) {
|
||||
try faceJson.write(to: URL(fileURLWithPath: faceOutput))
|
||||
}
|
||||
|
||||
let poseOutputDict: [String: Any] = [
|
||||
"frame_count": poseFrames.count,
|
||||
"fps": Double(fps),
|
||||
"frames": poseFrames,
|
||||
]
|
||||
if let poseJson = try? JSONSerialization.data(withJSONObject: poseOutputDict, options: [.prettyPrinted]) {
|
||||
try poseJson.write(to: URL(fileURLWithPath: poseOutput))
|
||||
}
|
||||
|
||||
let elapsed = Date().timeIntervalSince(startTime)
|
||||
print("[SwiftFacePose] Done: \(faceFrames.count) face frames, \(poseFrames.count) pose frames, \(String(format: "%.1f", elapsed))s")
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user