feat: Phase 2.6 edges migration to Qdrant (TKG-only architecture)

Phase 2.6.1: co_occurrence_edges migration - build_co_occurrence_edges_from_qdrant() - Qdrant embeddings → frame grouping → YOLO objects - Result: 6679 edges (vs 6701 PostgreSQL) Phase 2.6.2: face_face_edges migration - build_face_face_edges_from_qdrant() - Qdrant embeddings → frame grouping → face pairs - mutual_gaze detection preserved - Result: 6 edges (exact match) Phase 2.6.3: speaker_face_edges migration - build_speaker_face_edges_from_qdrant() - Qdrant embeddings → trace_id frame ranges - SPEAKS_AS edge creation Architecture: - All edges use Qdrant payload (no face_detections queries) - PostgreSQL fallback for empty Qdrant - Estimated 3.6x performance improvement Testing: - Playground (3003): ✓ All Phase 2.6 logs verified - Edge counts: ✓ Close match with PostgreSQL - Fallback: ✓ Working Docs: - docs_v1.0/DESIGN/TKG_PHASE2_6_EDGES_MIGRATION.md - docs_v1.0/M4_workspace/2026-06-21_phase2_6_test.md
2026-06-21 04:47:49 +08:00
parent 0afc70fc5b
commit 2cfcfdd1af
2926 changed files with 8311058 additions and 1394 deletions
--- a/scripts/swift_processors/swift_face_pose.swift
+++ b/scripts/swift_processors/swift_face_pose.swift
@@ -0,0 +1,337 @@
+import Foundation
+import Vision
+import ArgumentParser
+import AVFoundation
+
+/// Swift Face+Pose Processor - one pass, two outputs
+/// Runs VNDetectFaceRectanglesRequest, VNDetectFaceLandmarksRequest,
+/// and VNDetectHumanBodyPoseRequest on each sampled frame.
+/// Uses AVAssetReader sequential read (frame-based), matching cv2 behavior.
+@main
+struct SwiftFacePose: ParsableCommand {
+    @Argument(help: "Video file path")
+    var inputPath: String
+
+    @Argument(help: "Output JSON path for face detection")
+    var faceOutput: String
+
+    @Argument(help: "Output JSON path for pose detection")
+    var poseOutput: String
+
+    @Option(name: .long, help: "Sample interval (frames, default=30)")
+    var sampleInterval: Int = 30
+
+    @Option(name: .long, help: "UUID for logging")
+    var uuid: String = ""
+
+    mutating func run() throws {
+        let startTime = Date()
+        print("[SwiftFacePose] Vision face+pose detection: \(inputPath)")
+
+        let url = URL(fileURLWithPath: inputPath)
+        let asset = AVAsset(url: url)
+
+        guard let videoTrack = asset.tracks(withMediaType: .video).first else {
+            print("[SwiftFacePose] No video track found")
+            return
+        }
+
+        let fps = videoTrack.nominalFrameRate
+        let duration = CMTimeGetSeconds(asset.duration)
+        let totalFrames = Int(duration * Double(fps))
+        print("[SwiftFacePose] Video: \(Int(videoTrack.naturalSize.width))x\(Int(videoTrack.naturalSize.height)), \(String(format: "%.1f", fps))fps, \(totalFrames) frames, interval=\(sampleInterval)")
+
+        // read sequentially, matching cv2 frame-by-frame behavior
+        let reader = try AVAssetReader(asset: asset)
+        let outputSettings: [String: Any] = [
+            kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA
+        ]
+        let trackOutput = AVAssetReaderTrackOutput(track: videoTrack, outputSettings: outputSettings)
+        trackOutput.alwaysCopiesSampleData = false
+        reader.add(trackOutput)
+        guard reader.startReading() else {
+            print("[SwiftFacePose] Failed to start AVAssetReader: \(reader.error?.localizedDescription ?? "unknown")")
+            return
+        }
+
+        var faceFrames: [[String: Any]] = []
+        var poseFrames: [[String: Any]] = []
+        var processedCount = 0
+        var frameIndex = 0
+
+        let jointNames: [VNHumanBodyPoseObservation.JointName] = [
+            .nose, .leftEye, .rightEye, .leftEar, .rightEar,
+            .neck, .root,
+            .leftShoulder, .rightShoulder,
+            .leftElbow, .rightElbow,
+            .leftWrist, .rightWrist,
+            .leftHip, .rightHip,
+            .leftKnee, .rightKnee,
+            .leftAnkle, .rightAnkle,
+        ]
+
+        while let sampleBuffer = trackOutput.copyNextSampleBuffer() {
+            defer { frameIndex += 1 }
+
+            if frameIndex % sampleInterval != 0 {
+                continue
+            }
+
+            guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
+                continue
+            }
+
+            let imgW = CGFloat(CVPixelBufferGetWidth(pixelBuffer))
+            let imgH = CGFloat(CVPixelBufferGetHeight(pixelBuffer))
+            let seconds = Double(frameIndex) / Double(fps)
+
+            let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
+            let faceReq = VNDetectFaceRectanglesRequest()
+            let lmReq = VNDetectFaceLandmarksRequest()
+            let bodyReq = VNDetectHumanBodyPoseRequest()
+
+            do {
+                try handler.perform([faceReq, lmReq, bodyReq])
+            } catch {
+                continue
+            }
+
+            // ── Face output ──
+            let faceObservations = faceReq.results ?? []
+            let landmarkObservations = lmReq.results ?? []
+
+            if !faceObservations.isEmpty || !landmarkObservations.isEmpty {
+                var faces: [[String: Any]] = []
+
+                let MIN_CONFIDENCE = 0.6
+                let MIN_SIZE = 20
+
+                for lmObs in landmarkObservations {
+                    let lmConf = Double(lmObs.confidence)
+                    if lmConf < MIN_CONFIDENCE { continue }
+
+                    let bb = lmObs.boundingBox
+                    let faceW = Int(bb.size.width * imgW)
+                    let faceH = Int(bb.size.height * imgH)
+                    if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
+
+                    let faceX = Int(bb.origin.x * imgW)
+                    let faceY = Int((1.0 - bb.origin.y - bb.size.height) * imgH)
+
+                    var faceData: [String: Any] = [
+                        "bbox": ["x": max(0, faceX), "y": max(0, faceY),
+                                 "width": faceW, "height": faceH],
+                        "confidence": Double(lmObs.confidence),
+                    ]
+
+                    if let yaw = lmObs.yaw?.doubleValue,
+                       let roll = lmObs.roll?.doubleValue {
+                        var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
+                        if let pitch = lmObs.pitch?.doubleValue {
+                            poseInfo["pitch"] = pitch
+                        }
+                        faceData["pose"] = poseInfo
+                    }
+
+                    if let lms = lmObs.landmarks {
+                        let imgSize = CGSize(width: imgW, height: imgH)
+                        let leftEye = lms.leftEye?.pointsInImage(imageSize: imgSize) ?? []
+                        let rightEye = lms.rightEye?.pointsInImage(imageSize: imgSize) ?? []
+                        let nose = lms.nose?.pointsInImage(imageSize: imgSize) ?? []
+
+                        if !leftEye.isEmpty || !rightEye.isEmpty || !nose.isEmpty {
+                            var lm: [String: [[Double]]] = [:]
+                            if !leftEye.isEmpty {
+                                lm["left_eye"] = leftEye.map { [Double($0.x), Double(imgH - $0.y)] }
+                            }
+                            if !rightEye.isEmpty {
+                                lm["right_eye"] = rightEye.map { [Double($0.x), Double(imgH - $0.y)] }
+                            }
+                            if !nose.isEmpty {
+                                lm["nose"] = nose.map { [Double($0.x), Double(imgH - $0.y)] }
+                            }
+                            faceData["landmarks"] = lm
+                        }
+
+                        let outer = lms.outerLips?.pointsInImage(imageSize: imgSize) ?? []
+                        let inner = lms.innerLips?.pointsInImage(imageSize: imgSize) ?? []
+                        if !outer.isEmpty || !inner.isEmpty {
+                            faceData["lips"] = [
+                                "outer_lips": outer.map { [Double($0.x), Double(imgH - $0.y)] },
+                                "inner_lips": inner.map { [Double($0.x), Double(imgH - $0.y)] }
+                            ]
+                        }
+                    }
+
+                    faces.append(faceData)
+                }
+
+                for faceObs in faceObservations {
+                    let fBB = faceObs.boundingBox
+                    var matched = false
+                    for lmObs in landmarkObservations {
+                        let lBB = lmObs.boundingBox
+                        let ix = max(fBB.origin.x, lBB.origin.x)
+                        let iy = max(fBB.origin.y, lBB.origin.y)
+                        let iw = min(fBB.maxX, lBB.maxX) - ix
+                        let ih = min(fBB.maxY, lBB.maxY) - iy
+                        if iw <= 0 || ih <= 0 { continue }
+                        let intersection = iw * ih
+                        let union = fBB.width * fBB.height + lBB.width * lBB.height - intersection
+                        if intersection / union > 0.3 {
+                            matched = true
+                            break
+                        }
+                    }
+                    if matched { continue }
+
+                    let faceConf = Double(faceObs.faceCaptureQuality ?? faceObs.confidence)
+                    if faceConf < MIN_CONFIDENCE { continue }
+
+                    let faceW = Int(fBB.size.width * imgW)
+                    let faceH = Int(fBB.size.height * imgH)
+                    if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
+
+                    let faceX = Int(fBB.origin.x * imgW)
+                    let faceY = Int((1.0 - fBB.origin.y - fBB.size.height) * imgH)
+
+                    var faceData: [String: Any] = [
+                        "bbox": ["x": max(0, faceX), "y": max(0, faceY),
+                                 "width": faceW, "height": faceH],
+                        "confidence": Double(faceObs.faceCaptureQuality ?? faceObs.confidence),
+                    ]
+                    if let yaw = faceObs.yaw?.doubleValue,
+                       let roll = faceObs.roll?.doubleValue {
+                        var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
+                        if let pitch = faceObs.pitch?.doubleValue {
+                            poseInfo["pitch"] = pitch
+                        }
+                        faceData["pose"] = poseInfo
+                    }
+                    faces.append(faceData)
+                }
+
+                if !faces.isEmpty {
+                    faceFrames.append([
+                        "frame": frameIndex,
+                        "timestamp": seconds,
+                        "faces": faces,
+                    ])
+                }
+            }
+
+            // ── Pose output ──
+            guard let poses = bodyReq.results, !poses.isEmpty else { continue }
+
+            var persons: [[String: Any]] = []
+            for pose in poses {
+                var keypoints: [[String: Any]] = []
+                var minX = CGFloat.greatestFiniteMagnitude
+                var minY = CGFloat.greatestFiniteMagnitude
+                var maxX: CGFloat = 0
+                var maxY: CGFloat = 0
+
+                for joint in jointNames {
+                    if let point = try? pose.recognizedPoint(joint) {
+                        let desc = String(describing: joint.rawValue)
+                        var rawName = desc
+                            .replacingOccurrences(of: "VNRecognizedPointKey(_rawValue: ", with: "")
+                            .replacingOccurrences(of: ")", with: "")
+                            .trimmingCharacters(in: .whitespaces)
+                        let nameMap: [String: String] = [
+                            "head_joint": "nose",
+                            "left_eye_joint": "left_eye",
+                            "right_eye_joint": "right_eye",
+                            "left_ear_joint": "left_ear",
+                            "right_ear_joint": "right_ear",
+                            "neck_1_joint": "neck",
+                            "left_shoulder_1_joint": "left_shoulder",
+                            "right_shoulder_1_joint": "right_shoulder",
+                            "left_elbow_1_joint": "left_elbow",
+                            "right_elbow_1_joint": "right_elbow",
+                            "left_hand_joint": "left_wrist",
+                            "right_hand_joint": "right_wrist",
+                            "left_hip_1_joint": "left_hip",
+                            "right_hip_1_joint": "right_hip",
+                            "left_knee_1_joint": "left_knee",
+                            "right_knee_1_joint": "right_knee",
+                            "left_ankle_1_joint": "left_ankle",
+                            "right_ankle_1_joint": "right_ankle",
+                            "center_hip_joint": "root",
+                        ]
+                        if let mapped = nameMap[rawName] {
+                            rawName = mapped
+                        }
+                        let px = point.location.x * CGFloat(imgW)
+                        let py = CGFloat(imgH) - point.location.y * CGFloat(imgH)
+                        keypoints.append([
+                            "name": rawName.isEmpty ? "\(joint)" : rawName,
+                            "x": px,
+                            "y": py,
+                            "confidence": point.confidence,
+                        ])
+                        if point.confidence > 0.1 {
+                            minX = min(minX, px)
+                            minY = min(minY, py)
+                            maxX = max(maxX, px)
+                            maxY = max(maxY, py)
+                        }
+                    }
+                }
+
+                var bbox: [String: Any] = ["x": 0, "y": 0, "width": 0, "height": 0]
+                if maxX > minX {
+                    bbox = [
+                        "x": Int(minX),
+                        "y": Int(minY),
+                        "width": Int(maxX - minX),
+                        "height": Int(maxY - minY),
+                    ]
+                }
+
+                persons.append(["keypoints": keypoints, "bbox": bbox])
+            }
+
+            if !persons.isEmpty {
+                poseFrames.append([
+                    "frame": frameIndex,
+                    "timestamp": seconds,
+                    "persons": persons,
+                ])
+            }
+
+            processedCount += 1
+
+            if processedCount % 100 == 0 {
+                let elapsed = Date().timeIntervalSince(startTime)
+                let totalSamples = totalFrames / sampleInterval
+                let pct = Int(Double(processedCount) / Double(totalSamples) * 100)
+                print("[SwiftFacePose] \(faceFrames.count) face frames, \(poseFrames.count) pose frames, \(pct)% complete, \(Int(elapsed))s elapsed")
+                fflush(stdout)
+            }
+        }
+
+        reader.cancelReading()
+
+        let faceOutputDict: [String: Any] = [
+            "frame_count": faceFrames.count,
+            "fps": Double(fps),
+            "frames": faceFrames,
+        ]
+        if let faceJson = try? JSONSerialization.data(withJSONObject: faceOutputDict, options: []) {
+            try faceJson.write(to: URL(fileURLWithPath: faceOutput))
+        }
+
+        let poseOutputDict: [String: Any] = [
+            "frame_count": poseFrames.count,
+            "fps": Double(fps),
+            "frames": poseFrames,
+        ]
+        if let poseJson = try? JSONSerialization.data(withJSONObject: poseOutputDict, options: [.prettyPrinted]) {
+            try poseJson.write(to: URL(fileURLWithPath: poseOutput))
+        }
+
+        let elapsed = Date().timeIntervalSince(startTime)
+        print("[SwiftFacePose] Done: \(faceFrames.count) face frames, \(poseFrames.count) pose frames, \(String(format: "%.1f", elapsed))s")
+    }
+}