feat: Phase 2.6 edges migration to Qdrant (TKG-only architecture)

Phase 2.6.1: co_occurrence_edges migration
- build_co_occurrence_edges_from_qdrant()
- Qdrant embeddings → frame grouping → YOLO objects
- Result: 6679 edges (vs 6701 PostgreSQL)

Phase 2.6.2: face_face_edges migration
- build_face_face_edges_from_qdrant()
- Qdrant embeddings → frame grouping → face pairs
- mutual_gaze detection preserved
- Result: 6 edges (exact match)

Phase 2.6.3: speaker_face_edges migration
- build_speaker_face_edges_from_qdrant()
- Qdrant embeddings → trace_id frame ranges
- SPEAKS_AS edge creation

Architecture:
- All edges use Qdrant payload (no face_detections queries)
- PostgreSQL fallback for empty Qdrant
- Estimated 3.6x performance improvement

Testing:
- Playground (3003): ✓ All Phase 2.6 logs verified
- Edge counts: ✓ Close match with PostgreSQL
- Fallback: ✓ Working

Docs:
- docs_v1.0/DESIGN/TKG_PHASE2_6_EDGES_MIGRATION.md
- docs_v1.0/M4_workspace/2026-06-21_phase2_6_test.md
This commit is contained in:
Accusys
2026-06-21 04:47:49 +08:00
parent 0afc70fc5b
commit 2cfcfdd1af
2926 changed files with 8311058 additions and 1394 deletions

View File

@@ -0,0 +1,337 @@
import Foundation
import Vision
import ArgumentParser
import AVFoundation
/// Swift Face+Pose Processor - one pass, two outputs
/// Runs VNDetectFaceRectanglesRequest, VNDetectFaceLandmarksRequest,
/// and VNDetectHumanBodyPoseRequest on each sampled frame.
/// Uses AVAssetReader sequential read (frame-based), matching cv2 behavior.
@main
struct SwiftFacePose: ParsableCommand {
@Argument(help: "Video file path")
var inputPath: String
@Argument(help: "Output JSON path for face detection")
var faceOutput: String
@Argument(help: "Output JSON path for pose detection")
var poseOutput: String
@Option(name: .long, help: "Sample interval (frames, default=30)")
var sampleInterval: Int = 30
@Option(name: .long, help: "UUID for logging")
var uuid: String = ""
mutating func run() throws {
let startTime = Date()
print("[SwiftFacePose] Vision face+pose detection: \(inputPath)")
let url = URL(fileURLWithPath: inputPath)
let asset = AVAsset(url: url)
guard let videoTrack = asset.tracks(withMediaType: .video).first else {
print("[SwiftFacePose] No video track found")
return
}
let fps = videoTrack.nominalFrameRate
let duration = CMTimeGetSeconds(asset.duration)
let totalFrames = Int(duration * Double(fps))
print("[SwiftFacePose] Video: \(Int(videoTrack.naturalSize.width))x\(Int(videoTrack.naturalSize.height)), \(String(format: "%.1f", fps))fps, \(totalFrames) frames, interval=\(sampleInterval)")
// read sequentially, matching cv2 frame-by-frame behavior
let reader = try AVAssetReader(asset: asset)
let outputSettings: [String: Any] = [
kCVPixelBufferPixelFormatTypeKey as String: kCVPixelFormatType_32BGRA
]
let trackOutput = AVAssetReaderTrackOutput(track: videoTrack, outputSettings: outputSettings)
trackOutput.alwaysCopiesSampleData = false
reader.add(trackOutput)
guard reader.startReading() else {
print("[SwiftFacePose] Failed to start AVAssetReader: \(reader.error?.localizedDescription ?? "unknown")")
return
}
var faceFrames: [[String: Any]] = []
var poseFrames: [[String: Any]] = []
var processedCount = 0
var frameIndex = 0
let jointNames: [VNHumanBodyPoseObservation.JointName] = [
.nose, .leftEye, .rightEye, .leftEar, .rightEar,
.neck, .root,
.leftShoulder, .rightShoulder,
.leftElbow, .rightElbow,
.leftWrist, .rightWrist,
.leftHip, .rightHip,
.leftKnee, .rightKnee,
.leftAnkle, .rightAnkle,
]
while let sampleBuffer = trackOutput.copyNextSampleBuffer() {
defer { frameIndex += 1 }
if frameIndex % sampleInterval != 0 {
continue
}
guard let pixelBuffer = CMSampleBufferGetImageBuffer(sampleBuffer) else {
continue
}
let imgW = CGFloat(CVPixelBufferGetWidth(pixelBuffer))
let imgH = CGFloat(CVPixelBufferGetHeight(pixelBuffer))
let seconds = Double(frameIndex) / Double(fps)
let handler = VNImageRequestHandler(cvPixelBuffer: pixelBuffer, options: [:])
let faceReq = VNDetectFaceRectanglesRequest()
let lmReq = VNDetectFaceLandmarksRequest()
let bodyReq = VNDetectHumanBodyPoseRequest()
do {
try handler.perform([faceReq, lmReq, bodyReq])
} catch {
continue
}
// Face output
let faceObservations = faceReq.results ?? []
let landmarkObservations = lmReq.results ?? []
if !faceObservations.isEmpty || !landmarkObservations.isEmpty {
var faces: [[String: Any]] = []
let MIN_CONFIDENCE = 0.6
let MIN_SIZE = 20
for lmObs in landmarkObservations {
let lmConf = Double(lmObs.confidence)
if lmConf < MIN_CONFIDENCE { continue }
let bb = lmObs.boundingBox
let faceW = Int(bb.size.width * imgW)
let faceH = Int(bb.size.height * imgH)
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
let faceX = Int(bb.origin.x * imgW)
let faceY = Int((1.0 - bb.origin.y - bb.size.height) * imgH)
var faceData: [String: Any] = [
"bbox": ["x": max(0, faceX), "y": max(0, faceY),
"width": faceW, "height": faceH],
"confidence": Double(lmObs.confidence),
]
if let yaw = lmObs.yaw?.doubleValue,
let roll = lmObs.roll?.doubleValue {
var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
if let pitch = lmObs.pitch?.doubleValue {
poseInfo["pitch"] = pitch
}
faceData["pose"] = poseInfo
}
if let lms = lmObs.landmarks {
let imgSize = CGSize(width: imgW, height: imgH)
let leftEye = lms.leftEye?.pointsInImage(imageSize: imgSize) ?? []
let rightEye = lms.rightEye?.pointsInImage(imageSize: imgSize) ?? []
let nose = lms.nose?.pointsInImage(imageSize: imgSize) ?? []
if !leftEye.isEmpty || !rightEye.isEmpty || !nose.isEmpty {
var lm: [String: [[Double]]] = [:]
if !leftEye.isEmpty {
lm["left_eye"] = leftEye.map { [Double($0.x), Double(imgH - $0.y)] }
}
if !rightEye.isEmpty {
lm["right_eye"] = rightEye.map { [Double($0.x), Double(imgH - $0.y)] }
}
if !nose.isEmpty {
lm["nose"] = nose.map { [Double($0.x), Double(imgH - $0.y)] }
}
faceData["landmarks"] = lm
}
let outer = lms.outerLips?.pointsInImage(imageSize: imgSize) ?? []
let inner = lms.innerLips?.pointsInImage(imageSize: imgSize) ?? []
if !outer.isEmpty || !inner.isEmpty {
faceData["lips"] = [
"outer_lips": outer.map { [Double($0.x), Double(imgH - $0.y)] },
"inner_lips": inner.map { [Double($0.x), Double(imgH - $0.y)] }
]
}
}
faces.append(faceData)
}
for faceObs in faceObservations {
let fBB = faceObs.boundingBox
var matched = false
for lmObs in landmarkObservations {
let lBB = lmObs.boundingBox
let ix = max(fBB.origin.x, lBB.origin.x)
let iy = max(fBB.origin.y, lBB.origin.y)
let iw = min(fBB.maxX, lBB.maxX) - ix
let ih = min(fBB.maxY, lBB.maxY) - iy
if iw <= 0 || ih <= 0 { continue }
let intersection = iw * ih
let union = fBB.width * fBB.height + lBB.width * lBB.height - intersection
if intersection / union > 0.3 {
matched = true
break
}
}
if matched { continue }
let faceConf = Double(faceObs.faceCaptureQuality ?? faceObs.confidence)
if faceConf < MIN_CONFIDENCE { continue }
let faceW = Int(fBB.size.width * imgW)
let faceH = Int(fBB.size.height * imgH)
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
let faceX = Int(fBB.origin.x * imgW)
let faceY = Int((1.0 - fBB.origin.y - fBB.size.height) * imgH)
var faceData: [String: Any] = [
"bbox": ["x": max(0, faceX), "y": max(0, faceY),
"width": faceW, "height": faceH],
"confidence": Double(faceObs.faceCaptureQuality ?? faceObs.confidence),
]
if let yaw = faceObs.yaw?.doubleValue,
let roll = faceObs.roll?.doubleValue {
var poseInfo: [String: Any] = ["roll": roll, "yaw": yaw]
if let pitch = faceObs.pitch?.doubleValue {
poseInfo["pitch"] = pitch
}
faceData["pose"] = poseInfo
}
faces.append(faceData)
}
if !faces.isEmpty {
faceFrames.append([
"frame": frameIndex,
"timestamp": seconds,
"faces": faces,
])
}
}
// Pose output
guard let poses = bodyReq.results, !poses.isEmpty else { continue }
var persons: [[String: Any]] = []
for pose in poses {
var keypoints: [[String: Any]] = []
var minX = CGFloat.greatestFiniteMagnitude
var minY = CGFloat.greatestFiniteMagnitude
var maxX: CGFloat = 0
var maxY: CGFloat = 0
for joint in jointNames {
if let point = try? pose.recognizedPoint(joint) {
let desc = String(describing: joint.rawValue)
var rawName = desc
.replacingOccurrences(of: "VNRecognizedPointKey(_rawValue: ", with: "")
.replacingOccurrences(of: ")", with: "")
.trimmingCharacters(in: .whitespaces)
let nameMap: [String: String] = [
"head_joint": "nose",
"left_eye_joint": "left_eye",
"right_eye_joint": "right_eye",
"left_ear_joint": "left_ear",
"right_ear_joint": "right_ear",
"neck_1_joint": "neck",
"left_shoulder_1_joint": "left_shoulder",
"right_shoulder_1_joint": "right_shoulder",
"left_elbow_1_joint": "left_elbow",
"right_elbow_1_joint": "right_elbow",
"left_hand_joint": "left_wrist",
"right_hand_joint": "right_wrist",
"left_hip_1_joint": "left_hip",
"right_hip_1_joint": "right_hip",
"left_knee_1_joint": "left_knee",
"right_knee_1_joint": "right_knee",
"left_ankle_1_joint": "left_ankle",
"right_ankle_1_joint": "right_ankle",
"center_hip_joint": "root",
]
if let mapped = nameMap[rawName] {
rawName = mapped
}
let px = point.location.x * CGFloat(imgW)
let py = CGFloat(imgH) - point.location.y * CGFloat(imgH)
keypoints.append([
"name": rawName.isEmpty ? "\(joint)" : rawName,
"x": px,
"y": py,
"confidence": point.confidence,
])
if point.confidence > 0.1 {
minX = min(minX, px)
minY = min(minY, py)
maxX = max(maxX, px)
maxY = max(maxY, py)
}
}
}
var bbox: [String: Any] = ["x": 0, "y": 0, "width": 0, "height": 0]
if maxX > minX {
bbox = [
"x": Int(minX),
"y": Int(minY),
"width": Int(maxX - minX),
"height": Int(maxY - minY),
]
}
persons.append(["keypoints": keypoints, "bbox": bbox])
}
if !persons.isEmpty {
poseFrames.append([
"frame": frameIndex,
"timestamp": seconds,
"persons": persons,
])
}
processedCount += 1
if processedCount % 100 == 0 {
let elapsed = Date().timeIntervalSince(startTime)
let totalSamples = totalFrames / sampleInterval
let pct = Int(Double(processedCount) / Double(totalSamples) * 100)
print("[SwiftFacePose] \(faceFrames.count) face frames, \(poseFrames.count) pose frames, \(pct)% complete, \(Int(elapsed))s elapsed")
fflush(stdout)
}
}
reader.cancelReading()
let faceOutputDict: [String: Any] = [
"frame_count": faceFrames.count,
"fps": Double(fps),
"frames": faceFrames,
]
if let faceJson = try? JSONSerialization.data(withJSONObject: faceOutputDict, options: []) {
try faceJson.write(to: URL(fileURLWithPath: faceOutput))
}
let poseOutputDict: [String: Any] = [
"frame_count": poseFrames.count,
"fps": Double(fps),
"frames": poseFrames,
]
if let poseJson = try? JSONSerialization.data(withJSONObject: poseOutputDict, options: [.prettyPrinted]) {
try poseJson.write(to: URL(fileURLWithPath: poseOutput))
}
let elapsed = Date().timeIntervalSince(startTime)
print("[SwiftFacePose] Done: \(faceFrames.count) face frames, \(poseFrames.count) pose frames, \(String(format: "%.1f", elapsed))s")
}
}