#!/usr/bin/env python3 """ Head-to-Shoulder Ratio 年齡估算實驗 使用 Apple Vision VNDetectHumanBodyPoseRequest 提取肩寬, 再從已偵測的臉寬計算頭肩比。 """ import json, os, sys, subprocess, tempfile from pathlib import Path VIDEO = "/Users/accusys/test_video/Old_Time_Movie_Show_-_Charade_1963.HD.mov" DB_URL = "postgresql://accusys@localhost:5432/momentry" FILE_UUID = "1a04db97be5fa12bd77369831dc141fd" OUT_DIR = Path("/Users/accusys/momentry/output_dev/experiments/head_shoulder") OUT_DIR.mkdir(parents=True, exist_ok=True) # 1. Get trace samples (same 12 traces from DeepFace benchmark) import psycopg2 conn = psycopg2.connect(DB_URL) cur = conn.cursor() cur.execute(f""" WITH ranked AS ( SELECT trace_id, COUNT(*) AS fc, MIN(frame_number) AS first_frame, MAX(frame_number) AS last_frame, AVG(confidence) AS avg_conf FROM dev.face_detections WHERE file_uuid = '{FILE_UUID}' AND trace_id IS NOT NULL GROUP BY trace_id HAVING COUNT(*) >= 5 ) SELECT trace_id, fc, first_frame, last_frame, ROUND(avg_conf::numeric,3) FROM ranked ORDER BY fc DESC LIMIT 12 """) samples = cur.fetchall() cur.close() conn.close() print(f"Selected {len(samples)} traces for head-shoulder ratio benchmark\n") # 2. Extract frames + face crops for each trace from PIL import Image frames = [] for trace_id, fc, first, last, conf in samples: mid_frame = (first + last) // 2 mid_sec = mid_frame / 24.0 frame_file = OUT_DIR / f"trace_{trace_id}_frame_{mid_frame}.jpg" subprocess.run([ "ffmpeg", "-y", "-ss", str(mid_sec), "-i", VIDEO, "-frames:v", "1", "-q:v", "2", str(frame_file) ], capture_output=True) if frame_file.stat().st_size > 1000: frames.append((trace_id, fc, first, conf, str(frame_file))) print(f" trace_{trace_id}: frame {mid_frame} ({mid_sec:.0f}s)") # 3. Get face bbox from face_detections DB conn = psycopg2.connect(DB_URL) cur = conn.cursor() face_boxes = {} for trace_id, fc, first, conf, _ in frames: mid_frame = (first + last) // 2 cur.execute(""" SELECT x, y, width, height, frame_number FROM dev.face_detections WHERE file_uuid = %s AND trace_id = %s ORDER BY ABS(frame_number - %s) LIMIT 1 """, (FILE_UUID, trace_id, mid_frame)) row = cur.fetchone() if row: face_boxes[trace_id] = {"x": row[0], "y": row[1], "w": row[2], "h": row[3], "frame": row[4]} cur.close() conn.close() print(f"\nFace bboxes loaded: {len(face_boxes)} traces\n") # 4. Run Apple Vision body pose detection on each frame # Using a simple AppleScript/Python bridge or subprocess to swift # For now, use Vision via a minimal Swift script that processes a single image swift_code = ''' import Foundation import Vision import AppKit let args = CommandLine.arguments guard args.count >= 2 else { exit(1) } let imagePath = args[1] guard let image = NSImage(contentsOfFile: imagePath), let tiff = image.tiffRepresentation, let bitmap = NSBitmapImageRep(data: tiff), let cgImage = bitmap.cgImage else { print("{}") exit(0) } let request = VNDetectHumanBodyPoseRequest() let handler = VNImageRequestHandler(cgImage: cgImage) do { try handler.perform([request]) guard let results = request.results, !results.isEmpty else { print("{}") exit(0) } var output: [[String: Double]] = [] for obs in results { var joints: [String: Double] = [:] do { let pts = try obs.recognizedPoints(.all) let imgH = Double(image.size.height) // Vision (0,0) = bottom-left, (1,1) = top-right // Convert to pixel coordinates (top-left origin) for (name, pt) in pts { if pt.confidence > 0.3 { let x = pt.location.x let y = imgH - pt.location.y // flip Y joints[String(describing: name)] = round(x * 100) / 100 joints[String(describing: name) + "_y"] = round(y * 100) / 100 } } } catch {} if !joints.isEmpty { output.append(joints) } } let jsonData = try JSONSerialization.data(withJSONObject: output, options: []) print(String(data: jsonData, encoding: .utf8)!) } catch { print("{}") } ''' swift_file = OUT_DIR / "detect_body.swift" swift_file.write_text(swift_code) subprocess.run(["swiftc", "-o", str(OUT_DIR / "detect_body"), str(swift_file)], check=True) print("=" * 60) print("Head-to-Shoulder Ratio Benchmark") print("=" * 60) print() results = [] for trace_id, fc, first_frame, conf, frame_path in frames: result = subprocess.run( [str(OUT_DIR / "detect_body"), frame_path], capture_output=True, text=True ) try: joints_list = json.loads(result.stdout.strip()) except: joints_list = [] fb = face_boxes.get(trace_id, {"w": 0}) face_w = fb["w"] if joints_list: joints = joints_list[0] # Find shoulder keypoints l_shoulder = joints.get("left_shoulder", None) r_shoulder = joints.get("right_shoulder", None) neck = joints.get("neck", joints.get("root", None)) # Calculate shoulder width in pixels shoulder_w = -1 if l_shoulder is not None and r_shoulder is not None: ly = joints.get("left_shoulder_y", 0) ry = joints.get("right_shoulder_y", 0) shoulder_w = abs(l_shoulder - r_shoulder) # normalized coords ratio = face_w / shoulder_w if shoulder_w > 0 else 0 h2s = { "trace_id": trace_id, "faces": fc, "first_sec": round(first_frame / 24.0, 1), "face_w_px": face_w, "shoulder_w_unit": round(shoulder_w, 3), "ratio": round(ratio, 2), "joints": joints, } results.append(h2s) status = "OK" if ratio > 0 else "no shoulder" print(f" trace_{trace_id:5d} | face={face_w:4d}px | shoulder={shoulder_w:.3f} | ratio={ratio:.2f} | {status}") else: print(f" trace_{trace_id:5d} | face={face_w:4d}px | no body detected") # 5. Save results report = { "method": "Apple Vision Head-to-Shoulder Ratio", "video": "Charade (1963)", "samples": len(frames), "results": results, "notes": "Ratio = face_width_px / shoulder_width_normalized. Higher ratio = proportionally larger head (younger)." } with open(OUT_DIR / "head_shoulder_report.json", "w") as f: json.dump(report, f, indent=2, ensure_ascii=False) print(f"\nReport saved: {OUT_DIR}/head_shoulder_report.json") print(f"\nNote: Apple Vision body pose returns normalized coordinates.") print(f"Shoulder width is in Vision normalized [0,1] space.") print(f"For meaningful ratio, face_bbox needs to be in same coordinate space.") print(f"Consider using Vision face detection + body pose simultaneously on the same frame.")