#!/usr/bin/env python3 """ 場景識別處理器 (Scene Classification Processor) 使用 Core ML + Places365 模型進行場景識別 支援 Apple Silicon M4 優化 - Core ML 模型 (原生) - PyTorch + MPS (備案) """ import argparse import json import sys import time from datetime import datetime from pathlib import Path from typing import Dict, List, Optional, Any # 嘗試導入 Core ML try: import coremltools as ct HAS_COREML = True except ImportError: HAS_COREML = False # 嘗試導入 PyTorch (備案) try: import torch from torchvision import transforms, models HAS_TORCH = True DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu") except ImportError: HAS_TORCH = False DEVICE = torch.device("cpu") # 嘗試導入 Pillow 用於圖像處理 try: from PIL import Image HAS_PIL = True except ImportError: HAS_PIL = False # 嘗試導入 OpenCV 用於影片處理 try: import cv2 HAS_CV = True except ImportError: HAS_CV = False # 載入 Places365 類別 PLACES365_CATEGORIES = {} try: import json from pathlib import Path categories_path = Path(__file__).parent / "places365_categories.json" if categories_path.exists(): with open(categories_path, "r", encoding="utf-8") as f: PLACES365_CATEGORIES = json.load(f) print(f"[SCENE] Loaded {len(PLACES365_CATEGORIES)} Places365 categories") except Exception as e: print(f"[SCENE] Warning: Could not load Places365 categories: {e}") # 場景類型中英文對照 SCENE_TYPE_ZH = { "hospital_room": "醫院病房", "pharmacy": "藥房", "classroom": "教室", "office": "辦公室", "kitchen": "廚房", "living_room": "客廳", "bedroom": "臥室", "bathroom": "浴室", "restaurant": "餐廳", "gym": "健身房", "supermarket": "超市", "basketball_court": "籃球場", "football_field": "足球場", "tennis_court": "網球場", "swimming_pool": "游泳池", "park": "公園", "street": "街道", "beach": "海灘", "mountain": "山地", "forest": "森林", "airport": "機場", "train_station": "火車站", "subway_station": "地鐵站", "gas_station": "加油站", "parking_lot": "停車場", "auditorium": "禮堂", "library": "圖書館", "laboratory": "實驗室", "art_studio": "藝術工作室", "music_store": "音樂商店", "computer_room": "電腦室", "conference_room": "會議室", "playground": "遊樂場", "ski_slope": "滑雪坡", "ice_rink": "溜冰場", "boxing_ring": "拳擊場", "volleyball_court": "排球場", "baseball_field": "棒球場", } # 場景類別(Places365 子集) SCENE_CATEGORIES = [ "hospital_room", "pharmacy", "classroom", "office", "kitchen", "living_room", "bedroom", "bathroom", "restaurant", "gym", "supermarket", "basketball_court", "football_field", "tennis_court", "swimming_pool", "park", "street", "beach", "mountain", "forest", "airport", "train_station", "subway_station", "gas_station", "parking_lot", "auditorium", "library", "laboratory", "art_studio", "music_store", "computer_room", "conference_room", "playground", "ski_slope", "ice_rink", "boxing_ring", "volleyball_court", "baseball_field", ] class SceneClassifier: """場景識別器""" def __init__(self, model_path: Optional[str] = None): """ 初始化場景識別器 Args: model_path: Core ML 模型路徑 (可選) """ self.model_path = model_path self.places365_model_path = ( "/Users/accusys/momentry/models/resnet18_places365.pth.tar" ) self.model = None self.coreml_model = None self.transform = None self.model_type = "unknown" # 圖像預處理 self.transform = transforms.Compose( [ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ), ] ) def load_model(self) -> bool: """ 載入模型 Returns: bool: 是否成功載入 """ # 優先使用 Core ML if HAS_COREML and self.model_path and Path(self.model_path).exists(): try: print(f"[SCENE] Loading Core ML model: {self.model_path}") self.coreml_model = ct.models.MLModel(self.model_path) self.model_type = "coreml" print("[SCENE] Core ML model loaded successfully") return True except Exception as e: print(f"[SCENE] Warning: Failed to load Core ML model: {e}") # 備案:使用 PyTorch + Places365 if HAS_TORCH: try: print(f"[SCENE] Loading PyTorch model on {DEVICE}") # 檢查 Places365 模型是否存在 if Path(self.places365_model_path).exists(): print( f"[SCENE] Loading Places365 model: {self.places365_model_path}" ) checkpoint = torch.load( self.places365_model_path, map_location=DEVICE ) # 建立 ResNet18 模型 (Places365 有 365 個類別) self.model = models.resnet18(num_classes=365) # 移除 'module.' prefix (DataParallel training) state_dict = checkpoint["state_dict"] new_state_dict = {} for k, v in state_dict.items(): if k.startswith("module."): new_state_dict[k[7:]] = v else: new_state_dict[k] = v self.model.load_state_dict(new_state_dict) self.model_type = "places365" print("[SCENE] Places365 model loaded successfully (365 classes)") else: print( f"[SCENE] Places365 model not found, using ImageNet pretrained" ) self.model = models.resnet18(pretrained=True) self.model_type = "imagenet" self.model.to(DEVICE) self.model.eval() print("[SCENE] PyTorch model loaded successfully") return True except Exception as e: print(f"[SCENE] Warning: Failed to load PyTorch model: {e}") import traceback traceback.print_exc() print("[SCENE] Error: No model available") return False def predict_frame(self, frame: Any) -> List[Dict[str, Any]]: """ 預測單幀圖像的場景類型 Args: frame: 圖像幀 (OpenCV ndarray 或 PIL) Returns: List[Dict]: 前 5 個預測結果 """ if self.coreml_model is None and self.model is None: print("[SCENE] Warning: No model loaded") return [] # 轉換為 PIL Image if isinstance(frame, str): img = Image.open(frame).convert("RGB") elif HAS_CV and hasattr(frame, "shape") and len(frame.shape) == 3: # OpenCV frame (BGR ndarray) img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) elif hasattr(frame, "convert"): # PIL Image img = frame.convert("RGB") else: print(f"[SCENE] Warning: Unknown frame type: {type(frame)}") return [] if img is None: print("[SCENE] Warning: Failed to convert to PIL Image") return [] # 使用 Core ML if self.coreml_model is not None: try: # Core ML 需要 dict 輸入 input_dict = {"image": img} output = self.coreml_model.predict(input_dict) # 解析輸出 probs = output.get("probs", {}) top_5 = sorted(probs.items(), key=lambda x: x[1], reverse=True)[:5] return [ {"scene_type": label, "confidence": float(conf)} for label, conf in top_5 ] except Exception as e: print(f"[SCENE] Core ML prediction error: {e}") return [] # 使用 PyTorch if self.model is not None: try: with torch.no_grad(): # 預處理 input_tensor = self.transform(img).unsqueeze(0).to(DEVICE) # 推理 outputs = self.model(input_tensor) probs = torch.nn.functional.softmax(outputs, dim=1) # 取得 top 5 top_5_probs, top_5_indices = torch.topk(probs, 5) # 簡化:使用 Places365 類別映射 results = [] for i in range(5): prob = top_5_probs[0][i].item() idx = top_5_indices[0][i].item() # 使用 Places365 類別名稱(如果可用) scene_type = PLACES365_CATEGORIES.get(str(idx), f"scene_{idx}") results.append({"scene_type": scene_type, "confidence": prob}) return results except Exception as e: print(f"[SCENE] PyTorch prediction error: {e}") import traceback traceback.print_exc() return [] return [] # 轉換為 PIL Image if isinstance(frame, str): img = Image.open(frame).convert("RGB") elif HAS_CV and hasattr(frame, "shape") and len(frame.shape) == 3: # OpenCV frame (BGR ndarray) img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) elif hasattr(frame, "convert"): # PIL Image img = frame.convert("RGB") else: print(f"[SCENE] Warning: Unknown frame type: {type(frame)}") return [] if img is None: return [] # 轉換為 PIL Image if isinstance(frame, str): img = Image.open(frame).convert("RGB") elif HAS_CV and isinstance(frame, dict): # OpenCV frame (BGR) img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) else: img = frame.convert("RGB") if hasattr(frame, "convert") else None if img is None: return [] # 使用 Core ML if self.coreml_model is not None: try: # Core ML 需要 dict 輸入 input_dict = {"image": img} output = self.coreml_model.predict(input_dict) # 解析輸出 probs = output.get("probs", {}) top_5 = sorted(probs.items(), key=lambda x: x[1], reverse=True)[:5] return [ {"scene_type": label, "confidence": float(conf)} for label, conf in top_5 ] except Exception as e: print(f"[SCENE] Core ML prediction error: {e}") return [] # 使用 PyTorch if self.model is not None: try: with torch.no_grad(): # 預處理 input_tensor = self.transform(img).unsqueeze(0).to(DEVICE) # 推理 outputs = self.model(input_tensor) probs = torch.nn.functional.softmax(outputs, dim=1) # 取得 top 5 top_5_probs, top_5_indices = torch.topk(probs, 5) # 載入 ImageNet 類別(簡化版,實際應該用 Places365) # 這裡返回通用預測 results = [] for i in range(5): prob = top_5_probs[0][i].item() # 簡化:返回 "unknown" + 信心度 results.append( {"scene_type": f"unknown_{i}", "confidence": prob} ) return results except Exception as e: print(f"[SCENE] PyTorch prediction error: {e}") return [] return [] def classify_video( self, video_path: str, output_path: str, sample_interval: float = 2.0, min_scene_duration: float = 3.0, ) -> Dict[str, Any]: """ 分類整個影片 Args: video_path: 影片路徑 output_path: 輸出 JSON 路徑 sample_interval: 取樣間隔(秒) min_scene_duration: 最小場景持續時間(秒) Returns: Dict: 分類結果 """ if not HAS_CV: print("[SCENE] Error: OpenCV not available") return {"frame_count": 0, "fps": 0.0, "scenes": []} # 開啟影片 cap = cv2.VideoCapture(video_path) if not cap.isOpened(): print(f"[SCENE] Error: Cannot open video: {video_path}") return {"frame_count": 0, "fps": 0.0, "scenes": []} # 取得影片資訊 fps = cap.get(cv2.CAP_PROP_FPS) total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) duration = total_frames / fps if fps > 0 else 0 print(f"[SCENE] Video: {video_path}") print(f"[SCENE] FPS: {fps}, Frames: {total_frames}, Duration: {duration:.1f}s") # 取樣幀進行分類 sample_interval_frames = max(1, int(fps * sample_interval)) predictions = [] frame_count = 0 while True: ret, frame = cap.read() if not ret: break frame_count += 1 # 只在取樣點預測 if frame_count % sample_interval_frames == 0: timestamp = frame_count / fps pred = self.predict_frame(frame) if pred: predictions.append({"timestamp": timestamp, "predictions": pred}) # 顯示進度 if len(predictions) % 10 == 0: progress = (frame_count / total_frames) * 100 print( f"[SCENE] Progress: {progress:.1f}% ({len(predictions)} samples)" ) cap.release() print(f"[SCENE] Collected {len(predictions)} predictions") # 合併連續相同場景 scenes = self._merge_scenes(predictions, min_scene_duration, duration) # 建立結果 result = { "frame_count": total_frames, "fps": fps, "scenes": scenes, "metadata": { "video_path": video_path, "duration": duration, "sample_interval": sample_interval, "min_scene_duration": min_scene_duration, "processed_at": datetime.now().isoformat(), "model_type": "coreml" if self.coreml_model else "pytorch" if self.model else "none", }, } # 寫出 JSON with open(output_path, "w", encoding="utf-8") as f: json.dump(result, f, ensure_ascii=False, indent=2) print(f"[SCENE] Result saved to: {output_path}") print(f"[SCENE] Detected {len(scenes)} scenes") return result def _merge_scenes( self, predictions: List[Dict], min_duration: float, total_duration: float ) -> List[Dict[str, Any]]: """ 合併連續相同場景 使用 Places365 類別名稱 """ if not predictions: return [] # 統計所有預測的場景類型 scene_counts = {} for pred in predictions: if pred["predictions"]: scene_type = pred["predictions"][0]["scene_type"] scene_counts[scene_type] = scene_counts.get(scene_type, 0) + 1 # 找出最常見的場景類型 if scene_counts: most_common_scene = max(scene_counts.items(), key=lambda x: x[1])[0] # 計算平均信心度 avg_confidence = ( sum( p["predictions"][0]["confidence"] for p in predictions if p["predictions"] ) / len(predictions) if predictions else 0.0 ) first_pred = predictions[0] last_pred = predictions[-1] return [ { "start_time": first_pred["timestamp"], "end_time": last_pred["timestamp"], "scene_type": most_common_scene, "scene_type_zh": SCENE_TYPE_ZH.get(most_common_scene), "confidence": avg_confidence, "top_5": first_pred["predictions"][:5], } ] return [] # 在沒有 Places365 模型的情況下,這是合理的預設行為 if predictions: first_pred = predictions[0] last_pred = predictions[-1] # 使用平均信心度 avg_confidence = ( sum( p["predictions"][0]["confidence"] for p in predictions if p["predictions"] ) / len(predictions) if predictions else 0.0 ) return [ { "start_time": first_pred["timestamp"], "end_time": last_pred["timestamp"], "scene_type": "indoor_general", # 預設為室內一般場景 "scene_type_zh": "室內場景", "confidence": avg_confidence, "top_5": first_pred["predictions"][:5], } ] return [] def main(): """主函數""" parser = argparse.ArgumentParser( description="場景識別處理器 - 使用 Core ML + Places365" ) parser.add_argument("video_path", nargs="?", help="輸入影片路徑") parser.add_argument("output_path", nargs="?", help="輸出 JSON 路徑") parser.add_argument("--uuid", help="影片 UUID (用於日誌)", default=None) parser.add_argument("--model", help="Core ML 模型路徑", default=None) parser.add_argument( "--sample-interval", type=float, default=2.0, help="取樣間隔 (秒),預設 2.0" ) parser.add_argument( "--min-scene-duration", type=float, default=3.0, help="最小場景持續時間 (秒),預設 3.0", ) parser.add_argument("--check-health", action="store_true", help="檢查環境並退出") args = parser.parse_args() # 健康檢查 if args.check_health: print("=== 場景識別處理器健康檢查 ===") print(f"Core ML: {'✓ Available' if HAS_COREML else '✗ Not available'}") print(f"PyTorch: {'✓ Available' if HAS_TORCH else '✗ Not available'}") print(f"PIL: {'✓ Available' if HAS_PIL else '✗ Not available'}") print(f"OpenCV: {'✓ Available' if HAS_CV else '✗ Not available'}") if HAS_TORCH: print(f"Device: {DEVICE}") sys.exit(0) # 檢查必要參數 if not args.video_path or not args.output_path: parser.print_help() sys.exit(1) # 檢查依賴 if not HAS_PIL or not HAS_CV: print("[SCENE] Error: Missing required dependencies (PIL/OpenCV)") sys.exit(1) # 建立分類器 classifier = SceneClassifier(model_path=args.model) # 載入模型 if not classifier.load_model(): print("[SCENE] Warning: No model loaded, will return empty results") # 建立空結果 result = { "frame_count": 0, "fps": 0.0, "scenes": [], "metadata": { "video_path": args.video_path, "error": "No model available", "processed_at": datetime.now().isoformat(), }, } with open(args.output_path, "w", encoding="utf-8") as f: json.dump(result, f, ensure_ascii=False, indent=2) sys.exit(0) # 執行分類 start_time = time.time() result = classifier.classify_video( video_path=args.video_path, output_path=args.output_path, sample_interval=args.sample_interval, min_scene_duration=args.min_scene_duration, ) elapsed = time.time() - start_time print(f"[SCENE] Completed in {elapsed:.1f}s") # 顯示統計 if result["scenes"]: print("\n[SCENE] 場景統計:") for scene in result["scenes"]: scene_name = scene.get("scene_type_zh") or scene.get("scene_type") duration = scene["end_time"] - scene["start_time"] conf = scene.get("confidence", 0) * 100 print( f" - {scene_name}: {scene['start_time']:.1f}s - {scene['end_time']:.1f}s ({duration:.1f}s, {conf:.0f}%)" ) if __name__ == "__main__": main()