Files
momentry_core/scripts/scene_classifier.py
Warren e75c4d6f07 cleanup: remove dead code and duplicate docs
- Remove session-ses_2f27.md (161KB raw session log)
- Remove 49 ROOT_* duplicate files across REFERENCE/
- Remove 14 duplicate files between REFERENCE/ root and history/
- Remove asr_legacy.rs (dead code, replaced by asr.rs)
- Remove src/core/worker/ (duplicate JobWorker)
- Remove src/core/layers/ (empty directory)
- Remove 4 .bak files in src/
- Remove 7 dead private methods in worker/processor.rs
- Remove backup directory from git tracking
2026-05-04 01:31:21 +08:00

684 lines
21 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
場景識別處理器 (Scene Classification Processor)
使用 Core ML + Places365 模型進行場景識別
支援 Apple Silicon M4 優化
- Core ML 模型 (原生)
- PyTorch + MPS (備案)
"""
import argparse
import json
import sys
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any
# 嘗試導入 Core ML
try:
import coremltools as ct
HAS_COREML = True
except ImportError:
HAS_COREML = False
# 嘗試導入 PyTorch (備案)
try:
import torch
from torchvision import transforms, models
HAS_TORCH = True
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
except ImportError:
HAS_TORCH = False
DEVICE = torch.device("cpu")
# 嘗試導入 Pillow 用於圖像處理
try:
from PIL import Image
HAS_PIL = True
except ImportError:
HAS_PIL = False
# 嘗試導入 OpenCV 用於影片處理
try:
import cv2
HAS_CV = True
except ImportError:
HAS_CV = False
# 載入 Places365 類別
PLACES365_CATEGORIES = {}
try:
import json
from pathlib import Path
categories_path = Path(__file__).parent / "places365_categories.json"
if categories_path.exists():
with open(categories_path, "r", encoding="utf-8") as f:
PLACES365_CATEGORIES = json.load(f)
print(f"[SCENE] Loaded {len(PLACES365_CATEGORIES)} Places365 categories")
except Exception as e:
print(f"[SCENE] Warning: Could not load Places365 categories: {e}")
# 場景類型中英文對照
SCENE_TYPE_ZH = {
"hospital_room": "醫院病房",
"pharmacy": "藥房",
"classroom": "教室",
"office": "辦公室",
"kitchen": "廚房",
"living_room": "客廳",
"bedroom": "臥室",
"bathroom": "浴室",
"restaurant": "餐廳",
"gym": "健身房",
"supermarket": "超市",
"basketball_court": "籃球場",
"football_field": "足球場",
"tennis_court": "網球場",
"swimming_pool": "游泳池",
"park": "公園",
"street": "街道",
"beach": "海灘",
"mountain": "山地",
"forest": "森林",
"airport": "機場",
"train_station": "火車站",
"subway_station": "地鐵站",
"gas_station": "加油站",
"parking_lot": "停車場",
"auditorium": "禮堂",
"library": "圖書館",
"laboratory": "實驗室",
"art_studio": "藝術工作室",
"music_store": "音樂商店",
"computer_room": "電腦室",
"conference_room": "會議室",
"playground": "遊樂場",
"ski_slope": "滑雪坡",
"ice_rink": "溜冰場",
"boxing_ring": "拳擊場",
"volleyball_court": "排球場",
"baseball_field": "棒球場",
}
# 場景類別Places365 子集)
SCENE_CATEGORIES = [
"hospital_room",
"pharmacy",
"classroom",
"office",
"kitchen",
"living_room",
"bedroom",
"bathroom",
"restaurant",
"gym",
"supermarket",
"basketball_court",
"football_field",
"tennis_court",
"swimming_pool",
"park",
"street",
"beach",
"mountain",
"forest",
"airport",
"train_station",
"subway_station",
"gas_station",
"parking_lot",
"auditorium",
"library",
"laboratory",
"art_studio",
"music_store",
"computer_room",
"conference_room",
"playground",
"ski_slope",
"ice_rink",
"boxing_ring",
"volleyball_court",
"baseball_field",
]
class SceneClassifier:
"""場景識別器"""
def __init__(self, model_path: Optional[str] = None):
"""
初始化場景識別器
Args:
model_path: Core ML 模型路徑 (可選)
"""
self.model_path = model_path
self.places365_model_path = (
"/Users/accusys/momentry/models/resnet18_places365.pth.tar"
)
self.model = None
self.coreml_model = None
self.transform = None
self.model_type = "unknown"
# 圖像預處理
self.transform = transforms.Compose(
[
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(
mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
),
]
)
def load_model(self) -> bool:
"""
載入模型
Returns:
bool: 是否成功載入
"""
# 優先使用 Core ML
if HAS_COREML and self.model_path and Path(self.model_path).exists():
try:
print(f"[SCENE] Loading Core ML model: {self.model_path}")
self.coreml_model = ct.models.MLModel(self.model_path)
self.model_type = "coreml"
print("[SCENE] Core ML model loaded successfully")
return True
except Exception as e:
print(f"[SCENE] Warning: Failed to load Core ML model: {e}")
# 備案:使用 PyTorch + Places365
if HAS_TORCH:
try:
print(f"[SCENE] Loading PyTorch model on {DEVICE}")
# 檢查 Places365 模型是否存在
if Path(self.places365_model_path).exists():
print(
f"[SCENE] Loading Places365 model: {self.places365_model_path}"
)
checkpoint = torch.load(
self.places365_model_path, map_location=DEVICE
)
# 建立 ResNet18 模型 (Places365 有 365 個類別)
self.model = models.resnet18(num_classes=365)
# 移除 'module.' prefix (DataParallel training)
state_dict = checkpoint["state_dict"]
new_state_dict = {}
for k, v in state_dict.items():
if k.startswith("module."):
new_state_dict[k[7:]] = v
else:
new_state_dict[k] = v
self.model.load_state_dict(new_state_dict)
self.model_type = "places365"
print("[SCENE] Places365 model loaded successfully (365 classes)")
else:
print(
"[SCENE] Places365 model not found, using ImageNet pretrained"
)
self.model = models.resnet18(pretrained=True)
self.model_type = "imagenet"
self.model.to(DEVICE)
self.model.eval()
print("[SCENE] PyTorch model loaded successfully")
return True
except Exception as e:
print(f"[SCENE] Warning: Failed to load PyTorch model: {e}")
import traceback
traceback.print_exc()
print("[SCENE] Error: No model available")
return False
def predict_frame(self, frame: Any) -> List[Dict[str, Any]]:
"""
預測單幀圖像的場景類型
Args:
frame: 圖像幀 (OpenCV ndarray 或 PIL)
Returns:
List[Dict]: 前 5 個預測結果
"""
if self.coreml_model is None and self.model is None:
print("[SCENE] Warning: No model loaded")
return []
# 轉換為 PIL Image
if isinstance(frame, str):
img = Image.open(frame).convert("RGB")
elif HAS_CV and hasattr(frame, "shape") and len(frame.shape) == 3:
# OpenCV frame (BGR ndarray)
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
elif hasattr(frame, "convert"):
# PIL Image
img = frame.convert("RGB")
else:
print(f"[SCENE] Warning: Unknown frame type: {type(frame)}")
return []
if img is None:
print("[SCENE] Warning: Failed to convert to PIL Image")
return []
# 使用 Core ML
if self.coreml_model is not None:
try:
# Core ML 需要 dict 輸入
input_dict = {"image": img}
output = self.coreml_model.predict(input_dict)
# 解析輸出
probs = output.get("probs", {})
top_5 = sorted(probs.items(), key=lambda x: x[1], reverse=True)[:5]
return [
{"scene_type": label, "confidence": float(conf)}
for label, conf in top_5
]
except Exception as e:
print(f"[SCENE] Core ML prediction error: {e}")
return []
# 使用 PyTorch
if self.model is not None:
try:
with torch.no_grad():
# 預處理
input_tensor = self.transform(img).unsqueeze(0).to(DEVICE)
# 推理
outputs = self.model(input_tensor)
probs = torch.nn.functional.softmax(outputs, dim=1)
# 取得 top 5
top_5_probs, top_5_indices = torch.topk(probs, 5)
# 簡化:使用 Places365 類別映射
results = []
for i in range(5):
prob = top_5_probs[0][i].item()
idx = top_5_indices[0][i].item()
# 使用 Places365 類別名稱(如果可用)
scene_type = PLACES365_CATEGORIES.get(str(idx), f"scene_{idx}")
results.append({"scene_type": scene_type, "confidence": prob})
return results
except Exception as e:
print(f"[SCENE] PyTorch prediction error: {e}")
import traceback
traceback.print_exc()
return []
return []
# 轉換為 PIL Image
if isinstance(frame, str):
img = Image.open(frame).convert("RGB")
elif HAS_CV and hasattr(frame, "shape") and len(frame.shape) == 3:
# OpenCV frame (BGR ndarray)
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
elif hasattr(frame, "convert"):
# PIL Image
img = frame.convert("RGB")
else:
print(f"[SCENE] Warning: Unknown frame type: {type(frame)}")
return []
if img is None:
return []
# 轉換為 PIL Image
if isinstance(frame, str):
img = Image.open(frame).convert("RGB")
elif HAS_CV and isinstance(frame, dict):
# OpenCV frame (BGR)
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
else:
img = frame.convert("RGB") if hasattr(frame, "convert") else None
if img is None:
return []
# 使用 Core ML
if self.coreml_model is not None:
try:
# Core ML 需要 dict 輸入
input_dict = {"image": img}
output = self.coreml_model.predict(input_dict)
# 解析輸出
probs = output.get("probs", {})
top_5 = sorted(probs.items(), key=lambda x: x[1], reverse=True)[:5]
return [
{"scene_type": label, "confidence": float(conf)}
for label, conf in top_5
]
except Exception as e:
print(f"[SCENE] Core ML prediction error: {e}")
return []
# 使用 PyTorch
if self.model is not None:
try:
with torch.no_grad():
# 預處理
input_tensor = self.transform(img).unsqueeze(0).to(DEVICE)
# 推理
outputs = self.model(input_tensor)
probs = torch.nn.functional.softmax(outputs, dim=1)
# 取得 top 5
top_5_probs, top_5_indices = torch.topk(probs, 5)
# 載入 ImageNet 類別(簡化版,實際應該用 Places365
# 這裡返回通用預測
results = []
for i in range(5):
prob = top_5_probs[0][i].item()
# 簡化:返回 "unknown" + 信心度
results.append(
{"scene_type": f"unknown_{i}", "confidence": prob}
)
return results
except Exception as e:
print(f"[SCENE] PyTorch prediction error: {e}")
return []
return []
def classify_video(
self,
video_path: str,
output_path: str,
sample_interval: float = 2.0,
min_scene_duration: float = 3.0,
) -> Dict[str, Any]:
"""
分類整個影片
Args:
video_path: 影片路徑
output_path: 輸出 JSON 路徑
sample_interval: 取樣間隔(秒)
min_scene_duration: 最小場景持續時間(秒)
Returns:
Dict: 分類結果
"""
if not HAS_CV:
print("[SCENE] Error: OpenCV not available")
return {"frame_count": 0, "fps": 0.0, "scenes": []}
# 開啟影片
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"[SCENE] Error: Cannot open video: {video_path}")
return {"frame_count": 0, "fps": 0.0, "scenes": []}
# 取得影片資訊
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
duration = total_frames / fps if fps > 0 else 0
print(f"[SCENE] Video: {video_path}")
print(f"[SCENE] FPS: {fps}, Frames: {total_frames}, Duration: {duration:.1f}s")
# 取樣幀進行分類
sample_interval_frames = max(1, int(fps * sample_interval))
predictions = []
frame_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
frame_count += 1
# 只在取樣點預測
if frame_count % sample_interval_frames == 0:
timestamp = frame_count / fps
pred = self.predict_frame(frame)
if pred:
predictions.append({"timestamp": timestamp, "predictions": pred})
# 顯示進度
if len(predictions) % 10 == 0:
progress = (frame_count / total_frames) * 100
print(
f"[SCENE] Progress: {progress:.1f}% ({len(predictions)} samples)"
)
cap.release()
print(f"[SCENE] Collected {len(predictions)} predictions")
# 合併連續相同場景
scenes = self._merge_scenes(predictions, min_scene_duration, duration)
# 建立結果
result = {
"frame_count": total_frames,
"fps": fps,
"scenes": scenes,
"metadata": {
"video_path": video_path,
"duration": duration,
"sample_interval": sample_interval,
"min_scene_duration": min_scene_duration,
"processed_at": datetime.now().isoformat(),
"model_type": "coreml"
if self.coreml_model
else "pytorch"
if self.model
else "none",
},
}
# 寫出 JSON
with open(output_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"[SCENE] Result saved to: {output_path}")
print(f"[SCENE] Detected {len(scenes)} scenes")
return result
def _merge_scenes(
self, predictions: List[Dict], min_duration: float, total_duration: float
) -> List[Dict[str, Any]]:
"""
合併連續相同場景
使用 Places365 類別名稱
"""
if not predictions:
return []
# 統計所有預測的場景類型
scene_counts = {}
for pred in predictions:
if pred["predictions"]:
scene_type = pred["predictions"][0]["scene_type"]
scene_counts[scene_type] = scene_counts.get(scene_type, 0) + 1
# 找出最常見的場景類型
if scene_counts:
most_common_scene = max(scene_counts.items(), key=lambda x: x[1])[0]
# 計算平均信心度
avg_confidence = (
sum(
p["predictions"][0]["confidence"]
for p in predictions
if p["predictions"]
)
/ len(predictions)
if predictions
else 0.0
)
first_pred = predictions[0]
last_pred = predictions[-1]
return [
{
"start_time": first_pred["timestamp"],
"end_time": last_pred["timestamp"],
"scene_type": most_common_scene,
"scene_type_zh": SCENE_TYPE_ZH.get(most_common_scene),
"confidence": avg_confidence,
"top_5": first_pred["predictions"][:5],
}
]
return []
# 在沒有 Places365 模型的情況下,這是合理的預設行為
if predictions:
first_pred = predictions[0]
last_pred = predictions[-1]
# 使用平均信心度
avg_confidence = (
sum(
p["predictions"][0]["confidence"]
for p in predictions
if p["predictions"]
)
/ len(predictions)
if predictions
else 0.0
)
return [
{
"start_time": first_pred["timestamp"],
"end_time": last_pred["timestamp"],
"scene_type": "indoor_general", # 預設為室內一般場景
"scene_type_zh": "室內場景",
"confidence": avg_confidence,
"top_5": first_pred["predictions"][:5],
}
]
return []
def main():
"""主函數"""
parser = argparse.ArgumentParser(
description="場景識別處理器 - 使用 Core ML + Places365"
)
parser.add_argument("video_path", nargs="?", help="輸入影片路徑")
parser.add_argument("output_path", nargs="?", help="輸出 JSON 路徑")
parser.add_argument("--uuid", help="影片 UUID (用於日誌)", default=None)
parser.add_argument("--model", help="Core ML 模型路徑", default=None)
parser.add_argument(
"--sample-interval", type=float, default=2.0, help="取樣間隔 (秒),預設 2.0"
)
parser.add_argument(
"--min-scene-duration",
type=float,
default=3.0,
help="最小場景持續時間 (秒),預設 3.0",
)
parser.add_argument("--check-health", action="store_true", help="檢查環境並退出")
args = parser.parse_args()
# 健康檢查
if args.check_health:
print("=== 場景識別處理器健康檢查 ===")
print(f"Core ML: {'✓ Available' if HAS_COREML else '✗ Not available'}")
print(f"PyTorch: {'✓ Available' if HAS_TORCH else '✗ Not available'}")
print(f"PIL: {'✓ Available' if HAS_PIL else '✗ Not available'}")
print(f"OpenCV: {'✓ Available' if HAS_CV else '✗ Not available'}")
if HAS_TORCH:
print(f"Device: {DEVICE}")
sys.exit(0)
# 檢查必要參數
if not args.video_path or not args.output_path:
parser.print_help()
sys.exit(1)
# 檢查依賴
if not HAS_PIL or not HAS_CV:
print("[SCENE] Error: Missing required dependencies (PIL/OpenCV)")
sys.exit(1)
# 建立分類器
classifier = SceneClassifier(model_path=args.model)
# 載入模型
if not classifier.load_model():
print("[SCENE] Warning: No model loaded, will return empty results")
# 建立空結果
result = {
"frame_count": 0,
"fps": 0.0,
"scenes": [],
"metadata": {
"video_path": args.video_path,
"error": "No model available",
"processed_at": datetime.now().isoformat(),
},
}
with open(args.output_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
sys.exit(0)
# 執行分類
start_time = time.time()
result = classifier.classify_video(
video_path=args.video_path,
output_path=args.output_path,
sample_interval=args.sample_interval,
min_scene_duration=args.min_scene_duration,
)
elapsed = time.time() - start_time
print(f"[SCENE] Completed in {elapsed:.1f}s")
# 顯示統計
if result["scenes"]:
print("\n[SCENE] 場景統計:")
for scene in result["scenes"]:
scene_name = scene.get("scene_type_zh") or scene.get("scene_type")
duration = scene["end_time"] - scene["start_time"]
conf = scene.get("confidence", 0) * 100
print(
f" - {scene_name}: {scene['start_time']:.1f}s - {scene['end_time']:.1f}s ({duration:.1f}s, {conf:.0f}%)"
)
if __name__ == "__main__":
main()