"""MaskFormer judge: scene classification via COCO-Stuff 171 class""" import torch, numpy as np from PIL import Image from transformers import MaskFormerImageProcessor, MaskFormerForInstanceSegmentation MODEL_ID = "facebook/maskformer-resnet50-coco-stuff" _model = None _processor = None _id2label = None INDOOR_STUFF = {"wall", "floor", "ceiling", "door", "window", "curtain", "desk", "table", "furniture", "bed", "chair", "cabinet", "shelf", "carpet", "pillow"} OUTDOOR_STUFF = {"sky", "road", "river", "sea", "grass", "tree", "mountain", "pavement", "sand", "gravel", "snow", "cloud"} def load(): global _model, _processor, _id2label if _model is None: _processor = MaskFormerImageProcessor.from_pretrained(MODEL_ID) _model = MaskFormerForInstanceSegmentation.from_pretrained(MODEL_ID).eval() if torch.backends.mps.is_available(): _model = _model.to("mps") _id2label = {int(k): v for k, v in _model.config.id2label.items()} def score(frames, prompt): load() results = [] for img in frames: w, h = img.size inputs = _processor(images=img, return_tensors="pt") if torch.backends.mps.is_available(): inputs = {k: v.to("mps") for k, v in inputs.items()} with torch.no_grad(): outputs = _model(**inputs) seg = _processor.post_process_semantic_segmentation(outputs, target_sizes=[(h, w)])[0].cpu().numpy() classes, counts = np.unique(seg, return_counts=True) total_px = h * w stuff_found = [] indoor_px = outdoor_px = 0 for cid, cnt in zip(classes, counts): lbl = _id2label.get(int(cid), f"_{cid}") pct = 100 * cnt / total_px if pct > 1.0: stuff_found.append((lbl, pct)) if lbl in INDOOR_STUFF: indoor_px += cnt if lbl in OUTDOOR_STUFF: outdoor_px += cnt is_indoor = bool(indoor_px > outdoor_px) dominant = max(stuff_found, key=lambda x: x[1]) if stuff_found else ("unknown", 0) results.append({ "is_indoor": is_indoor, "dominant_stuff": dominant[0], "dom_pct": round(dominant[1], 1), "top_stuff": stuff_found[:5] }) is_indoor = results[0]["is_indoor"] if results else None return { "agent": "MaskFormer", "score": 100 if is_indoor else 0, "reasoning": f"Scene: {'indoor' if is_indoor else 'outdoor'} (dom={results[0]['dominant_stuff']})", "details": {"frames": results} }