66 lines
2.5 KiB
Python
66 lines
2.5 KiB
Python
"""MaskFormer judge: scene classification via COCO-Stuff 171 class"""
|
|
import torch, numpy as np
|
|
from PIL import Image
|
|
from transformers import MaskFormerImageProcessor, MaskFormerForInstanceSegmentation
|
|
|
|
MODEL_ID = "facebook/maskformer-resnet50-coco-stuff"
|
|
|
|
_model = None
|
|
_processor = None
|
|
_id2label = None
|
|
|
|
INDOOR_STUFF = {"wall", "floor", "ceiling", "door", "window", "curtain", "desk", "table",
|
|
"furniture", "bed", "chair", "cabinet", "shelf", "carpet", "pillow"}
|
|
OUTDOOR_STUFF = {"sky", "road", "river", "sea", "grass", "tree", "mountain", "pavement",
|
|
"sand", "gravel", "snow", "cloud"}
|
|
|
|
def load():
|
|
global _model, _processor, _id2label
|
|
if _model is None:
|
|
_processor = MaskFormerImageProcessor.from_pretrained(MODEL_ID)
|
|
_model = MaskFormerForInstanceSegmentation.from_pretrained(MODEL_ID).eval()
|
|
if torch.backends.mps.is_available():
|
|
_model = _model.to("mps")
|
|
_id2label = {int(k): v for k, v in _model.config.id2label.items()}
|
|
|
|
def score(frames, prompt):
|
|
load()
|
|
results = []
|
|
for img in frames:
|
|
w, h = img.size
|
|
inputs = _processor(images=img, return_tensors="pt")
|
|
if torch.backends.mps.is_available():
|
|
inputs = {k: v.to("mps") for k, v in inputs.items()}
|
|
with torch.no_grad():
|
|
outputs = _model(**inputs)
|
|
seg = _processor.post_process_semantic_segmentation(outputs, target_sizes=[(h, w)])[0].cpu().numpy()
|
|
|
|
classes, counts = np.unique(seg, return_counts=True)
|
|
total_px = h * w
|
|
stuff_found = []
|
|
indoor_px = outdoor_px = 0
|
|
for cid, cnt in zip(classes, counts):
|
|
lbl = _id2label.get(int(cid), f"_{cid}")
|
|
pct = 100 * cnt / total_px
|
|
if pct > 1.0:
|
|
stuff_found.append((lbl, pct))
|
|
if lbl in INDOOR_STUFF: indoor_px += cnt
|
|
if lbl in OUTDOOR_STUFF: outdoor_px += cnt
|
|
|
|
is_indoor = bool(indoor_px > outdoor_px)
|
|
dominant = max(stuff_found, key=lambda x: x[1]) if stuff_found else ("unknown", 0)
|
|
results.append({
|
|
"is_indoor": is_indoor,
|
|
"dominant_stuff": dominant[0],
|
|
"dom_pct": round(dominant[1], 1),
|
|
"top_stuff": stuff_found[:5]
|
|
})
|
|
|
|
is_indoor = results[0]["is_indoor"] if results else None
|
|
return {
|
|
"agent": "MaskFormer",
|
|
"score": 100 if is_indoor else 0,
|
|
"reasoning": f"Scene: {'indoor' if is_indoor else 'outdoor'} (dom={results[0]['dominant_stuff']})",
|
|
"details": {"frames": results}
|
|
}
|