Files
momentry_core/scripts/qa/judges/maskformer.py

66 lines
2.5 KiB
Python

"""MaskFormer judge: scene classification via COCO-Stuff 171 class"""
import torch, numpy as np
from PIL import Image
from transformers import MaskFormerImageProcessor, MaskFormerForInstanceSegmentation
MODEL_ID = "facebook/maskformer-resnet50-coco-stuff"
_model = None
_processor = None
_id2label = None
INDOOR_STUFF = {"wall", "floor", "ceiling", "door", "window", "curtain", "desk", "table",
"furniture", "bed", "chair", "cabinet", "shelf", "carpet", "pillow"}
OUTDOOR_STUFF = {"sky", "road", "river", "sea", "grass", "tree", "mountain", "pavement",
"sand", "gravel", "snow", "cloud"}
def load():
global _model, _processor, _id2label
if _model is None:
_processor = MaskFormerImageProcessor.from_pretrained(MODEL_ID)
_model = MaskFormerForInstanceSegmentation.from_pretrained(MODEL_ID).eval()
if torch.backends.mps.is_available():
_model = _model.to("mps")
_id2label = {int(k): v for k, v in _model.config.id2label.items()}
def score(frames, prompt):
load()
results = []
for img in frames:
w, h = img.size
inputs = _processor(images=img, return_tensors="pt")
if torch.backends.mps.is_available():
inputs = {k: v.to("mps") for k, v in inputs.items()}
with torch.no_grad():
outputs = _model(**inputs)
seg = _processor.post_process_semantic_segmentation(outputs, target_sizes=[(h, w)])[0].cpu().numpy()
classes, counts = np.unique(seg, return_counts=True)
total_px = h * w
stuff_found = []
indoor_px = outdoor_px = 0
for cid, cnt in zip(classes, counts):
lbl = _id2label.get(int(cid), f"_{cid}")
pct = 100 * cnt / total_px
if pct > 1.0:
stuff_found.append((lbl, pct))
if lbl in INDOOR_STUFF: indoor_px += cnt
if lbl in OUTDOOR_STUFF: outdoor_px += cnt
is_indoor = bool(indoor_px > outdoor_px)
dominant = max(stuff_found, key=lambda x: x[1]) if stuff_found else ("unknown", 0)
results.append({
"is_indoor": is_indoor,
"dominant_stuff": dominant[0],
"dom_pct": round(dominant[1], 1),
"top_stuff": stuff_found[:5]
})
is_indoor = results[0]["is_indoor"] if results else None
return {
"agent": "MaskFormer",
"score": 100 if is_indoor else 0,
"reasoning": f"Scene: {'indoor' if is_indoor else 'outdoor'} (dom={results[0]['dominant_stuff']})",
"details": {"frames": results}
}