feat: Phase 2.6 edges migration to Qdrant (TKG-only architecture)
Phase 2.6.1: co_occurrence_edges migration - build_co_occurrence_edges_from_qdrant() - Qdrant embeddings → frame grouping → YOLO objects - Result: 6679 edges (vs 6701 PostgreSQL) Phase 2.6.2: face_face_edges migration - build_face_face_edges_from_qdrant() - Qdrant embeddings → frame grouping → face pairs - mutual_gaze detection preserved - Result: 6 edges (exact match) Phase 2.6.3: speaker_face_edges migration - build_speaker_face_edges_from_qdrant() - Qdrant embeddings → trace_id frame ranges - SPEAKS_AS edge creation Architecture: - All edges use Qdrant payload (no face_detections queries) - PostgreSQL fallback for empty Qdrant - Estimated 3.6x performance improvement Testing: - Playground (3003): ✓ All Phase 2.6 logs verified - Edge counts: ✓ Close match with PostgreSQL - Fallback: ✓ Working Docs: - docs_v1.0/DESIGN/TKG_PHASE2_6_EDGES_MIGRATION.md - docs_v1.0/M4_workspace/2026-06-21_phase2_6_test.md
This commit is contained in:
232
v1.1/scripts/clip_classifier_v1.11.py
Normal file
232
v1.1/scripts/clip_classifier_v1.11.py
Normal file
@@ -0,0 +1,232 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
CLIP Zero-Shot Classifier
|
||||
Uses OpenAI CLIP for reliable scene and object classification.
|
||||
|
||||
Advantages over LLaVA Vision:
|
||||
- Zero-shot classification (no prompt induction)
|
||||
- Reliable confidence scores
|
||||
- Fast inference
|
||||
- No hallucinations
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
try:
|
||||
import torch
|
||||
from PIL import Image
|
||||
from transformers import CLIPProcessor, CLIPModel
|
||||
HAS_CLIP = True
|
||||
except ImportError as e:
|
||||
print(f"[ERROR] Required packages not found: {e}", file=sys.stderr)
|
||||
print("[ERROR] Install with: pip install transformers torch pillow", file=sys.stderr)
|
||||
HAS_CLIP = False
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
class CLIPClassifier:
|
||||
def __init__(self, model_name: str = "openai/clip-vit-base-patch32"):
|
||||
"""
|
||||
Initialize CLIP model.
|
||||
|
||||
Args:
|
||||
model_name: HuggingFace model name (default: openai/clip-vit-base-patch32)
|
||||
"""
|
||||
print(f"[CLIP] Loading model: {model_name}")
|
||||
self.model = CLIPModel.from_pretrained(model_name)
|
||||
self.processor = CLIPProcessor.from_pretrained(model_name)
|
||||
self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
|
||||
self.model.to(self.device)
|
||||
print(f"[CLIP] Model loaded on device: {self.device}")
|
||||
|
||||
def classify_image(
|
||||
self,
|
||||
image_path: str,
|
||||
labels: List[str],
|
||||
top_k: int = 5
|
||||
) -> List[Dict[str, float]]:
|
||||
"""
|
||||
Classify a single image with given labels.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
labels: List of candidate labels (e.g., ["person in room", "outdoor scene", "snow landscape"])
|
||||
top_k: Number of top predictions to return
|
||||
|
||||
Returns:
|
||||
List of {"label": str, "confidence": float} sorted by confidence
|
||||
"""
|
||||
try:
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Failed to load image {image_path}: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
# Prepare inputs
|
||||
inputs = self.processor(
|
||||
text=labels,
|
||||
images=image,
|
||||
return_tensors="pt",
|
||||
padding=True
|
||||
).to(self.device)
|
||||
|
||||
# Get predictions
|
||||
with torch.no_grad():
|
||||
outputs = self.model(**inputs)
|
||||
logits_per_image = outputs.logits_per_image
|
||||
probs = logits_per_image.softmax(dim=1).cpu().numpy()[0]
|
||||
|
||||
# Sort by confidence
|
||||
results = [
|
||||
{"label": label, "confidence": float(prob)}
|
||||
for label, prob in zip(labels, probs)
|
||||
]
|
||||
results.sort(key=lambda x: x["confidence"], reverse=True)
|
||||
|
||||
return results[:top_k]
|
||||
|
||||
def classify_images(
|
||||
self,
|
||||
image_paths: List[str],
|
||||
labels: List[str],
|
||||
top_k: int = 5
|
||||
) -> Dict[str, List[Dict[str, float]]]:
|
||||
"""
|
||||
Classify multiple images with given labels.
|
||||
|
||||
Args:
|
||||
image_paths: List of image paths
|
||||
labels: List of candidate labels
|
||||
top_k: Number of top predictions per image
|
||||
|
||||
Returns:
|
||||
Dict mapping image_path -> predictions
|
||||
"""
|
||||
results = {}
|
||||
for img_path in image_paths:
|
||||
results[img_path] = self.classify_image(img_path, labels, top_k)
|
||||
return results
|
||||
|
||||
def detect_objects(
|
||||
self,
|
||||
image_path: str,
|
||||
objects: List[str],
|
||||
threshold: float = 0.15
|
||||
) -> List[Dict[str, float]]:
|
||||
"""
|
||||
Detect if specific objects are present in image.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
objects: List of objects to detect (e.g., ["gun", "knife", "weapon"])
|
||||
threshold: Confidence threshold (default: 0.15)
|
||||
|
||||
Returns:
|
||||
List of detected objects with confidence >= threshold
|
||||
"""
|
||||
predictions = self.classify_image(image_path, objects, top_k=len(objects))
|
||||
detected = [p for p in predictions if p["confidence"] >= threshold]
|
||||
return detected
|
||||
|
||||
def batch_detect_objects(
|
||||
self,
|
||||
image_paths: List[str],
|
||||
objects: List[str],
|
||||
threshold: float = 0.15
|
||||
) -> Dict[str, List[Dict[str, float]]]:
|
||||
"""
|
||||
Detect objects across multiple images.
|
||||
|
||||
Args:
|
||||
image_paths: List of image paths
|
||||
objects: List of objects to detect
|
||||
threshold: Confidence threshold
|
||||
|
||||
Returns:
|
||||
Dict mapping image_path -> detected objects
|
||||
"""
|
||||
results = {}
|
||||
for img_path in image_paths:
|
||||
detected = self.detect_objects(img_path, objects, threshold)
|
||||
if detected:
|
||||
results[img_path] = detected
|
||||
return results
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="CLIP Zero-Shot Classifier",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Scene classification
|
||||
python clip_classifier.py image.jpg --labels "indoor room,outdoor scene,person in room" --top-k 3
|
||||
|
||||
# Object detection
|
||||
python clip_classifier.py image.jpg --detect "gun,weapon,knife" --threshold 0.2
|
||||
|
||||
# Batch processing
|
||||
python clip_classifier.py images.txt --batch --labels "indoor,outdoor"
|
||||
"""
|
||||
)
|
||||
|
||||
parser.add_argument("input", help="Image path or text file with image paths (for batch)")
|
||||
parser.add_argument("--labels", help="Comma-separated labels for classification")
|
||||
parser.add_argument("--detect", help="Comma-separated objects to detect")
|
||||
parser.add_argument("--threshold", type=float, default=0.15, help="Detection threshold (default: 0.15)")
|
||||
parser.add_argument("--top-k", type=int, default=5, help="Top-k predictions (default: 5)")
|
||||
parser.add_argument("--batch", action="store_true", help="Batch mode (input is text file)")
|
||||
parser.add_argument("--output", help="Output JSON file (default: stdout)")
|
||||
parser.add_argument("--model", default="openai/clip-vit-base-patch32", help="CLIP model name")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not HAS_CLIP:
|
||||
sys.exit(1)
|
||||
|
||||
# Initialize classifier
|
||||
classifier = CLIPClassifier(args.model)
|
||||
|
||||
# Prepare image paths
|
||||
if args.batch:
|
||||
with open(args.input, "r") as f:
|
||||
image_paths = [line.strip() for line in f if line.strip()]
|
||||
else:
|
||||
image_paths = [args.input]
|
||||
|
||||
# Run classification
|
||||
results = {}
|
||||
|
||||
if args.detect:
|
||||
# Object detection mode
|
||||
objects = [obj.strip() for obj in args.detect.split(",")]
|
||||
print(f"[CLIP] Detecting objects: {objects}")
|
||||
results = classifier.batch_detect_objects(image_paths, objects, args.threshold)
|
||||
|
||||
elif args.labels:
|
||||
# Scene classification mode
|
||||
labels = [label.strip() for label in args.labels.split(",")]
|
||||
print(f"[CLIP] Classifying with {len(labels)} labels")
|
||||
results = classifier.classify_images(image_paths, labels, args.top_k)
|
||||
|
||||
else:
|
||||
print("[ERROR] Must specify --labels or --detect", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Output results
|
||||
output_json = json.dumps(results, indent=2, ensure_ascii=False)
|
||||
|
||||
if args.output:
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
f.write(output_json)
|
||||
print(f"[CLIP] Results saved to {args.output}")
|
||||
else:
|
||||
print(output_json)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user