#!/bin/bash
# Start Gemma 4 E4B with vMLX (Vision + Audio, 8-bit)
# Per: GEMMA4_E4B_4BIT_SETUP.md

VMLX="/Users/accusys/vmlx/.venv/bin/vmlx"
MODEL="/Users/accusys/models/mlx-gemma4-e4b-it-8bit"
PORT=8000
LOG_FILE="/Users/accusys/momentry_core/logs/vmlx_8000.log"

# Kill existing processes on port
lsof -i :$PORT | awk 'NR>1 {print $2}' | while read pid; do kill $pid; done
sleep 2

# Start vMLX server (editable install v1.5.59)
$VMLX serve $MODEL \
  --host 0.0.0.0 --port $PORT \
  --enable-prefix-cache \
  --use-paged-cache \
  --enable-disk-cache \
  --kv-cache-quantization q8 \
  --max-cache-blocks 2048 \
  --timeout 1200 \
  --log-level INFO \
  --served-model-name gemma-4-E4B \
  > $LOG_FILE 2>&1 &

echo "vMLX server starting on port $PORT"
echo "Model: Gemma 4 E4B 8bit (MLX) — supports Vision + Audio"
echo "Log: $LOG_FILE"

# Wait for ready
for i in $(seq 1 30); do
  if curl -s -m 2 http://localhost:$PORT/health >/dev/null 2>&1; then
    echo "✅ Ready ($i s)"
    echo "API: http://localhost:$PORT/v1/chat/completions"
    exit 0
  fi
  sleep 2
done

echo "❌ Not ready after 60s, check log"
exit 1