Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
79 lines
3.0 KiB
Python
Executable File
79 lines
3.0 KiB
Python
Executable File
"""
|
|
Simple Flask-like HTTP server for CoreML ANE embedding inference.
|
|
Replaces /api/embeddings endpoint that comic_embed.rs calls.
|
|
"""
|
|
import json, os, argparse
|
|
from http.server import HTTPServer, BaseHTTPRequestHandler
|
|
import numpy as np
|
|
from transformers import AutoTokenizer
|
|
|
|
# Global model
|
|
MODEL = None
|
|
TOKENIZER = None
|
|
MODEL_PATH = "/Users/accusys/models/mxbai-embed-large-v1.mlpackage"
|
|
|
|
class EmbeddingHandler(BaseHTTPRequestHandler):
|
|
def do_POST(self):
|
|
if self.path == "/api/embeddings":
|
|
length = int(self.headers.get("Content-Length", 0))
|
|
body = self.read(length)
|
|
try:
|
|
data = json.loads(body)
|
|
prompt = data.get("prompt", "")
|
|
# Strip search_document: or search_query: prefix
|
|
if prompt.startswith("search_document: "):
|
|
prompt = prompt[17:]
|
|
elif prompt.startswith("search_query: "):
|
|
prompt = prompt[14:]
|
|
|
|
tokens = TOKENIZER(prompt, return_tensors="np", padding="max_length", truncation=True, max_length=512)
|
|
input_ids = tokens["input_ids"].astype(np.int32)
|
|
attention_mask = tokens["attention_mask"].astype(np.int32)
|
|
result = MODEL.predict({"input_ids": input_ids, "attention_mask": attention_mask})
|
|
embedding = result["embedding"][0].tolist()
|
|
|
|
resp = json.dumps({"embedding": embedding}).encode()
|
|
self.send_response(200)
|
|
self.send_header("Content-Type", "application/json")
|
|
self.end_headers()
|
|
self.wfile.write(resp)
|
|
except Exception as e:
|
|
resp = json.dumps({"error": str(e)}).encode()
|
|
self.send_response(500)
|
|
self.send_header("Content-Type", "application/json")
|
|
self.end_headers()
|
|
self.wfile.write(resp)
|
|
else:
|
|
self.send_response(404)
|
|
self.end_headers()
|
|
|
|
def read(self, length):
|
|
return self.rfile.read(length)
|
|
|
|
def main():
|
|
global MODEL, TOKENIZER
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--port", type=int, default=11435)
|
|
parser.add_argument("--model", default=MODEL_PATH)
|
|
args = parser.parse_args()
|
|
|
|
import coremltools as ct
|
|
print(f"Loading CoreML model from {args.model}...")
|
|
MODEL = ct.models.MLModel(args.model, compute_units=ct.ComputeUnit.ALL)
|
|
print(f"Model loaded (compute: {MODEL.compute_unit})")
|
|
|
|
print("Loading tokenizer...")
|
|
TOKENIZER = AutoTokenizer.from_pretrained("mixedbread-ai/mxbai-embed-large-v1")
|
|
print("Tokenizer loaded")
|
|
|
|
server = HTTPServer(("127.0.0.1", args.port), EmbeddingHandler)
|
|
print(f"ANE Embedding server running on port {args.port}")
|
|
print(f"API: POST http://127.0.0.1:{args.port}/api/embeddings")
|
|
print(f" Body: {{\"model\": \"...\", \"prompt\": \"...\"}}")
|
|
print(f" Response: {{\"embedding\": [...]}}")
|
|
server.serve_forever()
|
|
|
|
if __name__ == "__main__":
|
|
main()
|