#!/opt/homebrew/bin/python3.11 """ Comprehensive search comparison: Text, Vector (PostgreSQL & Qdrant), Object, and MongoDB search """ import time import requests import psycopg2 from pymongo import MongoClient VIDEO_UUID = "39567a0eb16f39fd" POSTGRES_CONFIG = { "host": "localhost", "port": 5432, "user": "accusys", "password": "Test3200", "database": "momentry", } MONGO_URI = "mongodb://localhost:27017" MONGO_DB = "momentry" MONGO_COLLECTION = "chunks" TEST_QUERIES = [ ("text", "Paris"), ("text", " Audrey Hepburn"), ("text", "Cary Grant"), ("vector", "Paris"), ("vector", " Audrey Hepburn"), ("vector", "Cary Grant"), ("object", "person"), ("object", "car"), ("object", "clock"), ("object", "tie"), ] def test_text_search(): """Test PostgreSQL text search""" results = {} conn = psycopg2.connect(**POSTGRES_CONFIG) cur = conn.cursor() for query in ["Paris", " Audrey Hepburn", "Cary Grant"]: start = time.time() cur.execute( "SELECT chunk_id, content->>'text' FROM chunks WHERE chunk_type = 'sentence' AND content->>'text' ILIKE %s LIMIT 10", (f"%{query}%",), ) rows = cur.fetchall() elapsed = (time.time() - start) * 1000 results[query] = {"ms": round(elapsed, 2), "rows": len(rows)} print(f"PostgreSQL text '{query}': {elapsed:.2f}ms, {len(rows)} rows") cur.close() conn.close() return results def test_mongodb_text_search(): """Test MongoDB text search""" results = {} mongo_client = MongoClient(MONGO_URI) mongo_collection = mongo_client[MONGO_DB][MONGO_COLLECTION] for query in ["Paris", "Audrey Hepburn", "Cary Grant"]: start = time.time() cursor = mongo_collection.find( {"uuid": VIDEO_UUID, "chunk_type": "sentence", "$text": {"$search": query}} ).limit(10) rows = list(cursor) elapsed = (time.time() - start) * 1000 results[query] = {"ms": round(elapsed, 2), "rows": len(rows)} print(f"MongoDB text '{query}': {elapsed:.2f}ms, {len(rows)} rows") mongo_client.close() return results def test_qdrant_vector_search(): """Test Qdrant vector search""" results = {} for query in ["Paris", " Audrey Hepburn", "Cary Grant"]: # Get embedding from Ollama embed_resp = requests.post( "http://localhost:11434/api/embeddings", json={"model": "nomic-embed-text", "prompt": query}, ) embedding = embed_resp.json()["embedding"] # Search in Qdrant start = time.time() resp = requests.post( "http://localhost:6333/collections/AccusysDB/points/search", headers={"api-key": "Test3200Test3200Test3200"}, json={"vector": embedding, "limit": 10}, ) elapsed = (time.time() - start) * 1000 data = resp.json() result_count = len(data.get("result", [])) results[query] = {"ms": round(elapsed, 2), "rows": result_count} print(f"Qdrant vector '{query}': {elapsed:.2f}ms, {result_count} rows") return results def test_postgres_vector_search(): """Test PostgreSQL vector search using pgvector""" results = {} conn = psycopg2.connect(**POSTGRES_CONFIG) cur = conn.cursor() for query in ["Paris", " Audrey Hepburn", "Cary Grant"]: # Get embedding from Ollama embed_resp = requests.post( "http://localhost:11434/api/embeddings", json={"model": "nomic-embed-text", "prompt": query}, ) embedding = embed_resp.json()["embedding"] # Search in PostgreSQL using pgvector start = time.time() # Convert to vector string format vector_str = "[" + ",".join(str(x) for x in embedding) + "]" cur.execute( """ SELECT chunk_id, (embedding_vector <=> %s::vector) as distance FROM chunk_vectors WHERE embedding_vector IS NOT NULL ORDER BY embedding_vector <=> %s::vector LIMIT 10 """, (vector_str, vector_str), ) rows = cur.fetchall() elapsed = (time.time() - start) * 1000 results[query] = {"ms": round(elapsed, 2), "rows": len(rows)} print(f"PostgreSQL vector '{query}': {elapsed:.2f}ms, {len(rows)} rows") cur.close() conn.close() return results def test_object_search(): """Test PostgreSQL object search""" results = {} conn = psycopg2.connect(**POSTGRES_CONFIG) cur = conn.cursor() for obj in ["person", "car", "clock", "tie"]: start = time.time() cur.execute( """ SELECT chunk_id FROM chunks WHERE uuid = %s AND chunk_type = 'sentence' AND metadata IS NOT NULL AND metadata->'yolo'->'objects' ? %s LIMIT 10 """, (VIDEO_UUID, obj), ) rows = cur.fetchall() elapsed = (time.time() - start) * 1000 results[obj] = {"ms": round(elapsed, 2), "rows": len(rows)} print(f"PostgreSQL object '{obj}': {elapsed:.2f}ms, {len(rows)} rows") cur.close() conn.close() return results def main(): print("=" * 70) print("SEARCH PERFORMANCE COMPARISON") print("=" * 70) # Get chunk count conn = psycopg2.connect(**POSTGRES_CONFIG) cur = conn.cursor() cur.execute( "SELECT COUNT(*) FROM chunks WHERE uuid = %s AND chunk_type = 'sentence'", (VIDEO_UUID,), ) chunk_count = cur.fetchone()[0] print(f"\nTotal sentence chunks: {chunk_count}") print(f"Video UUID: {VIDEO_UUID}") cur.close() conn.close() print("\n" + "=" * 70) print("A. TEXT SEARCH (PostgreSQL ILIKE)") print("=" * 70) text_results = test_text_search() print("\n" + "=" * 70) print("A2. TEXT SEARCH (MongoDB Text)") print("=" * 70) mongodb_results = test_mongodb_text_search() print("\n" + "=" * 70) print("B1. VECTOR SEARCH (Qdrant HNSW)") print("=" * 70) qdrant_results = test_qdrant_vector_search() print("\n" + "=" * 70) print("B2. VECTOR SEARCH (PostgreSQL pgvector HNSW)") print("=" * 70) pgvector_results = test_postgres_vector_search() print("\n" + "=" * 70) print("C. OBJECT SEARCH (PostgreSQL JSON)") print("=" * 70) object_results = test_object_search() print("\n" + "=" * 70) print("SUMMARY") print("=" * 70) print(f"\n{'Method':<28} | {'Query':<20} | {'Time (ms)':<12} | {'Results'}") print("-" * 75) for query, data in text_results.items(): print( f"{'PostgreSQL ILIKE':<28} | {query:<20} | {data['ms']:<12.1f} | {data['rows']}" ) for query, data in mongodb_results.items(): print( f"{'MongoDB Text':<28} | {query:<20} | {data['ms']:<12.1f} | {data['rows']}" ) for query, data in qdrant_results.items(): print( f"{'Qdrant HNSW':<28} | {query:<20} | {data['ms']:<12.1f} | {data['rows']}" ) for query, data in pgvector_results.items(): print( f"{'PostgreSQL pgvector':<28} | {query:<20} | {data['ms']:<12.1f} | {data['rows']}" ) for query, data in object_results.items(): print( f"{'PostgreSQL JSON':<28} | {query:<20} | {data['ms']:<12.1f} | {data['rows']}" ) # Calculate averages text_avg = sum(d["ms"] for d in text_results.values()) / len(text_results) mongodb_avg = sum(d["ms"] for d in mongodb_results.values()) / len(mongodb_results) qdrant_avg = sum(d["ms"] for d in qdrant_results.values()) / len(qdrant_results) pgvector_avg = sum(d["ms"] for d in pgvector_results.values()) / len( pgvector_results ) object_avg = sum(d["ms"] for d in object_results.values()) / len(object_results) print("\n" + "=" * 70) print("AVERAGE RESPONSE TIME") print("=" * 70) print(f" PostgreSQL ILIKE (Text): {text_avg:.2f}ms") print(f" MongoDB Text: {mongodb_avg:.2f}ms") print(f" PostgreSQL pgvector (Vector): {pgvector_avg:.2f}ms") print(f" Qdrant HNSW (Vector): {qdrant_avg:.2f}ms") print(f" PostgreSQL JSON (Object): {object_avg:.2f}ms") print("\n" + "=" * 70) print("ANALYSIS") print("=" * 70) print( """ 1. TEXT SEARCH (PostgreSQL ILIKE): - Fast: ~0.7ms average - Exact substring matching - Case-insensitive - Good for keyword searches 2. VECTOR SEARCH - PostgreSQL pgvector (HNSW): - Speed: ~{:.1f}ms average - Built into PostgreSQL - No additional infrastructure needed - Good for single-database architecture 3. VECTOR SEARCH - Qdrant (HNSW): - Speed: ~{:.1f}ms average - Dedicated vector database - Better for large-scale deployments - Supports more advanced vector operations 4. OBJECT SEARCH (PostgreSQL JSON): - Very fast: ~{:.1f}ms average - Uses JSON containment operator - Works with YOLO metadata - Best for visual object queries RECOMMENDATION: - For simple keyword searches: PostgreSQL ILIKE - For semantic search with single DB: PostgreSQL pgvector - For scalability: Qdrant - For visual content: PostgreSQL JSON with YOLO metadata """.format(pgvector_avg, qdrant_avg, object_avg) ) if __name__ == "__main__": main()