momentry_core/scripts/compare_search.py

#!/opt/homebrew/bin/python3.11
"""
Search comparison script for PostgreSQL, MongoDB, and Qdrant
"""

import time
import requests

# Test queries
TEST_QUERIES = [
    "Charade",
    "Paris",
    " Audrey Hepburn",
    "Cary Grant",
]

# PostgreSQL connection
POSTGRES_CONFIG = {
    "host": "localhost",
    "port": 5432,
    "user": "accusys",
    "password": "Test3200",
    "database": "momentry",
}


def test_postgres_text_search():
    """Test text search in PostgreSQL"""
    import psycopg2

    results = {}
    conn = psycopg2.connect(**POSTGRES_CONFIG)
    cur = conn.cursor()

    for query in TEST_QUERIES:
        start = time.time()
        cur.execute(
            "SELECT chunk_id, content->>'text' FROM chunks WHERE chunk_type = 'sentence' AND content->>'text' ILIKE %s LIMIT 10",
            (f"%{query}%",),
        )
        rows = cur.fetchall()
        elapsed = (time.time() - start) * 1000

        results[query] = {
            "method": "PostgreSQL ILIKE",
            "ms": round(elapsed, 2),
            "rows": len(rows),
        }
        print(f"PostgreSQL text search '{query}': {elapsed:.2f}ms, {len(rows)} rows")

    cur.close()
    conn.close()
    return results


def test_qdrant_vector_search():
    """Test vector search in Qdrant"""
    results = {}

    # First, generate query embeddings
    for query in TEST_QUERIES:
        # Get embedding from Ollama
        embed_resp = requests.post(
            "http://localhost:11434/api/embeddings",
            json={"model": "nomic-embed-text", "prompt": query},
        )
        embedding = embed_resp.json()["embedding"]

        # Search in Qdrant (using AccusysDB collection)
        start = time.time()
        resp = requests.post(
            "http://localhost:6333/collections/AccusysDB/points/search",
            headers={"api-key": "Test3200Test3200Test3200"},
            json={"vector": embedding, "limit": 10},
        )
        elapsed = (time.time() - start) * 1000

        data = resp.json()
        result_count = len(data.get("result", []))

        results[query] = {
            "method": "Qdrant HNSW",
            "ms": round(elapsed, 2),
            "rows": result_count,
        }
        print(f"Qdrant vector search '{query}': {elapsed:.2f}ms, {result_count} rows")

    return results


def main():
    print("=" * 60)
    print("Search Performance Comparison Test")
    print("=" * 60)

    # Get chunk count
    import psycopg2

    conn = psycopg2.connect(**POSTGRES_CONFIG)
    cur = conn.cursor()
    cur.execute("SELECT COUNT(*) FROM chunks WHERE chunk_type = 'sentence'")
    count = cur.fetchone()[0]
    cur.close()
    conn.close()

    print(f"\nTotal sentence chunks: {count}")
    print("\n" + "=" * 60)
    print("A. Text Search Test (Priority a)")
    print("=" * 60)
    pg_results = test_postgres_text_search()

    print("\n" + "=" * 60)
    print("B. Vector Search Test (Priority b)")
    print("=" * 60)
    qdrant_results = test_qdrant_vector_search()

    print("\n" + "=" * 60)
    print("Summary")
    print("=" * 60)
    print(f"\n{'Query':<20} | {'PostgreSQL':<25} | {'Qdrant':<25}")
    print("-" * 70)
    for query in TEST_QUERIES:
        pg = pg_results.get(query, {})
        qd = qdrant_results.get(query, {})
        print(
            f"{query:<20} | {pg.get('ms', 0):.1f}ms ({pg.get('rows', 0)} rows) | {qd.get('ms', 0):.1f}ms ({qd.get('rows', 0)} rows)"
        )


if __name__ == "__main__":
    main()