Files
momentry_core/scripts/natural_language_top10.py
accusys 383201cacd feat: Initial v0.9 release with API Key authentication
## v0.9.20260325_144654

### Features
- API Key Authentication System
- Job Worker System
- V2 Backup Versioning

### Bug Fixes
- get_processor_results_by_job column mapping

Co-authored-by: OpenCode
2026-03-25 14:53:41 +08:00

170 lines
4.4 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Natural Language Vector Search - Show Top 10 Results
"""
import time
import requests
import psycopg2
VIDEO_UUID = "39567a0eb16f39fd"
POSTGRES_CONFIG = {
"host": "localhost",
"port": 5432,
"user": "accusys",
"password": "Test3200",
"database": "momentry",
}
NATURAL_LANGUAGE_QUERIES = [
"a person talking",
"someone speaking on camera",
"outdoor scene",
"indoor setting",
"walking or moving",
"dialogue or conversation",
"looking at something",
"happy or joyful",
"serious or dramatic",
"comedy or funny",
"wearing a tie",
"holding an object",
"sitting on a chair",
"city or urban",
"building or room",
"open space",
]
def get_embedding(text):
resp = requests.post(
"http://localhost:11434/api/embeddings",
json={"model": "nomic-embed-text", "prompt": text},
)
return resp.json()["embedding"]
def test_qdrant(queries):
results = {}
for query in queries:
embedding = get_embedding(query)
start = time.time()
resp = requests.post(
"http://localhost:6333/collections/AccusysDB/points/search",
headers={"api-key": "Test3200Test3200Test3200"},
json={"vector": embedding, "limit": 10},
)
elapsed = (time.time() - start) * 1000
data = resp.json()
results[query] = {"ms": round(elapsed, 2), "results": data.get("result", [])}
return results
def test_pgvector(queries):
results = {}
conn = psycopg2.connect(**POSTGRES_CONFIG)
cur = conn.cursor()
for query in queries:
embedding = get_embedding(query)
vector_str = "[" + ",".join(str(x) for x in embedding) + "]"
start = time.time()
cur.execute(
"""
SELECT cv.chunk_id, (cv.embedding_vector <=> %s::vector) as distance,
c.content->>'text' as text
FROM chunk_vectors cv
JOIN chunks c ON cv.chunk_id = c.chunk_id
WHERE cv.embedding_vector IS NOT NULL
ORDER BY cv.embedding_vector <=> %s::vector
LIMIT 10
""",
(vector_str, vector_str),
)
rows = cur.fetchall()
elapsed = (time.time() - start) * 1000
results[query] = {
"ms": round(elapsed, 2),
"results": [
{"chunk_id": r[0], "score": 1 - r[1], "text": r[2]} for r in rows
],
}
cur.close()
conn.close()
return results
def main():
print("=" * 80)
print("NATURAL LANGUAGE VECTOR SEARCH - TOP 10 RESULTS")
print("=" * 80)
print("\nVideo: Charade 1963")
print("Model: nomic-embed-text\n")
# Run tests
print("Running Qdrant searches...")
qdrant_results = test_qdrant(NATURAL_LANGUAGE_QUERIES)
print("Running pgvector searches...")
pgvector_results = test_pgvector(NATURAL_LANGUAGE_QUERIES)
# Calculate averages
qdrant_avg = sum(r["ms"] for r in qdrant_results.values()) / len(qdrant_results)
pgvector_avg = sum(r["ms"] for r in pgvector_results.values()) / len(
pgvector_results
)
print("\n" + "=" * 80)
print("AVERAGE RESPONSE TIME")
print("=" * 80)
print(f" Qdrant: {qdrant_avg:.2f}ms")
print(f" pgvector: {pgvector_avg:.2f}ms")
# Show detailed results for each query
print("\n" + "=" * 80)
print("DETAILED RESULTS")
print("=" * 80)
for query in NATURAL_LANGUAGE_QUERIES:
qd = qdrant_results[query]
pg = pgvector_results[query]
print(f"\n{'=' * 60}")
print(f'Query: "{query}"')
print(f"{'=' * 60}")
print(f"\n[Qdrant] Time: {qd['ms']:.1f}ms")
print("-" * 60)
for i, r in enumerate(qd["results"][:10]):
text = pg["results"][i]["text"] if i < len(pg["results"]) else ""
text_display = (
text[:70] + "..." if text and len(text) > 70 else (text if text else "")
)
print(f" {i + 1:2}. [{r['score']:.3f}] {text_display}")
print(f"\n[pgvector] Time: {pg['ms']:.1f}ms")
print("-" * 60)
for i, r in enumerate(pg["results"][:10]):
text = r["text"]
text_display = (
text[:70] + "..." if text and len(text) > 70 else (text if text else "")
)
print(f" {i + 1:2}. [{r['score']:.3f}] {text_display}")
print()
if __name__ == "__main__":
main()