## v0.9.20260325_144654 ### Features - API Key Authentication System - Job Worker System - V2 Backup Versioning ### Bug Fixes - get_processor_results_by_job column mapping Co-authored-by: OpenCode
157 lines
4.0 KiB
Python
157 lines
4.0 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
Vector Search Test with nomic-embed-text:v1.5 using prefixes - detailed results
|
|
"""
|
|
|
|
import time
|
|
import requests
|
|
|
|
|
|
VIDEO_UUID = "39567a0eb16f39fd"
|
|
|
|
POSTGRES_CONFIG = {
|
|
"host": "localhost",
|
|
"port": 5432,
|
|
"user": "accusys",
|
|
"password": "Test3200",
|
|
"database": "momentry",
|
|
}
|
|
|
|
MODEL = "nomic-embed-text:v1.5"
|
|
QDRANT_COLLECTION = "chunks_v2"
|
|
|
|
|
|
def get_embedding(text, prefix=""):
|
|
prompt = f"{prefix}{text}"
|
|
resp = requests.post(
|
|
"http://localhost:11434/api/embeddings", json={"model": MODEL, "prompt": prompt}
|
|
)
|
|
return resp.json()["embedding"]
|
|
|
|
|
|
def test_queries(queries, use_prefix=True):
|
|
"""Test queries against Qdrant"""
|
|
prefix = "search_query: " if use_prefix else ""
|
|
results_data = []
|
|
|
|
for query in queries:
|
|
embedding = get_embedding(query, prefix)
|
|
|
|
start = time.time()
|
|
resp = requests.post(
|
|
f"http://localhost:6333/collections/{QDRANT_COLLECTION}/points/search",
|
|
headers={
|
|
"api-key": "Test3200Test3200Test3200",
|
|
"Content-Type": "application/json",
|
|
},
|
|
json={"vector": embedding, "limit": 3, "with_payload": True},
|
|
)
|
|
elapsed = (time.time() - start) * 1000
|
|
|
|
results = resp.json().get("result", [])
|
|
|
|
print(f"\nQuery: '{query}' ({elapsed:.1f}ms)")
|
|
print("-" * 60)
|
|
for i, r in enumerate(results):
|
|
score = r.get("score", 0)
|
|
payload = r.get("payload", {})
|
|
text = payload.get("text", "")[:60]
|
|
print(f" {i + 1}. [{score:.3f}] {text}")
|
|
|
|
results_data.append(
|
|
{
|
|
"query": query,
|
|
"time_ms": elapsed,
|
|
"top_score": results[0].get("score", 0) if results else 0,
|
|
"top_text": results[0].get("payload", {}).get("text", "")[:50]
|
|
if results
|
|
else "",
|
|
}
|
|
)
|
|
|
|
return results_data
|
|
|
|
|
|
# English queries
|
|
ENGLISH_QUERIES = [
|
|
"a person talking",
|
|
"someone speaking on camera",
|
|
"outdoor scene",
|
|
"indoor setting",
|
|
"walking or moving",
|
|
"dialogue or conversation",
|
|
"looking at something",
|
|
"happy or joyful",
|
|
"serious or dramatic",
|
|
"comedy or funny",
|
|
"wearing a tie",
|
|
"holding an object",
|
|
"sitting on a chair",
|
|
"city or urban",
|
|
"building or room",
|
|
"open space",
|
|
]
|
|
|
|
# Chinese queries
|
|
CHINESE_QUERIES = [
|
|
"有人在說話",
|
|
"戶外場景",
|
|
"室內場景",
|
|
"走路或移動",
|
|
"對話或交談",
|
|
"看著某樣東西",
|
|
"快樂或開心",
|
|
"嚴肅或戲劇性",
|
|
"喜劇或有趣",
|
|
"戴著領帶",
|
|
"拿著東西",
|
|
"坐在椅子上",
|
|
"城市或都市",
|
|
"建築物或房間",
|
|
"開放空間",
|
|
]
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("=" * 70)
|
|
print(f"Testing with {QDRANT_COLLECTION}")
|
|
print(f"Model: {MODEL}")
|
|
print("Prefix for chunks: search_document:")
|
|
print("Prefix for queries: search_query:")
|
|
print("=" * 70)
|
|
|
|
print("\n" + "=" * 70)
|
|
print("ENGLISH QUERIES")
|
|
print("=" * 70)
|
|
en_results = test_queries(ENGLISH_QUERIES)
|
|
|
|
print("\n" + "=" * 70)
|
|
print("CHINESE QUERIES")
|
|
print("=" * 70)
|
|
zh_results = test_queries(CHINESE_QUERIES)
|
|
|
|
# Summary
|
|
print("\n" + "=" * 70)
|
|
print("SUMMARY")
|
|
print("=" * 70)
|
|
|
|
en_avg = sum(r["time_ms"] for r in en_results) / len(en_results)
|
|
zh_avg = sum(r["time_ms"] for r in zh_results) / len(zh_results)
|
|
|
|
print(f"\nEnglish avg time: {en_avg:.1f}ms")
|
|
print(f"Chinese avg time: {zh_avg:.1f}ms")
|
|
|
|
print("\nTop results:")
|
|
print(f"\n{'Query':<25} | {'Time':<8} | {'Score':<8} | {'Text'}")
|
|
print("-" * 70)
|
|
for r in en_results[:5]:
|
|
print(
|
|
f"{r['query']:<25} | {r['time_ms']:>5.1f}ms | {r['top_score']:.3f} | {r['top_text']}"
|
|
)
|
|
|
|
print()
|
|
for r in zh_results[:5]:
|
|
print(
|
|
f"{r['query']:<25} | {r['time_ms']:>5.1f}ms | {r['top_score']:.3f} | {r['top_text']}"
|
|
)
|