Files
momentry_core/scripts/test_multilingual.py
accusys 383201cacd feat: Initial v0.9 release with API Key authentication
## v0.9.20260325_144654

### Features
- API Key Authentication System
- Job Worker System
- V2 Backup Versioning

### Bug Fixes
- get_processor_results_by_job column mapping

Co-authored-by: OpenCode
2026-03-25 14:53:41 +08:00

192 lines
4.9 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Multilingual Vector Search Test with nomic-embed-text-v2-moe
"""
import time
import requests
import psycopg2
import uuid
VIDEO_UUID = "39567a0eb16f39fd"
POSTGRES_CONFIG = {
"host": "localhost",
"port": 5432,
"user": "accusys",
"password": "Test3200",
"database": "momentry",
}
MODEL = "nomic-embed-text-v2-moe"
QDRANT_COLLECTION = "chunks_v3"
def get_embedding(text, prefix=""):
prompt = f"{prefix}{text}"
resp = requests.post(
"http://localhost:11434/api/embeddings", json={"model": MODEL, "prompt": prompt}
)
return resp.json()["embedding"]
def sync_to_qdrant():
"""Sync vectors to Qdrant with multilingual model"""
conn = psycopg2.connect(**POSTGRES_CONFIG)
cur = conn.cursor()
cur.execute(
"""
SELECT chunk_id, content->>'text' as text, start_time, end_time, uuid
FROM chunks
WHERE uuid = %s AND chunk_type = 'sentence'
ORDER BY chunk_index
""",
(VIDEO_UUID,),
)
rows = cur.fetchall()
print(f"Syncing {len(rows)} chunks to Qdrant with {MODEL}")
points = []
for chunk_id, text, start_time, end_time, vid in rows:
if not text:
continue
# Use search_document: prefix for chunks
embedding = get_embedding(text, "search_document: ")
point_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, chunk_id))
payload = {
"uuid": vid,
"chunk_id": chunk_id,
"chunk_type": "sentence",
"start_time": float(start_time),
"end_time": float(end_time),
"text": text[:200],
}
points.append({"id": point_id, "vector": embedding, "payload": payload})
# Upload in batches
batch_size = 100
for i in range(0, len(points), batch_size):
batch = points[i : i + batch_size]
resp = requests.put(
f"http://localhost:6333/collections/{QDRANT_COLLECTION}/points",
headers={
"api-key": "Test3200Test3200Test3200",
"Content-Type": "application/json",
},
json={"points": batch},
)
if resp.status_code != 200:
print(f"Error: {resp.text[:200]}")
break
print(
f"Uploaded batch {i // batch_size + 1}/{(len(points) - 1) // batch_size + 1}"
)
cur.close()
conn.close()
print("Done!")
def test_queries(queries, use_prefix=True):
"""Test queries against Qdrant"""
prefix = "search_query: " if use_prefix else ""
for query in queries:
embedding = get_embedding(query, prefix)
start = time.time()
resp = requests.post(
f"http://localhost:6333/collections/{QDRANT_COLLECTION}/points/search",
headers={
"api-key": "Test3200Test3200Test3200",
"Content-Type": "application/json",
},
json={"vector": embedding, "limit": 3, "with_payload": True},
)
elapsed = (time.time() - start) * 1000
results = resp.json().get("result", [])
print(f"\nQuery: '{query}' ({elapsed:.1f}ms)")
print("-" * 60)
for i, r in enumerate(results):
score = r.get("score", 0)
payload = r.get("payload", {})
text = payload.get("text", "")[:60]
print(f" {i + 1}. [{score:.3f}] {text}")
# English queries
ENGLISH_QUERIES = [
"a person talking",
"someone speaking on camera",
"outdoor scene",
"indoor setting",
"walking or moving",
"dialogue or conversation",
"looking at something",
"happy or joyful",
"serious or dramatic",
"comedy or funny",
"wearing a tie",
"holding an object",
"sitting on a chair",
"city or urban",
"building or room",
"open space",
]
# Chinese queries
CHINESE_QUERIES = [
"有人在說話",
"戶外場景",
"室內場景",
"走路或移動",
"對話或交談",
"看著某樣東西",
"快樂或開心",
"嚴肅或戲劇性",
"喜劇或有趣",
"戴著領帶",
"拿著東西",
"坐在椅子上",
"城市或都市",
"建築物或房間",
"開放空間",
]
if __name__ == "__main__":
import sys
if len(sys.argv) > 1 and sys.argv[1] == "sync":
print("=" * 60)
print(f"Syncing vectors to {QDRANT_COLLECTION}")
print(f"Model: {MODEL}")
print("Prefix for chunks: search_document:")
print("=" * 60)
sync_to_qdrant()
else:
print("=" * 60)
print(f"Testing with {QDRANT_COLLECTION}")
print(f"Model: {MODEL}")
print("Prefix for queries: search_query:")
print("=" * 60)
print("\n" + "=" * 60)
print("ENGLISH QUERIES")
print("=" * 60)
test_queries(ENGLISH_QUERIES)
print("\n" + "=" * 60)
print("CHINESE QUERIES")
print("=" * 60)
test_queries(CHINESE_QUERIES)