feat: Phase 1 handover - schema migration, correction mechanism, API fixes
Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
This commit is contained in:
139
scripts/vectorize_4188.py
Normal file
139
scripts/vectorize_4188.py
Normal file
@@ -0,0 +1,139 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Vectorize 4188 sentence chunks via EmbeddingGemma (768D) + rebuild Qdrant collections.
|
||||
"""
|
||||
import json, sys, time
|
||||
from urllib.request import Request, urlopen
|
||||
import psycopg2
|
||||
import urllib.request
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
||||
COLLECTIONS = ["momentry_dev_v1", "sentence_story", "sentence_summary"]
|
||||
|
||||
def call_embed(text):
|
||||
body = json.dumps({"input": text}).encode()
|
||||
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
resp = urlopen(req, timeout=30)
|
||||
return json.loads(resp.read())["data"][0]["embedding"]
|
||||
|
||||
print("=== Step 1: Load chunks ===")
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT chunk_index, chunk_id, text_content, metadata->>'speaker_name',
|
||||
start_time, end_time, metadata->>'speaker_id'
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid=%s AND chunk_type='sentence'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
chunks = cur.fetchall()
|
||||
conn.close()
|
||||
print(f"Loaded {len(chunks)} chunks")
|
||||
|
||||
print("\n=== Step 2: Vectorize (EmbeddingGemma 768D) ===")
|
||||
# Generate cleaned text for embedding: "Speaker: text" format
|
||||
texts_for_embed = []
|
||||
for r in chunks:
|
||||
spk = r[3] or "Unknown"
|
||||
txt = r[2] or ""
|
||||
# Remove [Speaker] prefix if present
|
||||
if txt.startswith("["):
|
||||
txt = txt.split("]", 1)[-1].strip()
|
||||
texts_for_embed.append(f"{spk}: \"{txt}\"")
|
||||
|
||||
t0 = time.time()
|
||||
embeddings = []
|
||||
batch_size = 50
|
||||
for start in range(0, len(texts_for_embed), batch_size):
|
||||
batch = texts_for_embed[start:start+batch_size]
|
||||
# Try batch embed
|
||||
body = json.dumps({"input": batch}).encode()
|
||||
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
resp = json.loads(urlopen(req, timeout=60).read())
|
||||
batch_embs = [d["embedding"] for d in resp["data"]]
|
||||
except:
|
||||
# Fallback to single
|
||||
batch_embs = []
|
||||
for t in batch:
|
||||
batch_embs.append(call_embed(t))
|
||||
embeddings.extend(batch_embs)
|
||||
|
||||
if (start // batch_size) % 10 == 0:
|
||||
pct = (start + len(batch)) * 100 // len(texts_for_embed)
|
||||
print(f" {start+len(batch)}/{len(texts_for_embed)} ({pct}%) [{time.time()-t0:.0f}s]")
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f" Done: {len(embeddings)} embeddings in {elapsed:.1f}s ({elapsed/len(embeddings):.2f}s each)")
|
||||
|
||||
print("\n=== Step 3: Rebuild Qdrant collections ===")
|
||||
import time as time_module
|
||||
|
||||
for col in COLLECTIONS:
|
||||
# Delete
|
||||
req = Request(f"{QDRANT_URL}/collections/{col}", method="DELETE")
|
||||
try: urlopen(req); time_module.sleep(0.3)
|
||||
except: pass
|
||||
|
||||
# Create
|
||||
req = Request(f"{QDRANT_URL}/collections/{col}",
|
||||
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req)
|
||||
time_module.sleep(0.3)
|
||||
print(f" Created {col}")
|
||||
|
||||
# Upload
|
||||
print("\n=== Step 4: Upload points ===")
|
||||
batch_size = 100
|
||||
for col in COLLECTIONS:
|
||||
points = []
|
||||
for i, r in enumerate(chunks):
|
||||
idx = r[0]
|
||||
cid = r[1]
|
||||
spk_name = r[3] or "Unknown"
|
||||
spk_id = r[6] or "Unknown"
|
||||
txt = r[2] or ""
|
||||
st = r[4]
|
||||
et = r[5]
|
||||
|
||||
payload = {
|
||||
"chunk_type": "sentence", "uuid": UUID,
|
||||
"chunk_id": cid, "start_time": st, "end_time": et,
|
||||
"speaker_name": spk_name, "speaker_id": spk_id,
|
||||
}
|
||||
if col == "momentry_dev_v1":
|
||||
payload["text"] = txt
|
||||
elif col == "sentence_story":
|
||||
payload["text"] = txt
|
||||
elif col == "sentence_summary":
|
||||
payload["summary"] = txt
|
||||
|
||||
points.append({
|
||||
"id": idx + 1,
|
||||
"vector": embeddings[i],
|
||||
"payload": payload,
|
||||
})
|
||||
|
||||
for start in range(0, len(points), batch_size):
|
||||
batch = points[start:start+batch_size]
|
||||
req = Request(f"{QDRANT_URL}/collections/{col}/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try: urlopen(req)
|
||||
except Exception as e: print(f" {col} batch {start}: {e}")
|
||||
if (start // batch_size) % 5 == 0:
|
||||
print(f" {col}: {start+len(batch)}/{len(points)}")
|
||||
print(f" {col}: done")
|
||||
|
||||
# Verify
|
||||
print("\n=== Verify ===")
|
||||
for col in COLLECTIONS:
|
||||
resp = json.loads(urlopen(f"{QDRANT_URL}/collections/{col}").read())
|
||||
info = resp["result"]
|
||||
print(f" {col}: {info['points_count']} pts, {info['config']['params']['vectors'].get('size','?')}D")
|
||||
|
||||
print("\n=== Done ===")
|
||||
Reference in New Issue
Block a user