feat: Phase 1 handover - schema migration, correction mechanism, API fixes

Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
2026-05-11 07:03:22 +08:00
parent ef894a44ad
commit 39ba5ddf76
147 changed files with 19843 additions and 3053 deletions
--- a/scripts/vectorize_4188.py
+++ b/scripts/vectorize_4188.py
@@ -0,0 +1,139 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Vectorize 4188 sentence chunks via EmbeddingGemma (768D) + rebuild Qdrant collections.
+"""
+import json, sys, time
+from urllib.request import Request, urlopen
+import psycopg2
+import urllib.request
+
+UUID = "aeed71342a899fe4b4c57b7d41bcb692"
+DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
+QDRANT_URL = "http://localhost:6333"
+EMBED_URL = "http://localhost:11436/v1/embeddings"
+COLLECTIONS = ["momentry_dev_v1", "sentence_story", "sentence_summary"]
+
+def call_embed(text):
+    body = json.dumps({"input": text}).encode()
+    req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
+    resp = urlopen(req, timeout=30)
+    return json.loads(resp.read())["data"][0]["embedding"]
+
+print("=== Step 1: Load chunks ===")
+conn = psycopg2.connect(DB_URL)
+cur = conn.cursor()
+cur.execute("""
+    SELECT chunk_index, chunk_id, text_content, metadata->>'speaker_name',
+           start_time, end_time, metadata->>'speaker_id'
+    FROM dev.chunks
+    WHERE file_uuid=%s AND chunk_type='sentence'
+    ORDER BY chunk_index
+""", (UUID,))
+chunks = cur.fetchall()
+conn.close()
+print(f"Loaded {len(chunks)} chunks")
+
+print("\n=== Step 2: Vectorize (EmbeddingGemma 768D) ===")
+# Generate cleaned text for embedding: "Speaker: text" format
+texts_for_embed = []
+for r in chunks:
+    spk = r[3] or "Unknown"
+    txt = r[2] or ""
+    # Remove [Speaker] prefix if present
+    if txt.startswith("["):
+        txt = txt.split("]", 1)[-1].strip()
+    texts_for_embed.append(f"{spk}: \"{txt}\"")
+
+t0 = time.time()
+embeddings = []
+batch_size = 50
+for start in range(0, len(texts_for_embed), batch_size):
+    batch = texts_for_embed[start:start+batch_size]
+    # Try batch embed
+    body = json.dumps({"input": batch}).encode()
+    req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
+    try:
+        resp = json.loads(urlopen(req, timeout=60).read())
+        batch_embs = [d["embedding"] for d in resp["data"]]
+    except:
+        # Fallback to single
+        batch_embs = []
+        for t in batch:
+            batch_embs.append(call_embed(t))
+    embeddings.extend(batch_embs)
+    
+    if (start // batch_size) % 10 == 0:
+        pct = (start + len(batch)) * 100 // len(texts_for_embed)
+        print(f"  {start+len(batch)}/{len(texts_for_embed)} ({pct}%) [{time.time()-t0:.0f}s]")
+
+elapsed = time.time() - t0
+print(f"  Done: {len(embeddings)} embeddings in {elapsed:.1f}s ({elapsed/len(embeddings):.2f}s each)")
+
+print("\n=== Step 3: Rebuild Qdrant collections ===")
+import time as time_module
+
+for col in COLLECTIONS:
+    # Delete
+    req = Request(f"{QDRANT_URL}/collections/{col}", method="DELETE")
+    try: urlopen(req); time_module.sleep(0.3)
+    except: pass
+    
+    # Create
+    req = Request(f"{QDRANT_URL}/collections/{col}",
+        data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
+        headers={"Content-Type": "application/json"}, method="PUT")
+    urlopen(req)
+    time_module.sleep(0.3)
+    print(f"  Created {col}")
+
+# Upload
+print("\n=== Step 4: Upload points ===")
+batch_size = 100
+for col in COLLECTIONS:
+    points = []
+    for i, r in enumerate(chunks):
+        idx = r[0]
+        cid = r[1]
+        spk_name = r[3] or "Unknown"
+        spk_id = r[6] or "Unknown"
+        txt = r[2] or ""
+        st = r[4]
+        et = r[5]
+        
+        payload = {
+            "chunk_type": "sentence", "uuid": UUID,
+            "chunk_id": cid, "start_time": st, "end_time": et,
+            "speaker_name": spk_name, "speaker_id": spk_id,
+        }
+        if col == "momentry_dev_v1":
+            payload["text"] = txt
+        elif col == "sentence_story":
+            payload["text"] = txt
+        elif col == "sentence_summary":
+            payload["summary"] = txt
+        
+        points.append({
+            "id": idx + 1,
+            "vector": embeddings[i],
+            "payload": payload,
+        })
+    
+    for start in range(0, len(points), batch_size):
+        batch = points[start:start+batch_size]
+        req = Request(f"{QDRANT_URL}/collections/{col}/points?wait=true",
+            data=json.dumps({"points": batch}).encode(),
+            headers={"Content-Type": "application/json"}, method="PUT")
+        try: urlopen(req)
+        except Exception as e: print(f"  {col} batch {start}: {e}")
+        if (start // batch_size) % 5 == 0:
+            print(f"  {col}: {start+len(batch)}/{len(points)}")
+    print(f"  {col}: done")
+
+# Verify
+print("\n=== Verify ===")
+for col in COLLECTIONS:
+    resp = json.loads(urlopen(f"{QDRANT_URL}/collections/{col}").read())
+    info = resp["result"]
+    print(f"  {col}: {info['points_count']} pts, {info['config']['params']['vectors'].get('size','?')}D")
+
+print("\n=== Done ===")