feat: Phase 1 handover - schema migration, correction mechanism, API fixes

Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
2026-05-11 07:03:22 +08:00
parent ef894a44ad
commit 39ba5ddf76
147 changed files with 19843 additions and 3053 deletions
--- a/scripts/clean_sentence_text.py
+++ b/scripts/clean_sentence_text.py
@@ -0,0 +1,173 @@
+#!/opt/homebrew/bin/python3.11
+"""
+LLM-clean all 4188 sentence texts, re-embed, update momentry_dev_v1 + sentence_story.
+"""
+import json, time, os
+from urllib.request import Request, urlopen
+import psycopg2
+
+UUID = "aeed71342a899fe4b4c57b7d41bcb692"
+DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
+QDRANT_URL = "http://localhost:6333"
+LLM_URL = "http://localhost:8082/v1/chat/completions"
+EMBED_URL = "http://localhost:11436/v1/embeddings"
+CHECKPOINT = f"/tmp/sentence_clean_{UUID}.json"
+
+def call_llm(prompt):
+    body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": 0.1, "max_tokens": 80}).encode()
+    req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
+    resp = urlopen(req, timeout=30)
+    return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
+
+def call_embed(text):
+    body = json.dumps({"input": text}).encode()
+    req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
+    resp = urlopen(req, timeout=30)
+    return json.loads(resp.read())["data"][0]["embedding"]
+
+print("=== Step 1: Load all sentences ===")
+conn = psycopg2.connect(DB_URL)
+cur = conn.cursor()
+cur.execute("""
+    SELECT id, chunk_id, text_content
+    FROM dev.chunks
+    WHERE file_uuid = %s AND chunk_type = 'sentence'
+    ORDER BY id
+""", (UUID,))
+rows = cur.fetchall()
+conn.close()
+print(f"Loaded {len(rows)} sentences")
+
+# Reset checkpoint (incompatible with old chunk_index format)
+if os.path.exists(CHECKPOINT):
+    os.remove(CHECKPOINT)
+    print("Old checkpoint removed (format changed)")
+
+results = []
+errors = 0
+
+print("\n=== Step 2: LLM clean + embed ===")
+for i, (cid, chunk_id, text_content) in enumerate(rows):
+    input_text = text_content
+
+    prompt = f"""Clean this movie dialogue line. Fix truncated words, capitalize, add punctuation.
+Return: SPEAKER: "clean text"
+
+Input: [Cary Grant] can't you do something constructive like start
+Return: Cary Grant: "Can't you do something constructive like start?"
+
+Input: [Audrey Hepburn] qui se présente influence d'une manière vitale la proposition l
+Return: Audrey Hepburn: "Qui se présente influence d'une manière vitale la proposition..."
+
+Input: {input_text}
+Return:"""
+
+    try:
+        cleaned = call_llm(prompt)
+        embedding = call_embed(cleaned)
+        time.sleep(0.1)
+    except Exception as e:
+        print(f"  [{i+1}/{len(rows)}] id={cid} chunk={chunk_id} ERROR: {e}")
+        cleaned = input_text
+        embedding = [0.0] * 768
+        errors += 1
+
+    entry = {
+        "index": i,
+        "chunk_id": chunk_id,
+        "original": input_text,
+        "cleaned": cleaned,
+        "embedding": embedding,
+    }
+    results.append(entry)
+    json.dump({"last": i}, open(CHECKPOINT, "w"))
+
+    if (i + 1) % 50 == 0:
+        print(f"  [{i+1}/{len(rows)}] chunk={chunk_id} errors={errors}")
+
+results.sort(key=lambda x: x["index"])
+
+print(f"\nDone: {len(results)} cleaned, {errors} errors")
+
+print("\n=== Step 3: Rebuild momentry_dev_v1 ===")
+# Delete old
+req = Request(f"{QDRANT_URL}/collections/momentry_dev_v1", method="DELETE")
+try: urlopen(req); time.sleep(0.5)
+except: pass
+
+req = Request(f"{QDRANT_URL}/collections/momentry_dev_v1",
+    data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
+    headers={"Content-Type": "application/json"}, method="PUT")
+urlopen(req); time.sleep(0.5)
+
+batch_size = 100
+points = []
+for pi, r in enumerate(results):
+    points.append({
+        "id": pi + 1,
+        "vector": r["embedding"],
+        "payload": {
+            "chunk_type": "sentence",
+            "uuid": UUID,
+            "chunk_id": r["chunk_id"],
+            "text": r["cleaned"],
+            "original": r["original"],
+        }
+    })
+
+for start in range(0, len(points), batch_size):
+    batch = points[start:start+batch_size]
+    req = Request(f"{QDRANT_URL}/collections/momentry_dev_v1/points?wait=true",
+        data=json.dumps({"points": batch}).encode(),
+        headers={"Content-Type": "application/json"}, method="PUT")
+    try: urlopen(req)
+    except Exception as e: print(f"  batch {start}: {e}")
+    if (start // batch_size) % 5 == 0:
+        print(f"  momentry_dev_v1: {start+len(batch)}/{len(points)}")
+
+print("  momentry_dev_v1 done")
+
+print("\n=== Step 4: Rebuild sentence_story ===")
+req = Request(f"{QDRANT_URL}/collections/sentence_story", method="DELETE")
+try: urlopen(req); time.sleep(0.5)
+except: pass
+
+req = Request(f"{QDRANT_URL}/collections/sentence_story",
+    data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
+    headers={"Content-Type": "application/json"}, method="PUT")
+urlopen(req); time.sleep(0.5)
+
+story_points = []
+for pi, r in enumerate(results):
+    story_points.append({
+        "id": pi + 1,
+        "vector": r["embedding"],
+        "payload": {
+            "chunk_type": "sentence",
+            "uuid": UUID,
+            "chunk_id": r["chunk_id"],
+            "text": r["cleaned"],
+        }
+    })
+
+for start in range(0, len(story_points), batch_size):
+    batch = story_points[start:start+batch_size]
+    req = Request(f"{QDRANT_URL}/collections/sentence_story/points?wait=true",
+        data=json.dumps({"points": batch}).encode(),
+        headers={"Content-Type": "application/json"}, method="PUT")
+    try: urlopen(req)
+    except Exception as e: print(f"  batch {start}: {e}")
+    if (start // batch_size) % 5 == 0:
+        print(f"  sentence_story: {start+len(batch)}/{len(story_points)}")
+
+print("  sentence_story done")
+
+# Verify
+for col in ["momentry_dev_v1", "sentence_story"]:
+    resp = json.loads(urlopen(f"{QDRANT_URL}/collections/{col}").read())
+    info = resp["result"]
+    print(f"Verified {col}: {info['points_count']} pts, {info['config']['params']['vectors'].get('size','?')}D")
+
+print("\n=== Done ===")