feat: Phase 1 handover - schema migration, correction mechanism, API fixes

Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
2026-05-11 07:03:22 +08:00
parent ef894a44ad
commit 39ba5ddf76
147 changed files with 19843 additions and 3053 deletions
--- a/scripts/rebuild_story_content.py
+++ b/scripts/rebuild_story_content.py
@@ -0,0 +1,320 @@
+#!/opt/homebrew/bin/python3.11
+"""
+Rebuild story chunk text_content and regenerates summaries using new ASRX speaker assignments.
+Then updates Qdrant momentry_dev_stories and sentence_story/sentence_summary collections.
+"""
+
+import json, sys, time, urllib.request
+from urllib.request import Request, urlopen
+import psycopg2
+
+UUID = "aeed71342a899fe4b4c57b7d41bcb692"
+DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
+QDRANT_URL = "http://localhost:6333"
+LLM_URL = "http://localhost:8082/v1/chat/completions"
+EMBED_URL = "http://localhost:11436/v1/embeddings"
+
+def call_llm(dialogue_text):
+    prompt = f"Dialogue:\n{dialogue_text}\n\n50-word summary:"
+    body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": 0.1, "max_tokens": 100}).encode()
+    req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
+    try:
+        resp = urlopen(req, timeout=120)
+        return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
+    except Exception as e:
+        print(f"    LLM error: {e}")
+        return ""
+
+def call_embed(text):
+    body = json.dumps({"input": text}).encode()
+    req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
+    try:
+        resp = urlopen(req, timeout=30)
+        return json.loads(resp.read())["data"][0]["embedding"]
+    except Exception as e:
+        print(f"    Embed error: {e}")
+        return [0.0] * 768
+
+print("=== Step 1: Load sentence chunks with new speaker info ===")
+conn = psycopg2.connect(DB_URL)
+cur = conn.cursor()
+
+cur.execute("""
+    SELECT chunk_index, text_content, metadata->>'new_speaker_name',
+           metadata->>'speaker_name', content
+    FROM dev.chunks
+    WHERE file_uuid = %s AND chunk_type = 'sentence'
+    ORDER BY chunk_index
+""", (UUID,))
+sentence_rows = cur.fetchall()
+print(f"Loaded {len(sentence_rows)} sentence chunks")
+
+# Build lookup
+sentences = {}
+for r in sentence_rows:
+    idx, old_text, new_name, old_name, content = r
+    sentences[idx] = {
+        "old_text": old_text or "",
+        "new_name": new_name or old_name or "Unknown",
+        "old_name": old_name or "Unknown",
+        "content": content or {},
+    }
+
+# Rebuild sentence text_content with new speaker names
+print("\n=== Step 2: Rebuild sentence text_content ===")
+updated_sentences = 0
+for r in sentence_rows:
+    idx, old_text, new_name, old_name, content = r
+    new_name = new_name or old_name or "Unknown"
+    
+    # Extract the text part (remove old speaker prefix if exists)
+    raw_text = ""
+    if content and isinstance(content, dict):
+        raw_text = content.get("data", {}).get("text", "")
+    if not raw_text and old_text:
+        # Parse old format: [Speaker] text
+        import re
+        m = re.search(r'\]\s*(.*)', old_text)
+        if m:
+            raw_text = m.group(1)
+        else:
+            raw_text = old_text
+    
+    new_text = f"[{new_name}] {raw_text}"
+    
+    cur.execute("""
+        UPDATE dev.chunks
+        SET text_content = %s, updated_at = NOW()
+        WHERE file_uuid = %s AND chunk_type = 'sentence' AND chunk_index = %s
+    """, (new_text, UUID, idx))
+    updated_sentences += 1
+
+conn.commit()
+print(f"Updated {updated_sentences} sentence chunks text_content")
+
+print("\n=== Step 3: Rebuild story chunk text_content ===")
+cur.execute("""
+    SELECT id, chunk_id, chunk_index, child_chunk_ids, start_time, end_time,
+           text_content, summary_text
+    FROM dev.chunks
+    WHERE file_uuid = %s AND chunk_type = 'story'
+    ORDER BY chunk_index
+""", (UUID,))
+story_rows = cur.fetchall()
+print(f"Loaded {len(story_rows)} story chunks")
+
+# Build child text per story chunk
+story_dialogue_texts = []
+for r in story_rows:
+    db_id, cid, idx, child_ids, st, et, old_text, old_summary = r
+    
+    dialogue_parts = []
+    for child_cid in (child_ids or []):
+        parts = child_cid.split("_")
+        child_idx = int(parts[-1])
+        if child_idx in sentences:
+            s = sentences[child_idx]
+            raw = ""
+            if s["content"] and isinstance(s["content"], dict):
+                raw = s["content"].get("data", {}).get("text", "")
+            if not raw:
+                import re
+                m = re.search(r'\]\s*(.*)', s["old_text"])
+                if m:
+                    raw = m.group(1)
+                else:
+                    raw = s["old_text"]
+            if raw:
+                dialogue_parts.append(f'({s["new_name"]}) {raw}')
+    
+    dialogue_text = " ".join(dialogue_parts)
+    story_dialogue_texts.append((db_id, cid, idx, st, et, dialogue_text, old_summary))
+
+print(f"Built {len(story_dialogue_texts)} story dialogue texts")
+
+# Update DB with new text_content (dialogue only, not summary yet)
+for item in story_dialogue_texts:
+    db_id, cid, idx, st, et, dialogue_text, old_summary = item
+    cur.execute("""
+        UPDATE dev.chunks
+        SET text_content = %s, updated_at = NOW()
+        WHERE id = %s
+    """, (dialogue_text, db_id))
+
+conn.commit()
+print("Updated story chunk dialogue texts")
+
+print("\n=== Step 4: Generate LLM summaries (all 228 stories) ===")
+summaries = []
+for i, item in enumerate(story_dialogue_texts):
+    db_id, cid, idx, st, et, dialogue_text, old_summary = item
+    
+    if len(dialogue_text) < 10:
+        summary = "[no dialogue]"
+        embedding = [0.0] * 768
+    else:
+        print(f"  [{i+1}/{len(story_dialogue_texts)}] {cid}: {len(dialogue_text)} chars", end="")
+        try:
+            summary = call_llm(dialogue_text[:3000])
+            print(f" -> {len(summary)} chars")
+            time.sleep(0.3)
+            embedding = call_embed(summary)
+        except Exception as e:
+            print(f" ERROR: {e}")
+            summary = "[error]"
+            embedding = [0.0] * 768
+    
+    # Update DB
+    s_esc = summary.replace("'", "''")
+    cur.execute(f"""
+        UPDATE dev.chunks
+        SET summary_text = '{s_esc}', updated_at = NOW()
+        WHERE id = {db_id}
+    """)
+    
+    summaries.append({
+        "db_id": db_id,
+        "chunk_id": cid,
+        "chunk_index": idx,
+        "start_time": st,
+        "end_time": et,
+        "dialogue": dialogue_text,
+        "summary": summary,
+        "embedding": embedding,
+    })
+
+conn.commit()
+print(f"\nGenerated {len(summaries)} summaries")
+
+print("\n=== Step 5: Rebuild Qdrant momentry_dev_stories ===")
+# Delete existing
+req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories", method="DELETE")
+try:
+    urlopen(req)
+    time.sleep(0.3)
+except:
+    pass
+
+# Recreate
+req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories",
+    data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
+    headers={"Content-Type": "application/json"}, method="PUT")
+urlopen(req)
+time.sleep(0.3)
+
+# Upload dialogue points (0..227) and summary points (228..455)
+dialogue_points = []
+summary_points = []
+for s in summaries:
+    idx = s["chunk_index"]
+    dialogue_points.append({
+        "id": idx + 1,
+        "vector": [0.0] * 768,
+        "payload": {
+            "chunk_id": s["chunk_id"],
+            "file_uuid": UUID,
+            "start_time": s["start_time"],
+            "end_time": s["end_time"],
+            "type": "story_dialogue",
+            "text": s["dialogue"][:500],
+        }
+    })
+    summary_points.append({
+        "id": idx + 1 + 228,
+        "vector": s["embedding"],
+        "payload": {
+            "chunk_id": s["chunk_id"],
+            "file_uuid": UUID,
+            "start_time": s["start_time"],
+            "end_time": s["end_time"],
+            "type": "story_summary",
+            "summary": s["summary"],
+        }
+    })
+
+all_story_points = dialogue_points + summary_points
+
+batch_size = 100
+for start in range(0, len(all_story_points), batch_size):
+    batch = all_story_points[start:start+batch_size]
+    req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories/points?wait=true",
+        data=json.dumps({"points": batch}).encode(),
+        headers={"Content-Type": "application/json"}, method="PUT")
+    try:
+        urlopen(req)
+    except Exception as e:
+        print(f"  Batch {start}: {e}")
+    if (start // batch_size) % 3 == 0:
+        print(f"  Uploaded {start + len(batch)}/{len(all_story_points)}")
+
+print(f"Uploaded {len(all_story_points)} points to momentry_dev_stories")
+
+print("\n=== Step 6: Populate sentence_story and sentence_summary ===")
+# These are the per-sentence template + summary collections
+# sentence_story: 3417 points, 768D, template payloads
+# sentence_summary: 3417 points, 768D, LLM summary payloads
+
+for col_name in ["sentence_story", "sentence_summary"]:
+    req = Request(f"{QDRANT_URL}/collections/{col_name}", method="DELETE")
+    try:
+        urlopen(req)
+        time.sleep(0.2)
+    except:
+        pass
+    
+    req = Request(f"{QDRANT_URL}/collections/{col_name}",
+        data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
+        headers={"Content-Type": "application/json"}, method="PUT")
+    urlopen(req)
+    time.sleep(0.2)
+
+# Build points for sentence_story and sentence_summary
+story_sentence_points = []
+summary_sentence_points = []
+for idx in sorted(sentences.keys()):
+    s = sentences[idx]
+    raw_text = ""
+    if s["content"] and isinstance(s["content"], dict):
+        raw_text = s["content"].get("data", {}).get("text", "")
+    
+    dialog_line = f'({s["new_name"]}) {raw_text}'
+    
+    story_sentence_points.append({
+        "id": idx + 1,
+        "vector": [0.0] * 768,
+        "payload": {
+            "chunk_id": f"{UUID}_{idx}",
+            "file_uuid": UUID,
+            "start_time": 0,
+            "end_time": 0,
+            "text": dialog_line,
+            "speaker_name": s["new_name"],
+            "chunk_type": "sentence",
+        }
+    })
+
+# Upload sentence_story (dialogue template)
+batch_size = 200
+for start in range(0, len(story_sentence_points), batch_size):
+    batch = story_sentence_points[start:start+batch_size]
+    req = Request(f"{QDRANT_URL}/collections/sentence_story/points?wait=true",
+        data=json.dumps({"points": batch}).encode(),
+        headers={"Content-Type": "application/json"}, method="PUT")
+    try:
+        urlopen(req)
+    except Exception as e:
+        print(f"  sentence_story batch {start}: {e}")
+    if (start // batch_size) % 5 == 0:
+        print(f"  Uploaded {start + len(batch)}/3417 sentence_story")
+
+print("Uploaded sentence_story points")
+
+# sentence_summary will be populated when we generate per-sentence summaries
+# For now, mark as TODO
+print("sentence_summary: SKIPPED (needs per-sentence LLM summaries)")
+
+cur.close()
+conn.close()
+print("\n=== Done ===")