Files
momentry_core/scripts/rebuild_story_content.py
Accusys 39ba5ddf76 feat: Phase 1 handover - schema migration, correction mechanism, API fixes
Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index
Correction: asr-1.json format, generate/apply scripts
API: 37/37 endpoints fixed and tested
Docs: HANDOVER_V2.0.md for M4
2026-05-11 07:03:22 +08:00

321 lines
10 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Rebuild story chunk text_content and regenerates summaries using new ASRX speaker assignments.
Then updates Qdrant momentry_dev_stories and sentence_story/sentence_summary collections.
"""
import json, sys, time, urllib.request
from urllib.request import Request, urlopen
import psycopg2
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
QDRANT_URL = "http://localhost:6333"
LLM_URL = "http://localhost:8082/v1/chat/completions"
EMBED_URL = "http://localhost:11436/v1/embeddings"
def call_llm(dialogue_text):
prompt = f"Dialogue:\n{dialogue_text}\n\n50-word summary:"
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1, "max_tokens": 100}).encode()
req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
try:
resp = urlopen(req, timeout=120)
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
except Exception as e:
print(f" LLM error: {e}")
return ""
def call_embed(text):
body = json.dumps({"input": text}).encode()
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
try:
resp = urlopen(req, timeout=30)
return json.loads(resp.read())["data"][0]["embedding"]
except Exception as e:
print(f" Embed error: {e}")
return [0.0] * 768
print("=== Step 1: Load sentence chunks with new speaker info ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("""
SELECT chunk_index, text_content, metadata->>'new_speaker_name',
metadata->>'speaker_name', content
FROM dev.chunks
WHERE file_uuid = %s AND chunk_type = 'sentence'
ORDER BY chunk_index
""", (UUID,))
sentence_rows = cur.fetchall()
print(f"Loaded {len(sentence_rows)} sentence chunks")
# Build lookup
sentences = {}
for r in sentence_rows:
idx, old_text, new_name, old_name, content = r
sentences[idx] = {
"old_text": old_text or "",
"new_name": new_name or old_name or "Unknown",
"old_name": old_name or "Unknown",
"content": content or {},
}
# Rebuild sentence text_content with new speaker names
print("\n=== Step 2: Rebuild sentence text_content ===")
updated_sentences = 0
for r in sentence_rows:
idx, old_text, new_name, old_name, content = r
new_name = new_name or old_name or "Unknown"
# Extract the text part (remove old speaker prefix if exists)
raw_text = ""
if content and isinstance(content, dict):
raw_text = content.get("data", {}).get("text", "")
if not raw_text and old_text:
# Parse old format: [Speaker] text
import re
m = re.search(r'\]\s*(.*)', old_text)
if m:
raw_text = m.group(1)
else:
raw_text = old_text
new_text = f"[{new_name}] {raw_text}"
cur.execute("""
UPDATE dev.chunks
SET text_content = %s, updated_at = NOW()
WHERE file_uuid = %s AND chunk_type = 'sentence' AND chunk_index = %s
""", (new_text, UUID, idx))
updated_sentences += 1
conn.commit()
print(f"Updated {updated_sentences} sentence chunks text_content")
print("\n=== Step 3: Rebuild story chunk text_content ===")
cur.execute("""
SELECT id, chunk_id, chunk_index, child_chunk_ids, start_time, end_time,
text_content, summary_text
FROM dev.chunks
WHERE file_uuid = %s AND chunk_type = 'story'
ORDER BY chunk_index
""", (UUID,))
story_rows = cur.fetchall()
print(f"Loaded {len(story_rows)} story chunks")
# Build child text per story chunk
story_dialogue_texts = []
for r in story_rows:
db_id, cid, idx, child_ids, st, et, old_text, old_summary = r
dialogue_parts = []
for child_cid in (child_ids or []):
parts = child_cid.split("_")
child_idx = int(parts[-1])
if child_idx in sentences:
s = sentences[child_idx]
raw = ""
if s["content"] and isinstance(s["content"], dict):
raw = s["content"].get("data", {}).get("text", "")
if not raw:
import re
m = re.search(r'\]\s*(.*)', s["old_text"])
if m:
raw = m.group(1)
else:
raw = s["old_text"]
if raw:
dialogue_parts.append(f'({s["new_name"]}) {raw}')
dialogue_text = " ".join(dialogue_parts)
story_dialogue_texts.append((db_id, cid, idx, st, et, dialogue_text, old_summary))
print(f"Built {len(story_dialogue_texts)} story dialogue texts")
# Update DB with new text_content (dialogue only, not summary yet)
for item in story_dialogue_texts:
db_id, cid, idx, st, et, dialogue_text, old_summary = item
cur.execute("""
UPDATE dev.chunks
SET text_content = %s, updated_at = NOW()
WHERE id = %s
""", (dialogue_text, db_id))
conn.commit()
print("Updated story chunk dialogue texts")
print("\n=== Step 4: Generate LLM summaries (all 228 stories) ===")
summaries = []
for i, item in enumerate(story_dialogue_texts):
db_id, cid, idx, st, et, dialogue_text, old_summary = item
if len(dialogue_text) < 10:
summary = "[no dialogue]"
embedding = [0.0] * 768
else:
print(f" [{i+1}/{len(story_dialogue_texts)}] {cid}: {len(dialogue_text)} chars", end="")
try:
summary = call_llm(dialogue_text[:3000])
print(f" -> {len(summary)} chars")
time.sleep(0.3)
embedding = call_embed(summary)
except Exception as e:
print(f" ERROR: {e}")
summary = "[error]"
embedding = [0.0] * 768
# Update DB
s_esc = summary.replace("'", "''")
cur.execute(f"""
UPDATE dev.chunks
SET summary_text = '{s_esc}', updated_at = NOW()
WHERE id = {db_id}
""")
summaries.append({
"db_id": db_id,
"chunk_id": cid,
"chunk_index": idx,
"start_time": st,
"end_time": et,
"dialogue": dialogue_text,
"summary": summary,
"embedding": embedding,
})
conn.commit()
print(f"\nGenerated {len(summaries)} summaries")
print("\n=== Step 5: Rebuild Qdrant momentry_dev_stories ===")
# Delete existing
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories", method="DELETE")
try:
urlopen(req)
time.sleep(0.3)
except:
pass
# Recreate
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories",
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
urlopen(req)
time.sleep(0.3)
# Upload dialogue points (0..227) and summary points (228..455)
dialogue_points = []
summary_points = []
for s in summaries:
idx = s["chunk_index"]
dialogue_points.append({
"id": idx + 1,
"vector": [0.0] * 768,
"payload": {
"chunk_id": s["chunk_id"],
"file_uuid": UUID,
"start_time": s["start_time"],
"end_time": s["end_time"],
"type": "story_dialogue",
"text": s["dialogue"][:500],
}
})
summary_points.append({
"id": idx + 1 + 228,
"vector": s["embedding"],
"payload": {
"chunk_id": s["chunk_id"],
"file_uuid": UUID,
"start_time": s["start_time"],
"end_time": s["end_time"],
"type": "story_summary",
"summary": s["summary"],
}
})
all_story_points = dialogue_points + summary_points
batch_size = 100
for start in range(0, len(all_story_points), batch_size):
batch = all_story_points[start:start+batch_size]
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories/points?wait=true",
data=json.dumps({"points": batch}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
try:
urlopen(req)
except Exception as e:
print(f" Batch {start}: {e}")
if (start // batch_size) % 3 == 0:
print(f" Uploaded {start + len(batch)}/{len(all_story_points)}")
print(f"Uploaded {len(all_story_points)} points to momentry_dev_stories")
print("\n=== Step 6: Populate sentence_story and sentence_summary ===")
# These are the per-sentence template + summary collections
# sentence_story: 3417 points, 768D, template payloads
# sentence_summary: 3417 points, 768D, LLM summary payloads
for col_name in ["sentence_story", "sentence_summary"]:
req = Request(f"{QDRANT_URL}/collections/{col_name}", method="DELETE")
try:
urlopen(req)
time.sleep(0.2)
except:
pass
req = Request(f"{QDRANT_URL}/collections/{col_name}",
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
urlopen(req)
time.sleep(0.2)
# Build points for sentence_story and sentence_summary
story_sentence_points = []
summary_sentence_points = []
for idx in sorted(sentences.keys()):
s = sentences[idx]
raw_text = ""
if s["content"] and isinstance(s["content"], dict):
raw_text = s["content"].get("data", {}).get("text", "")
dialog_line = f'({s["new_name"]}) {raw_text}'
story_sentence_points.append({
"id": idx + 1,
"vector": [0.0] * 768,
"payload": {
"chunk_id": f"{UUID}_{idx}",
"file_uuid": UUID,
"start_time": 0,
"end_time": 0,
"text": dialog_line,
"speaker_name": s["new_name"],
"chunk_type": "sentence",
}
})
# Upload sentence_story (dialogue template)
batch_size = 200
for start in range(0, len(story_sentence_points), batch_size):
batch = story_sentence_points[start:start+batch_size]
req = Request(f"{QDRANT_URL}/collections/sentence_story/points?wait=true",
data=json.dumps({"points": batch}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
try:
urlopen(req)
except Exception as e:
print(f" sentence_story batch {start}: {e}")
if (start // batch_size) % 5 == 0:
print(f" Uploaded {start + len(batch)}/3417 sentence_story")
print("Uploaded sentence_story points")
# sentence_summary will be populated when we generate per-sentence summaries
# For now, mark as TODO
print("sentence_summary: SKIPPED (needs per-sentence LLM summaries)")
cur.close()
conn.close()
print("\n=== Done ===")