feat: Phase 1 handover - schema migration, correction mechanism, API fixes
Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
This commit is contained in:
320
scripts/rebuild_story_content.py
Normal file
320
scripts/rebuild_story_content.py
Normal file
@@ -0,0 +1,320 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Rebuild story chunk text_content and regenerates summaries using new ASRX speaker assignments.
|
||||
Then updates Qdrant momentry_dev_stories and sentence_story/sentence_summary collections.
|
||||
"""
|
||||
|
||||
import json, sys, time, urllib.request
|
||||
from urllib.request import Request, urlopen
|
||||
import psycopg2
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
||||
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
||||
|
||||
def call_llm(dialogue_text):
|
||||
prompt = f"Dialogue:\n{dialogue_text}\n\n50-word summary:"
|
||||
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.1, "max_tokens": 100}).encode()
|
||||
req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
resp = urlopen(req, timeout=120)
|
||||
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
|
||||
except Exception as e:
|
||||
print(f" LLM error: {e}")
|
||||
return ""
|
||||
|
||||
def call_embed(text):
|
||||
body = json.dumps({"input": text}).encode()
|
||||
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
resp = urlopen(req, timeout=30)
|
||||
return json.loads(resp.read())["data"][0]["embedding"]
|
||||
except Exception as e:
|
||||
print(f" Embed error: {e}")
|
||||
return [0.0] * 768
|
||||
|
||||
print("=== Step 1: Load sentence chunks with new speaker info ===")
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("""
|
||||
SELECT chunk_index, text_content, metadata->>'new_speaker_name',
|
||||
metadata->>'speaker_name', content
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid = %s AND chunk_type = 'sentence'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
sentence_rows = cur.fetchall()
|
||||
print(f"Loaded {len(sentence_rows)} sentence chunks")
|
||||
|
||||
# Build lookup
|
||||
sentences = {}
|
||||
for r in sentence_rows:
|
||||
idx, old_text, new_name, old_name, content = r
|
||||
sentences[idx] = {
|
||||
"old_text": old_text or "",
|
||||
"new_name": new_name or old_name or "Unknown",
|
||||
"old_name": old_name or "Unknown",
|
||||
"content": content or {},
|
||||
}
|
||||
|
||||
# Rebuild sentence text_content with new speaker names
|
||||
print("\n=== Step 2: Rebuild sentence text_content ===")
|
||||
updated_sentences = 0
|
||||
for r in sentence_rows:
|
||||
idx, old_text, new_name, old_name, content = r
|
||||
new_name = new_name or old_name or "Unknown"
|
||||
|
||||
# Extract the text part (remove old speaker prefix if exists)
|
||||
raw_text = ""
|
||||
if content and isinstance(content, dict):
|
||||
raw_text = content.get("data", {}).get("text", "")
|
||||
if not raw_text and old_text:
|
||||
# Parse old format: [Speaker] text
|
||||
import re
|
||||
m = re.search(r'\]\s*(.*)', old_text)
|
||||
if m:
|
||||
raw_text = m.group(1)
|
||||
else:
|
||||
raw_text = old_text
|
||||
|
||||
new_text = f"[{new_name}] {raw_text}"
|
||||
|
||||
cur.execute("""
|
||||
UPDATE dev.chunks
|
||||
SET text_content = %s, updated_at = NOW()
|
||||
WHERE file_uuid = %s AND chunk_type = 'sentence' AND chunk_index = %s
|
||||
""", (new_text, UUID, idx))
|
||||
updated_sentences += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"Updated {updated_sentences} sentence chunks text_content")
|
||||
|
||||
print("\n=== Step 3: Rebuild story chunk text_content ===")
|
||||
cur.execute("""
|
||||
SELECT id, chunk_id, chunk_index, child_chunk_ids, start_time, end_time,
|
||||
text_content, summary_text
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid = %s AND chunk_type = 'story'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
story_rows = cur.fetchall()
|
||||
print(f"Loaded {len(story_rows)} story chunks")
|
||||
|
||||
# Build child text per story chunk
|
||||
story_dialogue_texts = []
|
||||
for r in story_rows:
|
||||
db_id, cid, idx, child_ids, st, et, old_text, old_summary = r
|
||||
|
||||
dialogue_parts = []
|
||||
for child_cid in (child_ids or []):
|
||||
parts = child_cid.split("_")
|
||||
child_idx = int(parts[-1])
|
||||
if child_idx in sentences:
|
||||
s = sentences[child_idx]
|
||||
raw = ""
|
||||
if s["content"] and isinstance(s["content"], dict):
|
||||
raw = s["content"].get("data", {}).get("text", "")
|
||||
if not raw:
|
||||
import re
|
||||
m = re.search(r'\]\s*(.*)', s["old_text"])
|
||||
if m:
|
||||
raw = m.group(1)
|
||||
else:
|
||||
raw = s["old_text"]
|
||||
if raw:
|
||||
dialogue_parts.append(f'({s["new_name"]}) {raw}')
|
||||
|
||||
dialogue_text = " ".join(dialogue_parts)
|
||||
story_dialogue_texts.append((db_id, cid, idx, st, et, dialogue_text, old_summary))
|
||||
|
||||
print(f"Built {len(story_dialogue_texts)} story dialogue texts")
|
||||
|
||||
# Update DB with new text_content (dialogue only, not summary yet)
|
||||
for item in story_dialogue_texts:
|
||||
db_id, cid, idx, st, et, dialogue_text, old_summary = item
|
||||
cur.execute("""
|
||||
UPDATE dev.chunks
|
||||
SET text_content = %s, updated_at = NOW()
|
||||
WHERE id = %s
|
||||
""", (dialogue_text, db_id))
|
||||
|
||||
conn.commit()
|
||||
print("Updated story chunk dialogue texts")
|
||||
|
||||
print("\n=== Step 4: Generate LLM summaries (all 228 stories) ===")
|
||||
summaries = []
|
||||
for i, item in enumerate(story_dialogue_texts):
|
||||
db_id, cid, idx, st, et, dialogue_text, old_summary = item
|
||||
|
||||
if len(dialogue_text) < 10:
|
||||
summary = "[no dialogue]"
|
||||
embedding = [0.0] * 768
|
||||
else:
|
||||
print(f" [{i+1}/{len(story_dialogue_texts)}] {cid}: {len(dialogue_text)} chars", end="")
|
||||
try:
|
||||
summary = call_llm(dialogue_text[:3000])
|
||||
print(f" -> {len(summary)} chars")
|
||||
time.sleep(0.3)
|
||||
embedding = call_embed(summary)
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
summary = "[error]"
|
||||
embedding = [0.0] * 768
|
||||
|
||||
# Update DB
|
||||
s_esc = summary.replace("'", "''")
|
||||
cur.execute(f"""
|
||||
UPDATE dev.chunks
|
||||
SET summary_text = '{s_esc}', updated_at = NOW()
|
||||
WHERE id = {db_id}
|
||||
""")
|
||||
|
||||
summaries.append({
|
||||
"db_id": db_id,
|
||||
"chunk_id": cid,
|
||||
"chunk_index": idx,
|
||||
"start_time": st,
|
||||
"end_time": et,
|
||||
"dialogue": dialogue_text,
|
||||
"summary": summary,
|
||||
"embedding": embedding,
|
||||
})
|
||||
|
||||
conn.commit()
|
||||
print(f"\nGenerated {len(summaries)} summaries")
|
||||
|
||||
print("\n=== Step 5: Rebuild Qdrant momentry_dev_stories ===")
|
||||
# Delete existing
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories", method="DELETE")
|
||||
try:
|
||||
urlopen(req)
|
||||
time.sleep(0.3)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Recreate
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories",
|
||||
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req)
|
||||
time.sleep(0.3)
|
||||
|
||||
# Upload dialogue points (0..227) and summary points (228..455)
|
||||
dialogue_points = []
|
||||
summary_points = []
|
||||
for s in summaries:
|
||||
idx = s["chunk_index"]
|
||||
dialogue_points.append({
|
||||
"id": idx + 1,
|
||||
"vector": [0.0] * 768,
|
||||
"payload": {
|
||||
"chunk_id": s["chunk_id"],
|
||||
"file_uuid": UUID,
|
||||
"start_time": s["start_time"],
|
||||
"end_time": s["end_time"],
|
||||
"type": "story_dialogue",
|
||||
"text": s["dialogue"][:500],
|
||||
}
|
||||
})
|
||||
summary_points.append({
|
||||
"id": idx + 1 + 228,
|
||||
"vector": s["embedding"],
|
||||
"payload": {
|
||||
"chunk_id": s["chunk_id"],
|
||||
"file_uuid": UUID,
|
||||
"start_time": s["start_time"],
|
||||
"end_time": s["end_time"],
|
||||
"type": "story_summary",
|
||||
"summary": s["summary"],
|
||||
}
|
||||
})
|
||||
|
||||
all_story_points = dialogue_points + summary_points
|
||||
|
||||
batch_size = 100
|
||||
for start in range(0, len(all_story_points), batch_size):
|
||||
batch = all_story_points[start:start+batch_size]
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try:
|
||||
urlopen(req)
|
||||
except Exception as e:
|
||||
print(f" Batch {start}: {e}")
|
||||
if (start // batch_size) % 3 == 0:
|
||||
print(f" Uploaded {start + len(batch)}/{len(all_story_points)}")
|
||||
|
||||
print(f"Uploaded {len(all_story_points)} points to momentry_dev_stories")
|
||||
|
||||
print("\n=== Step 6: Populate sentence_story and sentence_summary ===")
|
||||
# These are the per-sentence template + summary collections
|
||||
# sentence_story: 3417 points, 768D, template payloads
|
||||
# sentence_summary: 3417 points, 768D, LLM summary payloads
|
||||
|
||||
for col_name in ["sentence_story", "sentence_summary"]:
|
||||
req = Request(f"{QDRANT_URL}/collections/{col_name}", method="DELETE")
|
||||
try:
|
||||
urlopen(req)
|
||||
time.sleep(0.2)
|
||||
except:
|
||||
pass
|
||||
|
||||
req = Request(f"{QDRANT_URL}/collections/{col_name}",
|
||||
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req)
|
||||
time.sleep(0.2)
|
||||
|
||||
# Build points for sentence_story and sentence_summary
|
||||
story_sentence_points = []
|
||||
summary_sentence_points = []
|
||||
for idx in sorted(sentences.keys()):
|
||||
s = sentences[idx]
|
||||
raw_text = ""
|
||||
if s["content"] and isinstance(s["content"], dict):
|
||||
raw_text = s["content"].get("data", {}).get("text", "")
|
||||
|
||||
dialog_line = f'({s["new_name"]}) {raw_text}'
|
||||
|
||||
story_sentence_points.append({
|
||||
"id": idx + 1,
|
||||
"vector": [0.0] * 768,
|
||||
"payload": {
|
||||
"chunk_id": f"{UUID}_{idx}",
|
||||
"file_uuid": UUID,
|
||||
"start_time": 0,
|
||||
"end_time": 0,
|
||||
"text": dialog_line,
|
||||
"speaker_name": s["new_name"],
|
||||
"chunk_type": "sentence",
|
||||
}
|
||||
})
|
||||
|
||||
# Upload sentence_story (dialogue template)
|
||||
batch_size = 200
|
||||
for start in range(0, len(story_sentence_points), batch_size):
|
||||
batch = story_sentence_points[start:start+batch_size]
|
||||
req = Request(f"{QDRANT_URL}/collections/sentence_story/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try:
|
||||
urlopen(req)
|
||||
except Exception as e:
|
||||
print(f" sentence_story batch {start}: {e}")
|
||||
if (start // batch_size) % 5 == 0:
|
||||
print(f" Uploaded {start + len(batch)}/3417 sentence_story")
|
||||
|
||||
print("Uploaded sentence_story points")
|
||||
|
||||
# sentence_summary will be populated when we generate per-sentence summaries
|
||||
# For now, mark as TODO
|
||||
print("sentence_summary: SKIPPED (needs per-sentence LLM summaries)")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
print("\n=== Done ===")
|
||||
Reference in New Issue
Block a user