Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
321 lines
10 KiB
Python
321 lines
10 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
Rebuild story chunk text_content and regenerates summaries using new ASRX speaker assignments.
|
|
Then updates Qdrant momentry_dev_stories and sentence_story/sentence_summary collections.
|
|
"""
|
|
|
|
import json, sys, time, urllib.request
|
|
from urllib.request import Request, urlopen
|
|
import psycopg2
|
|
|
|
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
|
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
|
QDRANT_URL = "http://localhost:6333"
|
|
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
|
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
|
|
|
def call_llm(dialogue_text):
|
|
prompt = f"Dialogue:\n{dialogue_text}\n\n50-word summary:"
|
|
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"temperature": 0.1, "max_tokens": 100}).encode()
|
|
req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
|
|
try:
|
|
resp = urlopen(req, timeout=120)
|
|
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
|
|
except Exception as e:
|
|
print(f" LLM error: {e}")
|
|
return ""
|
|
|
|
def call_embed(text):
|
|
body = json.dumps({"input": text}).encode()
|
|
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
|
try:
|
|
resp = urlopen(req, timeout=30)
|
|
return json.loads(resp.read())["data"][0]["embedding"]
|
|
except Exception as e:
|
|
print(f" Embed error: {e}")
|
|
return [0.0] * 768
|
|
|
|
print("=== Step 1: Load sentence chunks with new speaker info ===")
|
|
conn = psycopg2.connect(DB_URL)
|
|
cur = conn.cursor()
|
|
|
|
cur.execute("""
|
|
SELECT chunk_index, text_content, metadata->>'new_speaker_name',
|
|
metadata->>'speaker_name', content
|
|
FROM dev.chunks
|
|
WHERE file_uuid = %s AND chunk_type = 'sentence'
|
|
ORDER BY chunk_index
|
|
""", (UUID,))
|
|
sentence_rows = cur.fetchall()
|
|
print(f"Loaded {len(sentence_rows)} sentence chunks")
|
|
|
|
# Build lookup
|
|
sentences = {}
|
|
for r in sentence_rows:
|
|
idx, old_text, new_name, old_name, content = r
|
|
sentences[idx] = {
|
|
"old_text": old_text or "",
|
|
"new_name": new_name or old_name or "Unknown",
|
|
"old_name": old_name or "Unknown",
|
|
"content": content or {},
|
|
}
|
|
|
|
# Rebuild sentence text_content with new speaker names
|
|
print("\n=== Step 2: Rebuild sentence text_content ===")
|
|
updated_sentences = 0
|
|
for r in sentence_rows:
|
|
idx, old_text, new_name, old_name, content = r
|
|
new_name = new_name or old_name or "Unknown"
|
|
|
|
# Extract the text part (remove old speaker prefix if exists)
|
|
raw_text = ""
|
|
if content and isinstance(content, dict):
|
|
raw_text = content.get("data", {}).get("text", "")
|
|
if not raw_text and old_text:
|
|
# Parse old format: [Speaker] text
|
|
import re
|
|
m = re.search(r'\]\s*(.*)', old_text)
|
|
if m:
|
|
raw_text = m.group(1)
|
|
else:
|
|
raw_text = old_text
|
|
|
|
new_text = f"[{new_name}] {raw_text}"
|
|
|
|
cur.execute("""
|
|
UPDATE dev.chunks
|
|
SET text_content = %s, updated_at = NOW()
|
|
WHERE file_uuid = %s AND chunk_type = 'sentence' AND chunk_index = %s
|
|
""", (new_text, UUID, idx))
|
|
updated_sentences += 1
|
|
|
|
conn.commit()
|
|
print(f"Updated {updated_sentences} sentence chunks text_content")
|
|
|
|
print("\n=== Step 3: Rebuild story chunk text_content ===")
|
|
cur.execute("""
|
|
SELECT id, chunk_id, chunk_index, child_chunk_ids, start_time, end_time,
|
|
text_content, summary_text
|
|
FROM dev.chunks
|
|
WHERE file_uuid = %s AND chunk_type = 'story'
|
|
ORDER BY chunk_index
|
|
""", (UUID,))
|
|
story_rows = cur.fetchall()
|
|
print(f"Loaded {len(story_rows)} story chunks")
|
|
|
|
# Build child text per story chunk
|
|
story_dialogue_texts = []
|
|
for r in story_rows:
|
|
db_id, cid, idx, child_ids, st, et, old_text, old_summary = r
|
|
|
|
dialogue_parts = []
|
|
for child_cid in (child_ids or []):
|
|
parts = child_cid.split("_")
|
|
child_idx = int(parts[-1])
|
|
if child_idx in sentences:
|
|
s = sentences[child_idx]
|
|
raw = ""
|
|
if s["content"] and isinstance(s["content"], dict):
|
|
raw = s["content"].get("data", {}).get("text", "")
|
|
if not raw:
|
|
import re
|
|
m = re.search(r'\]\s*(.*)', s["old_text"])
|
|
if m:
|
|
raw = m.group(1)
|
|
else:
|
|
raw = s["old_text"]
|
|
if raw:
|
|
dialogue_parts.append(f'({s["new_name"]}) {raw}')
|
|
|
|
dialogue_text = " ".join(dialogue_parts)
|
|
story_dialogue_texts.append((db_id, cid, idx, st, et, dialogue_text, old_summary))
|
|
|
|
print(f"Built {len(story_dialogue_texts)} story dialogue texts")
|
|
|
|
# Update DB with new text_content (dialogue only, not summary yet)
|
|
for item in story_dialogue_texts:
|
|
db_id, cid, idx, st, et, dialogue_text, old_summary = item
|
|
cur.execute("""
|
|
UPDATE dev.chunks
|
|
SET text_content = %s, updated_at = NOW()
|
|
WHERE id = %s
|
|
""", (dialogue_text, db_id))
|
|
|
|
conn.commit()
|
|
print("Updated story chunk dialogue texts")
|
|
|
|
print("\n=== Step 4: Generate LLM summaries (all 228 stories) ===")
|
|
summaries = []
|
|
for i, item in enumerate(story_dialogue_texts):
|
|
db_id, cid, idx, st, et, dialogue_text, old_summary = item
|
|
|
|
if len(dialogue_text) < 10:
|
|
summary = "[no dialogue]"
|
|
embedding = [0.0] * 768
|
|
else:
|
|
print(f" [{i+1}/{len(story_dialogue_texts)}] {cid}: {len(dialogue_text)} chars", end="")
|
|
try:
|
|
summary = call_llm(dialogue_text[:3000])
|
|
print(f" -> {len(summary)} chars")
|
|
time.sleep(0.3)
|
|
embedding = call_embed(summary)
|
|
except Exception as e:
|
|
print(f" ERROR: {e}")
|
|
summary = "[error]"
|
|
embedding = [0.0] * 768
|
|
|
|
# Update DB
|
|
s_esc = summary.replace("'", "''")
|
|
cur.execute(f"""
|
|
UPDATE dev.chunks
|
|
SET summary_text = '{s_esc}', updated_at = NOW()
|
|
WHERE id = {db_id}
|
|
""")
|
|
|
|
summaries.append({
|
|
"db_id": db_id,
|
|
"chunk_id": cid,
|
|
"chunk_index": idx,
|
|
"start_time": st,
|
|
"end_time": et,
|
|
"dialogue": dialogue_text,
|
|
"summary": summary,
|
|
"embedding": embedding,
|
|
})
|
|
|
|
conn.commit()
|
|
print(f"\nGenerated {len(summaries)} summaries")
|
|
|
|
print("\n=== Step 5: Rebuild Qdrant momentry_dev_stories ===")
|
|
# Delete existing
|
|
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories", method="DELETE")
|
|
try:
|
|
urlopen(req)
|
|
time.sleep(0.3)
|
|
except:
|
|
pass
|
|
|
|
# Recreate
|
|
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories",
|
|
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
|
headers={"Content-Type": "application/json"}, method="PUT")
|
|
urlopen(req)
|
|
time.sleep(0.3)
|
|
|
|
# Upload dialogue points (0..227) and summary points (228..455)
|
|
dialogue_points = []
|
|
summary_points = []
|
|
for s in summaries:
|
|
idx = s["chunk_index"]
|
|
dialogue_points.append({
|
|
"id": idx + 1,
|
|
"vector": [0.0] * 768,
|
|
"payload": {
|
|
"chunk_id": s["chunk_id"],
|
|
"file_uuid": UUID,
|
|
"start_time": s["start_time"],
|
|
"end_time": s["end_time"],
|
|
"type": "story_dialogue",
|
|
"text": s["dialogue"][:500],
|
|
}
|
|
})
|
|
summary_points.append({
|
|
"id": idx + 1 + 228,
|
|
"vector": s["embedding"],
|
|
"payload": {
|
|
"chunk_id": s["chunk_id"],
|
|
"file_uuid": UUID,
|
|
"start_time": s["start_time"],
|
|
"end_time": s["end_time"],
|
|
"type": "story_summary",
|
|
"summary": s["summary"],
|
|
}
|
|
})
|
|
|
|
all_story_points = dialogue_points + summary_points
|
|
|
|
batch_size = 100
|
|
for start in range(0, len(all_story_points), batch_size):
|
|
batch = all_story_points[start:start+batch_size]
|
|
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories/points?wait=true",
|
|
data=json.dumps({"points": batch}).encode(),
|
|
headers={"Content-Type": "application/json"}, method="PUT")
|
|
try:
|
|
urlopen(req)
|
|
except Exception as e:
|
|
print(f" Batch {start}: {e}")
|
|
if (start // batch_size) % 3 == 0:
|
|
print(f" Uploaded {start + len(batch)}/{len(all_story_points)}")
|
|
|
|
print(f"Uploaded {len(all_story_points)} points to momentry_dev_stories")
|
|
|
|
print("\n=== Step 6: Populate sentence_story and sentence_summary ===")
|
|
# These are the per-sentence template + summary collections
|
|
# sentence_story: 3417 points, 768D, template payloads
|
|
# sentence_summary: 3417 points, 768D, LLM summary payloads
|
|
|
|
for col_name in ["sentence_story", "sentence_summary"]:
|
|
req = Request(f"{QDRANT_URL}/collections/{col_name}", method="DELETE")
|
|
try:
|
|
urlopen(req)
|
|
time.sleep(0.2)
|
|
except:
|
|
pass
|
|
|
|
req = Request(f"{QDRANT_URL}/collections/{col_name}",
|
|
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
|
headers={"Content-Type": "application/json"}, method="PUT")
|
|
urlopen(req)
|
|
time.sleep(0.2)
|
|
|
|
# Build points for sentence_story and sentence_summary
|
|
story_sentence_points = []
|
|
summary_sentence_points = []
|
|
for idx in sorted(sentences.keys()):
|
|
s = sentences[idx]
|
|
raw_text = ""
|
|
if s["content"] and isinstance(s["content"], dict):
|
|
raw_text = s["content"].get("data", {}).get("text", "")
|
|
|
|
dialog_line = f'({s["new_name"]}) {raw_text}'
|
|
|
|
story_sentence_points.append({
|
|
"id": idx + 1,
|
|
"vector": [0.0] * 768,
|
|
"payload": {
|
|
"chunk_id": f"{UUID}_{idx}",
|
|
"file_uuid": UUID,
|
|
"start_time": 0,
|
|
"end_time": 0,
|
|
"text": dialog_line,
|
|
"speaker_name": s["new_name"],
|
|
"chunk_type": "sentence",
|
|
}
|
|
})
|
|
|
|
# Upload sentence_story (dialogue template)
|
|
batch_size = 200
|
|
for start in range(0, len(story_sentence_points), batch_size):
|
|
batch = story_sentence_points[start:start+batch_size]
|
|
req = Request(f"{QDRANT_URL}/collections/sentence_story/points?wait=true",
|
|
data=json.dumps({"points": batch}).encode(),
|
|
headers={"Content-Type": "application/json"}, method="PUT")
|
|
try:
|
|
urlopen(req)
|
|
except Exception as e:
|
|
print(f" sentence_story batch {start}: {e}")
|
|
if (start // batch_size) % 5 == 0:
|
|
print(f" Uploaded {start + len(batch)}/3417 sentence_story")
|
|
|
|
print("Uploaded sentence_story points")
|
|
|
|
# sentence_summary will be populated when we generate per-sentence summaries
|
|
# For now, mark as TODO
|
|
print("sentence_summary: SKIPPED (needs per-sentence LLM summaries)")
|
|
|
|
cur.close()
|
|
conn.close()
|
|
print("\n=== Done ===")
|