Files
momentry_core/scripts/story_embed.py
Accusys 39ba5ddf76 feat: Phase 1 handover - schema migration, correction mechanism, API fixes
Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index
Correction: asr-1.json format, generate/apply scripts
API: 37/37 endpoints fixed and tested
Docs: HANDOVER_V2.0.md for M4
2026-05-11 07:03:22 +08:00

88 lines
3.5 KiB
Python

#!/opt/homebrew/bin/python3.11
"""
Story Embedding Pipeline:
1. Read story chunks → LLM summary (Gemma4)
2. Embed summary (EmbeddingGemma)
3. Store in chunks table + Qdrant
"""
import json, urllib.request, subprocess, sys, time, os
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
PSQL = ["/Users/accusys/pgsql/18.3/bin/psql", "-U", "accusys", "-d", "momentry", "-t", "-A"]
LLM_URL = "http://localhost:8082/v1/chat/completions"
EMBED_URL = "http://localhost:11436/v1/embeddings"
QDRANT_URL = "http://localhost:6333"
QDRANT_COL = "momentry_dev_stories"
def psql(sql):
r = subprocess.run(PSQL + ["-c", sql], capture_output=True, text=True, timeout=30)
return r.stdout.strip()
def call_llm(dialogue):
prompt = f"Dialogue: {dialogue}\n\n50-word summary:"
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1, "max_tokens": 100}).encode()
req = urllib.request.Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
resp = urllib.request.urlopen(req, timeout=120)
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
def call_embed(text):
body = json.dumps({"input": text}).encode()
req = urllib.request.Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
resp = urllib.request.urlopen(req, timeout=30)
return json.loads(resp.read())["data"][0]["embedding"]
# Step 0: Ensure Qdrant collection exists (768 dims)
subprocess.run(["curl", "-s", "-X", "PUT", f"{QDRANT_URL}/collections/{QDRANT_COL}",
"-H", "Content-Type: application/json",
"-d", '{"vectors":{"size":768,"distance":"Cosine"}}'], capture_output=True)
# Step 1: Get all story chunks that need summaries
lines = [l for l in psql(f"SELECT chunk_id, chunk_index, start_time, end_time, text_content FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story' AND (summary_text IS NULL OR summary_text = '') ORDER BY chunk_index").split('\n') if l.strip() and '|' in l]
print(f"Chunks to process: {len(lines)}")
total = len(lines)
errors = 0
for i, line in enumerate(lines):
parts = line.split('|', 4)
cid, idx, st, et, dialogue = parts[0].strip(), int(parts[1]), float(parts[2]), float(parts[3]), parts[4] if len(parts) > 4 else ""
if len(dialogue) < 10:
summary = "[no dialogue]"
embedding = [0.0] * 768
else:
try:
summary = call_llm(dialogue)
time.sleep(0.3)
embedding = call_embed(summary)
except Exception as e:
print(f"[{i+1}/{total}] Error: {cid} - {e}")
errors += 1
summary = "[error]"
embedding = [0.0] * 768
# Update DB
s_esc = summary.replace("'", "''")
psql(f"UPDATE dev.chunks SET summary_text='{s_esc}', updated_at=CURRENT_TIMESTAMP WHERE chunk_id='{cid}'")
# Store in Qdrant
point = json.dumps({"points": [{"id": idx + 1, "vector": embedding,
"payload": {"chunk_id": cid, "file_uuid": UUID, "start_time": st, "end_time": et,
"summary": summary, "type": "story_summary"}
}]}).encode()
req = urllib.request.Request(f"{QDRANT_URL}/collections/{QDRANT_COL}/points?wait=true",
data=point, headers={"Content-Type": "application/json"}, method="PUT")
try:
urllib.request.urlopen(req, timeout=10)
except:
pass
if (i+1) % 20 == 0:
print(f"[{i+1}/{total}] {errors} errors so far")
print(f"\nDone. Processed: {total}, Errors: {errors}")
print(f"Qdrant: {QDRANT_COL}")