Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
88 lines
3.5 KiB
Python
88 lines
3.5 KiB
Python
#!/opt/homebrew/bin/python3.11
|
|
"""
|
|
Story Embedding Pipeline:
|
|
1. Read story chunks → LLM summary (Gemma4)
|
|
2. Embed summary (EmbeddingGemma)
|
|
3. Store in chunks table + Qdrant
|
|
"""
|
|
|
|
import json, urllib.request, subprocess, sys, time, os
|
|
|
|
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
|
PSQL = ["/Users/accusys/pgsql/18.3/bin/psql", "-U", "accusys", "-d", "momentry", "-t", "-A"]
|
|
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
|
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
|
QDRANT_URL = "http://localhost:6333"
|
|
QDRANT_COL = "momentry_dev_stories"
|
|
|
|
def psql(sql):
|
|
r = subprocess.run(PSQL + ["-c", sql], capture_output=True, text=True, timeout=30)
|
|
return r.stdout.strip()
|
|
|
|
def call_llm(dialogue):
|
|
prompt = f"Dialogue: {dialogue}\n\n50-word summary:"
|
|
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"temperature": 0.1, "max_tokens": 100}).encode()
|
|
req = urllib.request.Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
|
|
resp = urllib.request.urlopen(req, timeout=120)
|
|
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
|
|
|
|
def call_embed(text):
|
|
body = json.dumps({"input": text}).encode()
|
|
req = urllib.request.Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
|
resp = urllib.request.urlopen(req, timeout=30)
|
|
return json.loads(resp.read())["data"][0]["embedding"]
|
|
|
|
# Step 0: Ensure Qdrant collection exists (768 dims)
|
|
subprocess.run(["curl", "-s", "-X", "PUT", f"{QDRANT_URL}/collections/{QDRANT_COL}",
|
|
"-H", "Content-Type: application/json",
|
|
"-d", '{"vectors":{"size":768,"distance":"Cosine"}}'], capture_output=True)
|
|
|
|
# Step 1: Get all story chunks that need summaries
|
|
lines = [l for l in psql(f"SELECT chunk_id, chunk_index, start_time, end_time, text_content FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story' AND (summary_text IS NULL OR summary_text = '') ORDER BY chunk_index").split('\n') if l.strip() and '|' in l]
|
|
|
|
print(f"Chunks to process: {len(lines)}")
|
|
total = len(lines)
|
|
errors = 0
|
|
|
|
for i, line in enumerate(lines):
|
|
parts = line.split('|', 4)
|
|
cid, idx, st, et, dialogue = parts[0].strip(), int(parts[1]), float(parts[2]), float(parts[3]), parts[4] if len(parts) > 4 else ""
|
|
|
|
if len(dialogue) < 10:
|
|
summary = "[no dialogue]"
|
|
embedding = [0.0] * 768
|
|
else:
|
|
try:
|
|
summary = call_llm(dialogue)
|
|
time.sleep(0.3)
|
|
embedding = call_embed(summary)
|
|
except Exception as e:
|
|
print(f"[{i+1}/{total}] Error: {cid} - {e}")
|
|
errors += 1
|
|
summary = "[error]"
|
|
embedding = [0.0] * 768
|
|
|
|
# Update DB
|
|
s_esc = summary.replace("'", "''")
|
|
psql(f"UPDATE dev.chunks SET summary_text='{s_esc}', updated_at=CURRENT_TIMESTAMP WHERE chunk_id='{cid}'")
|
|
|
|
# Store in Qdrant
|
|
point = json.dumps({"points": [{"id": idx + 1, "vector": embedding,
|
|
"payload": {"chunk_id": cid, "file_uuid": UUID, "start_time": st, "end_time": et,
|
|
"summary": summary, "type": "story_summary"}
|
|
}]}).encode()
|
|
req = urllib.request.Request(f"{QDRANT_URL}/collections/{QDRANT_COL}/points?wait=true",
|
|
data=point, headers={"Content-Type": "application/json"}, method="PUT")
|
|
try:
|
|
urllib.request.urlopen(req, timeout=10)
|
|
except:
|
|
pass
|
|
|
|
if (i+1) % 20 == 0:
|
|
print(f"[{i+1}/{total}] {errors} errors so far")
|
|
|
|
print(f"\nDone. Processed: {total}, Errors: {errors}")
|
|
print(f"Qdrant: {QDRANT_COL}")
|