#!/opt/homebrew/bin/python3.11 """ LLM-clean all 4188 sentence texts, re-embed, update momentry_dev_v1 + sentence_story. """ import json, time, os from urllib.request import Request, urlopen import psycopg2 UUID = "aeed71342a899fe4b4c57b7d41bcb692" DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp" QDRANT_URL = "http://localhost:6333" LLM_URL = "http://localhost:8082/v1/chat/completions" EMBED_URL = "http://localhost:11436/v1/embeddings" CHECKPOINT = f"/tmp/sentence_clean_{UUID}.json" def call_llm(prompt): body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf", "messages": [{"role": "user", "content": prompt}], "temperature": 0.1, "max_tokens": 80}).encode() req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"}) resp = urlopen(req, timeout=30) return json.loads(resp.read())["choices"][0]["message"]["content"].strip() def call_embed(text): body = json.dumps({"input": text}).encode() req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"}) resp = urlopen(req, timeout=30) return json.loads(resp.read())["data"][0]["embedding"] print("=== Step 1: Load all sentences ===") conn = psycopg2.connect(DB_URL) cur = conn.cursor() cur.execute(""" SELECT id, chunk_id, text_content FROM dev.chunks WHERE file_uuid = %s AND chunk_type = 'sentence' ORDER BY id """, (UUID,)) rows = cur.fetchall() conn.close() print(f"Loaded {len(rows)} sentences") # Reset checkpoint (incompatible with old chunk_index format) if os.path.exists(CHECKPOINT): os.remove(CHECKPOINT) print("Old checkpoint removed (format changed)") results = [] errors = 0 print("\n=== Step 2: LLM clean + embed ===") for i, (cid, chunk_id, text_content) in enumerate(rows): input_text = text_content prompt = f"""Clean this movie dialogue line. Fix truncated words, capitalize, add punctuation. Return: SPEAKER: "clean text" Input: [Cary Grant] can't you do something constructive like start Return: Cary Grant: "Can't you do something constructive like start?" Input: [Audrey Hepburn] qui se présente influence d'une manière vitale la proposition l Return: Audrey Hepburn: "Qui se présente influence d'une manière vitale la proposition..." Input: {input_text} Return:""" try: cleaned = call_llm(prompt) embedding = call_embed(cleaned) time.sleep(0.1) except Exception as e: print(f" [{i+1}/{len(rows)}] id={cid} chunk={chunk_id} ERROR: {e}") cleaned = input_text embedding = [0.0] * 768 errors += 1 entry = { "index": i, "chunk_id": chunk_id, "original": input_text, "cleaned": cleaned, "embedding": embedding, } results.append(entry) json.dump({"last": i}, open(CHECKPOINT, "w")) if (i + 1) % 50 == 0: print(f" [{i+1}/{len(rows)}] chunk={chunk_id} errors={errors}") results.sort(key=lambda x: x["index"]) print(f"\nDone: {len(results)} cleaned, {errors} errors") print("\n=== Step 3: Rebuild momentry_dev_v1 ===") # Delete old req = Request(f"{QDRANT_URL}/collections/momentry_dev_v1", method="DELETE") try: urlopen(req); time.sleep(0.5) except: pass req = Request(f"{QDRANT_URL}/collections/momentry_dev_v1", data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(), headers={"Content-Type": "application/json"}, method="PUT") urlopen(req); time.sleep(0.5) batch_size = 100 points = [] for pi, r in enumerate(results): points.append({ "id": pi + 1, "vector": r["embedding"], "payload": { "chunk_type": "sentence", "uuid": UUID, "chunk_id": r["chunk_id"], "text": r["cleaned"], "original": r["original"], } }) for start in range(0, len(points), batch_size): batch = points[start:start+batch_size] req = Request(f"{QDRANT_URL}/collections/momentry_dev_v1/points?wait=true", data=json.dumps({"points": batch}).encode(), headers={"Content-Type": "application/json"}, method="PUT") try: urlopen(req) except Exception as e: print(f" batch {start}: {e}") if (start // batch_size) % 5 == 0: print(f" momentry_dev_v1: {start+len(batch)}/{len(points)}") print(" momentry_dev_v1 done") print("\n=== Step 4: Rebuild sentence_story ===") req = Request(f"{QDRANT_URL}/collections/sentence_story", method="DELETE") try: urlopen(req); time.sleep(0.5) except: pass req = Request(f"{QDRANT_URL}/collections/sentence_story", data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(), headers={"Content-Type": "application/json"}, method="PUT") urlopen(req); time.sleep(0.5) story_points = [] for pi, r in enumerate(results): story_points.append({ "id": pi + 1, "vector": r["embedding"], "payload": { "chunk_type": "sentence", "uuid": UUID, "chunk_id": r["chunk_id"], "text": r["cleaned"], } }) for start in range(0, len(story_points), batch_size): batch = story_points[start:start+batch_size] req = Request(f"{QDRANT_URL}/collections/sentence_story/points?wait=true", data=json.dumps({"points": batch}).encode(), headers={"Content-Type": "application/json"}, method="PUT") try: urlopen(req) except Exception as e: print(f" batch {start}: {e}") if (start // batch_size) % 5 == 0: print(f" sentence_story: {start+len(batch)}/{len(story_points)}") print(" sentence_story done") # Verify for col in ["momentry_dev_v1", "sentence_story"]: resp = json.loads(urlopen(f"{QDRANT_URL}/collections/{col}").read()) info = resp["result"] print(f"Verified {col}: {info['points_count']} pts, {info['config']['params']['vectors'].get('size','?')}D") print("\n=== Done ===")