feat: Phase 1 handover - schema migration, correction mechanism, API fixes
Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
This commit is contained in:
173
scripts/clean_sentence_text.py
Normal file
173
scripts/clean_sentence_text.py
Normal file
@@ -0,0 +1,173 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
LLM-clean all 4188 sentence texts, re-embed, update momentry_dev_v1 + sentence_story.
|
||||
"""
|
||||
import json, time, os
|
||||
from urllib.request import Request, urlopen
|
||||
import psycopg2
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
||||
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
||||
CHECKPOINT = f"/tmp/sentence_clean_{UUID}.json"
|
||||
|
||||
def call_llm(prompt):
|
||||
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.1, "max_tokens": 80}).encode()
|
||||
req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
resp = urlopen(req, timeout=30)
|
||||
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
|
||||
|
||||
def call_embed(text):
|
||||
body = json.dumps({"input": text}).encode()
|
||||
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
resp = urlopen(req, timeout=30)
|
||||
return json.loads(resp.read())["data"][0]["embedding"]
|
||||
|
||||
print("=== Step 1: Load all sentences ===")
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT id, chunk_id, text_content
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid = %s AND chunk_type = 'sentence'
|
||||
ORDER BY id
|
||||
""", (UUID,))
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
print(f"Loaded {len(rows)} sentences")
|
||||
|
||||
# Reset checkpoint (incompatible with old chunk_index format)
|
||||
if os.path.exists(CHECKPOINT):
|
||||
os.remove(CHECKPOINT)
|
||||
print("Old checkpoint removed (format changed)")
|
||||
|
||||
results = []
|
||||
errors = 0
|
||||
|
||||
print("\n=== Step 2: LLM clean + embed ===")
|
||||
for i, (cid, chunk_id, text_content) in enumerate(rows):
|
||||
input_text = text_content
|
||||
|
||||
prompt = f"""Clean this movie dialogue line. Fix truncated words, capitalize, add punctuation.
|
||||
Return: SPEAKER: "clean text"
|
||||
|
||||
Input: [Cary Grant] can't you do something constructive like start
|
||||
Return: Cary Grant: "Can't you do something constructive like start?"
|
||||
|
||||
Input: [Audrey Hepburn] qui se présente influence d'une manière vitale la proposition l
|
||||
Return: Audrey Hepburn: "Qui se présente influence d'une manière vitale la proposition..."
|
||||
|
||||
Input: {input_text}
|
||||
Return:"""
|
||||
|
||||
try:
|
||||
cleaned = call_llm(prompt)
|
||||
embedding = call_embed(cleaned)
|
||||
time.sleep(0.1)
|
||||
except Exception as e:
|
||||
print(f" [{i+1}/{len(rows)}] id={cid} chunk={chunk_id} ERROR: {e}")
|
||||
cleaned = input_text
|
||||
embedding = [0.0] * 768
|
||||
errors += 1
|
||||
|
||||
entry = {
|
||||
"index": i,
|
||||
"chunk_id": chunk_id,
|
||||
"original": input_text,
|
||||
"cleaned": cleaned,
|
||||
"embedding": embedding,
|
||||
}
|
||||
results.append(entry)
|
||||
json.dump({"last": i}, open(CHECKPOINT, "w"))
|
||||
|
||||
if (i + 1) % 50 == 0:
|
||||
print(f" [{i+1}/{len(rows)}] chunk={chunk_id} errors={errors}")
|
||||
|
||||
results.sort(key=lambda x: x["index"])
|
||||
|
||||
print(f"\nDone: {len(results)} cleaned, {errors} errors")
|
||||
|
||||
print("\n=== Step 3: Rebuild momentry_dev_v1 ===")
|
||||
# Delete old
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_v1", method="DELETE")
|
||||
try: urlopen(req); time.sleep(0.5)
|
||||
except: pass
|
||||
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_v1",
|
||||
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req); time.sleep(0.5)
|
||||
|
||||
batch_size = 100
|
||||
points = []
|
||||
for pi, r in enumerate(results):
|
||||
points.append({
|
||||
"id": pi + 1,
|
||||
"vector": r["embedding"],
|
||||
"payload": {
|
||||
"chunk_type": "sentence",
|
||||
"uuid": UUID,
|
||||
"chunk_id": r["chunk_id"],
|
||||
"text": r["cleaned"],
|
||||
"original": r["original"],
|
||||
}
|
||||
})
|
||||
|
||||
for start in range(0, len(points), batch_size):
|
||||
batch = points[start:start+batch_size]
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_v1/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try: urlopen(req)
|
||||
except Exception as e: print(f" batch {start}: {e}")
|
||||
if (start // batch_size) % 5 == 0:
|
||||
print(f" momentry_dev_v1: {start+len(batch)}/{len(points)}")
|
||||
|
||||
print(" momentry_dev_v1 done")
|
||||
|
||||
print("\n=== Step 4: Rebuild sentence_story ===")
|
||||
req = Request(f"{QDRANT_URL}/collections/sentence_story", method="DELETE")
|
||||
try: urlopen(req); time.sleep(0.5)
|
||||
except: pass
|
||||
|
||||
req = Request(f"{QDRANT_URL}/collections/sentence_story",
|
||||
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req); time.sleep(0.5)
|
||||
|
||||
story_points = []
|
||||
for pi, r in enumerate(results):
|
||||
story_points.append({
|
||||
"id": pi + 1,
|
||||
"vector": r["embedding"],
|
||||
"payload": {
|
||||
"chunk_type": "sentence",
|
||||
"uuid": UUID,
|
||||
"chunk_id": r["chunk_id"],
|
||||
"text": r["cleaned"],
|
||||
}
|
||||
})
|
||||
|
||||
for start in range(0, len(story_points), batch_size):
|
||||
batch = story_points[start:start+batch_size]
|
||||
req = Request(f"{QDRANT_URL}/collections/sentence_story/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try: urlopen(req)
|
||||
except Exception as e: print(f" batch {start}: {e}")
|
||||
if (start // batch_size) % 5 == 0:
|
||||
print(f" sentence_story: {start+len(batch)}/{len(story_points)}")
|
||||
|
||||
print(" sentence_story done")
|
||||
|
||||
# Verify
|
||||
for col in ["momentry_dev_v1", "sentence_story"]:
|
||||
resp = json.loads(urlopen(f"{QDRANT_URL}/collections/{col}").read())
|
||||
info = resp["result"]
|
||||
print(f"Verified {col}: {info['points_count']} pts, {info['config']['params']['vectors'].get('size','?')}D")
|
||||
|
||||
print("\n=== Done ===")
|
||||
Reference in New Issue
Block a user