175 lines
5.2 KiB
Python
175 lines
5.2 KiB
Python
#!/usr/bin/env python3
|
|
"""批量更新 Qdrant collection 中的 file_uuid (舊→新)"""
|
|
|
|
import json
|
|
import subprocess
|
|
import sys
|
|
|
|
QDRANT_URL = "http://localhost:6333"
|
|
|
|
# UUID mapping: 舊 → 新
|
|
UUID_MAP = {
|
|
"aeed71342a899fe4b4c57b7d41bcb692": [
|
|
"bd80fec92b0b6963d177a2c55bf713e2",
|
|
],
|
|
}
|
|
|
|
# Collections to process
|
|
COLLECTIONS = [
|
|
"momentry_dev_v1",
|
|
"momentry_dev_stories",
|
|
"momentry_dev_voice",
|
|
"momentry_dev_rule1_v2",
|
|
"momentry_dev_faces",
|
|
"sentence_story",
|
|
"sentence_summary",
|
|
]
|
|
|
|
|
|
def qdrant_get(path: str) -> dict:
|
|
res = subprocess.run(
|
|
["curl", "-s", "-X", "GET", f"{QDRANT_URL}{path}"],
|
|
capture_output=True, text=True
|
|
)
|
|
return json.loads(res.stdout) if res.stdout.strip() else {}
|
|
|
|
|
|
def qdrant_post(path: str, body: dict) -> dict:
|
|
tmp = "/tmp/qdrant_post.json"
|
|
with open(tmp, "w") as f:
|
|
json.dump(body, f)
|
|
res = subprocess.run(
|
|
["curl", "-s", "-X", "POST", f"{QDRANT_URL}{path}",
|
|
"-H", "Content-Type: application/json", "-d", f"@{tmp}"],
|
|
capture_output=True, text=True
|
|
)
|
|
return json.loads(res.stdout) if res.stdout.strip() else {}
|
|
|
|
|
|
def qdrant_put(path: str, body: dict) -> dict:
|
|
tmp = "/tmp/qdrant_update.json"
|
|
with open(tmp, "w") as f:
|
|
json.dump(body, f)
|
|
res = subprocess.run(
|
|
["curl", "-s", "-X", "PUT", f"{QDRANT_URL}{path}",
|
|
"-H", "Content-Type: application/json", "-d", f"@{tmp}"],
|
|
capture_output=True, text=True
|
|
)
|
|
return json.loads(res.stdout) if res.stdout.strip() else {}
|
|
|
|
|
|
def scroll_all(collection: str, filter_old: dict) -> list:
|
|
"""Scroll all matching points from a collection"""
|
|
points = []
|
|
offset = None
|
|
while True:
|
|
body = {
|
|
"limit": 1000,
|
|
"with_payload": True,
|
|
"with_vector": True,
|
|
"filter": filter_old,
|
|
}
|
|
if offset:
|
|
body["offset"] = offset
|
|
result = qdrant_post(f"/collections/{collection}/points/scroll", body)
|
|
batch = result.get("result", {}).get("points", [])
|
|
points.extend(batch)
|
|
next_offset = result.get("result", {}).get("next_page_offset")
|
|
if next_offset is None:
|
|
break
|
|
offset = next_offset
|
|
return points
|
|
|
|
|
|
def update_points(collection: str, points: list, old_uuid: str, new_uuid: str):
|
|
"""Update file_uuid in payload for the given points"""
|
|
if not points:
|
|
return 0
|
|
|
|
updated = []
|
|
for p in points:
|
|
pl = p.get("payload", {})
|
|
# Check both 'uuid' and 'file_uuid' fields
|
|
changed = False
|
|
if pl.get("uuid") == old_uuid:
|
|
pl["uuid"] = new_uuid
|
|
changed = True
|
|
if pl.get("file_uuid") == old_uuid:
|
|
pl["file_uuid"] = new_uuid
|
|
changed = True
|
|
if changed:
|
|
updated.append({
|
|
"id": p["id"],
|
|
"vector": p["vector"],
|
|
"payload": pl,
|
|
})
|
|
|
|
if not updated:
|
|
return 0
|
|
|
|
# Update in batches of 500
|
|
total = len(updated)
|
|
for i in range(0, total, 500):
|
|
batch = updated[i:i+500]
|
|
result = qdrant_put(
|
|
f"/collections/{collection}/points?wait=true",
|
|
{"points": batch}
|
|
)
|
|
if result.get("status") != "ok":
|
|
print(f" Error at {i}: {result}")
|
|
return i
|
|
return total
|
|
|
|
|
|
def main():
|
|
for collection in COLLECTIONS:
|
|
# Check if collection exists
|
|
info = qdrant_get(f"/collections/{collection}")
|
|
if "result" not in info:
|
|
continue
|
|
|
|
for old_uuid, new_uuids in UUID_MAP.items():
|
|
for new_uuid in new_uuids:
|
|
# Scroll all points with this old UUID
|
|
filter_body = {
|
|
"must": [
|
|
{"should": [
|
|
{"key": "uuid", "match": {"value": old_uuid}},
|
|
{"key": "file_uuid", "match": {"value": old_uuid}},
|
|
]}
|
|
]
|
|
}
|
|
points = scroll_all(collection, filter_body)
|
|
if not points:
|
|
continue
|
|
|
|
print(f"{collection}: {len(points)} points with UUID {old_uuid[:8]}...")
|
|
updated = update_points(collection, points, old_uuid, new_uuid)
|
|
print(f" → {updated} points updated to {new_uuid[:8]}...")
|
|
|
|
# Verify
|
|
print("\n=== Verification ===")
|
|
for collection in COLLECTIONS:
|
|
for old_uuid, new_uuids in UUID_MAP.items():
|
|
for what, uuid in [("old", old_uuid), ("new", new_uuids[0])]:
|
|
filter_body = {
|
|
"must": [
|
|
{"should": [
|
|
{"key": "uuid", "match": {"value": uuid}},
|
|
{"key": "file_uuid", "match": {"value": uuid}},
|
|
]}
|
|
]
|
|
}
|
|
result = qdrant_post(
|
|
f"/collections/{collection}/points/count",
|
|
{"filter": filter_body}
|
|
)
|
|
cnt = result.get("result", {}).get("count", 0)
|
|
if cnt > 0:
|
|
print(f" {collection}: {cnt} points with {what} UUID")
|
|
print("✅ Done")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|