feat: Phase 1 handover - schema migration, correction mechanism, API fixes
Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index Correction: asr-1.json format, generate/apply scripts API: 37/37 endpoints fixed and tested Docs: HANDOVER_V2.0.md for M4
This commit is contained in:
163
scripts/apply_asr_corrections.py
Normal file
163
scripts/apply_asr_corrections.py
Normal file
@@ -0,0 +1,163 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Apply asr-1.json corrections to dev.chunks.
|
||||
DELETE old chunks, INSERT corrected chunks.
|
||||
PRESERVE chunk_vectors by renaming old chunk_id to new corrected IDs.
|
||||
"""
|
||||
import json, os, subprocess, sys, time
|
||||
|
||||
PG_BIN = "/Users/accusys/pgsql/18.3/bin"
|
||||
DB_USER = "accusys"
|
||||
DB_NAME = "momentry"
|
||||
OUTPUT_DIR = "/Users/accusys/momentry/output_dev"
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
DRY_RUN = "--dry-run" in sys.argv
|
||||
|
||||
|
||||
def psql(sql, raw=False):
|
||||
args = [f"{PG_BIN}/psql", "-U", DB_USER, "-d", DB_NAME]
|
||||
if not raw:
|
||||
args += ["-t", "-A"]
|
||||
args += ["-c", sql]
|
||||
r = subprocess.run(args, capture_output=True, text=True, timeout=15)
|
||||
if r.returncode != 0: return None, r.stderr[:200]
|
||||
return r.stdout.strip(), None
|
||||
|
||||
|
||||
def esc(val):
|
||||
if val is None: return "NULL"
|
||||
return "'" + str(val).replace("'", "''") + "'"
|
||||
|
||||
|
||||
def main():
|
||||
t0 = time.time()
|
||||
fps = 24.0
|
||||
errors = 0
|
||||
|
||||
d = json.load(open(os.path.join(OUTPUT_DIR, f"{UUID}.asr-1.json")))
|
||||
kept = d["kept"]
|
||||
corrections = d["corrections"]
|
||||
|
||||
total = len(kept) + sum(len(c["corrected"]) for c in corrections)
|
||||
print(f"Kept: {len(kept)}, Corrected chunks: {sum(len(c['corrected']) for c in corrections)}, Total: {total}\n")
|
||||
|
||||
# Step 1: DELETE old sentence chunks
|
||||
if not DRY_RUN:
|
||||
psql(f"DELETE FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='sentence';")
|
||||
print(f"Step 1/4: Deleted old chunks (dry_run={DRY_RUN})")
|
||||
|
||||
# Step 2: RENAME chunk_vectors: old chunk_id → new corrected IDs
|
||||
# For kept chunks: chunk_id unchanged → no action needed
|
||||
# For corrections: clone the vector to each new child ID
|
||||
vec_renamed = 0
|
||||
batch_sql = []
|
||||
for c in corrections:
|
||||
old_id = str(c["parent_chunk_index"])
|
||||
new_ids = []
|
||||
for si, child in enumerate(c["corrected"]):
|
||||
new_id = child.get("new_chunk_id", f"{c['parent_chunk_index']}-{si+1:02d}")
|
||||
new_ids.append(new_id)
|
||||
# Check if old_id has a vector in chunk_vectors
|
||||
if not DRY_RUN:
|
||||
out, err = psql(
|
||||
f"SELECT count(*) FROM dev.chunk_vectors "
|
||||
f"WHERE uuid='{UUID}' AND chunk_id='{old_id}'"
|
||||
)
|
||||
count = int(out.strip()) if out and out.strip().isdigit() else 0
|
||||
else:
|
||||
count = 1 # assume exists for dry-run
|
||||
|
||||
if count > 0:
|
||||
# Delete old row, insert new rows for each child (cloning the embedding)
|
||||
if not DRY_RUN:
|
||||
# Get the embedding data
|
||||
out, err = psql(
|
||||
f"SELECT embedding FROM dev.chunk_vectors "
|
||||
f"WHERE uuid='{UUID}' AND chunk_id='{old_id}'"
|
||||
)
|
||||
embedding = out.strip() if out and out.strip() else "NULL"
|
||||
# Delete old
|
||||
psql(f"DELETE FROM dev.chunk_vectors WHERE uuid='{UUID}' AND chunk_id='{old_id}'")
|
||||
# Insert new rows
|
||||
for new_id in new_ids:
|
||||
psql(
|
||||
f"INSERT INTO dev.chunk_vectors (chunk_id, uuid, chunk_type, embedding) "
|
||||
f"VALUES ('{new_id}', '{UUID}', 'sentence', '{embedding}'::jsonb)"
|
||||
)
|
||||
vec_renamed += len(new_ids)
|
||||
|
||||
print(f"Step 2/4: chunk_vectors renamed: {vec_renamed} new entries (dry_run={DRY_RUN})")
|
||||
|
||||
# Step 3: INSERT kept chunks
|
||||
batch = []
|
||||
for k in kept:
|
||||
child_id = str(k["chunk_index"])
|
||||
sf = k["start_frame"]
|
||||
ef = k["end_frame"]
|
||||
text = k["text_content"]
|
||||
st = round(sf / fps, 3)
|
||||
et = round(ef / fps, 3)
|
||||
batch.append(
|
||||
f"INSERT INTO dev.chunks "
|
||||
f"(file_uuid, chunk_id, old_chunk_id, chunk_index, chunk_type, "
|
||||
f"start_time, end_time, start_frame, end_frame, text_content, fps, content) "
|
||||
f"VALUES ("
|
||||
f"'{UUID}', '{child_id}', '{child_id}', 0, 'sentence', "
|
||||
f"{esc(st)}, {esc(et)}, {sf}, {ef}, {esc(text)}, {fps}, "
|
||||
f"'{{\"source\": \"asr-1\"}}'::jsonb"
|
||||
f");"
|
||||
)
|
||||
|
||||
# Step 4: INSERT corrected chunks
|
||||
for c in corrections:
|
||||
for si, child in enumerate(c["corrected"]):
|
||||
child_id = child.get("new_chunk_id", f"{c['parent_chunk_index']}-{si+1:02d}")
|
||||
sf = child["start_frame"]
|
||||
ef = child["end_frame"]
|
||||
text = child["text_content"]
|
||||
st = round(sf / fps, 3)
|
||||
et = round(ef / fps, 3)
|
||||
batch.append(
|
||||
f"INSERT INTO dev.chunks "
|
||||
f"(file_uuid, chunk_id, old_chunk_id, chunk_index, chunk_type, "
|
||||
f"start_time, end_time, start_frame, end_frame, text_content, fps, content) "
|
||||
f"VALUES ("
|
||||
f"'{UUID}', '{child_id}', '{child_id}', 0, 'sentence', "
|
||||
f"{esc(st)}, {esc(et)}, {sf}, {ef}, {esc(text)}, {fps}, "
|
||||
f"'{{\"source\": \"asr-1\"}}'::jsonb"
|
||||
f");"
|
||||
)
|
||||
|
||||
# Execute batch
|
||||
for bs in range(0, len(batch), 100):
|
||||
be = min(bs + 100, len(batch))
|
||||
if not DRY_RUN:
|
||||
for s in batch[bs:be]:
|
||||
out, err = psql(s)
|
||||
if err:
|
||||
errors += 1
|
||||
if errors <= 3: print(f" ERROR: {err[:120]}")
|
||||
pct = be * 100 // len(batch)
|
||||
print(f" Steps 3+4/4: [{be}/{len(batch)}] {pct}% err={errors} [{time.time()-t0:.0f}s]")
|
||||
|
||||
# Verify
|
||||
if not DRY_RUN:
|
||||
sc = psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='sentence'")
|
||||
vc = psql(f"SELECT count(*) FROM dev.chunk_vectors WHERE uuid='{UUID}'")
|
||||
mc = psql(
|
||||
f"SELECT count(*) FROM dev.chunk_vectors cv "
|
||||
f"JOIN dev.chunks c ON c.file_uuid=cv.uuid AND c.chunk_id=cv.chunk_id "
|
||||
f"WHERE cv.uuid='{UUID}'"
|
||||
)
|
||||
print(f"\n Verify: {sc[0].strip()} chunks, {vc[0].strip()} vectors, {mc[0].strip()} matched")
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print("DRY RUN" if DRY_RUN else "APPLIED")
|
||||
print(f" Total chunks: {len(batch)}")
|
||||
print(f" Vectors renamed: {vec_renamed}")
|
||||
print(f" Errors: {errors}")
|
||||
print(f" Time: {time.time()-t0:.1f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
83
scripts/asr_model_benchmark.py
Normal file
83
scripts/asr_model_benchmark.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Comprehensive ASR Model Selection Benchmark
|
||||
Tests 5 models × 2 VAD settings across 3 test clips.
|
||||
Output: JSON results + markdown report
|
||||
"""
|
||||
import json, time, os, gc, sys
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
CLIPS = {
|
||||
"A_rapid": {"path": "/tmp/asr_clip_A.mp4", "offset": 1540},
|
||||
"B_normal": {"path": "/tmp/asr_clip_B.mp4", "offset": 600},
|
||||
"C_complex": {"path": "/tmp/asr_clip_C.mp4", "offset": 4400},
|
||||
}
|
||||
|
||||
MODELS = ["tiny", "base", "small", "medium", "large-v3"]
|
||||
VAD_SETTINGS = [200, 500] # min_silence_duration_ms
|
||||
|
||||
RESULTS_FILE = "/tmp/asr_benchmark_results.json"
|
||||
|
||||
def run_transcribe(model, clip_path, clip_name, vad_ms):
|
||||
segs = []
|
||||
t0 = time.time()
|
||||
vad_params = {"min_silence_duration_ms": vad_ms}
|
||||
segments, info = model.transcribe(clip_path, beam_size=5, vad_filter=True,
|
||||
vad_parameters=vad_params)
|
||||
for seg in segments:
|
||||
segs.append({"start": round(seg.start, 2), "end": round(seg.end, 2),
|
||||
"text": seg.text.strip()})
|
||||
elapsed = time.time() - t0
|
||||
return segs, info, elapsed
|
||||
|
||||
# Load existing results to skip completed
|
||||
all_results = {}
|
||||
if os.path.exists(RESULTS_FILE):
|
||||
all_results = json.load(open(RESULTS_FILE))
|
||||
print(f"Loaded {sum(len(v) for v in all_results.values())} existing results")
|
||||
|
||||
total = len(CLIPS) * len(MODELS) * len(VAD_SETTINGS)
|
||||
done = sum(len(v) for v in all_results.values())
|
||||
print(f"Total: {total} tests, {done} already done, {total-done} remaining\n")
|
||||
|
||||
for clip_name, clip_cfg in CLIPS.items():
|
||||
if clip_name not in all_results:
|
||||
all_results[clip_name] = {}
|
||||
|
||||
for model_size in MODELS:
|
||||
for vad_ms in VAD_SETTINGS:
|
||||
key = f"{model_size}_vad{vad_ms}"
|
||||
if key in all_results[clip_name]:
|
||||
continue
|
||||
|
||||
print(f"[{clip_name}] {model_size} VAD={vad_ms}ms ...", end=" ", flush=True)
|
||||
t_load = time.time()
|
||||
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||||
load_time = time.time() - t_load
|
||||
|
||||
segs, info, trans_time = run_transcribe(model, clip_cfg["path"], clip_name, vad_ms)
|
||||
|
||||
# Total chars
|
||||
total_chars = sum(len(s["text"]) for s in segs)
|
||||
|
||||
all_results[clip_name][key] = {
|
||||
"model": model_size,
|
||||
"vad_ms": vad_ms,
|
||||
"segments": segs,
|
||||
"segment_count": len(segs),
|
||||
"total_chars": total_chars,
|
||||
"runtime_secs": round(trans_time, 1),
|
||||
"load_time_secs": round(load_time, 1),
|
||||
"language": info.language,
|
||||
}
|
||||
print(f"{len(segs)} segs, {total_chars} chars, {trans_time:.1f}s")
|
||||
|
||||
# Free memory between models
|
||||
del model
|
||||
gc.collect()
|
||||
|
||||
# Save incrementally
|
||||
json.dump(all_results, open(RESULTS_FILE, "w"))
|
||||
|
||||
print("\n=== All tests complete ===")
|
||||
print(json.dumps({k: {kk: {kkk: vv for kkk, vv in v.items() if kkk != "segments"} for kk, v in vv.items()} for k, vv in all_results.items()}, indent=2))
|
||||
173
scripts/clean_sentence_text.py
Normal file
173
scripts/clean_sentence_text.py
Normal file
@@ -0,0 +1,173 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
LLM-clean all 4188 sentence texts, re-embed, update momentry_dev_v1 + sentence_story.
|
||||
"""
|
||||
import json, time, os
|
||||
from urllib.request import Request, urlopen
|
||||
import psycopg2
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
||||
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
||||
CHECKPOINT = f"/tmp/sentence_clean_{UUID}.json"
|
||||
|
||||
def call_llm(prompt):
|
||||
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.1, "max_tokens": 80}).encode()
|
||||
req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
resp = urlopen(req, timeout=30)
|
||||
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
|
||||
|
||||
def call_embed(text):
|
||||
body = json.dumps({"input": text}).encode()
|
||||
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
resp = urlopen(req, timeout=30)
|
||||
return json.loads(resp.read())["data"][0]["embedding"]
|
||||
|
||||
print("=== Step 1: Load all sentences ===")
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT id, chunk_id, text_content
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid = %s AND chunk_type = 'sentence'
|
||||
ORDER BY id
|
||||
""", (UUID,))
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
print(f"Loaded {len(rows)} sentences")
|
||||
|
||||
# Reset checkpoint (incompatible with old chunk_index format)
|
||||
if os.path.exists(CHECKPOINT):
|
||||
os.remove(CHECKPOINT)
|
||||
print("Old checkpoint removed (format changed)")
|
||||
|
||||
results = []
|
||||
errors = 0
|
||||
|
||||
print("\n=== Step 2: LLM clean + embed ===")
|
||||
for i, (cid, chunk_id, text_content) in enumerate(rows):
|
||||
input_text = text_content
|
||||
|
||||
prompt = f"""Clean this movie dialogue line. Fix truncated words, capitalize, add punctuation.
|
||||
Return: SPEAKER: "clean text"
|
||||
|
||||
Input: [Cary Grant] can't you do something constructive like start
|
||||
Return: Cary Grant: "Can't you do something constructive like start?"
|
||||
|
||||
Input: [Audrey Hepburn] qui se présente influence d'une manière vitale la proposition l
|
||||
Return: Audrey Hepburn: "Qui se présente influence d'une manière vitale la proposition..."
|
||||
|
||||
Input: {input_text}
|
||||
Return:"""
|
||||
|
||||
try:
|
||||
cleaned = call_llm(prompt)
|
||||
embedding = call_embed(cleaned)
|
||||
time.sleep(0.1)
|
||||
except Exception as e:
|
||||
print(f" [{i+1}/{len(rows)}] id={cid} chunk={chunk_id} ERROR: {e}")
|
||||
cleaned = input_text
|
||||
embedding = [0.0] * 768
|
||||
errors += 1
|
||||
|
||||
entry = {
|
||||
"index": i,
|
||||
"chunk_id": chunk_id,
|
||||
"original": input_text,
|
||||
"cleaned": cleaned,
|
||||
"embedding": embedding,
|
||||
}
|
||||
results.append(entry)
|
||||
json.dump({"last": i}, open(CHECKPOINT, "w"))
|
||||
|
||||
if (i + 1) % 50 == 0:
|
||||
print(f" [{i+1}/{len(rows)}] chunk={chunk_id} errors={errors}")
|
||||
|
||||
results.sort(key=lambda x: x["index"])
|
||||
|
||||
print(f"\nDone: {len(results)} cleaned, {errors} errors")
|
||||
|
||||
print("\n=== Step 3: Rebuild momentry_dev_v1 ===")
|
||||
# Delete old
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_v1", method="DELETE")
|
||||
try: urlopen(req); time.sleep(0.5)
|
||||
except: pass
|
||||
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_v1",
|
||||
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req); time.sleep(0.5)
|
||||
|
||||
batch_size = 100
|
||||
points = []
|
||||
for pi, r in enumerate(results):
|
||||
points.append({
|
||||
"id": pi + 1,
|
||||
"vector": r["embedding"],
|
||||
"payload": {
|
||||
"chunk_type": "sentence",
|
||||
"uuid": UUID,
|
||||
"chunk_id": r["chunk_id"],
|
||||
"text": r["cleaned"],
|
||||
"original": r["original"],
|
||||
}
|
||||
})
|
||||
|
||||
for start in range(0, len(points), batch_size):
|
||||
batch = points[start:start+batch_size]
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_v1/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try: urlopen(req)
|
||||
except Exception as e: print(f" batch {start}: {e}")
|
||||
if (start // batch_size) % 5 == 0:
|
||||
print(f" momentry_dev_v1: {start+len(batch)}/{len(points)}")
|
||||
|
||||
print(" momentry_dev_v1 done")
|
||||
|
||||
print("\n=== Step 4: Rebuild sentence_story ===")
|
||||
req = Request(f"{QDRANT_URL}/collections/sentence_story", method="DELETE")
|
||||
try: urlopen(req); time.sleep(0.5)
|
||||
except: pass
|
||||
|
||||
req = Request(f"{QDRANT_URL}/collections/sentence_story",
|
||||
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req); time.sleep(0.5)
|
||||
|
||||
story_points = []
|
||||
for pi, r in enumerate(results):
|
||||
story_points.append({
|
||||
"id": pi + 1,
|
||||
"vector": r["embedding"],
|
||||
"payload": {
|
||||
"chunk_type": "sentence",
|
||||
"uuid": UUID,
|
||||
"chunk_id": r["chunk_id"],
|
||||
"text": r["cleaned"],
|
||||
}
|
||||
})
|
||||
|
||||
for start in range(0, len(story_points), batch_size):
|
||||
batch = story_points[start:start+batch_size]
|
||||
req = Request(f"{QDRANT_URL}/collections/sentence_story/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try: urlopen(req)
|
||||
except Exception as e: print(f" batch {start}: {e}")
|
||||
if (start // batch_size) % 5 == 0:
|
||||
print(f" sentence_story: {start+len(batch)}/{len(story_points)}")
|
||||
|
||||
print(" sentence_story done")
|
||||
|
||||
# Verify
|
||||
for col in ["momentry_dev_v1", "sentence_story"]:
|
||||
resp = json.loads(urlopen(f"{QDRANT_URL}/collections/{col}").read())
|
||||
info = resp["result"]
|
||||
print(f"Verified {col}: {info['points_count']} pts, {info['config']['params']['vectors'].get('size','?')}D")
|
||||
|
||||
print("\n=== Done ===")
|
||||
138
scripts/compare_models_gun_test.py
Normal file
138
scripts/compare_models_gun_test.py
Normal file
@@ -0,0 +1,138 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Comparison test: Grounding DINO Base vs Florence-2 Base vs Florence-2 Large
|
||||
Tests on 8 known timepoints with gun prompts.
|
||||
"""
|
||||
import json, os, sys, time, cv2, torch
|
||||
from PIL import Image
|
||||
|
||||
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
|
||||
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/model_comparison"
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
TIMEPOINTS = [
|
||||
(2646, "2646s"), (3188, "3188s"), (3697, "3697s"),
|
||||
(5341, "5341s"), (5461, "5461s"), (6309, "6309s"),
|
||||
(6377, "6377s"), (6479, "6479s"),
|
||||
]
|
||||
PROMPTS = {"gun": "gun.", "pistol": "pistol."}
|
||||
device = "mps" if torch.backends.mps.is_available() else "cpu"
|
||||
|
||||
cap = cv2.VideoCapture(VIDEO)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
frames = {}
|
||||
for t_sec, label in TIMEPOINTS:
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
|
||||
ret, frame = cap.read()
|
||||
if ret: frames[label] = frame
|
||||
cap.release()
|
||||
print(f"Loaded {len(frames)} frames")
|
||||
|
||||
all_results = {}
|
||||
|
||||
# ========== Grounding DINO Base ==========
|
||||
print("\n" + "="*60)
|
||||
print("Grounding DINO Base")
|
||||
print("="*60)
|
||||
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
||||
t0 = time.time()
|
||||
gd_proc = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
|
||||
gd_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-base").to(device)
|
||||
gd_dets = {}
|
||||
for label, frame in frames.items():
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
for pname, prompt in PROMPTS.items():
|
||||
inputs = gd_proc(images=img, text=prompt, return_tensors="pt").to(device)
|
||||
with torch.no_grad():
|
||||
outputs = gd_model(**inputs)
|
||||
target = torch.tensor([img.size[::-1]])
|
||||
dets = gd_proc.post_process_grounded_object_detection(outputs, threshold=0.1, target_sizes=target)[0]
|
||||
scores = [round(s.item(), 3) for s in dets["scores"]] if len(dets["boxes"]) > 0 else []
|
||||
gd_dets[f"{label}_{pname}"] = scores
|
||||
all_results["grounding-dino-base"] = {"elapsed": round(time.time()-t0, 1), "detections": gd_dets}
|
||||
print(f" Done in {all_results['grounding-dino-base']['elapsed']}s")
|
||||
del gd_model; torch.mps.empty_cache()
|
||||
|
||||
# ========== Florence-2 Base ==========
|
||||
print("\n" + "="*60)
|
||||
print("Florence-2 Base")
|
||||
print("="*60)
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
t0 = time.time()
|
||||
f2b_proc = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
|
||||
f2b_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True).to(device)
|
||||
f2b_dets = {}
|
||||
for label, frame in frames.items():
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
for pname, prompt_text in PROMPTS.items():
|
||||
task = f"<OD>" # Object detection task
|
||||
text = f"{task}{prompt_text}"
|
||||
inputs = f2b_proc(text=text, images=img, return_tensors="pt").to(device)
|
||||
with torch.no_grad():
|
||||
outputs = f2b_model.generate(**inputs, max_new_tokens=100, num_beams=3)
|
||||
result = f2b_proc.decode(outputs[0], skip_special_tokens=False)
|
||||
# Parse Florence-2 output format
|
||||
scores = []
|
||||
if "<p>" in result and "</p>" in result:
|
||||
# Simple parsing: count detections (Florence-2 outputs positions)
|
||||
# Florence-2 outputs: <OD>gun.</s><p><loc_...><loc_...><loc_...><loc_...>gun</p>...
|
||||
import re
|
||||
detections = re.findall(r'<loc_\d+>', result)
|
||||
n_dets = len(detections) // 4 # 4 coords per bbox
|
||||
scores = [1.0] * n_dets if n_dets > 0 else [] # Florence-2 doesn't output confidence
|
||||
elif prompt_text.replace('.','') in result:
|
||||
scores = [1.0] # At least one detection found
|
||||
f2b_dets[f"{label}_{pname}"] = scores
|
||||
all_results["florence2-base"] = {"elapsed": round(time.time()-t0, 1), "detections": f2b_dets}
|
||||
print(f" Done in {all_results['florence2-base']['elapsed']}s")
|
||||
del f2b_model; torch.mps.empty_cache()
|
||||
|
||||
# ========== Florence-2 Large ==========
|
||||
print("\n" + "="*60)
|
||||
print("Florence-2 Large")
|
||||
print("="*60)
|
||||
t0 = time.time()
|
||||
f2l_proc = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
|
||||
f2l_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True).to(device)
|
||||
f2l_dets = {}
|
||||
for label, frame in frames.items():
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
for pname, prompt_text in PROMPTS.items():
|
||||
task = f"<OD>"
|
||||
text = f"{task}{prompt_text}"
|
||||
inputs = f2l_proc(text=text, images=img, return_tensors="pt").to(device)
|
||||
with torch.no_grad():
|
||||
outputs = f2l_model.generate(**inputs, max_new_tokens=100, num_beams=3)
|
||||
result = f2l_proc.decode(outputs[0], skip_special_tokens=False)
|
||||
scores = []
|
||||
import re
|
||||
detections = re.findall(r'<loc_\d+>', result)
|
||||
n_dets = len(detections) // 4
|
||||
scores = [1.0] * n_dets if n_dets > 0 else []
|
||||
f2l_dets[f"{label}_{pname}"] = scores
|
||||
all_results["florence2-large"] = {"elapsed": round(time.time()-t0, 1), "detections": f2l_dets}
|
||||
print(f" Done in {all_results['florence2-large']['elapsed']}s")
|
||||
del f2l_model; torch.mps.empty_cache()
|
||||
|
||||
# ========== Summary ==========
|
||||
print("\n" + "="*60)
|
||||
print(f"{'Model':<25} {'Time':>8} {'Gun hits':>10} {'Gun best':>10} {'Pistol hits':>12} {'Pistol best':>10}")
|
||||
print("-"*75)
|
||||
for model_name in ["grounding-dino-base", "florence2-base", "florence2-large"]:
|
||||
d = all_results[model_name]
|
||||
dets = d["detections"]
|
||||
gun_scores = []
|
||||
pistol_scores = []
|
||||
for label, _, _ in TIMEPOINTS:
|
||||
gk = f"{label}s_gun"
|
||||
pk = f"{label}s_pistol"
|
||||
gun_scores.extend(dets.get(gk, []))
|
||||
pistol_scores.extend(dets.get(pk, []))
|
||||
gun_hits = sum(1 for s in gun_scores if s > 0)
|
||||
pistol_hits = sum(1 for s in pistol_scores if s > 0)
|
||||
gun_best = max(gun_scores) if gun_scores else 0
|
||||
pistol_best = max(pistol_scores) if pistol_scores else 0
|
||||
print(f"{model_name:<25} {d['elapsed']:>7.1f}s {gun_hits:>6d}/8 {gun_best:>8.3f} {pistol_hits:>6d}/8 {pistol_best:>8.3f}")
|
||||
|
||||
json.dump(all_results, open(os.path.join(OUTPUT_DIR, "model_comparison.json"), "w"), indent=2)
|
||||
print(f"\nSaved to {OUTPUT_DIR}/")
|
||||
78
scripts/coreml_embed_server.py
Executable file
78
scripts/coreml_embed_server.py
Executable file
@@ -0,0 +1,78 @@
|
||||
"""
|
||||
Simple Flask-like HTTP server for CoreML ANE embedding inference.
|
||||
Replaces /api/embeddings endpoint that comic_embed.rs calls.
|
||||
"""
|
||||
import json, os, argparse
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
import numpy as np
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
# Global model
|
||||
MODEL = None
|
||||
TOKENIZER = None
|
||||
MODEL_PATH = "/Users/accusys/models/mxbai-embed-large-v1.mlpackage"
|
||||
|
||||
class EmbeddingHandler(BaseHTTPRequestHandler):
|
||||
def do_POST(self):
|
||||
if self.path == "/api/embeddings":
|
||||
length = int(self.headers.get("Content-Length", 0))
|
||||
body = self.read(length)
|
||||
try:
|
||||
data = json.loads(body)
|
||||
prompt = data.get("prompt", "")
|
||||
# Strip search_document: or search_query: prefix
|
||||
if prompt.startswith("search_document: "):
|
||||
prompt = prompt[17:]
|
||||
elif prompt.startswith("search_query: "):
|
||||
prompt = prompt[14:]
|
||||
|
||||
tokens = TOKENIZER(prompt, return_tensors="np", padding="max_length", truncation=True, max_length=512)
|
||||
input_ids = tokens["input_ids"].astype(np.int32)
|
||||
attention_mask = tokens["attention_mask"].astype(np.int32)
|
||||
result = MODEL.predict({"input_ids": input_ids, "attention_mask": attention_mask})
|
||||
embedding = result["embedding"][0].tolist()
|
||||
|
||||
resp = json.dumps({"embedding": embedding}).encode()
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.end_headers()
|
||||
self.wfile.write(resp)
|
||||
except Exception as e:
|
||||
resp = json.dumps({"error": str(e)}).encode()
|
||||
self.send_response(500)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.end_headers()
|
||||
self.wfile.write(resp)
|
||||
else:
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
|
||||
def read(self, length):
|
||||
return self.rfile.read(length)
|
||||
|
||||
def main():
|
||||
global MODEL, TOKENIZER
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--port", type=int, default=11435)
|
||||
parser.add_argument("--model", default=MODEL_PATH)
|
||||
args = parser.parse_args()
|
||||
|
||||
import coremltools as ct
|
||||
print(f"Loading CoreML model from {args.model}...")
|
||||
MODEL = ct.models.MLModel(args.model, compute_units=ct.ComputeUnit.ALL)
|
||||
print(f"Model loaded (compute: {MODEL.compute_unit})")
|
||||
|
||||
print("Loading tokenizer...")
|
||||
TOKENIZER = AutoTokenizer.from_pretrained("mixedbread-ai/mxbai-embed-large-v1")
|
||||
print("Tokenizer loaded")
|
||||
|
||||
server = HTTPServer(("127.0.0.1", args.port), EmbeddingHandler)
|
||||
print(f"ANE Embedding server running on port {args.port}")
|
||||
print(f"API: POST http://127.0.0.1:{args.port}/api/embeddings")
|
||||
print(f" Body: {{\"model\": \"...\", \"prompt\": \"...\"}}")
|
||||
print(f" Response: {{\"embedding\": [...]}}")
|
||||
server.serve_forever()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,176 +1,281 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Momentry Dashboard — Flask web app
|
||||
Reads pipeline status + Redis + system health on demand
|
||||
Momentry Dashboard v2 — Direct DB/Qdrant/Redis queries, no subprocess blocking
|
||||
"""
|
||||
|
||||
import json, os, subprocess, sys, platform
|
||||
import json, os, platform, time
|
||||
from pathlib import Path
|
||||
from flask import Flask, jsonify, render_template_string
|
||||
import psycopg2
|
||||
import urllib.request
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
PROJECT = Path(__file__).resolve().parent.parent
|
||||
|
||||
# System role detection
|
||||
HOSTNAME = platform.node()
|
||||
IS_M5 = "MacBook" in HOSTNAME or "M5" in HOSTNAME
|
||||
IS_M5 = "MacBook" in HOSTNAME
|
||||
SYSTEM_ROLE = "M5 (MacBook Pro)" if IS_M5 else "M4 (Mac Mini)"
|
||||
SYSTEM_COLOR = "#58a6ff" if IS_M5 else "#f0883e"
|
||||
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
||||
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
||||
|
||||
def run_status_json():
|
||||
"""Run pipeline_status.py and return parsed JSON"""
|
||||
r = subprocess.run(
|
||||
[sys.executable, str(PROJECT / "scripts/pipeline_status.py"), "--json"],
|
||||
capture_output=True, text=True, timeout=30,
|
||||
)
|
||||
return json.loads(r.stdout)
|
||||
COLLECTIONS = [
|
||||
"momentry_dev_v1", "momentry_dev_stories", "momentry_dev_voice",
|
||||
"momentry_dev_faces", "sentence_story", "sentence_summary",
|
||||
"momentry_dev_rule1_v2",
|
||||
]
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
|
||||
def run_redis_info():
|
||||
"""Fetch key Redis metrics"""
|
||||
result = {}
|
||||
def db_query(sql, params=None):
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute(sql, params or ())
|
||||
rows = cur.fetchall()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
def qdrant_get(path):
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["redis-cli", "-a", "accusys", "INFO", "all"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
for line in r.stdout.split("\n"):
|
||||
line = line.strip()
|
||||
if ":" not in line or line.startswith("#"):
|
||||
continue
|
||||
k, v = line.split(":", 1)
|
||||
if k in ("total_system_memory_human", "used_memory_human",
|
||||
"used_memory_peak_human", "total_connections_received",
|
||||
"total_commands_processed", "keyspace_hits", "keyspace_misses",
|
||||
"connected_clients", "uptime_in_seconds"):
|
||||
result[k] = v if not v.endswith("_human") else v
|
||||
result["keyspace_hits"] = int(result.get("keyspace_hits", 0))
|
||||
result["keyspace_misses"] = int(result.get("keyspace_misses", 0))
|
||||
hit_rate = result["keyspace_hits"] / max(result["keyspace_hits"] + result["keyspace_misses"], 1) * 100
|
||||
result["hit_rate_pct"] = round(hit_rate, 1)
|
||||
except Exception as e:
|
||||
result["error"] = str(e)
|
||||
|
||||
# Get momentry keys
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["redis-cli", "-a", "accusys", "KEYS", "momentry_dev:*"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
keys = [k for k in r.stdout.strip().split("\n") if k]
|
||||
result["momentry_keys"] = len(keys)
|
||||
# Sample a few interesting keys
|
||||
sample = {}
|
||||
for k in keys:
|
||||
if k.endswith(":health") or k.endswith(":job:") or ":processor:" in k:
|
||||
pass
|
||||
if len(sample) >= 5:
|
||||
break
|
||||
result["key_sample"] = keys[:10]
|
||||
resp = urllib.request.urlopen(f"{QDRANT_URL}{path}", timeout=5)
|
||||
return json.loads(resp.read())
|
||||
except:
|
||||
result["momentry_keys"] = 0
|
||||
result["key_sample"] = []
|
||||
return None
|
||||
|
||||
def qdrant_count(col):
|
||||
r = qdrant_get(f"/collections/{col}")
|
||||
if r:
|
||||
return r.get("result", {}).get("points_count", 0)
|
||||
return -1
|
||||
|
||||
def qdrant_dim(col):
|
||||
r = qdrant_get(f"/collections/{col}")
|
||||
if r:
|
||||
cfg = r.get("result", {}).get("config", {}).get("params", {}).get("vectors", {})
|
||||
return cfg.get("size", "?")
|
||||
return "?"
|
||||
|
||||
@app.route("/")
|
||||
def index():
|
||||
return render_template_string(TEMPLATE, SYSTEM_ROLE=SYSTEM_ROLE)
|
||||
|
||||
@app.route("/api/all")
|
||||
def api_all():
|
||||
return jsonify({
|
||||
"system": {"hostname": HOSTNAME, "role": SYSTEM_ROLE, "is_m5": IS_M5},
|
||||
"status": get_status(),
|
||||
"qdrant": get_qdrant_info(),
|
||||
"db": get_db_info(),
|
||||
"processes": get_processes(),
|
||||
})
|
||||
|
||||
@app.route("/api/status")
|
||||
def api_status():
|
||||
return jsonify(get_status())
|
||||
|
||||
@app.route("/api/qdrant")
|
||||
def api_qdrant():
|
||||
return jsonify(get_qdrant_info())
|
||||
|
||||
@app.route("/api/db")
|
||||
def api_db():
|
||||
return jsonify(get_db_info())
|
||||
|
||||
@app.route("/api/processes")
|
||||
def api_processes():
|
||||
return jsonify(get_processes())
|
||||
|
||||
def get_status():
|
||||
"""Pipeline checklist — direct DB queries"""
|
||||
t0 = time.time()
|
||||
stages = []
|
||||
|
||||
# 1. ASR file
|
||||
asr_path = f"/Users/accusys/momentry/output_dev/{UUID}.asr.json"
|
||||
asr_segs = 0
|
||||
try:
|
||||
if os.path.exists(asr_path):
|
||||
d = json.load(open(asr_path))
|
||||
asr_segs = len(d.get("segments", []))
|
||||
except: pass
|
||||
stages.append({"name":"ASR","passed":asr_segs>0,"detail":f"{asr_segs} seg","elapsed":0.0})
|
||||
|
||||
# 2. ASRX file
|
||||
asrx_path = f"/Users/accusys/momentry/output_dev/{UUID}.asrx.json"
|
||||
asrx_segs = 0
|
||||
try:
|
||||
if os.path.exists(asrx_path):
|
||||
d = json.load(open(asrx_path))
|
||||
asrx_segs = len(d.get("segments", []))
|
||||
except: pass
|
||||
stages.append({"name":"ASRX","passed":asrx_segs>0,"detail":f"{asrx_segs} seg","elapsed":0.0})
|
||||
|
||||
# 3. Sentence chunks
|
||||
try:
|
||||
cnt = db_query("SELECT count(*) FROM dev.chunks WHERE file_uuid=%s AND chunk_type='sentence'", (UUID,))[0][0]
|
||||
except:
|
||||
cnt = 0
|
||||
stages.append({"name":"Sentence","passed":cnt>0,"detail":f"{cnt} chunks","elapsed":0.0})
|
||||
|
||||
# 4. Vectorization (Qdrant)
|
||||
v1 = qdrant_count("momentry_dev_v1")
|
||||
stages.append({"name":"Vectorize","passed":v1>0,"detail":f"{v1} Qdrant","elapsed":0.0})
|
||||
|
||||
# 5. Face traces
|
||||
try:
|
||||
traces = db_query("SELECT count(DISTINCT trace_id) FROM dev.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL", (UUID,))[0][0]
|
||||
faces = db_query("SELECT count(*) FROM dev.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL", (UUID,))[0][0]
|
||||
except:
|
||||
traces = faces = 0
|
||||
stages.append({"name":"FaceTrace","passed":traces>0,"detail":f"{traces} traces, {faces} faces","elapsed":0.0})
|
||||
|
||||
# 6. TKG
|
||||
try:
|
||||
nodes = db_query("SELECT count(*) FROM dev.tkg_nodes WHERE file_uuid=%s", (UUID,))[0][0]
|
||||
edges = db_query("SELECT count(*) FROM dev.tkg_edges WHERE file_uuid=%s", (UUID,))[0][0]
|
||||
except:
|
||||
nodes = edges = 0
|
||||
stages.append({"name":"TKG","passed":nodes>0,"detail":f"{nodes} nodes, {edges} edges","elapsed":0.0})
|
||||
|
||||
# 7. Trace chunks
|
||||
try:
|
||||
tc = db_query("SELECT count(*) FROM dev.chunks WHERE file_uuid=%s AND chunk_type='trace'", (UUID,))[0][0]
|
||||
except:
|
||||
tc = 0
|
||||
stages.append({"name":"TraceChunks","passed":tc>0,"detail":f"{tc} chunks","elapsed":0.0})
|
||||
|
||||
# 8. Phase 1 release
|
||||
p1 = PROJECT / "release" / "phase1" / "latest"
|
||||
p1_ok = p1.exists() and (p1 / "RELEASE_INFO.txt").exists()
|
||||
p1_size = sum(f.stat().st_size for f in p1.rglob("*") if f.is_file()) // (1024*1024) if p1.exists() else 0
|
||||
stages.append({"name":"Phase1","passed":p1_ok,"detail":f"{p1_size}MB","elapsed":0.0})
|
||||
|
||||
all_passed = all(s["passed"] for s in stages)
|
||||
return {
|
||||
"uuid": UUID,
|
||||
"passed": all_passed,
|
||||
"stages": stages,
|
||||
"checked_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"total_elapsed": round(time.time() - t0, 1),
|
||||
"health": get_health(),
|
||||
}
|
||||
|
||||
def get_health():
|
||||
h = {}
|
||||
try:
|
||||
import os
|
||||
load = os.getloadavg()
|
||||
h["cpu_load_1m"] = round(load[0], 1)
|
||||
h["cpu_load_5m"] = round(load[1], 1)
|
||||
except:
|
||||
h["cpu_load_1m"] = h["cpu_load_5m"] = -1
|
||||
|
||||
try:
|
||||
import subprocess
|
||||
rss = 0
|
||||
out = subprocess.run(["ps", "-A", "-o", "rss="], capture_output=True, text=True, timeout=5).stdout
|
||||
for line in out.strip().split("\n"):
|
||||
if line.strip():
|
||||
rss += int(line.strip())
|
||||
h["memory_used_mb"] = rss // 1024 if rss else 0
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
d = subprocess.run(["df", "-h", "/Users/accusys/momentry/output_dev"],
|
||||
capture_output=True, text=True, timeout=5).stdout.strip().split("\n")[-1].split()
|
||||
h["disk_use_pct"] = d[4] if len(d) > 4 else "?"
|
||||
h["disk_avail"] = d[3] if len(d) > 3 else "?"
|
||||
except:
|
||||
pass
|
||||
|
||||
try:
|
||||
import torch
|
||||
h["gpu_available"] = torch.backends.mps.is_available()
|
||||
except:
|
||||
h["gpu_available"] = False
|
||||
|
||||
services = {"postgresql": False, "qdrant": False, "embedding": False, "llm": False}
|
||||
try:
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
conn.close()
|
||||
services["postgresql"] = True
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
r = qdrant_get("/collections")
|
||||
services["qdrant"] = r is not None
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
resp = urllib.request.urlopen("http://localhost:11436/health", timeout=3)
|
||||
services["embedding"] = resp.status == 200
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
req = urllib.request.Request(LLM_URL,
|
||||
data=json.dumps({"model":"google_gemma-4-26B-A4B-it-Q5_K_M.gguf","messages":[{"role":"user","content":"ping"}],"max_tokens":1}).encode(),
|
||||
headers={"Content-Type":"application/json"}, method="POST")
|
||||
resp = urllib.request.urlopen(req, timeout=3)
|
||||
services["llm"] = resp.status == 200
|
||||
except:
|
||||
pass
|
||||
|
||||
h["services"] = services
|
||||
return h
|
||||
|
||||
def get_qdrant_info():
|
||||
result = []
|
||||
for col in COLLECTIONS:
|
||||
r = qdrant_get(f"/collections/{col}")
|
||||
if r:
|
||||
info = r.get("result", {})
|
||||
cfg = info.get("config", {}).get("params", {}).get("vectors", {})
|
||||
result.append({
|
||||
"name": col,
|
||||
"points": info.get("points_count", 0),
|
||||
"dim": cfg.get("size", "?"),
|
||||
})
|
||||
else:
|
||||
result.append({"name": col, "points": -1, "dim": "?"})
|
||||
return result
|
||||
|
||||
|
||||
def run_db_info():
|
||||
"""Fetch DB metrics + current processing file"""
|
||||
psql = "/Users/accusys/pgsql/18.3/bin/psql"
|
||||
cmd = [psql, "-U", "accusys", "-d", "momentry", "-t", "-A"]
|
||||
def get_db_info():
|
||||
result = {}
|
||||
try:
|
||||
r = subprocess.run(cmd + ["-c", """
|
||||
rows = db_query("""
|
||||
SELECT 'videos', count(*) FROM dev.videos
|
||||
UNION ALL SELECT 'chunks', count(*) FROM dev.chunks
|
||||
UNION ALL SELECT 'face_detections', count(*) FROM dev.face_detections
|
||||
UNION ALL SELECT 'identities', count(*) FROM dev.identities
|
||||
UNION ALL SELECT 'tkg_nodes', count(*) FROM dev.tkg_nodes
|
||||
UNION ALL SELECT 'tkg_edges', count(*) FROM dev.tkg_edges
|
||||
"""], capture_output=True, text=True, timeout=10)
|
||||
for line in r.stdout.strip().split("\n"):
|
||||
if not line.strip() or "|" not in line:
|
||||
continue
|
||||
parts = line.split("|")
|
||||
result[parts[0].strip()] = int(parts[1])
|
||||
""")
|
||||
for r in rows:
|
||||
result[r[0]] = r[1]
|
||||
except:
|
||||
pass
|
||||
|
||||
# 所有檔案的 pipeline 進度(依檔案名去重,取最新)
|
||||
try:
|
||||
r = subprocess.run(cmd + ["-c", """
|
||||
SELECT DISTINCT ON (v.file_name)
|
||||
v.file_uuid, v.file_name, v.status,
|
||||
COALESCE(v.processing_status::text, '{}') as pstatus,
|
||||
m.status as job_status
|
||||
FROM dev.videos v
|
||||
LEFT JOIN dev.monitor_jobs m ON m.uuid = v.file_uuid
|
||||
WHERE v.status IN ('completed', 'processing')
|
||||
OR m.status IS NOT NULL
|
||||
ORDER BY v.file_name, GREATEST(
|
||||
COALESCE(v.registration_time::timestamp, '1970-01-01'),
|
||||
COALESCE(m.updated_at, '1970-01-01')
|
||||
) DESC
|
||||
LIMIT 20
|
||||
"""], capture_output=True, text=True, timeout=10)
|
||||
seen_names = set()
|
||||
files = []
|
||||
for line in r.stdout.strip().split("\n"):
|
||||
if not line.strip() or "|" not in line:
|
||||
continue
|
||||
parts = line.split("|", 4)
|
||||
if len(parts) < 5:
|
||||
continue
|
||||
name = parts[1].strip()
|
||||
if name in seen_names:
|
||||
continue
|
||||
seen_names.add(name)
|
||||
f = {"uuid": parts[0].strip(), "name": name,
|
||||
"status": parts[2].strip(), "job_status": parts[4].strip()}
|
||||
try:
|
||||
ps = json.loads(parts[3]) if parts[3] and parts[3] != '{}' else {}
|
||||
f["progress"] = ps.get("progress", {})
|
||||
except:
|
||||
f["progress"] = {}
|
||||
files.append(f)
|
||||
result["files"] = files
|
||||
except Exception as e:
|
||||
result["files_error"] = str(e)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
@app.route("/")
|
||||
def index():
|
||||
return render_template_string(TEMPLATE)
|
||||
|
||||
|
||||
@app.route("/api/status")
|
||||
def api_status():
|
||||
return jsonify(run_status_json())
|
||||
|
||||
|
||||
@app.route("/api/redis")
|
||||
def api_redis():
|
||||
return jsonify(run_redis_info())
|
||||
|
||||
|
||||
@app.route("/api/db")
|
||||
def api_db():
|
||||
return jsonify(run_db_info())
|
||||
|
||||
|
||||
@app.route("/api/all")
|
||||
def api_all():
|
||||
return jsonify({
|
||||
"system": {"hostname": HOSTNAME, "role": SYSTEM_ROLE, "is_m5": IS_M5},
|
||||
"status": run_status_json(),
|
||||
"redis": run_redis_info(),
|
||||
"db": run_db_info(),
|
||||
})
|
||||
|
||||
def get_processes():
|
||||
import subprocess
|
||||
scripts = ["clean_sentence_text.py", "generate_sentence_summaries.py"]
|
||||
result = {}
|
||||
for s in scripts:
|
||||
try:
|
||||
r = subprocess.run(["pgrep", "-f", s], capture_output=True, text=True, timeout=3)
|
||||
pids = [p.strip() for p in r.stdout.strip().split("\n") if p.strip()]
|
||||
if pids:
|
||||
r2 = subprocess.run(["ps", "-o", "etime=", "-p", pids[0]], capture_output=True, text=True, timeout=3)
|
||||
result[s] = {"pid": int(pids[0]), "elapsed": r2.stdout.strip()}
|
||||
else:
|
||||
result[s] = None
|
||||
except:
|
||||
result[s] = None
|
||||
return result
|
||||
|
||||
TEMPLATE = """<!DOCTYPE html>
|
||||
<html lang="zh-TW">
|
||||
@@ -193,10 +298,6 @@ th, td { padding: 8px 12px; text-align: left; border-bottom: 1px solid #21262d;
|
||||
th { color: #8b949e; font-weight: 600; }
|
||||
.pass { color: #3fb950; font-weight: bold; }
|
||||
.fail { color: #f85149; font-weight: bold; }
|
||||
.badge { display: inline-block; padding: 2px 8px; border-radius: 12px; font-size: 12px; font-weight: 600; }
|
||||
.badge-ok { background: #1b3a1b; color: #3fb950; }
|
||||
.badge-err { background: #3a1b1b; color: #f85149; }
|
||||
.badge-warn { background: #3a321b; color: #d29922; }
|
||||
.stat-value { font-size: 28px; font-weight: 700; }
|
||||
.stat-label { font-size: 12px; color: #8b949e; margin-top: 4px; }
|
||||
.stat-card { background: #0d1117; border: 1px solid #30363d; border-radius: 6px; padding: 16px; text-align: center; }
|
||||
@@ -204,275 +305,167 @@ th { color: #8b949e; font-weight: 600; }
|
||||
.last-updated { color: #8b949e; font-size: 13px; }
|
||||
button { background: #238636; color: white; border: none; padding: 8px 20px; border-radius: 6px; cursor: pointer; font-size: 14px; }
|
||||
button:hover { background: #2ea043; }
|
||||
.progress-bar { height: 6px; background: #21262d; border-radius: 3px; margin-top: 8px; }
|
||||
.progress-fill { height: 100%; border-radius: 3px; background: #238636; transition: width 0.5s; }
|
||||
#error { display: none; background: #3a1b1b; border: 1px solid #f85149; border-radius: 6px; padding: 12px; margin-bottom: 16px; color: #f85149; font-size: 13px; }
|
||||
@media (max-width: 768px) { .col { min-width: 100%; } }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="refresh-bar">
|
||||
<h1>Momentry Dashboard <span style="font-size:14px;background:#1f2937;color:#{{'58a6ff' if IS_M5 else 'f0883e'}};padding:4px 12px;border-radius:12px;margin-left:8px;vertical-align:middle">🤖 {{ SYSTEM_ROLE }}</span></h1>
|
||||
<div class="refresh-bar">
|
||||
<h1>Momentry Dashboard <span id="roleBadge" style="font-size:14px;background:#1f2937;padding:4px 12px;border-radius:12px;margin-left:8px">\U0001F4BB {{ SYSTEM_ROLE }}</span></h1>
|
||||
<div style="display:flex;align-items:center;gap:8px">
|
||||
<span class="last-updated" id="lastUpdated">—</span>
|
||||
<button onclick="copyStatus()" style="background:#1f6feb;padding:6px 14px;font-size:13px">📋 Copy</button>
|
||||
<button onclick="fetchAll()" style="background:#238636;padding:6px 14px;font-size:13px">⟳ Refresh</button>
|
||||
<span class="last-updated" id="lastUpdated">\u2014</span>
|
||||
<button onclick="load()" style="background:#238636;padding:6px 14px;font-size:13px">\u27F3 Refresh</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="error"></div>
|
||||
|
||||
<div class="row">
|
||||
<div class="col">
|
||||
<div class="section">
|
||||
<h2>✅ Pipeline Checklist</h2>
|
||||
<table id="checklist"><tr><td colspan="3">Loading...</td></tr></table>
|
||||
<h2>\u2705 Pipeline Checklist</h2>
|
||||
<table id="checklist"><tr><td>Loading...</td></tr></table>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col">
|
||||
<div class="section">
|
||||
<h2>💻 System Health</h2>
|
||||
<h2>\U0001F4BB System Health</h2>
|
||||
<div id="health" style="font-size:14px">Loading...</div>
|
||||
</div>
|
||||
<div class="section">
|
||||
<h2>🛠 Services</h2>
|
||||
<h2>\U0001F6E0 Services</h2>
|
||||
<div id="services" style="font-size:14px">Loading...</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="section" id="fileProgressSection">
|
||||
<h2>📁 Pipeline Progress</h2>
|
||||
<div id="fileProgress" style="font-size:14px">Loading...</div>
|
||||
<div class="row">
|
||||
<div class="col">
|
||||
<div class="section">
|
||||
<h2>\U0001F4CA Qdrant Collections</h2>
|
||||
<div id="qdrant" style="font-size:14px">Loading...</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col">
|
||||
<div class="section">
|
||||
<h2>\u2699\uFE0F Background Processes</h2>
|
||||
<div id="processes" style="font-size:14px">Loading...</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="row">
|
||||
<div class="col">
|
||||
<div class="section">
|
||||
<h2>⚡ Redis</h2>
|
||||
<div id="redis" style="font-size:14px">Loading...</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col">
|
||||
<div class="section">
|
||||
<h2>🗄 Database</h2>
|
||||
<h2>\U0001F4DB Database</h2>
|
||||
<div id="db" style="font-size:14px">Loading...</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="section">
|
||||
<h2>⏱ Processor Timing</h2>
|
||||
<table id="timing"><tr><td>Loading...</td></tr></table>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
async function fetchAll() {
|
||||
async function load() {
|
||||
const ts = new Date().toISOString().slice(11,19);
|
||||
document.getElementById('lastUpdated').textContent = '🔄 ' + ts;
|
||||
document.getElementById("lastUpdated").textContent = "\U0001F504 " + ts;
|
||||
document.getElementById("error").style.display = "none";
|
||||
|
||||
try {
|
||||
const all = await (await fetch('/api/all')).json();
|
||||
_lastData = all;
|
||||
const status = all.status;
|
||||
renderChecklist(status.job);
|
||||
renderHealth(status.health);
|
||||
renderTiming(status.health?.processors);
|
||||
if (all.redis) renderRedis(all.redis);
|
||||
if (all.db) { renderDb(all.db); renderFileProgress(all.db); }
|
||||
document.getElementById('lastUpdated').textContent = '✅ ' + ts;
|
||||
const resp = await fetch("/api/all");
|
||||
if (!resp.ok) throw new Error("HTTP " + resp.status);
|
||||
const d = await resp.json();
|
||||
renderChecklist(d.status);
|
||||
renderHealth(d.status.health);
|
||||
renderQdrant(d.qdrant);
|
||||
renderProcesses(d.processes);
|
||||
renderDb(d.db);
|
||||
document.getElementById("lastUpdated").textContent = "\u2705 " + ts;
|
||||
} catch(e) {
|
||||
document.getElementById('checklist').innerHTML = '<tr><td class="fail">Error: ' + e.message + '</td></tr>';
|
||||
// Fallback: try separate endpoints
|
||||
try {
|
||||
const s = await (await fetch('/api/status')).json(); renderChecklist(s.job); renderHealth(s.health); renderTiming(s.health?.processors);
|
||||
} catch(e2) {}
|
||||
try {
|
||||
const r = await (await fetch('/api/redis')).json(); renderRedis(r);
|
||||
} catch(e2) {}
|
||||
try {
|
||||
const d = await (await fetch('/api/db')).json(); renderDb(d); renderFileProgress(d);
|
||||
} catch(e2) {}
|
||||
showError(e.message);
|
||||
document.getElementById("lastUpdated").textContent = "\u274C " + ts;
|
||||
}
|
||||
}
|
||||
|
||||
function renderChecklist(job) {
|
||||
if (!job || !job.stages) return;
|
||||
let h = '<tr><th>Stage</th><th>Status</th><th>Detail</th><th>Time</th></tr>';
|
||||
for (const s of job.stages) {
|
||||
const cls = s.passed ? 'pass' : 'fail';
|
||||
const icon = s.passed ? '✅' : '❌';
|
||||
h += '<tr><td>' + s.name + '</td><td class="' + cls + '">' + icon + '</td><td>' + s.detail + '</td><td>' + s.elapsed + 's</td></tr>';
|
||||
function showError(msg) {
|
||||
document.getElementById("error").innerHTML = "\u26A0\uFE0F " + msg;
|
||||
document.getElementById("error").style.display = "block";
|
||||
}
|
||||
|
||||
function renderChecklist(status) {
|
||||
const job = status || {};
|
||||
const stages = job.stages || [];
|
||||
let h = "<tr><th>Stage</th><th>Status</th><th>Detail</th></tr>";
|
||||
for (const s of stages) {
|
||||
h += "<tr><td>" + s.name + '</td><td class="' + (s.passed ? "pass" : "fail") + '">' + (s.passed ? "\u2705" : "\u274C") + "</td><td>" + s.detail + "</td></tr>";
|
||||
}
|
||||
const totalCls = job.passed ? 'pass' : 'fail';
|
||||
h += '<tr style="font-weight:bold;border-top:2px solid #30363d"><td>TOTAL</td><td class="' + totalCls + '">' + (job.passed ? '✅' : '❌') + '</td><td></td><td>' + job.total_elapsed + 's</td></tr>';
|
||||
document.getElementById('checklist').innerHTML = h;
|
||||
h += '<tr style="font-weight:bold;border-top:2px solid #30363d"><td>TOTAL</td><td class="' + (job.passed ? "pass" : "fail") + '">' + (job.passed ? "\u2705" : "\u274C") + "</td><td></td></tr>";
|
||||
document.getElementById("checklist").innerHTML = h;
|
||||
}
|
||||
|
||||
function renderHealth(h) {
|
||||
if (!h) return;
|
||||
const memPct = h.memory_used_mb ? (h.memory_used_mb / 49152 * 100).toFixed(1) : '?';
|
||||
const memBar = Math.min(parseFloat(memPct), 100);
|
||||
const barColor = memBar > 85 ? '#f85149' : memBar > 70 ? '#d29922' : '#3fb950';
|
||||
document.getElementById('health').innerHTML = `
|
||||
<div class="row">
|
||||
<div class="col"><div class="stat-card"><div class="stat-value">${h.cpu_load_1m ?? '?'}</div><div class="stat-label">CPU Load (1m)</div></div></div>
|
||||
<div class="col"><div class="stat-card"><div class="stat-value">${memPct}%</div><div class="stat-label">Memory</div><div class="progress-bar"><div class="progress-fill" style="width:${memBar}%;background:${barColor}"></div></div></div></div>
|
||||
<div class="col"><div class="stat-card"><div class="stat-value">${h.disk_use_pct ?? '?'}</div><div class="stat-label">Disk Used</div></div></div>
|
||||
</div>
|
||||
`;
|
||||
let cards = '<div class="row">';
|
||||
cards += '<div class="col"><div class="stat-card"><div class="stat-value">' + (h.cpu_load_1m ?? "?") + '</div><div class="stat-label">CPU Load (1m)</div></div></div>';
|
||||
const memPct = h.memory_used_mb ? (h.memory_used_mb / 49152 * 100).toFixed(1) : "?";
|
||||
cards += '<div class="col"><div class="stat-card"><div class="stat-value">' + memPct + '%</div><div class="stat-label">Memory</div></div></div>';
|
||||
cards += '<div class="col"><div class="stat-card"><div class="stat-value">' + (h.disk_use_pct ?? "?") + '</div><div class="stat-label">Disk</div></div></div>';
|
||||
cards += "</div>";
|
||||
document.getElementById("health").innerHTML = cards;
|
||||
|
||||
const svc = h.services || {};
|
||||
document.getElementById('services').innerHTML = Object.entries(svc).map(([k,v]) =>
|
||||
'<span style="margin-right:16px">' + (v ? '✅' : '❌') + ' ' + k + '</span>'
|
||||
).join('');
|
||||
let svcHtml = "";
|
||||
for (const [k, v] of Object.entries(svc)) {
|
||||
svcHtml += '<span style="margin-right:16px">' + (v ? "\u2705" : "\u274C") + " " + k + "</span>";
|
||||
}
|
||||
document.getElementById("services").innerHTML = svcHtml;
|
||||
}
|
||||
|
||||
function renderTiming(procs) {
|
||||
function renderQdrant(cols) {
|
||||
if (!cols) return;
|
||||
let h = "<table><tr><th>Collection</th><th>Points</th><th>Dim</th></tr>";
|
||||
for (let i = 0; i < cols.length; i++) {
|
||||
const c = cols[i];
|
||||
h += "<tr><td>" + c.name + "</td><td>" + (c.points >= 0 ? Number(c.points).toLocaleString() : "err") + "</td><td>" + c.dim + "</td></tr>";
|
||||
}
|
||||
h += "</table>";
|
||||
document.getElementById("qdrant").innerHTML = h;
|
||||
}
|
||||
|
||||
function renderProcesses(procs) {
|
||||
if (!procs) return;
|
||||
let h = '<tr><th>Processor</th><th>Duration</th></tr>';
|
||||
for (const p of procs) {
|
||||
const d = p.duration_secs;
|
||||
const dur = d ? (d < 60 ? d + 's' : d < 3600 ? Math.floor(d/60) + 'm ' + (d%60) + 's' : Math.floor(d/3600) + 'h ' + Math.floor((d%3600)/60) + 'm') : 'running';
|
||||
h += '<tr><td>' + p.name + '</td><td>' + dur + '</td></tr>';
|
||||
}
|
||||
document.getElementById('timing').innerHTML = h;
|
||||
}
|
||||
|
||||
function renderRedis(r) {
|
||||
if (!r) return;
|
||||
let h = '<div class="row">';
|
||||
const cards = [
|
||||
{k:'used_memory_human', l:'Memory Used'},
|
||||
{k:'total_system_memory_human', l:'System Memory'},
|
||||
{k:'connected_clients', l:'Clients'},
|
||||
{k:'hit_rate_pct', l:'Hit Rate'},
|
||||
{k:'momentry_keys', l:'Momentry Keys'},
|
||||
{k:'uptime_in_seconds', l:'Uptime'},
|
||||
];
|
||||
for (const c of cards) {
|
||||
let v = r[c.k] ?? '—';
|
||||
if (c.k === 'uptime_in_seconds' && typeof v === 'number') {
|
||||
v = v > 86400 ? Math.round(v/86400) + 'd' : Math.round(v/3600) + 'h';
|
||||
let h = "<table><tr><th>Script</th><th>Status</th></tr>";
|
||||
for (const name in procs) {
|
||||
const info = procs[name];
|
||||
if (info) {
|
||||
h += "<tr><td>" + name + "</td><td>\u25B6 running " + info.elapsed + "</td></tr>";
|
||||
} else {
|
||||
h += '<tr style="color:#8b949e"><td>' + name + "</td><td>\u23F3 idle</td></tr>";
|
||||
}
|
||||
if (c.k === 'hit_rate_pct' && typeof v === 'number') v = v.toFixed(1) + '%';
|
||||
h += '<div class="col"><div class="stat-card"><div class="stat-value">' + v + '</div><div class="stat-label">' + c.l + '</div></div></div>';
|
||||
}
|
||||
h += '</div>';
|
||||
if (r.key_sample && r.key_sample.length) {
|
||||
h += '<div style="margin-top:12px;font-size:12px;color:#8b949e">Recent keys: ' + r.key_sample.slice(0,6).join(', ') + '</div>';
|
||||
}
|
||||
document.getElementById('redis').innerHTML = h;
|
||||
}
|
||||
|
||||
const PIPELINE_STAGES = ['cut','scene','asr','asrx','yolo','ocr','face','pose','visual_chunk','story'];
|
||||
|
||||
function renderFileProgress(d) {
|
||||
const el = document.getElementById('fileProgress');
|
||||
if (!d || !d.files || d.files.length === 0) {
|
||||
el.innerHTML = '<div style="color:#8b949e">No files found</div>';
|
||||
return;
|
||||
}
|
||||
let h = '<table><tr><th>File</th><th>Status</th>';
|
||||
for (const s of PIPELINE_STAGES) h += '<th style="font-size:11px">' + s.slice(0,4) + '</th>';
|
||||
h += '</tr>';
|
||||
for (const f of d.files) {
|
||||
const name = f.name.length > 50 ? f.name.slice(0,50) + '...' : f.name;
|
||||
const statusIcon = f.job_status === 'running' ? '▶️' : f.job_status === 'pending' ? '⏳' : f.status === 'completed' ? '✅' : '❌';
|
||||
const progress = f.progress || {};
|
||||
h += '<tr><td style="max-width:300px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap" title="' + f.name + '">' + name + '</td>'
|
||||
+ '<td>' + statusIcon + ' ' + (f.job_status || f.status) + '</td>';
|
||||
for (const s of PIPELINE_STAGES) {
|
||||
const ps = progress[s.toUpperCase()] || {};
|
||||
const st = ps.status || '';
|
||||
let icon = '⬜';
|
||||
if (st === 'completed') icon = '✅';
|
||||
else if (st === 'running') icon = '⏳';
|
||||
else if (st === 'failed') icon = '❌';
|
||||
h += '<td style="text-align:center;font-size:13px">' + icon + '</td>';
|
||||
}
|
||||
h += '</tr>';
|
||||
}
|
||||
h += '</table>';
|
||||
el.innerHTML = h;
|
||||
h += "</table>";
|
||||
document.getElementById("processes").innerHTML = h;
|
||||
}
|
||||
|
||||
function renderDb(d) {
|
||||
if (!d) return;
|
||||
const rows = ['videos','chunks','face_detections','identities','tkg_nodes','tkg_edges'];
|
||||
const keys = ["videos","chunks","face_detections","identities","tkg_nodes","tkg_edges"];
|
||||
let h = '<div class="row">';
|
||||
for (const key of rows) {
|
||||
const v = d[key] ?? 0;
|
||||
h += '<div class="col"><div class="stat-card"><div class="stat-value">' + v.toLocaleString() + '</div><div class="stat-label">' + key.replace(/_/g,' ') + '</div></div></div>';
|
||||
for (let i = 0; i < keys.length; i++) {
|
||||
const v = d[keys[i]] ?? 0;
|
||||
h += '<div class="col"><div class="stat-card"><div class="stat-value">' + Number(v).toLocaleString() + '</div><div class="stat-label">' + keys[i].replace(/_/g," ") + '</div></div></div>';
|
||||
}
|
||||
h += '</div>';
|
||||
document.getElementById('db').innerHTML = h;
|
||||
h += "</div>";
|
||||
document.getElementById("db").innerHTML = h;
|
||||
}
|
||||
|
||||
let _lastData = null;
|
||||
function copyStatus() {
|
||||
if (!_lastData) { alert('No data loaded yet'); return; }
|
||||
const d = _lastData;
|
||||
const job = d.status?.job;
|
||||
const h = d.status?.health;
|
||||
const db = d.db;
|
||||
const r = d.redis;
|
||||
let lines = [];
|
||||
lines.push('Momentry Pipeline Status');
|
||||
lines.push('='.repeat(50));
|
||||
lines.push('System: ' + (d.system?.role || '?') + ' | ' + new Date().toISOString().slice(0,19).replace('T',' '));
|
||||
lines.push('');
|
||||
if (job?.stages) {
|
||||
lines.push('── Checklist ──');
|
||||
for (const s of job.stages) {
|
||||
lines.push(' ' + (s.passed ? '✅' : '❌') + ' ' + s.name.padEnd(14) + s.detail);
|
||||
}
|
||||
lines.push(' ' + (job.passed ? '✅' : '❌') + ' TOTAL'.padEnd(14) + job.total_elapsed + 's');
|
||||
lines.push('');
|
||||
}
|
||||
if (h) {
|
||||
lines.push('── Health ──');
|
||||
lines.push(' CPU: ' + (h.cpu_load_1m ?? '?') + ' Memory: ' + (h.memory_used_mb ?? '?') + 'MB GPU: ' + (h.gpu_available ? '✅' : '❌'));
|
||||
if (h.services) {
|
||||
lines.push(' Services: ' + Object.entries(h.services).map(([k,v]) => k + '=' + (v ? '✓' : '✗')).join(' '));
|
||||
}
|
||||
lines.push('');
|
||||
}
|
||||
if (r) {
|
||||
lines.push('── Redis ──');
|
||||
lines.push(' Keys: ' + (r.momentry_keys ?? '?') + ' Hit Rate: ' + (r.hit_rate_pct ?? '?') + '% Uptime: ' + (r.uptime_in_seconds ? Math.round(r.uptime_in_seconds/3600)+'h' : '?'));
|
||||
lines.push('');
|
||||
}
|
||||
if (db) {
|
||||
lines.push('── Database ──');
|
||||
const tbls = ['videos','chunks','face_detections','identities','tkg_nodes','tkg_edges'];
|
||||
for (const t of tbls) {
|
||||
if (db[t] !== undefined) lines.push(' ' + t + ': ' + db[t].toLocaleString());
|
||||
}
|
||||
if (db.files) {
|
||||
lines.push('');
|
||||
lines.push('── Files ──');
|
||||
for (const f of db.files) {
|
||||
lines.push(' ' + (f.job_status === 'running' ? '▶️' : f.job_status === 'pending' ? '⏳' : f.status === 'completed' ? '✅' : '❌') + ' ' + f.name.slice(0,60));
|
||||
}
|
||||
}
|
||||
lines.push('');
|
||||
}
|
||||
const text = lines.join('\n');
|
||||
navigator.clipboard.writeText(text).then(() => {
|
||||
const btn = event.target;
|
||||
const orig = btn.textContent;
|
||||
btn.textContent = '✅ Copied!';
|
||||
setTimeout(() => btn.textContent = orig, 2000);
|
||||
}).catch(() => alert('Copy failed'));
|
||||
}
|
||||
|
||||
fetchAll();
|
||||
setInterval(fetchAll, 15000);
|
||||
load();
|
||||
setInterval(load, 30000);
|
||||
</script>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
port = int(os.environ.get("DASHBOARD_PORT", 5050))
|
||||
print(f"Momentry Dashboard: http://0.0.0.0:{port}")
|
||||
app.run(host="0.0.0.0", port=port, debug=False)
|
||||
print(f"Momentry Dashboard v2: http://0.0.0.0:{port}")
|
||||
app.run(host="0.0.0.0", port=port, threaded=True)
|
||||
|
||||
324
scripts/dense_scan_traces.py
Normal file
324
scripts/dense_scan_traces.py
Normal file
@@ -0,0 +1,324 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Dense Scan Traces - Re-scan frame-by-frame for traces with < 4 detections.
|
||||
|
||||
Flow:
|
||||
1. Query face_detections for traces with < 4 rows for a file_uuid
|
||||
2. For each short trace:
|
||||
a. Extract video segment (ffmpeg)
|
||||
b. Run face_processor.py with --sample-interval 1
|
||||
c. Match new detections to trace by embedding similarity
|
||||
d. Insert new rows into face_detections
|
||||
|
||||
Usage:
|
||||
python dense_scan_traces.py --file-uuid <uuid> [--video-path <path>]
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
import subprocess
|
||||
import time
|
||||
import tempfile
|
||||
import numpy as np
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
|
||||
SCHEMA = os.environ.get("MOMENTRY_DB_SCHEMA", "dev")
|
||||
OUTPUT_DIR = os.environ.get("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
FACE_PROCESSOR = os.path.join(SCRIPT_DIR, "face_processor.py")
|
||||
PYTHON_BIN = "/opt/homebrew/bin/python3.11"
|
||||
MIN_DETECTIONS = 4
|
||||
|
||||
|
||||
def get_conn():
|
||||
return psycopg2.connect(DB_URL)
|
||||
|
||||
|
||||
def get_video_path(file_uuid: str) -> Optional[str]:
|
||||
"""Get video file path from videos table"""
|
||||
conn = get_conn()
|
||||
cur = conn.cursor()
|
||||
try:
|
||||
cur.execute(
|
||||
f"SELECT file_path FROM {SCHEMA}.videos WHERE file_uuid = %s",
|
||||
(file_uuid,),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
return row[0] if row else None
|
||||
finally:
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_short_traces(file_uuid: str, min_det: int = MIN_DETECTIONS) -> List[Dict]:
|
||||
"""Find traces with < min_det rows"""
|
||||
conn = get_conn()
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
try:
|
||||
cur.execute(
|
||||
f"""
|
||||
SELECT trace_id, COUNT(*) as cnt,
|
||||
MIN(frame_number) as start_frame,
|
||||
MAX(frame_number) as end_frame
|
||||
FROM {SCHEMA}.face_detections
|
||||
WHERE file_uuid = %s AND trace_id IS NOT NULL
|
||||
GROUP BY trace_id
|
||||
HAVING COUNT(*) < %s
|
||||
ORDER BY trace_id
|
||||
""",
|
||||
(file_uuid, min_det),
|
||||
)
|
||||
return [dict(r) for r in cur.fetchall()]
|
||||
finally:
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_trace_embeddings(file_uuid: str, trace_id: int) -> List[Dict]:
|
||||
"""Get existing embedding vectors for a trace"""
|
||||
conn = get_conn()
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
try:
|
||||
cur.execute(
|
||||
f"""
|
||||
SELECT frame_number, x, y, width, height, embedding
|
||||
FROM {SCHEMA}.face_detections
|
||||
WHERE file_uuid = %s AND trace_id = %s AND embedding IS NOT NULL
|
||||
ORDER BY frame_number
|
||||
""",
|
||||
(file_uuid, trace_id),
|
||||
)
|
||||
return [dict(r) for r in cur.fetchall()]
|
||||
finally:
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
def cosine_similarity(a: List[float], b: List[float]) -> float:
|
||||
if not a or not b:
|
||||
return 0.0
|
||||
v1, v2 = np.array(a), np.array(b)
|
||||
n1, n2 = np.linalg.norm(v1), np.linalg.norm(v2)
|
||||
if n1 == 0 or n2 == 0:
|
||||
return 0.0
|
||||
return float(np.dot(v1, v2) / (n1 * n2))
|
||||
|
||||
|
||||
def extract_video_segment(video_path: str, start_frame: int, end_frame: int, output_path: str, fps: float = 59.94):
|
||||
"""Extract a frame range from video using ffmpeg (fast seek via -ss)"""
|
||||
start_time = max(0.0, start_frame / fps - 1.0)
|
||||
cmd = [
|
||||
"ffmpeg", "-y",
|
||||
"-ss", f"{start_time:.2f}",
|
||||
"-i", video_path,
|
||||
"-vf", f"select=between(n\\,{start_frame}\\,{end_frame}),setpts=PTS-STARTPTS",
|
||||
"-vsync", "0",
|
||||
"-an", output_path,
|
||||
]
|
||||
subprocess.run(cmd, check=True, timeout=120, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
|
||||
def match_new_detections(new_face_json: str, ref_embeddings: List[Dict],
|
||||
similarity_threshold: float = 0.7) -> List[Dict]:
|
||||
"""Match dense-scan detections to trace by embedding similarity"""
|
||||
with open(new_face_json) as f:
|
||||
data = json.load(f)
|
||||
|
||||
if not ref_embeddings:
|
||||
return []
|
||||
|
||||
matches = []
|
||||
frames = data.get("frames", []) if isinstance(data.get("frames"), list) else []
|
||||
for frame_data in frames:
|
||||
frame_num = frame_data.get("frame", 0)
|
||||
for face in frame_data.get("faces", []):
|
||||
emb = face.get("embedding")
|
||||
if not emb:
|
||||
continue
|
||||
|
||||
# Find best matching reference embedding
|
||||
best_sim = 0.0
|
||||
best_ref = None
|
||||
for ref in ref_embeddings:
|
||||
sim = cosine_similarity(emb, ref["embedding"])
|
||||
if sim > best_sim:
|
||||
best_sim = sim
|
||||
best_ref = ref
|
||||
|
||||
if best_sim >= similarity_threshold:
|
||||
matches.append({
|
||||
"frame_number": frame_num,
|
||||
"x": face["x"],
|
||||
"y": face["y"],
|
||||
"width": face["width"],
|
||||
"height": face["height"],
|
||||
"confidence": face.get("confidence", 0.5),
|
||||
"embedding": emb,
|
||||
"similarity": best_sim,
|
||||
})
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
def insert_detections(file_uuid: str, trace_id: int, detections: List[Dict]):
|
||||
"""Insert new detections into face_detections, skipping existing frames"""
|
||||
if not detections:
|
||||
return 0
|
||||
|
||||
conn = get_conn()
|
||||
cur = conn.cursor()
|
||||
try:
|
||||
inserted = 0
|
||||
for d in detections:
|
||||
# Check if frame already exists for this trace
|
||||
cur.execute(
|
||||
f"SELECT 1 FROM {SCHEMA}.face_detections "
|
||||
f"WHERE file_uuid=%s AND frame_number=%s AND trace_id=%s",
|
||||
(file_uuid, d["frame_number"], trace_id),
|
||||
)
|
||||
if cur.fetchone():
|
||||
continue
|
||||
|
||||
emb = d.get("embedding") if d.get("embedding") else None
|
||||
cur.execute(
|
||||
f"""
|
||||
INSERT INTO {SCHEMA}.face_detections
|
||||
(file_uuid, frame_number, face_id, trace_id,
|
||||
x, y, width, height, confidence, embedding)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
""",
|
||||
(
|
||||
file_uuid, d["frame_number"], None, trace_id,
|
||||
d["x"], d["y"], d["width"], d["height"],
|
||||
d.get("confidence", 0.5), emb,
|
||||
),
|
||||
)
|
||||
inserted += 1
|
||||
conn.commit()
|
||||
return inserted
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
print(f" [DENSE] DB error: {e}")
|
||||
return 0
|
||||
finally:
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
def dense_scan_trace(file_uuid: str, trace_id: int, video_path: str,
|
||||
start_frame: int, end_frame: int):
|
||||
"""Re-scan a trace's frame range frame-by-frame"""
|
||||
pad = 15
|
||||
seg_start = max(0, start_frame - pad)
|
||||
seg_end = end_frame + pad
|
||||
|
||||
# Get reference embeddings FIRST (outside tempdir, before tempdir cleanup)
|
||||
refs = get_trace_embeddings(file_uuid, trace_id)
|
||||
if not refs:
|
||||
return 0
|
||||
|
||||
new_detections = None
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# Extract segment
|
||||
segment_path = os.path.join(tmpdir, f"seg_{trace_id}.mp4")
|
||||
try:
|
||||
extract_video_segment(video_path, seg_start, seg_end, segment_path)
|
||||
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
|
||||
err = e.stderr.decode() if hasattr(e, 'stderr') and e.stderr else str(e)
|
||||
print(f" [DENSE] ffmpeg failed: {err[:200]}")
|
||||
return 0
|
||||
|
||||
# Run face_processor with sample_interval=1
|
||||
face_out = os.path.join(tmpdir, f"face_{trace_id}.json")
|
||||
try:
|
||||
subprocess.run(
|
||||
[PYTHON_BIN, FACE_PROCESSOR, segment_path, face_out,
|
||||
"--sample-interval", "1", "--uuid", file_uuid],
|
||||
check=True, timeout=120,
|
||||
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
|
||||
)
|
||||
except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as e:
|
||||
print(f" [DENSE] face_processor failed for trace {trace_id}: {e}")
|
||||
return 0
|
||||
|
||||
if not os.path.exists(face_out):
|
||||
return 0
|
||||
|
||||
# Match new detections while tempdir still exists
|
||||
new_detections = match_new_detections(face_out, refs)
|
||||
# Tempdir cleaned up here — face_out no longer accessible
|
||||
|
||||
if not new_detections:
|
||||
return 0
|
||||
|
||||
# Adjust frame numbers
|
||||
adjusted = []
|
||||
for d in new_detections:
|
||||
df = seg_start + d["frame_number"] - 1
|
||||
orig_fn = d["frame_number"]
|
||||
d["frame_number"] = df
|
||||
if not any(r["frame_number"] == df for r in refs):
|
||||
adjusted.append(d)
|
||||
|
||||
if not adjusted:
|
||||
return 0
|
||||
|
||||
count = insert_detections(file_uuid, trace_id, adjusted)
|
||||
print(f" [DENSE] Trace {trace_id}: added {count} new detections (range {seg_start}-{seg_end})")
|
||||
return count
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Dense re-scan for short face traces")
|
||||
parser.add_argument("--file-uuid", required=True, help="Video file UUID")
|
||||
parser.add_argument("--video-path", help="Video file path (auto-detect if omitted)")
|
||||
parser.add_argument("--min-detections", type=int, default=MIN_DETECTIONS,
|
||||
help=f"Minimum detections per trace (default: {MIN_DETECTIONS})")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Only list short traces")
|
||||
args = parser.parse_args()
|
||||
|
||||
min_det = getattr(args, 'min_detections', MIN_DETECTIONS)
|
||||
|
||||
# Get video path
|
||||
video_path = args.video_path or get_video_path(args.file_uuid)
|
||||
if not video_path or not os.path.exists(video_path):
|
||||
print(f"[DENSE] Video not found: {video_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
print(f"[DENSE] Video: {video_path}")
|
||||
|
||||
# Find short traces
|
||||
short_traces = get_short_traces(args.file_uuid, min_det)
|
||||
print(f"[DENSE] Traces with < {min_det} detections: {len(short_traces)}")
|
||||
|
||||
if args.dry_run:
|
||||
for t in short_traces:
|
||||
print(f" Trace {t['trace_id']}: {t['cnt']} detections "
|
||||
f"(frames {t['start_frame']}-{t['end_frame']})")
|
||||
return
|
||||
|
||||
# Dense scan each short trace
|
||||
total_added = 0
|
||||
total_traces = 0
|
||||
t0 = time.time()
|
||||
|
||||
for t in short_traces:
|
||||
count = dense_scan_trace(
|
||||
args.file_uuid, t["trace_id"], video_path,
|
||||
t["start_frame"], t["end_frame"],
|
||||
)
|
||||
if count > 0:
|
||||
total_added += count
|
||||
total_traces += 1
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f"\n[DENSE] Done: {total_traces} traces supplemented, "
|
||||
f"{total_added} new detections added, {elapsed:.1f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
327
scripts/export_file.py
Executable file
327
scripts/export_file.py
Executable file
@@ -0,0 +1,327 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
momentry-export — 打包檔案歷程
|
||||
將單一 file_uuid 的所有產出打包成可攜帶的 tar.gz
|
||||
|
||||
Usage:
|
||||
python3 scripts/export_file.py <uuid> [--output <path>] [--include-video]
|
||||
|
||||
Example:
|
||||
python3 scripts/export_file.py fa182e9c26145b2c1a932f73d1d484e5 --output /tmp/test_export.tar.gz
|
||||
"""
|
||||
|
||||
import sys, os, json, argparse, tarfile, io, time
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
|
||||
SCHEMA = os.environ.get("MOMENTRY_DB_SCHEMA", "dev")
|
||||
OUTPUT_DIR = os.environ.get("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
|
||||
|
||||
TABLES = [
|
||||
"pre_chunks", "chunks", "face_detections",
|
||||
"processor_results", "processor_versions",
|
||||
"videos", "api_keys",
|
||||
]
|
||||
|
||||
|
||||
def get_conn():
|
||||
return psycopg2.connect(DB_URL)
|
||||
|
||||
|
||||
def fetch_table(conn, table: str, uuid: str) -> list[dict]:
|
||||
"""Fetch rows from a table that reference this UUID"""
|
||||
uuid_columns = {"file_uuid", "uuid"}
|
||||
# Get columns
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cur.execute(
|
||||
"SELECT column_name, data_type FROM information_schema.columns "
|
||||
"WHERE table_schema = %s AND table_name = %s",
|
||||
(SCHEMA, table),
|
||||
)
|
||||
cols = cur.fetchall()
|
||||
uuid_col = None
|
||||
for c in cols:
|
||||
if c["column_name"] in uuid_columns:
|
||||
uuid_col = c["column_name"]
|
||||
break
|
||||
|
||||
if not uuid_col:
|
||||
cur.close()
|
||||
return []
|
||||
|
||||
# Fetch rows
|
||||
cur.execute(
|
||||
f"SELECT * FROM {SCHEMA}.{table} WHERE {uuid_col} = %s",
|
||||
(uuid,),
|
||||
)
|
||||
rows = [dict(r) for r in cur.fetchall()]
|
||||
cur.close()
|
||||
return rows
|
||||
|
||||
|
||||
def fetch_video_row(conn, uuid: str) -> dict | None:
|
||||
"""Get video metadata"""
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cur.execute(f"SELECT * FROM {SCHEMA}.videos WHERE file_uuid = %s", (uuid,))
|
||||
row = cur.fetchone()
|
||||
cur.close()
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
def serialize_value(v):
|
||||
"""Convert DB types to JSON-serializable"""
|
||||
if isinstance(v, (datetime,)):
|
||||
return v.isoformat()
|
||||
if isinstance(v, bytes):
|
||||
return list(v) # convert bytea to list of ints
|
||||
if isinstance(v, (list,)):
|
||||
# Check if it's a pgvector (list of floats)
|
||||
return v
|
||||
return v
|
||||
|
||||
|
||||
def export_file(uuid: str, output_path: str, include_video: bool = False):
|
||||
"""Export all data for a UUID into a tar.gz"""
|
||||
t0 = time.time()
|
||||
print(f"[EXPORT] Exporting {uuid}...")
|
||||
|
||||
conn = get_conn()
|
||||
buf = io.BytesIO()
|
||||
|
||||
# 先確認是否完成
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
f"SELECT status FROM {SCHEMA}.monitor_jobs WHERE uuid = %s ORDER BY id DESC LIMIT 1",
|
||||
(uuid,),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
job_status = row[0] if row else "unknown"
|
||||
cur.close()
|
||||
|
||||
if job_status == "completed":
|
||||
print(f" [EXPORT] Job status: ✅ {job_status}")
|
||||
elif job_status == "failed":
|
||||
print(f" [EXPORT] ⚠️ Job status: ❌ {job_status} (仍可匯出部分資料)")
|
||||
elif job_status == "running":
|
||||
print(f" [EXPORT] ⚠️ Job status: ⏳ {job_status} (處理中,產出不完全)")
|
||||
else:
|
||||
print(f" [EXPORT] ⚠️ Job status: {job_status}")
|
||||
|
||||
video = fetch_video_row(conn, uuid)
|
||||
if not video:
|
||||
print(f"[EXPORT] UUID {uuid} not found in videos table")
|
||||
conn.close()
|
||||
return False
|
||||
|
||||
# 歷程完整性檢查
|
||||
print(f"\n ── 歷程完整性檢查 ──")
|
||||
|
||||
# Job status
|
||||
completeness = {"job": job_status == "completed"}
|
||||
|
||||
# Processors: 7 processors all completed
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
f"SELECT processor, status FROM {SCHEMA}.processor_results "
|
||||
f"WHERE file_uuid = %s ORDER BY processor",
|
||||
(uuid,),
|
||||
)
|
||||
procs = {r[0]: r[1] for r in cur.fetchall()}
|
||||
cur.close()
|
||||
expected = ["asr", "asrx", "cut", "face", "ocr", "pose", "yolo"]
|
||||
for p in expected:
|
||||
st = procs.get(p, "missing")
|
||||
completeness[f"proc_{p}"] = st == "completed"
|
||||
completeness["processors"] = f"{sum(1 for p in expected if procs.get(p)=='completed')}/{len(expected)}"
|
||||
|
||||
# Output JSON files
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
json_files = sorted(output_dir.glob(f"{uuid}.*.json"))
|
||||
completeness["output_jsons"] = len(json_files)
|
||||
|
||||
# Face detections
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
f"SELECT count(*) FROM {SCHEMA}.face_detections WHERE file_uuid = %s",
|
||||
(uuid,),
|
||||
)
|
||||
completeness["face_detections"] = cur.fetchone()[0]
|
||||
cur.close()
|
||||
|
||||
# Chunks (Rule 1)
|
||||
cur = conn.cursor()
|
||||
cur.execute(
|
||||
f"SELECT count(*) FROM {SCHEMA}.chunks WHERE file_uuid = %s",
|
||||
(uuid,),
|
||||
)
|
||||
completeness["chunks"] = cur.fetchone()[0]
|
||||
cur.close()
|
||||
|
||||
# Print completeness report
|
||||
for k, v in completeness.items():
|
||||
icon = "✅" if v is True else ("❌" if v is False else "ℹ️")
|
||||
print(f" {icon} {k}: {v}")
|
||||
|
||||
# Decide if export is viable
|
||||
has_core_data = completeness["output_jsons"] > 0 or completeness["face_detections"] > 0 or completeness["chunks"] > 0
|
||||
if not has_core_data and job_status != "completed":
|
||||
print(f"\n ⛔ 歷程不完整,無核心產出,中止匯出")
|
||||
conn.close()
|
||||
return False
|
||||
|
||||
print(f" ─────────────────\n")
|
||||
|
||||
with tarfile.open(fileobj=buf, mode="w:gz") as tar:
|
||||
manifest = {
|
||||
"exported_at": datetime.now().isoformat(),
|
||||
"version": "1.0",
|
||||
"file_uuid": uuid,
|
||||
"file_name": video.get("file_name"),
|
||||
"duration": video.get("duration"),
|
||||
"fps": float(video.get("fps") or 0),
|
||||
"width": video.get("width"),
|
||||
"height": video.get("height"),
|
||||
"total_frames": video.get("total_frames"),
|
||||
"include_video": include_video,
|
||||
"completeness": {k: str(v) if not isinstance(v, (bool, int, str)) else v
|
||||
for k, v in completeness.items()},
|
||||
"merge_policy": {
|
||||
"identities": "merge_by_name",
|
||||
"description": "匯入時 identity 依名稱比對,已存在則合併(保留 target 的 identity_id),不存在則新增",
|
||||
},
|
||||
}
|
||||
_add_json(tar, "manifest.json", manifest)
|
||||
|
||||
# 2. Video metadata (videos table row)
|
||||
_add_json(tar, "data/video.json", video)
|
||||
|
||||
# 3. DB tables
|
||||
for table in TABLES:
|
||||
rows = fetch_table(conn, table, uuid)
|
||||
if rows:
|
||||
_add_json(tar, f"data/{table}.json", rows)
|
||||
print(f" [EXPORT] {table}: {len(rows)} rows")
|
||||
else:
|
||||
print(f" [EXPORT] {table}: (empty)")
|
||||
|
||||
# 4. Face detection embeddings (handle vector type)
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
cur.execute(
|
||||
f"SELECT id, file_uuid, frame_number, trace_id, x, y, width, height, "
|
||||
f"confidence, identity_id FROM {SCHEMA}.face_detections WHERE file_uuid = %s",
|
||||
(uuid,),
|
||||
)
|
||||
fd_rows = [dict(r) for r in cur.fetchall()]
|
||||
cur.close()
|
||||
if fd_rows:
|
||||
_add_json(tar, "data/face_detections_meta.json", fd_rows)
|
||||
print(f" [EXPORT] face_detections (meta): {len(fd_rows)} rows")
|
||||
else:
|
||||
print(f" [EXPORT] face_detections: (empty)")
|
||||
|
||||
# 5. Identity 關聯資料
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
# 找出此 file_uuid 相關的所有 identity_id
|
||||
cur.execute(
|
||||
f"SELECT DISTINCT identity_id FROM {SCHEMA}.face_detections "
|
||||
f"WHERE file_uuid = %s AND identity_id IS NOT NULL",
|
||||
(uuid,),
|
||||
)
|
||||
identity_ids = [r["identity_id"] for r in cur.fetchall()]
|
||||
|
||||
if identity_ids:
|
||||
# 查 identities 表
|
||||
placeholders = ",".join(["%s"] * len(identity_ids))
|
||||
cur.execute(
|
||||
f"SELECT * FROM {SCHEMA}.identities WHERE id IN ({placeholders})",
|
||||
identity_ids,
|
||||
)
|
||||
ident_rows = [dict(r) for r in cur.fetchall()]
|
||||
_add_json(tar, "data/identities.json", ident_rows)
|
||||
print(f" [EXPORT] identities: {len(ident_rows)} rows")
|
||||
|
||||
# 查 identity_bindings
|
||||
cur.execute(
|
||||
f"SELECT * FROM {SCHEMA}.identity_bindings "
|
||||
f"WHERE identity_id IN ({placeholders})",
|
||||
identity_ids,
|
||||
)
|
||||
bind_rows = [dict(r) for r in cur.fetchall()]
|
||||
if bind_rows:
|
||||
_add_json(tar, "data/identity_bindings.json", bind_rows)
|
||||
print(f" [EXPORT] identity_bindings: {len(bind_rows)} rows")
|
||||
|
||||
# 查 file_identities(若 table 存在)
|
||||
try:
|
||||
cur.execute(
|
||||
f"SELECT * FROM {SCHEMA}.file_identities WHERE file_uuid = %s",
|
||||
(uuid,),
|
||||
)
|
||||
fi_rows = [dict(r) for r in cur.fetchall()]
|
||||
if fi_rows:
|
||||
_add_json(tar, "data/file_identities.json", fi_rows)
|
||||
print(f" [EXPORT] file_identities: {len(fi_rows)} rows")
|
||||
except Exception:
|
||||
pass # table 可能不存在
|
||||
else:
|
||||
print(f" [EXPORT] identities: (none bound to this file)")
|
||||
cur.close()
|
||||
|
||||
# 6. Output JSON files
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
json_files = list(output_dir.glob(f"{uuid}.*.json"))
|
||||
for jf in json_files:
|
||||
arcname = f"output/{jf.name}"
|
||||
tar.add(str(jf), arcname=arcname)
|
||||
print(f" [EXPORT] output/{jf.name} ({jf.stat().st_size / 1024:.0f}KB)")
|
||||
print(f" [EXPORT] output JSONs: {len(json_files)} files")
|
||||
|
||||
# 7. Original video file (optional)
|
||||
if include_video and video.get("file_path"):
|
||||
src = video["file_path"]
|
||||
if os.path.exists(src):
|
||||
tar.add(src, arcname="original/" + os.path.basename(src))
|
||||
print(f" [EXPORT] original video: {src}")
|
||||
else:
|
||||
print(f" [WARN] Video file not found: {src}")
|
||||
|
||||
conn.close()
|
||||
|
||||
# Write to disk
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(buf.getvalue())
|
||||
|
||||
size_mb = os.path.getsize(output_path) / 1e6
|
||||
elapsed = time.time() - t0
|
||||
print(f"\n[EXPORT] Done: {output_path} ({size_mb:.1f}MB, {elapsed:.1f}s)")
|
||||
return True
|
||||
|
||||
|
||||
def _add_json(tar: tarfile.TarFile, arcname: str, data):
|
||||
"""Add a JSON file to the tar archive"""
|
||||
raw = json.dumps(data, ensure_ascii=False, default=str, indent=2).encode()
|
||||
info = tarfile.TarInfo(name=arcname)
|
||||
info.size = len(raw)
|
||||
info.mtime = int(time.time())
|
||||
tar.addfile(info, io.BytesIO(raw))
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Export file processing history")
|
||||
parser.add_argument("uuid", help="File UUID to export")
|
||||
parser.add_argument("--output", "-o", default=None,
|
||||
help="Output tar.gz path (default: {uuid}.tar.gz)")
|
||||
parser.add_argument("--include-video", action="store_true",
|
||||
help="Include original video file in export")
|
||||
args = parser.parse_args()
|
||||
|
||||
output = args.output or f"{args.uuid}.tar.gz"
|
||||
success = export_file(args.uuid, output, args.include_video)
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
114
scripts/fix_asr_text.py
Normal file
114
scripts/fix_asr_text.py
Normal file
@@ -0,0 +1,114 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Redo ASR word-timestamp mapping correctly.
|
||||
Save words first, then map to fine segments with independent scanning.
|
||||
"""
|
||||
import json, sys, os, time, subprocess, tempfile, shutil
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
BASE = "/Users/accusys/momentry/output_dev"
|
||||
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
|
||||
|
||||
print("Load fine segments...")
|
||||
fine = json.load(open(f"{BASE}/{UUID}.asrx_fine.json"))
|
||||
fine_segs = fine["segments"]
|
||||
print(f"{len(fine_segs)} segments")
|
||||
|
||||
# Extract full audio
|
||||
tmp_dir = tempfile.mkdtemp(prefix="asr_fix_")
|
||||
wav_path = os.path.join(tmp_dir, "audio.wav")
|
||||
subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", VIDEO,
|
||||
"-ar", "16000", "-ac", "1", "-sample_fmt", "s16", wav_path],
|
||||
check=True, capture_output=True, timeout=300)
|
||||
|
||||
print("Loading model...")
|
||||
model = WhisperModel("small", device="cpu", compute_type="int8")
|
||||
|
||||
# Check if words file exists
|
||||
words_file = f"{BASE}/{UUID}.words.json"
|
||||
if os.path.exists(words_file):
|
||||
print("Loading saved words...")
|
||||
words = json.load(open(words_file))
|
||||
else:
|
||||
print("Transcribing with word_timestamps...")
|
||||
t0 = time.time()
|
||||
segments, info = model.transcribe(
|
||||
wav_path, beam_size=5, vad_filter=True,
|
||||
vad_parameters={"min_silence_duration_ms": 500},
|
||||
word_timestamps=True
|
||||
)
|
||||
words = []
|
||||
for seg in segments:
|
||||
if seg.words:
|
||||
for w in seg.words:
|
||||
wt = w.word.strip()
|
||||
if wt:
|
||||
words.append({"word": wt, "start": w.start, "end": w.end})
|
||||
# Also save segment-level as fallback
|
||||
words.append({"word": seg.text.strip(), "start": seg.start, "end": seg.end, "_seg": True})
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f" {len(words)} entries in {elapsed:.1f}s")
|
||||
json.dump(words, open(words_file, "w"))
|
||||
|
||||
# Separate word-level and segment-level
|
||||
word_entries = [w for w in words if not w.get("_seg")]
|
||||
seg_entries = [w for w in words if w.get("_seg")]
|
||||
print(f"Word-level: {len(word_entries)}, Segment-level: {len(seg_entries)}")
|
||||
|
||||
# Map: for each fine segment, find ALL word entries within its time range
|
||||
print("Mapping words to segments...")
|
||||
assigned = 0
|
||||
for si, fs in enumerate(fine_segs):
|
||||
fstart = fs["start_time"]
|
||||
fend = fs["end_time"]
|
||||
|
||||
seg_words = []
|
||||
# Use word-level entries first (more precise)
|
||||
for w in word_entries:
|
||||
if w["start"] >= fstart and w["end"] <= fend + 0.05:
|
||||
seg_words.append(w["word"])
|
||||
elif w["start"] > fend:
|
||||
break # words are sorted by time
|
||||
|
||||
if not seg_words:
|
||||
# Fallback to segment-level
|
||||
for w in seg_entries:
|
||||
if w["start"] >= fstart and w["end"] <= fend + 0.05:
|
||||
seg_words.append(w["word"])
|
||||
elif w["start"] > fend:
|
||||
break
|
||||
|
||||
text = " ".join(seg_words) if seg_words else ""
|
||||
fs["text"] = text
|
||||
if text:
|
||||
assigned += 1
|
||||
|
||||
if (si + 1) % 500 == 0:
|
||||
print(f" {si+1}/{len(fine_segs)}")
|
||||
|
||||
print(f"Segments with text: {assigned}/{len(fine_segs)}")
|
||||
|
||||
# Fix empty segments: use original ASR text
|
||||
asr = json.load(open(f"{BASE}/{UUID}.asr.json"))
|
||||
asr_segs = asr["segments"]
|
||||
asr_bounds = {(s['start'], s['end']): s['text'] for s in asr_segs}
|
||||
|
||||
for fs in fine_segs:
|
||||
if not fs.get('text', '').strip():
|
||||
key = (fs['start_time'], fs['end_time'])
|
||||
if key in asr_bounds:
|
||||
fs['text'] = asr_bounds[key]
|
||||
else:
|
||||
fs['text'] = ""
|
||||
|
||||
with_text = sum(1 for fs in fine_segs if fs.get('text','').strip())
|
||||
print(f"After fallback: {with_text}/{len(fine_segs)} with text")
|
||||
|
||||
# Save
|
||||
fine["_asr_meta"]["word_file"] = words_file
|
||||
json.dump(fine, open(f"{BASE}/{UUID}.asrx_fine.json", "w"), indent=2)
|
||||
print("Saved")
|
||||
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
142
scripts/gdino_comparison_test.py
Normal file
142
scripts/gdino_comparison_test.py
Normal file
@@ -0,0 +1,142 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Grounding DINO Base vs Large comparison test.
|
||||
Both use Swin-B backbone; Large trained on 7 datasets vs Base's 3.
|
||||
"""
|
||||
import json, os, sys, time, cv2, torch
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
||||
|
||||
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
|
||||
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/gdino_comparison"
|
||||
LARGE_PATH = "/Users/accusys/momentry_core_0.1/models/gun/grounding-dino-large-hf"
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
TIMEPOINTS = [
|
||||
(2646, "2646s"), (3188, "3188s"), (3697, "3697s"), (5341, "5341s"),
|
||||
(5461, "5461s"), (6309, "6309s"), (6377, "6377s"), (6479, "6479s"),
|
||||
]
|
||||
PROMPTS = ["gun", "pistol", "rifle", "weapon"]
|
||||
|
||||
cap = cv2.VideoCapture(VIDEO)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
|
||||
def get_frame(t_sec):
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
|
||||
ret, frame = cap.read()
|
||||
return frame if ret else None
|
||||
|
||||
models = {
|
||||
"base": {"path": "IDEA-Research/grounding-dino-base", "label": "Base (3 datasets)"},
|
||||
"large": {"path": LARGE_PATH, "label": "Large (7 datasets)"},
|
||||
}
|
||||
|
||||
all_results = {}
|
||||
device = "mps" if torch.backends.mps.is_available() else "cpu"
|
||||
print(f"Device: {device}")
|
||||
|
||||
for model_name, model_info in models.items():
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Loading {model_info['label']} ({model_name})...")
|
||||
print(f"{'='*60}")
|
||||
|
||||
t_load = time.time()
|
||||
processor = AutoProcessor.from_pretrained(model_info["path"])
|
||||
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_info["path"]).to(device)
|
||||
load_time = time.time() - t_load
|
||||
print(f" Loaded in {load_time:.1f}s")
|
||||
|
||||
model_dets = {}
|
||||
t0 = time.time()
|
||||
|
||||
for t_sec, label in TIMEPOINTS:
|
||||
frame = get_frame(t_sec)
|
||||
if frame is None: continue
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
|
||||
for prompt in PROMPTS:
|
||||
inputs = processor(images=img, text=f"{prompt}.", return_tensors="pt").to(device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
target = torch.tensor([img.size[::-1]])
|
||||
dets = processor.post_process_grounded_object_detection(
|
||||
outputs, threshold=0.05, target_sizes=target
|
||||
)[0]
|
||||
|
||||
det_list = []
|
||||
for i in range(len(dets["boxes"])):
|
||||
det_list.append({
|
||||
"bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
|
||||
"score": round(dets["scores"][i].item(), 3),
|
||||
"label": prompt,
|
||||
})
|
||||
model_dets[f"{label}_prompt-{prompt}"] = det_list
|
||||
|
||||
elapsed = time.time() - t0
|
||||
all_results[model_name] = {"elapsed": round(elapsed, 1), "detections": model_dets}
|
||||
print(f" Inference: {elapsed:.1f}s")
|
||||
|
||||
del model
|
||||
torch.mps.empty_cache()
|
||||
|
||||
cap.release()
|
||||
|
||||
# ========== Summary ==========
|
||||
print(f"\n{'='*60}")
|
||||
print("COMPARISON SUMMARY")
|
||||
print(f"{'='*60}")
|
||||
|
||||
for model_name in ["base", "large"]:
|
||||
d = all_results[model_name]
|
||||
dets = d["detections"]
|
||||
hits = sum(1 for v in dets.values() if v)
|
||||
total = sum(len(v) for v in dets.values())
|
||||
print(f"\n{model_name.upper()} ({d['elapsed']}s): {hits}/32 prompt-timepoint hits, {total} total detections")
|
||||
|
||||
for t_sec, label in TIMEPOINTS:
|
||||
candidates = []
|
||||
for p in PROMPTS:
|
||||
key = f"{label}_prompt-{p}"
|
||||
key_rev = f"{label}_prompt-{p}."
|
||||
for k in [key, key_rev]:
|
||||
if k in dets and dets[k]:
|
||||
for dd in dets[k]:
|
||||
candidates.append((p, dd["score"]))
|
||||
if candidates:
|
||||
best = max(candidates, key=lambda x: x[1])
|
||||
print(f" {t_sec}s ({(t_sec//60)}:{t_sec%60:02d}): best={best[1]:.3f} (prompt='{best[0]}')")
|
||||
else:
|
||||
print(f" {t_sec}s: no detections")
|
||||
|
||||
# Per-timepoint comparison
|
||||
print(f"\n{'='*60}")
|
||||
print("PER-TIMEPOINT COMPARISON")
|
||||
print(f"{'='*60}")
|
||||
for t_sec, label in TIMEPOINTS:
|
||||
base_best = None
|
||||
large_best = None
|
||||
for p in PROMPTS:
|
||||
for mn in ["base", "large"]:
|
||||
dets = all_results[mn]["detections"]
|
||||
for k in [f"{label}_prompt-{p}", f"{label}_prompt-{p}."]:
|
||||
if k in dets and dets[k]:
|
||||
scores = [dd["score"] for dd in dets[k]]
|
||||
best = max(scores)
|
||||
if mn == "base" and (base_best is None or best > base_best[1]):
|
||||
base_best = (p, best)
|
||||
if mn == "large" and (large_best is None or best > large_best[1]):
|
||||
large_best = (p, best)
|
||||
|
||||
b_str = f"base={base_best[1]:.3f} ({base_best[0]})" if base_best else "base=no det"
|
||||
l_str = f"large={large_best[1]:.3f} ({large_best[0]})" if large_best else "large=no det"
|
||||
|
||||
delta = ""
|
||||
if base_best and large_best:
|
||||
d = large_best[1] - base_best[1]
|
||||
delta = f" ({'+'if d>0 else ''}{d:.3f})"
|
||||
|
||||
print(f" {t_sec}s: {b_str:30s} | {l_str:30s}{delta}")
|
||||
|
||||
# Save
|
||||
json.dump(all_results, open(os.path.join(OUTPUT_DIR, "comparison_results.json"), "w"), indent=2)
|
||||
print(f"\nSaved to {OUTPUT_DIR}/")
|
||||
343
scripts/gdino_frame_api.py
Normal file
343
scripts/gdino_frame_api.py
Normal file
@@ -0,0 +1,343 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Grounding DINO Frame API v2 — Zero-shot detection + natural language range search.
|
||||
Usage:
|
||||
python3 scripts/gdino_frame_api.py # Start server (port 5051)
|
||||
curl http://localhost:5051/detect -d '{"time":5461,"prompt":"gun"}'
|
||||
curl http://localhost:5051/search -d '{"query":"find the gun","range":"0-6780"}'
|
||||
"""
|
||||
import json, os, sys, time, cv2, torch, re, psycopg2, threading
|
||||
from PIL import Image, ImageDraw
|
||||
from flask import Flask, request, jsonify, send_file
|
||||
from datetime import datetime, timezone
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
RESOURCE_ID = "grounding-dino-v1"
|
||||
RESOURCE_TYPE = "vision_detector"
|
||||
CATEGORY = "zero_shot_detection"
|
||||
MODEL_NAME = "IDEA-Research/grounding-dino-base"
|
||||
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
|
||||
BASE_DIR = "/Users/accusys/momentry/output_dev"
|
||||
SHOTS_DIR = os.path.join(BASE_DIR, "api_shots")
|
||||
os.makedirs(SHOTS_DIR, exist_ok=True)
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
PORT = int(os.environ.get("GDINO_API_PORT", 5051))
|
||||
|
||||
VIDEO_PATHS = {
|
||||
"aeed71342a899fe4b4c57b7d41bcb692":
|
||||
"/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4",
|
||||
}
|
||||
|
||||
_model = None
|
||||
_processor = None
|
||||
|
||||
def register_resource():
|
||||
"""Register this service as a resource in dev.resources."""
|
||||
try:
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
INSERT INTO dev.resources (resource_id, resource_type, category, capabilities, config, metadata, status, last_heartbeat)
|
||||
VALUES (%s, %s, %s, %s::jsonb, %s::jsonb, %s::jsonb, %s, NOW())
|
||||
ON CONFLICT (resource_id)
|
||||
DO UPDATE SET status = %s, last_heartbeat = NOW(), config = %s::jsonb
|
||||
""", (
|
||||
RESOURCE_ID, RESOURCE_TYPE, CATEGORY,
|
||||
json.dumps({
|
||||
"detect": "Single-frame object detection",
|
||||
"search": "Time-range search with natural language query",
|
||||
"target_formats": ["file_uuid:chunk_id", "file_uuid:trace_id", "file_uuid:chunk_index", "range"],
|
||||
}),
|
||||
json.dumps({"port": PORT, "device": DEVICE, "model": MODEL_NAME, "host": "localhost"}),
|
||||
json.dumps({"version": "2.0", "docs": "/health"}),
|
||||
"online", "online", json.dumps({"port": PORT, "device": DEVICE, "model": MODEL_NAME}),
|
||||
))
|
||||
conn.commit()
|
||||
cur.close(); conn.close()
|
||||
print(f"[Resource] Registered as '{RESOURCE_ID}' (type={RESOURCE_TYPE})")
|
||||
except Exception as e:
|
||||
print(f"[Resource] Registration failed: {e}")
|
||||
|
||||
def heartbeat_loop():
|
||||
"""Update heartbeat every 60 seconds."""
|
||||
while True:
|
||||
try:
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("UPDATE dev.resources SET last_heartbeat = NOW() WHERE resource_id = %s", (RESOURCE_ID,))
|
||||
conn.commit()
|
||||
cur.close(); conn.close()
|
||||
except:
|
||||
pass
|
||||
time.sleep(60)
|
||||
|
||||
def get_model():
|
||||
global _model, _processor
|
||||
if _model is None:
|
||||
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
||||
print(f"[GDINO] Loading model on {DEVICE}...")
|
||||
t0 = time.time()
|
||||
_processor = AutoProcessor.from_pretrained(MODEL_NAME)
|
||||
_model = AutoModelForZeroShotObjectDetection.from_pretrained(MODEL_NAME).to(DEVICE)
|
||||
print(f"[GDINO] Loaded in {time.time()-t0:.1f}s")
|
||||
return _model, _processor
|
||||
|
||||
def find_video(uuid):
|
||||
if uuid in VIDEO_PATHS: return VIDEO_PATHS[uuid]
|
||||
import glob
|
||||
base = "/Users/accusys/momentry/var/sftpgo/data/demo"
|
||||
for f in glob.glob(f"{base}/**/Charade*", recursive=True):
|
||||
if f.endswith((".mp4", ".mov", ".avi")): VIDEO_PATHS[uuid] = f; return f
|
||||
for f in glob.glob(f"{base}/**/*{uuid[:8]}*", recursive=True):
|
||||
if f.endswith((".mp4", ".mov", ".avi")): VIDEO_PATHS[uuid] = f; return f
|
||||
return None
|
||||
|
||||
def resolve_target(target_str):
|
||||
"""Resolve 'file_uuid:chunk_id' or 'file_uuid:trace_id' to (file_uuid, start_time, end_time).
|
||||
Returns (uuid, start_sec, end_sec, label) or None.
|
||||
"""
|
||||
if not target_str or ":" not in target_str:
|
||||
return None
|
||||
parts = target_str.split(":", 1)
|
||||
if len(parts) != 2:
|
||||
return None
|
||||
uuid, identifier = parts
|
||||
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Try chunk_id first
|
||||
cur.execute("""
|
||||
SELECT start_time, end_time, chunk_id FROM dev.chunks
|
||||
WHERE file_uuid=%s AND chunk_id=%s LIMIT 1
|
||||
""", (uuid, identifier))
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
cur.close(); conn.close()
|
||||
return (uuid, float(row[0]), float(row[1]), identifier)
|
||||
|
||||
# Try chunk_index
|
||||
if identifier.isdigit():
|
||||
cid = f"{uuid}_{identifier}"
|
||||
cur.execute("""
|
||||
SELECT start_time, end_time, chunk_id FROM dev.chunks
|
||||
WHERE file_uuid=%s AND chunk_id=%s LIMIT 1
|
||||
""", (uuid, cid))
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
cur.close(); conn.close()
|
||||
return (uuid, float(row[0]), float(row[1]), cid)
|
||||
|
||||
# Try trace_id
|
||||
if identifier.startswith("trace_") or identifier.isdigit():
|
||||
trace_id = identifier.replace("trace_", "")
|
||||
cur.execute("""
|
||||
SELECT MIN(start_time), MAX(end_time), chunk_id FROM dev.chunks
|
||||
WHERE file_uuid=%s AND chunk_type='trace' AND chunk_id LIKE %s
|
||||
GROUP BY chunk_id LIMIT 1
|
||||
""", (uuid, f"%_trace_{trace_id}"))
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
cur.close(); conn.close()
|
||||
return (uuid, float(row[0]), float(row[1]), f"trace_{trace_id}")
|
||||
|
||||
cur.close(); conn.close()
|
||||
return None
|
||||
|
||||
def parse_query(query):
|
||||
"""Extract search object from natural language query."""
|
||||
query = query.lower().strip()
|
||||
# Direct object name
|
||||
articles = ["a ", "an ", "the ", "some ", "any "]
|
||||
prefixes = ["find ", "show ", "search ", "where is ", "where are ",
|
||||
"looking for ", "detect ", "locate ", "spot ", "scan for "]
|
||||
for p in prefixes:
|
||||
if query.startswith(p):
|
||||
query = query[len(p):]
|
||||
for a in articles:
|
||||
if query.startswith(a):
|
||||
query = query[len(a):]
|
||||
# Remove trailing punctuation and extra words
|
||||
query = query.rstrip(".?!,")
|
||||
for suffix in [" in the image", " in this scene", " in the picture",
|
||||
" being held", " in hand", " in frame", " please"]:
|
||||
if query.endswith(suffix):
|
||||
query = query[: -len(suffix)]
|
||||
return query.strip()
|
||||
|
||||
def infer_frame(img, prompt, threshold=0.1):
|
||||
"""Run Grounding DINO on a PIL image. Returns list of detections."""
|
||||
model, processor = get_model()
|
||||
inputs = processor(images=img, text=f"{prompt}.", return_tensors="pt").to(DEVICE)
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
dets = processor.post_process_grounded_object_detection(
|
||||
outputs, threshold=threshold, target_sizes=[img.size[::-1]])[0]
|
||||
results = []
|
||||
for i in range(len(dets["boxes"])):
|
||||
results.append({
|
||||
"bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
|
||||
"score": round(dets["scores"][i].item(), 3),
|
||||
"label": prompt,
|
||||
})
|
||||
return results
|
||||
|
||||
@app.route("/detect", methods=["POST"])
|
||||
def detect():
|
||||
"""Detect objects in a single frame.
|
||||
Input: {"uuid","time","prompt","threshold"}
|
||||
"""
|
||||
data = request.json or {}
|
||||
uuid = data.get("uuid", "aeed71342a899fe4b4c57b7d41bcb692")
|
||||
t_sec = data.get("time", 0)
|
||||
prompt = data.get("prompt", "gun")
|
||||
threshold = data.get("threshold", 0.1)
|
||||
|
||||
video = find_video(uuid)
|
||||
if not video: return jsonify({"error": "Video not found"}), 404
|
||||
|
||||
cap = cv2.VideoCapture(video)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
|
||||
ret, frame = cap.read()
|
||||
cap.release()
|
||||
if not ret: return jsonify({"error": f"Cannot read frame at {t_sec}s"}), 400
|
||||
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
t0 = time.time()
|
||||
detections = infer_frame(img, prompt, threshold)
|
||||
infer_ms = (time.time() - t0) * 1000
|
||||
|
||||
draw = ImageDraw.Draw(img)
|
||||
for d in detections:
|
||||
b = d["bbox"]
|
||||
draw.rectangle(b, outline="lime", width=3)
|
||||
draw.text((b[0], b[1]-18), f"{d['label']} {d['score']:.2f}", fill="lime")
|
||||
|
||||
shot_name = f"{uuid[:8]}_{int(t_sec)}s_{prompt}.jpg"
|
||||
img.save(os.path.join(SHOTS_DIR, shot_name))
|
||||
|
||||
return jsonify({
|
||||
"detections": detections,
|
||||
"time_ms": round(infer_ms, 1),
|
||||
"n_detections": len(detections),
|
||||
"shot_url": f"/shots/{shot_name}",
|
||||
})
|
||||
|
||||
@app.route("/search", methods=["POST"])
|
||||
def search():
|
||||
"""Search across a time range with natural language query.
|
||||
Input: {"uuid","target":"file_uuid:chunk_id","query":"find the gun","range":"0-6780","interval":30,"threshold":0.15}
|
||||
target: 'file_uuid:chunk_id' or 'file_uuid:trace_id' — resolves to time range automatically
|
||||
range: manual time range (used if target not provided)
|
||||
"""
|
||||
data = request.json or {}
|
||||
uuid = data.get("uuid", "aeed71342a899fe4b4c57b7d41bcb692")
|
||||
target_str = data.get("target", "")
|
||||
query = data.get("query", "find the gun")
|
||||
range_str = data.get("range", "0-6780")
|
||||
interval = data.get("interval", 30)
|
||||
threshold = data.get("threshold", 0.15)
|
||||
|
||||
prompt = parse_query(query)
|
||||
if not prompt:
|
||||
return jsonify({"error": f"Cannot parse query: {query}"}), 400
|
||||
|
||||
# Resolve target → time range
|
||||
resolved_label = ""
|
||||
if target_str:
|
||||
resolved = resolve_target(target_str)
|
||||
if resolved:
|
||||
uuid, range_start, range_end, resolved_label = resolved
|
||||
else:
|
||||
return jsonify({"error": f"Cannot resolve target: {target_str}"}), 404
|
||||
else:
|
||||
# Parse manual range
|
||||
if "-" in range_str:
|
||||
parts = range_str.split("-")
|
||||
range_start = float(parts[0])
|
||||
range_end = float(parts[1]) if len(parts) > 1 else 6780
|
||||
else:
|
||||
range_start = 0
|
||||
range_end = 6780
|
||||
|
||||
video = find_video(uuid)
|
||||
if not video: return jsonify({"error": "Video not found"}), 404
|
||||
|
||||
cap = cv2.VideoCapture(video)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
|
||||
hits = []
|
||||
t_start = time.time()
|
||||
frame_step = int(interval * fps)
|
||||
|
||||
for frame_num in range(int(range_start * fps), min(int(range_end * fps), total_frames), frame_step):
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
|
||||
ret, frame = cap.read()
|
||||
if not ret: continue
|
||||
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
detections = infer_frame(img, prompt, threshold)
|
||||
|
||||
if detections:
|
||||
ts = frame_num / fps
|
||||
best = max(d["score"] for d in detections)
|
||||
hits.append({
|
||||
"time": round(ts, 1),
|
||||
"time_str": f"{int(ts//60)}:{int(ts%60):02d}.{int((ts%1)*fps):02d}",
|
||||
"frame": frame_num,
|
||||
"detections": detections,
|
||||
"best_score": best,
|
||||
})
|
||||
|
||||
if len(hits) >= 100: # safety limit
|
||||
break
|
||||
|
||||
cap.release()
|
||||
elapsed = time.time() - t_start
|
||||
|
||||
return jsonify({
|
||||
"query": query,
|
||||
"object": prompt,
|
||||
"target": target_str or None,
|
||||
"resolved_target": resolved_label or None,
|
||||
"range": f"{range_start:.0f}-{range_end:.0f}",
|
||||
"interval_secs": interval,
|
||||
"scanned_frames": int((range_end - range_start) / interval) + 1,
|
||||
"hits": hits,
|
||||
"n_hits": len(hits),
|
||||
"elapsed_secs": round(elapsed, 1),
|
||||
})
|
||||
|
||||
@app.route("/shots/<filename>")
|
||||
def serve_shot(filename):
|
||||
path = os.path.join(SHOTS_DIR, filename)
|
||||
if not os.path.exists(path): return jsonify({"error": "Not found"}), 404
|
||||
return send_file(path, mimetype="image/jpeg")
|
||||
|
||||
@app.route("/health")
|
||||
def health():
|
||||
return jsonify({
|
||||
"status": "ok",
|
||||
"resource_id": RESOURCE_ID,
|
||||
"resource_type": RESOURCE_TYPE,
|
||||
"model": MODEL_NAME,
|
||||
"device": DEVICE,
|
||||
"port": PORT,
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Register as resource
|
||||
register_resource()
|
||||
|
||||
# Start heartbeat thread
|
||||
t = threading.Thread(target=heartbeat_loop, daemon=True)
|
||||
t.start()
|
||||
|
||||
# Load model
|
||||
get_model()
|
||||
print(f"[GDINO] Frame API v2: http://0.0.0.0:{PORT}")
|
||||
print(f"[GDINO] Resource: {RESOURCE_ID} (type={RESOURCE_TYPE})")
|
||||
app.run(host="0.0.0.0", port=PORT, threaded=True)
|
||||
155
scripts/generate_asr1.py
Normal file
155
scripts/generate_asr1.py
Normal file
@@ -0,0 +1,155 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Generate {uuid}.asr-1.json by comparing asr.json (3417) with DB chunks (4188).
|
||||
Identifies which ASR segments were split and records corrections.
|
||||
"""
|
||||
import json, os, subprocess, sys, time
|
||||
|
||||
PG_BIN = "/Users/accusys/pgsql/18.3/bin"
|
||||
DB_USER = "accusys"
|
||||
DB_NAME = "momentry"
|
||||
OUTPUT_DIR = "/Users/accusys/momentry/output_dev"
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
|
||||
|
||||
def psql(sql):
|
||||
r = subprocess.run([f"{PG_BIN}/psql", "-U", DB_USER, "-d", DB_NAME, "-t", "-A", "-F", chr(31), "-c", sql],
|
||||
capture_output=True, text=True, timeout=30)
|
||||
return r.stdout.strip()
|
||||
|
||||
|
||||
def main():
|
||||
t0 = time.time()
|
||||
print(f"Loading ASR segments from {UUID}.asr.json...")
|
||||
asr_path = os.path.join(OUTPUT_DIR, f"{UUID}.asr.json")
|
||||
with open(asr_path) as f:
|
||||
asr_data = json.load(f)
|
||||
asr_segs = asr_data["segments"]
|
||||
print(f" {len(asr_segs)} ASR segments")
|
||||
|
||||
print("Loading DB sentence chunks...")
|
||||
rows = []
|
||||
raw = psql(
|
||||
f"SELECT chunk_index, start_frame, end_frame, start_time, end_time, chunk_id, text_content "
|
||||
f"FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='sentence' "
|
||||
f"ORDER BY chunk_index"
|
||||
)
|
||||
for line in raw.split("\n"):
|
||||
if not line.strip():
|
||||
continue
|
||||
parts = line.split(chr(31))
|
||||
rows.append(parts)
|
||||
|
||||
db_chunks = []
|
||||
for r in rows:
|
||||
db_chunks.append({
|
||||
"chunk_index": int(r[0]),
|
||||
"start_frame": int(r[1]),
|
||||
"end_frame": int(r[2]),
|
||||
"start_time": float(r[3]),
|
||||
"end_time": float(r[4]),
|
||||
"chunk_id": r[5],
|
||||
"text_content": r[6] if len(r) > 6 and r[6] else "",
|
||||
})
|
||||
print(f" {len(db_chunks)} DB chunks")
|
||||
|
||||
# For each DB chunk, find the best-matching ASR segment.
|
||||
# A DB chunk belongs to ASR segment i if chunk's time range
|
||||
# falls WITHIN ASR segment i's time range.
|
||||
asr_of_chunk = {} # chunk_index -> asr_idx
|
||||
for dc in db_chunks:
|
||||
ct_mid = (dc["start_time"] + dc["end_time"]) / 2
|
||||
best_asr = None
|
||||
for ai, a in enumerate(asr_segs):
|
||||
if a["start"] - 0.1 <= dc["start_time"] and dc["end_time"] <= a["end"] + 0.1:
|
||||
if best_asr is None:
|
||||
best_asr = ai
|
||||
else:
|
||||
prev_a = asr_segs[best_asr]
|
||||
prev_mid = (prev_a["start"] + prev_a["end"]) / 2
|
||||
if abs(ct_mid - prev_mid) > abs(ct_mid - (a["start"] + a["end"]) / 2):
|
||||
best_asr = ai
|
||||
if best_asr is not None:
|
||||
asr_of_chunk[dc["chunk_index"]] = best_asr
|
||||
|
||||
print(f" Mapped: {len(asr_of_chunk)} / {len(db_chunks)} chunks to ASR segments")
|
||||
|
||||
# Group DB chunks by ASR index
|
||||
from collections import defaultdict
|
||||
chunks_by_asr = defaultdict(list)
|
||||
for ci, ai in asr_of_chunk.items():
|
||||
chunks_by_asr[ai].append(ci)
|
||||
|
||||
# Build kept + corrections
|
||||
corrections = []
|
||||
kept = []
|
||||
for ai, child_indices in sorted(chunks_by_asr.items()):
|
||||
if len(child_indices) < 2:
|
||||
dc = db_chunks[child_indices[0]]
|
||||
kept.append({
|
||||
"chunk_index": ai,
|
||||
"start_frame": dc["start_frame"],
|
||||
"end_frame": dc["end_frame"],
|
||||
"text_content": dc["text_content"],
|
||||
})
|
||||
continue
|
||||
a = asr_segs[ai]
|
||||
children = []
|
||||
for ci in child_indices:
|
||||
dc = db_chunks[ci]
|
||||
children.append({
|
||||
"chunk_id": dc["chunk_id"],
|
||||
"start_frame": dc["start_frame"],
|
||||
"end_frame": dc["end_frame"],
|
||||
"text_content": dc["text_content"],
|
||||
})
|
||||
children_sorted = sorted(children, key=lambda x: x["start_frame"])
|
||||
|
||||
# Assign new chunk_id format based on chunk_index
|
||||
# The first child of parent ASR idx N gets "N-01", second "N-02", etc.
|
||||
for si, child in enumerate(children_sorted):
|
||||
child["new_chunk_id"] = f"{ai}-{si+1:02d}"
|
||||
|
||||
corrections.append({
|
||||
"parent_chunk_index": ai,
|
||||
"reason": "split",
|
||||
"original": {
|
||||
"start_frame": int(a["start"] * 24),
|
||||
"end_frame": int(a["end"] * 24),
|
||||
"text_content": a["text"],
|
||||
},
|
||||
"corrected": children_sorted
|
||||
})
|
||||
|
||||
total_corrected = sum(len(c["corrected"]) for c in corrections)
|
||||
print(f" Kept chunks: {len(kept)}")
|
||||
print(f" Corrected chunks: {total_corrected}")
|
||||
print(f" Total: {len(kept) + total_corrected} (should be {len(db_chunks)})\n")
|
||||
|
||||
# Write output
|
||||
output = {
|
||||
"file_uuid": UUID,
|
||||
"asr_version": 1,
|
||||
"kept": kept,
|
||||
"corrections": corrections
|
||||
}
|
||||
output_path = os.path.join(OUTPUT_DIR, f"{UUID}.asr-1.json")
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output, f, indent=2, ensure_ascii=False)
|
||||
print(f"\nSaved: {output_path} ({os.path.getsize(output_path) / 1024:.0f} KB)")
|
||||
|
||||
# Stats
|
||||
split_sizes = {}
|
||||
for c in corrections:
|
||||
n = len(c["corrected"])
|
||||
split_sizes[n] = split_sizes.get(n, 0) + 1
|
||||
print(f"\nSplit distribution:")
|
||||
for n in sorted(split_sizes):
|
||||
print(f" {n} children: {split_sizes[n]} ASR segments → {n * split_sizes[n]} chunks")
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f"\nElapsed: {elapsed:.1f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
198
scripts/generate_sentence_summaries.py
Normal file
198
scripts/generate_sentence_summaries.py
Normal file
@@ -0,0 +1,198 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Generate sentence-level summaries using parent story context.
|
||||
Each sentence gets an LLM summary informed by the parent chunk scene overview.
|
||||
"""
|
||||
|
||||
import json, time, sys, os
|
||||
from urllib.request import Request, urlopen
|
||||
import psycopg2
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
||||
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
||||
|
||||
CHECKPOINT = f"/tmp/sentence_summaries_{UUID}.json"
|
||||
|
||||
def call_llm(prompt):
|
||||
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.1, "max_tokens": 80}).encode()
|
||||
req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
resp = urlopen(req, timeout=30)
|
||||
data = json.loads(resp.read())
|
||||
return data["choices"][0]["message"]["content"].strip()
|
||||
except Exception as e:
|
||||
return ""
|
||||
|
||||
def call_embed(text):
|
||||
body = json.dumps({"input": text}).encode()
|
||||
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
resp = urlopen(req, timeout=30)
|
||||
return json.loads(resp.read())["data"][0]["embedding"]
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
print("=== Step 1: Build sentence→parent mapping ===")
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get all story chunks with their child_chunk_ids
|
||||
cur.execute("""
|
||||
SELECT chunk_index, summary_text, child_chunk_ids
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid = %s AND chunk_type = 'story'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
stories = cur.fetchall()
|
||||
print(f"Loaded {len(stories)} story chunks")
|
||||
|
||||
# Get all sentence chunks
|
||||
cur.execute("""
|
||||
SELECT chunk_index, text_content, metadata->>'new_speaker_name' as speaker
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid = %s AND chunk_type = 'sentence'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
all_sentences = {r[0]: {"text": r[1], "speaker": r[2]} for r in cur.fetchall()}
|
||||
print(f"Loaded {len(all_sentences)} sentence chunks")
|
||||
|
||||
# Build: sentence_index → (parent_summary, sentence_text, speaker)
|
||||
sentence_map = {}
|
||||
for r in stories:
|
||||
story_idx, summary_text, child_ids = r
|
||||
if not child_ids:
|
||||
continue
|
||||
for cid in child_ids:
|
||||
parts = cid.split("_")
|
||||
child_idx = int(parts[-1])
|
||||
if child_idx in all_sentences:
|
||||
sentence_map[child_idx] = {
|
||||
"parent_summary": summary_text or "",
|
||||
"sentence_text": all_sentences[child_idx]["text"] or "",
|
||||
"speaker": all_sentences[child_idx]["speaker"] or "Unknown",
|
||||
}
|
||||
|
||||
# Load checkpoint if exists
|
||||
completed = set()
|
||||
if os.path.exists(CHECKPOINT):
|
||||
with open(CHECKPOINT) as f:
|
||||
old = json.load(f)
|
||||
completed = set(old.get("completed", []))
|
||||
print(f"Loaded checkpoint: {len(completed)} already completed")
|
||||
|
||||
conn.close()
|
||||
|
||||
print("\n=== Step 2: Generate summaries ===")
|
||||
results = []
|
||||
errors = 0
|
||||
sorted_indices = sorted(sentence_map.keys())
|
||||
|
||||
for i, idx in enumerate(sorted_indices):
|
||||
if idx in completed:
|
||||
continue
|
||||
|
||||
info = sentence_map[idx]
|
||||
parent_summary = info["parent_summary"]
|
||||
sent_text = info["sentence_text"]
|
||||
speaker = info["speaker"]
|
||||
|
||||
if not parent_summary or not sent_text:
|
||||
summary = sent_text or ""
|
||||
embedding = [0.0] * 768
|
||||
else:
|
||||
prompt = f"Context: {parent_summary}\nUtterance: {sent_text}\n\nIn one short sentence, explain what the speaker communicates with this line within the context above."
|
||||
summary = call_llm(prompt)
|
||||
if not summary:
|
||||
summary = sent_text
|
||||
embedding = [0.0] * 768
|
||||
else:
|
||||
embedding = call_embed(summary)
|
||||
if embedding is None:
|
||||
embedding = [0.0] * 768
|
||||
time.sleep(0.15)
|
||||
|
||||
results.append({
|
||||
"index": idx,
|
||||
"chunk_id": f"{UUID}_{idx}",
|
||||
"speaker_name": speaker,
|
||||
"utterance": sent_text,
|
||||
"summary": summary,
|
||||
"embedding": embedding,
|
||||
})
|
||||
|
||||
if (i + 1) % 50 == 0:
|
||||
print(f" [{i+1}/{len(sorted_indices)}] idx={idx} summary_len={len(summary)} errs={errors}")
|
||||
json.dump({"completed": list(completed | {r["index"] for r in results}), "results": results}, open(CHECKPOINT, "w"))
|
||||
|
||||
print(f"Generated {len(results)} summaries, {errors} errors")
|
||||
|
||||
# Recompute all results including checkpointed
|
||||
all_results = results
|
||||
if os.path.exists(CHECKPOINT):
|
||||
cp = json.load(open(CHECKPOINT))
|
||||
all_results = cp.get("results", [])
|
||||
# Merge
|
||||
existing = {r["index"] for r in all_results}
|
||||
for r in results:
|
||||
if r["index"] not in existing:
|
||||
all_results.append(r)
|
||||
all_results.sort(key=lambda x: x["index"])
|
||||
|
||||
print(f"\nTotal summaries: {len(all_results)}")
|
||||
|
||||
print("\n=== Step 3: Update Qdrant sentence_summary ===")
|
||||
# Delete old collection
|
||||
req = Request(f"{QDRANT_URL}/collections/sentence_summary", method="DELETE")
|
||||
try:
|
||||
urlopen(req)
|
||||
time.sleep(0.5)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Recreate
|
||||
req = Request(f"{QDRANT_URL}/collections/sentence_summary",
|
||||
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req)
|
||||
time.sleep(0.5)
|
||||
|
||||
# Upload
|
||||
batch_size = 100
|
||||
points = []
|
||||
for r in all_results:
|
||||
points.append({
|
||||
"id": r["index"] + 1,
|
||||
"vector": r["embedding"],
|
||||
"payload": {
|
||||
"chunk_type": "sentence",
|
||||
"uuid": UUID,
|
||||
"chunk_id": r["chunk_id"],
|
||||
"speaker_name": r["speaker_name"],
|
||||
"utterance": r["utterance"],
|
||||
"summary": r["summary"],
|
||||
}
|
||||
})
|
||||
|
||||
for start in range(0, len(points), batch_size):
|
||||
batch = points[start:start+batch_size]
|
||||
req = Request(f"{QDRANT_URL}/collections/sentence_summary/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try:
|
||||
urlopen(req)
|
||||
except Exception as e:
|
||||
print(f" Batch {start}: {e}")
|
||||
if (start // batch_size) % 5 == 0:
|
||||
print(f" Uploaded {start + len(batch)}/{len(points)}")
|
||||
|
||||
print(f"Done: {len(points)} points in sentence_summary")
|
||||
|
||||
# Verify
|
||||
resp = json.loads(urlopen(f"{QDRANT_URL}/collections/sentence_summary").read())
|
||||
info = resp["result"]
|
||||
print(f"Verified: points={info['points_count']}, dim={info['config']['params']['vectors'].get('size','?')}")
|
||||
161
scripts/gun_detector_scan.py
Normal file
161
scripts/gun_detector_scan.py
Normal file
@@ -0,0 +1,161 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Gun Detector Scan — YOLOv8n fine-tuned gun detector on Charade (1963).
|
||||
Scans at ASR "gun" trigger points + fixed intervals, saves annotated screenshots.
|
||||
"""
|
||||
import json, os, sys, time, cv2, re
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
from ultralytics import YOLO
|
||||
|
||||
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
|
||||
MODEL = "/Users/accusys/momentry_core_0.1/models/gun/gun_detector/weights/best.pt"
|
||||
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/gun_detections"
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
CLASS_NAMES = {0: "grenade", 1: "knife", 2: "pistol", 3: "rifle"}
|
||||
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
# Load model
|
||||
print(f"Loading model: {MODEL}")
|
||||
model = YOLO(MODEL)
|
||||
print(f"Classes: {model.names}")
|
||||
|
||||
# Open video
|
||||
cap = cv2.VideoCapture(VIDEO)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
print(f"Video: {fps:.1f} fps, {total_frames} frames ({total_frames/fps/60:.1f} min)")
|
||||
|
||||
# === Collect scan timepoints ===
|
||||
print("\n=== Collecting scan timepoints ===")
|
||||
|
||||
# 1. ASR mentions of "gun"
|
||||
import psycopg2
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT DISTINCT start_time FROM dev.chunks
|
||||
WHERE file_uuid=%s AND chunk_type='sentence'
|
||||
AND text_content ILIKE CONCAT('%%', %s, '%%')
|
||||
ORDER BY start_time
|
||||
""", (UUID, 'gun'))
|
||||
asr_times = [r[0] for r in cur.fetchall()]
|
||||
conn.close()
|
||||
print(f"ASR 'gun' mentions: {len(asr_times)} timepoints")
|
||||
|
||||
# 2. Fixed interval scan (every 60 seconds)
|
||||
fixed_times = list(range(0, int(total_frames / fps), 60))
|
||||
print(f"Fixed interval (60s): {len(fixed_times)} timepoints")
|
||||
|
||||
# 3. The original 5 pistol timestamps (3188, 5461, 6309, 6377, 6479)
|
||||
original_hits = [3188, 5461, 6309, 6377, 6479]
|
||||
|
||||
# Merge all timepoints, rounded to nearest second
|
||||
all_times = set()
|
||||
for t in asr_times + fixed_times + original_hits:
|
||||
all_times.add(int(round(t)))
|
||||
all_times = sorted(all_times)
|
||||
print(f"Total unique scan points: {len(all_times)}")
|
||||
print(f"Range: {all_times[0]}s - {all_times[-1]}s")
|
||||
|
||||
# === Scan ===
|
||||
print("\n=== Scanning ===")
|
||||
results = []
|
||||
frame_step = 30 # scan 30 frames around each timepoint
|
||||
|
||||
t0 = time.time()
|
||||
for scan_idx, t_sec in enumerate(all_times):
|
||||
# Scan frames around this timepoint
|
||||
center_frame = int(t_sec * fps)
|
||||
start_frame = max(0, center_frame - frame_step)
|
||||
end_frame = min(total_frames, center_frame + frame_step)
|
||||
|
||||
for frame_num in range(start_frame, end_frame + 1, 3): # every 3rd frame
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
|
||||
dets = model(frame, conf=0.25, verbose=False)[0]
|
||||
|
||||
for det in dets.boxes.data:
|
||||
cls_id = int(det[5])
|
||||
conf = float(det[4])
|
||||
class_name = CLASS_NAMES.get(cls_id, f"class_{cls_id}")
|
||||
|
||||
# Draw annotation
|
||||
x1, y1, x2, y2 = map(int, det[:4])
|
||||
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
|
||||
label = f"{class_name} {conf:.2f}"
|
||||
cv2.putText(frame, label, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
|
||||
|
||||
ts = frame_num / fps
|
||||
filename = f"{int(ts)}s_{class_name}_{conf:.3f}.jpg"
|
||||
filepath = os.path.join(OUTPUT_DIR, filename)
|
||||
cv2.imwrite(filepath, frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
|
||||
|
||||
results.append({
|
||||
"timestamp": round(ts, 1),
|
||||
"time_str": f"{int(ts//60)}:{int(ts%60):02d}.{int((ts%1)*fps):02.0f}",
|
||||
"frame": frame_num,
|
||||
"class": class_name,
|
||||
"confidence": round(conf, 3),
|
||||
"image": filename,
|
||||
})
|
||||
|
||||
if (scan_idx + 1) % 20 == 0:
|
||||
elapsed = time.time() - t0
|
||||
print(f" [{scan_idx+1}/{len(all_times)}] {len(results)} detections so far [{elapsed:.0f}s]")
|
||||
|
||||
cap.release()
|
||||
|
||||
print(f"\n=== Scan Complete ===")
|
||||
print(f"Scan points: {len(all_times)}")
|
||||
print(f"Total detections: {len(results)}")
|
||||
|
||||
# Deduplicate nearby detections (same class within 2 seconds)
|
||||
results.sort(key=lambda r: (r["timestamp"], r["class"]))
|
||||
deduped = []
|
||||
for r in results:
|
||||
if deduped and r["timestamp"] - deduped[-1]["timestamp"] < 2 and r["class"] == deduped[-1]["class"]:
|
||||
if r["confidence"] > deduped[-1]["confidence"]:
|
||||
deduped[-1] = r
|
||||
else:
|
||||
deduped.append(r)
|
||||
print(f"After dedup: {len(deduped)} detections")
|
||||
|
||||
# Group by class
|
||||
by_class = defaultdict(list)
|
||||
for r in deduped:
|
||||
by_class[r["class"]].append(r)
|
||||
print(f"\nDetections by class:")
|
||||
for cls, items in sorted(by_class.items()):
|
||||
print(f" {cls}: {len(items)}")
|
||||
for r in sorted(items, key=lambda x: -x["confidence"])[:5]:
|
||||
print(f" {r['time_str']} conf={r['confidence']:.3f} frame={r['frame']} {r['image']}")
|
||||
|
||||
# Check if original 5 were found
|
||||
print(f"\nOriginal 5 pistol timestamps:")
|
||||
for t in original_hits:
|
||||
found = [r for r in deduped if abs(r["timestamp"] - t) < 3 and r["class"] == "pistol"]
|
||||
if found:
|
||||
best = max(found, key=lambda x: x["confidence"])
|
||||
print(f" {t}s: ✅ FOUND conf={best['confidence']:.3f} {best['image']}")
|
||||
else:
|
||||
print(f" {t}s: ❌ NOT FOUND")
|
||||
|
||||
# Save JSON
|
||||
output = {
|
||||
"uuid": UUID,
|
||||
"model": str(MODEL),
|
||||
"scan_points": len(all_times),
|
||||
"total_detections": len(results),
|
||||
"after_dedup": len(deduped),
|
||||
"detections": sorted(deduped, key=lambda x: x["timestamp"]),
|
||||
}
|
||||
json_path = os.path.join(OUTPUT_DIR, "gun_detections.json")
|
||||
json.dump(output, open(json_path, "w"), indent=2)
|
||||
print(f"\nSaved: {json_path}")
|
||||
print(f"Images: {OUTPUT_DIR}/")
|
||||
259
scripts/import_file.py
Normal file
259
scripts/import_file.py
Normal file
@@ -0,0 +1,259 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
momentry-import — 匯入檔案歷程封包
|
||||
將 export_file.py 產出的 tar.gz 匯入到目標 Momentry 系統
|
||||
|
||||
Usage:
|
||||
python3 scripts/import_file.py <package.tar.gz> [--schema <schema>]
|
||||
|
||||
Example:
|
||||
python3 scripts/import_file.py /tmp/charade_export.tar.gz --schema dev
|
||||
"""
|
||||
|
||||
import sys, os, json, argparse, tarfile, io, tempfile, shutil
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
|
||||
SCHEMA = os.environ.get("MOMENTRY_DB_SCHEMA", "dev")
|
||||
OUTPUT_DIR = os.environ.get("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
|
||||
|
||||
|
||||
def get_conn():
|
||||
return psycopg2.connect(DB_URL)
|
||||
|
||||
|
||||
def json_loads(data: bytes):
|
||||
return json.loads(data.decode())
|
||||
|
||||
|
||||
def import_package(package_path: str, schema: str):
|
||||
print(f"[IMPORT] Opening {package_path}...")
|
||||
|
||||
with tarfile.open(package_path, "r:gz") as tar:
|
||||
# 讀取 manifest
|
||||
manifest = json_loads(tar.extractfile("manifest.json").read())
|
||||
uuid = manifest["file_uuid"]
|
||||
print(f"[IMPORT] File: {manifest.get('file_name','?')} ({uuid})")
|
||||
print(f"[IMPORT] Exported at: {manifest.get('exported_at','?')}")
|
||||
print(f"[IMPORT] Completeness: {manifest.get('completeness',{})}")
|
||||
print(f"[IMPORT] Merge policy: {manifest.get('merge_policy',{})}")
|
||||
|
||||
conn = get_conn()
|
||||
cur = conn.cursor()
|
||||
|
||||
# Step 1: 檢查目標系統是否已有此 file_uuid
|
||||
cur.execute(
|
||||
f"SELECT file_uuid FROM {schema}.videos WHERE file_uuid = %s",
|
||||
(uuid,),
|
||||
)
|
||||
existing = cur.fetchone()
|
||||
if existing:
|
||||
print(f" ⚠️ UUID {uuid} 已存在於目標系統")
|
||||
# TODO: 支援覆蓋或略過
|
||||
|
||||
# Step 2: 匯入 identities(需先做 identity merge)
|
||||
identity_map = {} # old_id → new_id
|
||||
if "data/identities.json" in [m.name for m in tar.getmembers()]:
|
||||
identities = json_loads(tar.extractfile("data/identities.json").read())
|
||||
print(f"\n ── Identity Merge ──")
|
||||
for ident in identities:
|
||||
old_id = ident["id"]
|
||||
name = ident.get("name", "")
|
||||
# 依名稱比對
|
||||
cur.execute(
|
||||
f"SELECT id FROM {schema}.identities WHERE name = %s",
|
||||
(name,),
|
||||
)
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
# 已存在 → merge
|
||||
identity_map[old_id] = row[0]
|
||||
print(f" 🔗 '{name}' → 已存在 (id={row[0]}), 合併")
|
||||
else:
|
||||
# 不存在 → 新增
|
||||
cur.execute(
|
||||
f"INSERT INTO {schema}.identities (name) VALUES (%s) RETURNING id",
|
||||
(name,),
|
||||
)
|
||||
new_id = cur.fetchone()[0]
|
||||
identity_map[old_id] = new_id
|
||||
print(f" ✅ '{name}' → 新增 (id={new_id})")
|
||||
conn.commit()
|
||||
print(f" ────────────────")
|
||||
else:
|
||||
print(f" [IMPORT] identities: (package 無 identity 資料)")
|
||||
|
||||
# Step 3: 匯入 identity_bindings(若有)
|
||||
if "data/identity_bindings.json" in [m.name for m in tar.getmembers()]:
|
||||
bindings = json_loads(tar.extractfile("data/identity_bindings.json").read())
|
||||
for b in bindings:
|
||||
b["identity_id"] = identity_map.get(b["identity_id"], b["identity_id"])
|
||||
try:
|
||||
cur.execute(
|
||||
f"INSERT INTO {schema}.identity_bindings "
|
||||
f"(identity_id, identity_type, identity_value, metadata, confidence) "
|
||||
f"VALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING",
|
||||
(b["identity_id"], b["identity_type"], b["identity_value"],
|
||||
json.dumps(b.get("metadata", {})), b.get("confidence", 1.0)),
|
||||
)
|
||||
except Exception as e:
|
||||
print(f" ⚠️ binding 匯入失敗: {e}")
|
||||
conn.commit()
|
||||
print(f" [IMPORT] identity_bindings: {len(bindings)} rows")
|
||||
|
||||
# Step 4: 匯入 videos 資料
|
||||
video_data = json_loads(tar.extractfile("data/video.json").read())
|
||||
cur.execute(
|
||||
f"""
|
||||
INSERT INTO {schema}.videos
|
||||
(file_uuid, file_path, file_name, file_type, duration, width, height,
|
||||
fps, total_frames, probe_json, status)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 'completed')
|
||||
ON CONFLICT (file_uuid) DO UPDATE SET
|
||||
file_path = EXCLUDED.file_path,
|
||||
file_name = EXCLUDED.file_name,
|
||||
status = 'completed'
|
||||
""",
|
||||
(
|
||||
uuid,
|
||||
video_data.get("file_path", ""),
|
||||
video_data.get("file_name", ""),
|
||||
video_data.get("file_type", "video"),
|
||||
video_data.get("duration"),
|
||||
video_data.get("width"),
|
||||
video_data.get("height"),
|
||||
float(video_data.get("fps") or 0),
|
||||
video_data.get("total_frames"),
|
||||
json.dumps(video_data.get("probe_json", {})),
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
print(f" [IMPORT] videos: ✅")
|
||||
|
||||
# Step 5: 匯入 output JSON 檔案
|
||||
output_dir = Path(OUTPUT_DIR)
|
||||
for member in tar.getmembers():
|
||||
if member.name.startswith("output/") and member.isfile():
|
||||
fname = member.name.replace("output/", "")
|
||||
dst = output_dir / fname
|
||||
if not dst.parent.exists():
|
||||
dst.parent.mkdir(parents=True)
|
||||
with tar.extractfile(member) as src_f:
|
||||
with open(dst, "wb") as dst_f:
|
||||
shutil.copyfileobj(src_f, dst_f)
|
||||
print(f" [IMPORT] output/{fname} ({member.size // 1024}KB)")
|
||||
print(f" [IMPORT] output files: 完成")
|
||||
|
||||
# Step 6: 匯入 pre_chunks(批次插入)
|
||||
if "data/pre_chunks.json" in [m.name for m in tar.getmembers()]:
|
||||
pre_chunks = json_loads(tar.extractfile("data/pre_chunks.json").read())
|
||||
# 先取得 file_id(videos table 的 id)
|
||||
cur.execute(f"SELECT id FROM {schema}.videos WHERE file_uuid = %s", (uuid,))
|
||||
file_row = cur.fetchone()
|
||||
if file_row:
|
||||
file_id = file_row[0]
|
||||
inserted = 0
|
||||
for pc in pre_chunks:
|
||||
try:
|
||||
cur.execute(
|
||||
f"INSERT INTO {schema}.pre_chunks "
|
||||
f"(file_id, file_uuid, processor_type, coordinate_type, "
|
||||
f"coordinate_index, start_frame, end_frame, start_time, end_time, "
|
||||
f"fps, data) "
|
||||
f"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) "
|
||||
f"ON CONFLICT DO NOTHING",
|
||||
(
|
||||
file_id, uuid,
|
||||
pc.get("processor_type"), pc.get("coordinate_type"),
|
||||
pc.get("coordinate_index"),
|
||||
pc.get("start_frame"), pc.get("end_frame"),
|
||||
pc.get("start_time"), pc.get("end_time"),
|
||||
pc.get("fps"), json.dumps(pc.get("data", {})),
|
||||
),
|
||||
)
|
||||
inserted += 1
|
||||
if inserted % 1000 == 0:
|
||||
print(f" ... {inserted}/{len(pre_chunks)}", end="\r")
|
||||
except Exception as e:
|
||||
pass
|
||||
conn.commit()
|
||||
print(f" [IMPORT] pre_chunks: {inserted} rows \n")
|
||||
else:
|
||||
print(f" [IMPORT] pre_chunks: 無法取得 file_id")
|
||||
|
||||
# Step 7: 匯入 processor_results
|
||||
if "data/processor_results.json" in [m.name for m in tar.getmembers()]:
|
||||
results = json_loads(tar.extractfile("data/processor_results.json").read())
|
||||
for r in results:
|
||||
try:
|
||||
cur.execute(
|
||||
f"INSERT INTO {schema}.processor_results "
|
||||
f"(job_id, file_uuid, processor, status, chunks_produced, frames_processed) "
|
||||
f"VALUES (0, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING",
|
||||
(uuid, r.get("processor"), r.get("status"),
|
||||
r.get("chunks_produced", 0), r.get("frames_processed", 0)),
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
conn.commit()
|
||||
print(f" [IMPORT] processor_results: {len(results)} rows")
|
||||
|
||||
# Step 7: 匯入 face_detections(若無 embedding 可省略該欄位)
|
||||
face_detections_src = None
|
||||
for candidate in ["data/face_detections.json", "data/face_detections_meta.json"]:
|
||||
if candidate in [m.name for m in tar.getmembers()]:
|
||||
face_detections_src = candidate
|
||||
break
|
||||
if face_detections_src:
|
||||
fds = json_loads(tar.extractfile(face_detections_src).read())
|
||||
inserted = 0
|
||||
for fd in fds:
|
||||
try:
|
||||
cur.execute(
|
||||
f"INSERT INTO {schema}.face_detections "
|
||||
f"(file_uuid, face_id, frame_number, x, y, width, height, "
|
||||
f"confidence, identity_id, trace_id) "
|
||||
f"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) "
|
||||
f"ON CONFLICT DO NOTHING",
|
||||
(
|
||||
uuid,
|
||||
fd.get("face_id"),
|
||||
fd.get("frame_number"),
|
||||
fd.get("x"), fd.get("y"),
|
||||
fd.get("width"), fd.get("height"),
|
||||
fd.get("confidence"),
|
||||
identity_map.get(fd.get("identity_id"), fd.get("identity_id")),
|
||||
fd.get("trace_id"),
|
||||
),
|
||||
)
|
||||
inserted += 1
|
||||
if inserted % 1000 == 0:
|
||||
print(f" ... {inserted}/{len(fds)}", end="\r")
|
||||
except Exception as e:
|
||||
pass
|
||||
conn.commit()
|
||||
print(f" [IMPORT] face_detections: {inserted} rows \n")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
print(f"\n[IMPORT] ✅ 完成: {manifest.get('file_name','?')} 已匯入 (file_uuid={uuid})")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Import file processing history package")
|
||||
parser.add_argument("package", help="Path to .tar.gz package")
|
||||
parser.add_argument("--schema", default=SCHEMA, help="Target DB schema")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not os.path.exists(args.package):
|
||||
print(f"[IMPORT] ❌ Package not found: {args.package}")
|
||||
sys.exit(1)
|
||||
|
||||
import_package(args.package, args.schema)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
138
scripts/lip_analyzer.py
Normal file
138
scripts/lip_analyzer.py
Normal file
@@ -0,0 +1,138 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Lip Analyzer — from face_test.json (Apple Vision outer_lips 14pts) + ASRX
|
||||
Computes lip_openness per frame, compares with speaker segments.
|
||||
"""
|
||||
|
||||
import json, sys, os
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
def calc_lip_height(face):
|
||||
lips_data = face.get("lips", {})
|
||||
if isinstance(lips_data, dict):
|
||||
pts = lips_data.get("outer_lips", [])
|
||||
elif isinstance(lips_data, list):
|
||||
pts = lips_data
|
||||
else:
|
||||
return None
|
||||
if not pts or len(pts) < 3:
|
||||
return None
|
||||
ys = [pt[1] if isinstance(pt, (list, tuple)) else pt.get("y", 0) for pt in pts]
|
||||
return max(ys) - min(ys)
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--face", required=True)
|
||||
parser.add_argument("--asrx", required=True)
|
||||
parser.add_argument("--output", required=True)
|
||||
parser.add_argument("--threshold", type=float, default=0.05)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load face data
|
||||
with open(args.face) as f:
|
||||
face_data = json.load(f)
|
||||
|
||||
frames_data = face_data.get("frames", face_data if isinstance(face_data, list) else [])
|
||||
# face_test.json uses frames array
|
||||
if not isinstance(frames_data, list) and isinstance(face_data, dict):
|
||||
frames_data = face_data.get("frames", [])
|
||||
|
||||
print(f"\nFace data: {len(frames_data)} frames, {face_data.get('frame_count', '?')} total")
|
||||
|
||||
# Extract lip openness per frame, per face
|
||||
lip_by_frame = {}
|
||||
for fdata in frames_data:
|
||||
fn = fdata.get("frame", 0) if isinstance(fdata, dict) else 0
|
||||
faces = fdata.get("faces", fdata.get("detections", []))
|
||||
heights = []
|
||||
for face in faces:
|
||||
h = calc_lip_height(face)
|
||||
if h is not None:
|
||||
heights.append(h)
|
||||
if heights:
|
||||
lip_by_frame[fn] = {"heights": heights, "avg": sum(heights)/len(heights), "count": len(heights)}
|
||||
|
||||
print(f"Frames with lip data: {len(lip_by_frame)}")
|
||||
|
||||
# Load ASRX speaker segments
|
||||
with open(args.asrx) as f:
|
||||
asrx = json.load(f)
|
||||
segs = asrx.get("segments", [])
|
||||
fps = 25.0
|
||||
print(f"ASRX segments: {len(segs)}")
|
||||
|
||||
# Analyze each ASR segment
|
||||
results = []
|
||||
speakable = 0
|
||||
total = 0
|
||||
for seg in segs:
|
||||
total += 1
|
||||
st = seg.get("start_time", 0)
|
||||
et = seg.get("end_time", 0)
|
||||
speaker = seg.get("speaker_id", "?")
|
||||
text = seg.get("text", "")
|
||||
|
||||
# Process all segments (no time limit)
|
||||
|
||||
# Find frames in this segment's window
|
||||
start_frame = int(st * fps)
|
||||
end_frame = int(et * fps) + 10 # allow some after
|
||||
|
||||
# Sample before ASR start (baseline 10 frames before)
|
||||
baseline_frames = [fn for fn in lip_by_frame if abs(fn - start_frame) <= 10]
|
||||
|
||||
# Sample after ASR start (during speaking)
|
||||
during_frames = [fn for fn in lip_by_frame if fn >= start_frame and fn <= end_frame]
|
||||
|
||||
baseline_avg = sum(lip_by_frame[fn]["avg"] for fn in baseline_frames) / max(len(baseline_frames), 1)
|
||||
during_avg = sum(lip_by_frame[fn]["avg"] for fn in during_frames) / max(len(during_frames), 1)
|
||||
|
||||
# How many frames have detectable faces (any faces)
|
||||
any_face = len(during_frames)
|
||||
|
||||
motion = (during_avg - baseline_avg) / max(baseline_avg, 1)
|
||||
is_speaking = motion > args.threshold
|
||||
|
||||
r = {
|
||||
"start_time": st, "end_time": et, "speaker": speaker,
|
||||
"text": text[:40],
|
||||
"baseline_avg": round(baseline_avg, 2),
|
||||
"during_avg": round(during_avg, 2),
|
||||
"motion_ratio": round(motion, 4),
|
||||
"is_speaking": is_speaking,
|
||||
"baseline_frames": len(baseline_frames),
|
||||
"during_frames": any_face,
|
||||
}
|
||||
results.append(r)
|
||||
if any_face > 0:
|
||||
speakable += 1
|
||||
|
||||
# Summary
|
||||
print(f"\n=== Results ===")
|
||||
print(f"ASRX segments analyzed: {len(results)}")
|
||||
print(f"With face data: {speakable} ({speakable*100//max(len(results),1)}%)")
|
||||
speech_detected = sum(1 for r in results if r["is_speaking"] and r["during_frames"] > 0)
|
||||
print(f"Lip motion detected: {speech_detected} ({speech_detected*100//max(speakable,1)}% of face-present)")
|
||||
|
||||
print(f"\n=== Sample: first 5 segments ===")
|
||||
for r in results[:5]:
|
||||
icon = "🗣" if r["is_speaking"] else "🤐"
|
||||
print(f" {icon} {r['start_time']:.0f}s {r['speaker']:12s} motion={r['motion_ratio']:.3f} baseline={r['baseline_avg']:.1f} during={r['during_avg']:.1f} faces={r['during_frames']}")
|
||||
|
||||
# Save
|
||||
output = {
|
||||
"fps": fps,
|
||||
"total_asrx_segments": len(results),
|
||||
"segments_with_faces": speakable,
|
||||
"segments_with_lip_motion": speech_detected,
|
||||
"lip_by_frame_count": len(lip_by_frame),
|
||||
"results": results,
|
||||
}
|
||||
with open(args.output, "w") as f:
|
||||
json.dump(output, f, indent=2, ensure_ascii=False)
|
||||
print(f"\nSaved: {args.output}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
137
scripts/map_speakers_v2.py
Normal file
137
scripts/map_speakers_v2.py
Normal file
@@ -0,0 +1,137 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Build new ASRX speaker_id → character name mapping using:
|
||||
1. Old DB sentence chunk metadata (speaker_name from face-to-TMDb match)
|
||||
2. New ASRX segments (1:1 aligned with ASR, each with speaker_id + voice embedding)
|
||||
"""
|
||||
|
||||
import json, sys, psycopg2
|
||||
from collections import Counter, defaultdict
|
||||
import numpy as np
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
ASRX_PATH = f"/Users/accusys/momentry/output_dev/{UUID}.asrx.json"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
|
||||
# Character name normalization
|
||||
NAME_MAP = {
|
||||
"Speaker_0": "Unknown",
|
||||
"SPEAKER_0": "Unknown",
|
||||
"SPEAKER_1": "Unknown",
|
||||
"SPEAKER_2": "Unknown",
|
||||
"SPEAKER_3": "Unknown",
|
||||
"SPEAKER_4": "Unknown",
|
||||
"SPEAKER_5": "Unknown",
|
||||
"SPEAKER_6": "Unknown",
|
||||
"SPEAKER_7": "Unknown",
|
||||
"SPEAKER_8": "Unknown",
|
||||
"SPEAKER_9": "Unknown",
|
||||
}
|
||||
|
||||
print("=== Step 1: Load DB sentence chunks ===")
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT chunk_index, metadata->>'speaker_id' as old_sid,
|
||||
metadata->>'speaker_name' as old_name
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid = %s AND chunk_type = 'sentence'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
conn.close()
|
||||
print(f"Loaded {len(rows)} sentence chunks from DB")
|
||||
|
||||
# Build array indexed by chunk_index
|
||||
db_by_idx = {}
|
||||
for r in rows:
|
||||
db_by_idx[r[0]] = {"old_sid": r[1], "old_name": r[2]}
|
||||
|
||||
print("=== Step 2: Load new ASRX ===")
|
||||
asrx = json.load(open(ASRX_PATH))
|
||||
segs = asrx["segments"]
|
||||
embeddings = asrx.get("embeddings", [])
|
||||
print(f"Loaded {len(segs)} ASRX segments, {len(embeddings)} embeddings")
|
||||
|
||||
# Build mapping: new_speaker_id --> old_name distribution
|
||||
new_to_old = defaultdict(list)
|
||||
old_name_counter = defaultdict(Counter)
|
||||
unmapped = 0
|
||||
total = 0
|
||||
|
||||
for i, seg in enumerate(segs):
|
||||
new_sid = seg["speaker_id"]
|
||||
total += 1
|
||||
|
||||
if i in db_by_idx:
|
||||
old_name = db_by_idx[i].get("old_name", "")
|
||||
old_sid = db_by_idx[i].get("old_sid", "")
|
||||
|
||||
# Normalize old name
|
||||
if old_name and old_name not in NAME_MAP:
|
||||
# Normalize case: "Speaker_0" → "Unknown"
|
||||
if old_name.startswith("Speaker_") or old_name.startswith("SPEAKER_"):
|
||||
old_name = "Unknown"
|
||||
elif old_name in NAME_MAP:
|
||||
old_name = NAME_MAP[old_name]
|
||||
|
||||
new_to_old[new_sid].append(old_name)
|
||||
old_name_counter[new_sid][old_name] += 1
|
||||
else:
|
||||
unmapped += 1
|
||||
new_to_old[new_sid].append("Unknown")
|
||||
|
||||
print(f"\nMapped {total - unmapped} segments, {unmapped} unmapped")
|
||||
print(f"\nMapping {len(new_to_old)} new speaker IDs:")
|
||||
|
||||
# Determine best character name for each new speaker
|
||||
speaker_identity = {}
|
||||
for sid in sorted(new_to_old.keys()):
|
||||
counter = old_name_counter[sid]
|
||||
total_for_speaker = sum(counter.values())
|
||||
best_name = counter.most_common(1)[0][0]
|
||||
best_count = counter.most_common(1)[0][1]
|
||||
pct = best_count / total_for_speaker * 100
|
||||
|
||||
speaker_identity[sid] = {
|
||||
"name": best_name,
|
||||
"confidence": round(pct, 1),
|
||||
"count": total_for_speaker,
|
||||
"distribution": dict(counter.most_common(5))
|
||||
}
|
||||
print(f" {sid}: {best_name} ({pct:.0f}%, {total_for_speaker} segs)")
|
||||
for nm, cnt in counter.most_common(5):
|
||||
if nm != best_name:
|
||||
print(f" {nm}: {cnt}")
|
||||
|
||||
print("\n=== Step 3: Assign names to all new ASRX segments ===")
|
||||
assignments = []
|
||||
for i, seg in enumerate(segs):
|
||||
new_sid = seg["speaker_id"]
|
||||
assigned_name = speaker_identity[new_sid]["name"]
|
||||
assignments.append({
|
||||
"index": i,
|
||||
"speaker_id": new_sid,
|
||||
"speaker_name": assigned_name,
|
||||
"start_time": seg["start_time"],
|
||||
"end_time": seg["end_time"],
|
||||
})
|
||||
|
||||
# Save mapping
|
||||
output = {
|
||||
"uuid": UUID,
|
||||
"total_segments": len(segs),
|
||||
"speaker_identity": speaker_identity,
|
||||
"assignments": assignments,
|
||||
}
|
||||
with open(f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map_v2.json", "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
print(f"\nSaved speaker mapping to output_dev/{UUID}.speaker_map_v2.json")
|
||||
|
||||
print("\n=== Summary ===")
|
||||
for sid, info in sorted(speaker_identity.items()):
|
||||
print(f" {sid} ({info['count']} segs, {info['confidence']}% confidence): {info['name']}")
|
||||
185
scripts/migrate_to_4188.py
Normal file
185
scripts/migrate_to_4188.py
Normal file
@@ -0,0 +1,185 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Full pipeline migration: delete old chunks, create 4188 fine-grained chunks
|
||||
with yolo_objects, face_ids, metadata per (recalculated) frame range.
|
||||
"""
|
||||
import json, sys, time, psycopg2
|
||||
from collections import defaultdict
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
BASE = "/Users/accusys/momentry/output_dev"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
FPS = 25.0
|
||||
FILE_ID = 242
|
||||
|
||||
print("=== Load asrx_fine ===")
|
||||
fine = json.load(open(f"{BASE}/{UUID}.asrx.json"))
|
||||
segs = fine["segments"]
|
||||
print(f"Segments: {len(segs)}")
|
||||
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Step 2: Delete old chunks
|
||||
print("\n=== Step 2: Delete old chunks ===")
|
||||
for ctype in ['sentence', 'story', 'trace']:
|
||||
cur.execute(
|
||||
"DELETE FROM dev.chunks WHERE file_uuid=%s AND chunk_type=%s",
|
||||
(UUID, ctype))
|
||||
print(f" Deleted {cur.rowcount} {ctype} chunks")
|
||||
conn.commit()
|
||||
|
||||
# Step 3: Build frame → data lookup for YOLO and faces
|
||||
print("\n=== Step 3: Load yolo + face data ===")
|
||||
# YOLO: frame → set of object class names (dedup, confidence > 0.5)
|
||||
print(" Loading YOLO data...")
|
||||
t0 = time.time()
|
||||
cur.execute(
|
||||
"SELECT start_frame, data FROM dev.pre_chunks "
|
||||
"WHERE file_uuid=%s AND processor_type='yolo' "
|
||||
"ORDER BY start_frame", (UUID,))
|
||||
yolo_by_frame = {} # frame → set of class names
|
||||
row_count = 0
|
||||
for r in cur:
|
||||
fn = r[0]
|
||||
data = r[1]
|
||||
if data and "objects" in data:
|
||||
objects = data["objects"]
|
||||
names = set()
|
||||
for obj in objects:
|
||||
if obj.get("confidence", 0) > 0.5:
|
||||
names.add(obj.get("class_name", ""))
|
||||
if names:
|
||||
yolo_by_frame[fn] = names
|
||||
row_count += 1
|
||||
print(f" YOLO: {row_count} entries, {len(yolo_by_frame)} frames with objects ({time.time()-t0:.1f}s)")
|
||||
|
||||
# Face: frame → set of face_ids
|
||||
print(" Loading face data...")
|
||||
t0 = time.time()
|
||||
cur.execute(
|
||||
"SELECT frame_number, face_id FROM dev.face_detections "
|
||||
"WHERE file_uuid=%s AND trace_id IS NOT NULL "
|
||||
"ORDER BY frame_number", (UUID,))
|
||||
face_by_frame = defaultdict(set) # frame → set of face_ids
|
||||
row_count = 0
|
||||
for r in cur:
|
||||
fn = r[0]
|
||||
fid = r[1]
|
||||
if fid:
|
||||
face_by_frame[fn].add(fid)
|
||||
row_count += 1
|
||||
print(f" Faces: {row_count} entries, {len(face_by_frame)} frames ({time.time()-t0:.1f}s)")
|
||||
|
||||
# Step 4: Create new chunks
|
||||
print("\n=== Step 4: Create 4188 sentence chunks ===")
|
||||
t0 = time.time()
|
||||
batch_size = 100
|
||||
inserted = 0
|
||||
yolo_hit = 0
|
||||
face_hit = 0
|
||||
|
||||
yolo_frames_sorted = sorted(yolo_by_frame.keys())
|
||||
face_frames_sorted = sorted(face_by_frame.keys())
|
||||
|
||||
for batch_start in range(0, len(segs), batch_size):
|
||||
batch = segs[batch_start:batch_start + batch_size]
|
||||
values = []
|
||||
for si, s in enumerate(batch):
|
||||
idx = batch_start + si
|
||||
st = s["start_time"]
|
||||
et = s["end_time"]
|
||||
sf = int(st * FPS)
|
||||
ef = int(et * FPS)
|
||||
spk_name = s.get("speaker_name", "Unknown")
|
||||
spk_id = s.get("speaker_id", "SPEAKER_?")
|
||||
raw_text = s.get("text", "")
|
||||
|
||||
# Query YOLO objects in frame range (binary search on sorted list)
|
||||
yolo_objs = []
|
||||
import bisect
|
||||
left = bisect.bisect_left(yolo_frames_sorted, sf)
|
||||
right = bisect.bisect_right(yolo_frames_sorted, ef)
|
||||
for i in range(left, right):
|
||||
fn = yolo_frames_sorted[i]
|
||||
yolo_objs.extend(yolo_by_frame[fn])
|
||||
yolo_objs = list(set(yolo_objs)) # dedup
|
||||
if yolo_objs:
|
||||
yolo_hit += 1
|
||||
|
||||
# Query face IDs in frame range
|
||||
face_ids = []
|
||||
left = bisect.bisect_left(face_frames_sorted, sf)
|
||||
right = bisect.bisect_right(face_frames_sorted, ef)
|
||||
for i in range(left, right):
|
||||
fn = face_frames_sorted[i]
|
||||
face_ids.extend(face_by_frame[fn])
|
||||
face_ids = list(set(face_ids)) # dedup
|
||||
if face_ids:
|
||||
face_hit += 1
|
||||
|
||||
chunk_id = f"{UUID}_{idx}"
|
||||
|
||||
values.append((
|
||||
UUID, # file_uuid
|
||||
chunk_id, # old_chunk_id
|
||||
idx, # chunk_index
|
||||
"sentence", # chunk_type
|
||||
st, # start_time
|
||||
et, # end_time
|
||||
json.dumps({"data": {"text": raw_text, "text_normalized": raw_text.lower()}, "rule": "rule_1"}), # content
|
||||
json.dumps({ # metadata
|
||||
"speaker_id": spk_id,
|
||||
"speaker_name": spk_name,
|
||||
"yolo_objects": yolo_objs,
|
||||
"face_ids": face_ids,
|
||||
"language": "en",
|
||||
}),
|
||||
f"[{spk_name}] {raw_text}", # text_content
|
||||
FPS, # fps
|
||||
sf, # start_frame
|
||||
ef, # end_frame
|
||||
ef - sf, # frame_count
|
||||
FILE_ID, # file_id
|
||||
chunk_id, # chunk_id
|
||||
[], # pre_chunk_ids
|
||||
[], # child_chunk_ids
|
||||
))
|
||||
|
||||
cur.executemany("""
|
||||
INSERT INTO dev.chunks
|
||||
(file_uuid, old_chunk_id, chunk_index, chunk_type,
|
||||
start_time, end_time, content, metadata,
|
||||
text_content, fps, start_frame, end_frame, frame_count,
|
||||
file_id, chunk_id, pre_chunk_ids, child_chunk_ids)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s::jsonb,%s::jsonb,%s,%s,%s,%s,%s,%s,%s,%s,%s)
|
||||
""", values)
|
||||
conn.commit()
|
||||
inserted += len(batch)
|
||||
|
||||
if (batch_start // batch_size) % 5 == 0:
|
||||
pct = inserted * 100 // len(segs)
|
||||
print(f" {inserted}/{len(segs)} ({pct}%) yolo_hit={yolo_hit} face_hit={face_hit} [{time.time()-t0:.0f}s]")
|
||||
|
||||
print(f"\n Inserted: {inserted} chunks")
|
||||
print(f" Chunks with YOLO objects: {yolo_hit}/{inserted}")
|
||||
print(f" Chunks with face IDs: {face_hit}/{inserted}")
|
||||
print(f" Time: {time.time()-t0:.1f}s")
|
||||
|
||||
# Verify
|
||||
cur.execute(
|
||||
"SELECT COUNT(*) FROM dev.chunks WHERE file_uuid=%s AND chunk_type='sentence'",
|
||||
(UUID,))
|
||||
cnt = cur.fetchone()[0]
|
||||
print(f"\n DB sentence chunks: {cnt}")
|
||||
|
||||
cur.execute(
|
||||
"SELECT metadata->>'speaker_name', COUNT(*) FROM dev.chunks "
|
||||
"WHERE file_uuid=%s AND chunk_type='sentence' "
|
||||
"GROUP BY 1 ORDER BY 2 DESC", (UUID,))
|
||||
print(" Speaker distribution:")
|
||||
for r in cur.fetchall():
|
||||
print(f" {r[0]}: {r[1]}")
|
||||
|
||||
conn.close()
|
||||
print("\n=== Done ===")
|
||||
324
scripts/object_search_agent.py
Normal file
324
scripts/object_search_agent.py
Normal file
@@ -0,0 +1,324 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Object Search Agent — searches across YOLO, OCR, ASR, and TKG.
|
||||
Usage: python3 scripts/object_search_agent.py --keyword stamp [--uuid <UUID>]
|
||||
"""
|
||||
import json, sys, argparse
|
||||
from collections import defaultdict
|
||||
import psycopg2
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
FPS = 25.0
|
||||
|
||||
# YOLO class aliases for common search terms
|
||||
ALIASES = {
|
||||
"stamp": ["stamp"],
|
||||
"gun": ["knife", "pistol", "rifle", "grenade"],
|
||||
"weapon": ["knife", "pistol", "rifle", "grenade"],
|
||||
"knife": ["knife"],
|
||||
"person": ["person"],
|
||||
"letter": ["book"],
|
||||
"envelope": ["book"],
|
||||
"car": ["car"],
|
||||
"tie": ["tie"],
|
||||
"phone": ["cell phone"],
|
||||
"bottle": ["bottle", "wine glass", "cup"],
|
||||
"chair": ["chair"],
|
||||
"umbrella": ["umbrella"],
|
||||
}
|
||||
|
||||
def search_yolo(cur, keyword, uuid):
|
||||
"""Search YOLO detections for matching object classes."""
|
||||
classes = ALIASES.get(keyword, [keyword])
|
||||
results = []
|
||||
for cls in classes:
|
||||
cur.execute("""
|
||||
SELECT start_frame, end_frame, data
|
||||
FROM dev.pre_chunks
|
||||
WHERE file_uuid=%s AND processor_type='yolo'
|
||||
AND data->'objects' IS NOT NULL
|
||||
AND data->'objects' @> jsonb_build_array(
|
||||
jsonb_build_object('class_name', %s)
|
||||
)
|
||||
ORDER BY start_frame
|
||||
LIMIT 100
|
||||
""", (uuid, cls))
|
||||
for r in cur.fetchall():
|
||||
sf, ef, data = r
|
||||
objects = [o for o in data.get("objects", []) if o.get("class_name") == cls]
|
||||
top_conf = max((o.get("confidence", 0) for o in objects), default=0)
|
||||
if top_conf > 0.3:
|
||||
ts = sf / FPS
|
||||
results.append({
|
||||
"frame": int(sf),
|
||||
"timestamp": ts,
|
||||
"time_str": f"{int(ts//60)}:{int(ts%60):02d}.{int((ts%1)*25):02d}",
|
||||
"class": cls,
|
||||
"confidence": round(top_conf, 3),
|
||||
"source": "yolo",
|
||||
})
|
||||
return results
|
||||
|
||||
def search_ocr(cur, keyword, uuid):
|
||||
"""Search OCR text for keyword."""
|
||||
cur.execute("""
|
||||
SELECT start_frame, end_frame, data
|
||||
FROM dev.pre_chunks
|
||||
WHERE file_uuid=%s AND processor_type='ocr'
|
||||
AND data->>'text' ILIKE %s
|
||||
ORDER BY start_frame
|
||||
LIMIT 50
|
||||
""", (uuid, f"%{keyword}%"))
|
||||
results = []
|
||||
for r in cur.fetchall():
|
||||
sf, ef, data = r
|
||||
results.append({
|
||||
"frame": sf,
|
||||
"timestamp": sf / FPS,
|
||||
"time_str": f"{int(sf//FPS//60)}:{sf//FPS%60:02d}.{sf%FPS:02.0f}",
|
||||
"text": data.get("text", "")[:100],
|
||||
"source": "ocr",
|
||||
})
|
||||
return results
|
||||
|
||||
def search_asr(cur, keyword, uuid):
|
||||
"""Search ASR/sentence text for keyword."""
|
||||
cur.execute("""
|
||||
SELECT chunk_index, start_time, end_time, text_content
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid=%s AND chunk_type='sentence'
|
||||
AND text_content ILIKE %s
|
||||
ORDER BY start_time
|
||||
LIMIT 100
|
||||
""", (uuid, f"%{keyword}%"))
|
||||
results = []
|
||||
for r in cur.fetchall():
|
||||
idx, st, et, text = r
|
||||
results.append({
|
||||
"chunk_index": idx,
|
||||
"timestamp": st,
|
||||
"time_str": f"{int(st//60)}:{st%60:05.2f}",
|
||||
"text": (text or "")[:120],
|
||||
"source": "asr",
|
||||
})
|
||||
return results
|
||||
|
||||
GUN_MODEL_PATH = "/Users/accusys/momentry_core_0.1/models/gun/gun_detector/weights/best.pt"
|
||||
GUN_CLASSES = {0: "grenade", 1: "knife", 2: "pistol", 3: "rifle"}
|
||||
|
||||
# Grounding DINO — Zero-shot gun detector (Large: 7 datasets, confirmed best on Charade)
|
||||
GDINO_MODEL_NAME = "/Users/accusys/momentry_core_0.1/models/gun/grounding-dino-large-hf"
|
||||
GDINO_PROMPTS = ["gun", "pistol", "rifle", "weapon", "firearm"]
|
||||
|
||||
_gdino_processor = None
|
||||
_gdino_model = None
|
||||
_gdino_device = None
|
||||
|
||||
def init_gdino():
|
||||
global _gdino_processor, _gdino_model, _gdino_device
|
||||
if _gdino_model is not None:
|
||||
return
|
||||
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
||||
import torch
|
||||
_gdino_processor = AutoProcessor.from_pretrained(GDINO_MODEL_NAME)
|
||||
_gdino_model = AutoModelForZeroShotObjectDetection.from_pretrained(GDINO_MODEL_NAME)
|
||||
_gdino_device = "mps" if torch.backends.mps.is_available() else "cpu"
|
||||
_gdino_model.to(_gdino_device)
|
||||
|
||||
def search_zero_shot(video_path, keyword, threshold=0.05):
|
||||
"""Search for objects using Grounding DINO zero-shot detection."""
|
||||
import cv2
|
||||
from PIL import Image
|
||||
import torch
|
||||
|
||||
# Determine prompts based on keyword
|
||||
if keyword in ("gun", "weapon", "pistol", "rifle", "firearm"):
|
||||
prompts = GDINO_PROMPTS
|
||||
else:
|
||||
prompts = [keyword]
|
||||
|
||||
init_gdino()
|
||||
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
|
||||
results = []
|
||||
for frame_num in range(0, total_frames, 1500): # every ~60s
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
|
||||
for prompt in prompts:
|
||||
inputs = _gdino_processor(images=img, text=prompt, return_tensors="pt").to(_gdino_device)
|
||||
with torch.no_grad():
|
||||
outputs = _gdino_model(**inputs)
|
||||
target = torch.tensor([img.size[::-1]])
|
||||
dets = _gdino_processor.post_process_grounded_object_detection(
|
||||
outputs, threshold=threshold, target_sizes=target)[0]
|
||||
|
||||
for i in range(len(dets["boxes"])):
|
||||
score = dets["scores"][i].item()
|
||||
ts = frame_num / fps
|
||||
results.append({
|
||||
"frame": frame_num,
|
||||
"timestamp": ts,
|
||||
"time_str": f"{int(ts//60)}:{int(ts%60):02d}",
|
||||
"class": prompt,
|
||||
"confidence": round(score, 3),
|
||||
"source": "grounding-dino",
|
||||
})
|
||||
|
||||
if len(results) >= 50:
|
||||
break
|
||||
|
||||
cap.release()
|
||||
return results
|
||||
|
||||
def search_gun_detector(video_path, keyword, frame_step=150, confidence=0.25):
|
||||
"""Run custom gun detector model on keyframes."""
|
||||
classes = ALIASES.get(keyword, [])
|
||||
target_ids = [cid for cid, cname in GUN_CLASSES.items() if cname in classes]
|
||||
if not target_ids:
|
||||
return []
|
||||
|
||||
try:
|
||||
from ultralytics import YOLO
|
||||
import cv2
|
||||
except ImportError:
|
||||
return [{"error": "ultralytics or cv2 not available"}]
|
||||
|
||||
model = YOLO(GUN_MODEL_PATH)
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
|
||||
results = []
|
||||
for frame_num in range(0, total_frames, frame_step):
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
break
|
||||
|
||||
dets = model(frame, conf=confidence, verbose=False)[0]
|
||||
for det in dets.boxes.data:
|
||||
cls_id = int(det[5])
|
||||
if cls_id in target_ids:
|
||||
conf_val = float(det[4])
|
||||
ts = frame_num / fps
|
||||
results.append({
|
||||
"frame": frame_num,
|
||||
"timestamp": ts,
|
||||
"time_str": f"{int(ts//60)}:{int(ts%60):02d}.{int((ts%1)*fps):02d}",
|
||||
"class": GUN_CLASSES[cls_id],
|
||||
"confidence": round(conf_val, 3),
|
||||
"source": "gun_detector",
|
||||
})
|
||||
|
||||
if len(results) >= 50:
|
||||
break
|
||||
|
||||
cap.release()
|
||||
return results
|
||||
|
||||
def search_tkg(cur, keyword, uuid):
|
||||
"""Search TKG for related entities."""
|
||||
cur.execute("""
|
||||
SELECT node_type, external_id, label, properties
|
||||
FROM dev.tkg_nodes
|
||||
WHERE file_uuid=%s
|
||||
AND (label ILIKE %s OR external_id ILIKE %s)
|
||||
LIMIT 20
|
||||
""", (uuid, f"%{keyword}%", f"%{keyword}%"))
|
||||
results = []
|
||||
for r in cur.fetchall():
|
||||
node_type, ext_id, label, props = r
|
||||
results.append({
|
||||
"type": node_type,
|
||||
"id": ext_id,
|
||||
"label": label,
|
||||
"properties": props,
|
||||
"source": "tkg",
|
||||
})
|
||||
return results
|
||||
|
||||
def find_video(uuid):
|
||||
"""Find Charade video file."""
|
||||
import glob
|
||||
base = "/Users/accusys/momentry/var/sftpgo/data/demo"
|
||||
# Find Charade by name
|
||||
for f in glob.glob(f"{base}/**/Charade*", recursive=True):
|
||||
if f.endswith((".mp4", ".mov", ".avi")):
|
||||
return f
|
||||
# Fallback: search by uuid pattern
|
||||
for f in glob.glob(f"{base}/**/*{uuid[:8]}*", recursive=True):
|
||||
if f.endswith((".mp4", ".mov", ".avi")):
|
||||
return f
|
||||
return None
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Movie Object Search Agent")
|
||||
parser.add_argument("--keyword", required=True, help="Object to search for")
|
||||
parser.add_argument("--uuid", default=UUID)
|
||||
parser.add_argument("--sources", default="all", help="yolo,ocr,asr,tkg,gun_custom,all")
|
||||
parser.add_argument("--video", help="Path to video file (for gun detector)")
|
||||
args = parser.parse_args()
|
||||
|
||||
kw = args.keyword.lower()
|
||||
src = args.sources.split(",") if args.sources != "all" else ["yolo","ocr","asr","tkg"]
|
||||
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
results = {}
|
||||
|
||||
if "yolo" in src:
|
||||
r = search_yolo(cur, kw, args.uuid)
|
||||
results["yolo"] = {"count": len(r), "results": r[:30]}
|
||||
|
||||
if "ocr" in src:
|
||||
r = search_ocr(cur, kw, args.uuid)
|
||||
results["ocr"] = {"count": len(r), "results": r[:20]}
|
||||
|
||||
if "asr" in src:
|
||||
r = search_asr(cur, kw, args.uuid)
|
||||
results["asr"] = {"count": len(r), "results": r[:20]}
|
||||
|
||||
if "tkg" in src:
|
||||
r = search_tkg(cur, kw, args.uuid)
|
||||
results["tkg"] = {"count": len(r), "results": r[:10]}
|
||||
|
||||
if "zero_shot" in src or kw in ("gun", "weapon", "pistol", "rifle", "firearm"):
|
||||
video_path = args.video or find_video(args.uuid)
|
||||
if video_path:
|
||||
print(" Running Grounding DINO zero-shot search...")
|
||||
r = search_zero_shot(video_path, kw)
|
||||
results["zero_shot"] = {"count": len(r), "results": r[:20]}
|
||||
else:
|
||||
results["zero_shot"] = {"count": 0, "results": [], "error": "Video not found"}
|
||||
|
||||
conn.close()
|
||||
|
||||
# Print summary
|
||||
print(f"\n=== Object Search: \"{args.keyword}\" ===\n")
|
||||
for src_name, data in results.items():
|
||||
print(f"[{src_name.upper()}] {data['count']} matches" + (" — top results:" if data['results'] else ""))
|
||||
for i, r in enumerate(data['results'][:5]):
|
||||
if src_name == "yolo":
|
||||
print(f" {i+1}. {r['time_str']} frame={r['frame']} \"{r['class']}\" conf={r['confidence']}")
|
||||
elif src_name == "ocr":
|
||||
print(f" {i+1}. {r['time_str']} frame={r['frame']} \"{r['text'][:60]}\"")
|
||||
elif src_name == "asr":
|
||||
print(f" {i+1}. {r['time_str']} \"{r['text'][:60]}\"")
|
||||
elif src_name == "tkg":
|
||||
print(f" {i+1}. {r['type']}: {r['label']} ({r.get('properties',{}).get('total_detections','?')} detections)")
|
||||
print()
|
||||
|
||||
# Output as JSON for machine parsing
|
||||
print(json.dumps({"keyword": args.keyword, "sources": results}, indent=2))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
121
scripts/paligemma_vs_gdino.py
Normal file
121
scripts/paligemma_vs_gdino.py
Normal file
@@ -0,0 +1,121 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Full comparison: Grounding DINO Base vs PaliGemma 3B mix-224
|
||||
Tests on 8 known timepoints with gun/stamp prompts.
|
||||
"""
|
||||
import json, os, sys, time, cv2, torch, re
|
||||
from PIL import Image
|
||||
|
||||
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
|
||||
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/paligemma_vs_gdino"
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
TIMEPOINTS = [
|
||||
(2646, "2646s"), (3188, "3188s"), (3697, "3697s"),
|
||||
(5341, "5341s"), (5461, "5461s"), (6309, "6309s"),
|
||||
(6377, "6377s"), (6479, "6479s"),
|
||||
]
|
||||
PROMPTS = ["gun", "pistol", "stamp", "envelope", "passport"]
|
||||
|
||||
device = "mps" if torch.backends.mps.is_available() else "cpu"
|
||||
print(f"Device: {device}")
|
||||
|
||||
# Load all frames
|
||||
cap = cv2.VideoCapture(VIDEO)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
frames = {}
|
||||
for t_sec, label in TIMEPOINTS:
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
|
||||
ret, frame = cap.read()
|
||||
if ret: frames[label] = frame
|
||||
cap.release()
|
||||
print(f"Loaded {len(frames)} frames")
|
||||
|
||||
all_results = {}
|
||||
|
||||
# ===== Grounding DINO Base =====
|
||||
print("\n" + "="*60)
|
||||
print("Grounding DINO Base")
|
||||
print("="*60)
|
||||
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
||||
t0 = time.time()
|
||||
gd_proc = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
|
||||
gd_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-base").to(device)
|
||||
gd_dets = {}
|
||||
for label, frame in frames.items():
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
for pname in PROMPTS:
|
||||
inputs = gd_proc(images=img, text=f"{pname}.", return_tensors="pt").to(device)
|
||||
with torch.no_grad():
|
||||
outputs = gd_model(**inputs)
|
||||
target = torch.tensor([img.size[::-1]])
|
||||
dets = gd_proc.post_process_grounded_object_detection(outputs, threshold=0.1, target_sizes=target)[0]
|
||||
scores = [round(s.item(), 3) for s in dets["scores"]] if len(dets["boxes"]) > 0 else []
|
||||
gd_dets[f"{label}_{pname}"] = scores
|
||||
all_results["grounding-dino-base"] = {"elapsed": round(time.time()-t0, 1), "detections": gd_dets}
|
||||
print(f" Done: {all_results['grounding-dino-base']['elapsed']}s")
|
||||
del gd_model; torch.mps.empty_cache()
|
||||
|
||||
# ===== PaliGemma 3B mix-224 =====
|
||||
print("\n" + "="*60)
|
||||
print("PaliGemma 3B mix-224")
|
||||
print("="*60)
|
||||
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
|
||||
t0 = time.time()
|
||||
pg_proc = AutoProcessor.from_pretrained("google/paligemma-3b-mix-224")
|
||||
pg_model = PaliGemmaForConditionalGeneration.from_pretrained(
|
||||
"google/paligemma-3b-mix-224", dtype=torch.bfloat16
|
||||
).to(device)
|
||||
print(f" Model loaded: {sum(p.numel() for p in pg_model.parameters())/1e6:.0f}M params")
|
||||
pg_dets = {}
|
||||
for label, frame in frames.items():
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
for pname in PROMPTS:
|
||||
t_infer = time.time()
|
||||
prompt = f"detect {pname}"
|
||||
inputs = pg_proc(text=prompt, images=img, return_tensors="pt").to(device)
|
||||
with torch.no_grad():
|
||||
outputs = pg_model.generate(**inputs, max_new_tokens=100)
|
||||
result = pg_proc.decode(outputs[0], skip_special_tokens=True)
|
||||
infer_time = time.time() - t_infer
|
||||
|
||||
# Parse bboxes from output
|
||||
locs = re.findall(r'<loc(\d+)>', result)
|
||||
n_dets = len(locs) // 4
|
||||
has_detection = n_dets > 0 or (pname in result.lower() and 'detect' not in result.lower())
|
||||
|
||||
scores = []
|
||||
if has_detection:
|
||||
for _ in range(n_dets if n_dets > 0 else 1):
|
||||
scores.append(1.0)
|
||||
|
||||
pg_dets[f"{label}_{pname}"] = scores
|
||||
if has_detection:
|
||||
print(f" {label} prompt={pname:10s}: {n_dets} det ({infer_time:.1f}s) result={result[:80]}")
|
||||
all_results["paligemma-3b-mix-224"] = {"elapsed": round(time.time()-t0, 1), "detections": pg_dets}
|
||||
del pg_model; torch.mps.empty_cache()
|
||||
|
||||
# ===== Summary =====
|
||||
print("\n" + "="*70)
|
||||
print(f"{'Model':<28} {'Time':>8} {'Params':>8} {'Gun hits':>12} {'Pistol hits':>14} {'Stamp h':>10}")
|
||||
print("-"*80)
|
||||
for model_name in ["grounding-dino-base", "paligemma-3b-mix-224"]:
|
||||
d = all_results[model_name]
|
||||
dets = d["detections"]
|
||||
summary = {}
|
||||
for pname in PROMPTS:
|
||||
hits = 0
|
||||
for label, _, _ in TIMEPOINTS:
|
||||
key = f"{label}_{pname}"
|
||||
if key in dets and dets[key]:
|
||||
hits += 1
|
||||
summary[pname] = hits
|
||||
|
||||
params = "232M" if "grounding" in model_name else "2923M"
|
||||
gun_h = summary.get("gun", 0)
|
||||
pistol_h = summary.get("pistol", 0)
|
||||
stamp_h = summary.get("stamp", 0)
|
||||
print(f"{model_name:<28} {d['elapsed']:>7.1f}s {params:>8} {gun_h:>6d}/8 {pistol_h:>6d}/8 {stamp_h:>6d}/8")
|
||||
|
||||
json.dump(all_results, open(os.path.join(OUTPUT_DIR, "comparison.json"), "w"), indent=2)
|
||||
print(f"\nSaved to {OUTPUT_DIR}/")
|
||||
@@ -108,7 +108,7 @@ def check_job(uuid: str) -> dict:
|
||||
stages = []
|
||||
t0 = time.time()
|
||||
|
||||
# 1. ASR
|
||||
# 1. ASR (pass 1: faster-whisper small)
|
||||
t = time.time()
|
||||
f = OUTPUT_DIR / f"{uuid}.asr.json"
|
||||
ok = f.exists() and f.stat().st_size > 0
|
||||
@@ -118,10 +118,10 @@ def check_job(uuid: str) -> dict:
|
||||
with open(f) as fh: d = json.load(fh)
|
||||
segs = len(d.get("segments", []))
|
||||
except: ok = False
|
||||
stages.append({"name": "ASR", "passed": ok and segs > 0, "detail": f"{segs} seg" if ok else file_size(str(f)),
|
||||
stages.append({"name": "ASR", "passed": ok and segs > 0, "detail": f"faster-whisper ({segs})" if ok else file_size(str(f)),
|
||||
"elapsed": round(time.time() - t, 1)})
|
||||
|
||||
# 2. ASRX
|
||||
# 2. ASRX (ECAPA-TDNN speaker diarization)
|
||||
t = time.time()
|
||||
f = OUTPUT_DIR / f"{uuid}.asrx.json"
|
||||
ok = f.exists() and f.stat().st_size > 0
|
||||
@@ -131,15 +131,28 @@ def check_job(uuid: str) -> dict:
|
||||
with open(f) as fh: d = json.load(fh)
|
||||
segs = len(d.get("segments", []))
|
||||
except: ok = False
|
||||
stages.append({"name": "ASRX", "passed": ok and segs > 0, "detail": f"{segs} seg" if ok else file_size(str(f)),
|
||||
stages.append({"name": "ASRX", "passed": ok and segs > 0, "detail": f"ECAPA-TDNN ({segs})" if ok else file_size(str(f)),
|
||||
"elapsed": round(time.time() - t, 1)})
|
||||
|
||||
# 3. Sentence Chunks
|
||||
# 3. ASR2 (pass 2: correct split segments)
|
||||
t = time.time()
|
||||
cnt = int(psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{uuid}' AND chunk_type='sentence'"))
|
||||
stages.append({"name": "Sentence", "passed": cnt > 0, "detail": f"{cnt} chunks", "elapsed": round(time.time() - t, 1)})
|
||||
f2 = OUTPUT_DIR / f"{uuid}.asr-1.json"
|
||||
ok2 = f2.exists() and f2.stat().st_size > 0
|
||||
cnt2 = 0
|
||||
if ok2:
|
||||
try:
|
||||
with open(f2) as fh: d2 = json.load(fh)
|
||||
cnt2 = len(d2.get("kept", [])) + sum(len(c["corrected"]) for c in d2.get("corrections", []))
|
||||
except: ok2 = False
|
||||
stages.append({"name": "ASR2", "passed": ok2 and cnt2 > 0, "detail": f"{cnt2} chunks (asr-1.json)" if ok2 else file_size(str(f2)),
|
||||
"elapsed": round(time.time() - t, 1)})
|
||||
|
||||
# 4. Vectorization
|
||||
# 4. Sentence Chunks (DB)
|
||||
t = time.time()
|
||||
cnt = int(psql(f"SELECT count(*) FROM dev.chunk WHERE file_uuid='{uuid}' AND chunk_type='sentence'"))
|
||||
stages.append({"name": "Sentence", "passed": cnt > 0, "detail": f"{cnt} DB", "elapsed": round(time.time() - t, 1)})
|
||||
|
||||
# 5. Vectorization
|
||||
t = time.time()
|
||||
vec = int(psql(f"SELECT count(*) FROM dev.chunk_vectors WHERE uuid='{uuid}'"))
|
||||
qdrant_ok = False
|
||||
@@ -161,7 +174,7 @@ def check_job(uuid: str) -> dict:
|
||||
"detail": f"{vec} PG, Qdrant={'ok' if qdrant_ok else '?'}",
|
||||
"elapsed": round(time.time() - t, 1)})
|
||||
|
||||
# 5. Face Trace
|
||||
# 6. Face Trace
|
||||
t = time.time()
|
||||
traces = int(psql(f"SELECT count(DISTINCT trace_id) FROM dev.face_detections WHERE file_uuid='{uuid}' AND trace_id IS NOT NULL"))
|
||||
faces = int(psql(f"SELECT count(*) FROM dev.face_detections WHERE file_uuid='{uuid}' AND trace_id IS NOT NULL"))
|
||||
@@ -169,7 +182,7 @@ def check_job(uuid: str) -> dict:
|
||||
"detail": f"{traces} traces, {faces} faces",
|
||||
"elapsed": round(time.time() - t, 1)})
|
||||
|
||||
# 6. TKG
|
||||
# 7. TKG
|
||||
t = time.time()
|
||||
nodes = int(psql(f"SELECT count(*) FROM dev.tkg_nodes WHERE file_uuid='{uuid}'"))
|
||||
edges = int(psql(f"SELECT count(*) FROM dev.tkg_edges WHERE file_uuid='{uuid}'"))
|
||||
@@ -177,16 +190,16 @@ def check_job(uuid: str) -> dict:
|
||||
"detail": f"{nodes} nodes, {edges} edges",
|
||||
"elapsed": round(time.time() - t, 1)})
|
||||
|
||||
# 7. Trace Chunks
|
||||
# 8. Trace Chunks
|
||||
t = time.time()
|
||||
tc = int(psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{uuid}' AND chunk_type='trace'"))
|
||||
tc = int(psql(f"SELECT count(*) FROM dev.chunk WHERE file_uuid='{uuid}' AND chunk_type='trace'"))
|
||||
stages.append({"name": "TraceChunks", "passed": tc > 0, "detail": f"{tc} chunks",
|
||||
"elapsed": round(time.time() - t, 1)})
|
||||
|
||||
# 8. Phase 1 Release
|
||||
# 9. Phase 1 Release
|
||||
t = time.time()
|
||||
p1 = PROJECT / "release" / "phase1" / "latest"
|
||||
p1_files = [p1 / "RELEASE_INFO.txt", p1 / "chunks.csv", p1 / "vectors.csv"]
|
||||
p1_files = [p1 / "RELEASE_INFO.txt", p1 / "schema.sql", p1 / "snapshots"]
|
||||
p1_ok = all(f.exists() for f in p1_files)
|
||||
p1_size = sum(f.stat().st_size for f in p1.rglob("*") if f.is_file()) // 1024 if p1.exists() else 0
|
||||
stages.append({"name": "Phase1", "passed": p1_ok,
|
||||
|
||||
204
scripts/rebuild_parents.py
Normal file
204
scripts/rebuild_parents.py
Normal file
@@ -0,0 +1,204 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Rebuild parent/story chunks (280 × 15 children) + LLM summaries + Qdrant momentry_dev_stories.
|
||||
"""
|
||||
import json, sys, time, psycopg2
|
||||
from collections import Counter
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
||||
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
||||
FPS = 25.0
|
||||
FILE_ID = 242
|
||||
CHILDREN_PER_PARENT = 15
|
||||
|
||||
print("=== Step 1: Load sentence chunks sorted by time ===")
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT chunk_index, chunk_id, start_time, end_time, text_content,
|
||||
metadata->>'speaker_name', file_uuid
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid=%s AND chunk_type='sentence'
|
||||
ORDER BY start_time, chunk_index
|
||||
""", (UUID,))
|
||||
children = cur.fetchall()
|
||||
print(f"Loaded {len(children)} sentence chunks")
|
||||
|
||||
# Group into parents of 15
|
||||
parents = []
|
||||
for i in range(0, len(children), CHILDREN_PER_PARENT):
|
||||
group = children[i:i+CHILDREN_PER_PARENT]
|
||||
if not group: continue
|
||||
p_start = group[0][2]
|
||||
p_end = group[-1][3]
|
||||
child_ids = [c[1] for c in group]
|
||||
|
||||
# Speaker breakdown
|
||||
spk_counter = Counter(c[4] for c in group)
|
||||
# Actually count speaker names
|
||||
spk_names = Counter(c[5] for c in group)
|
||||
primary = spk_names.most_common(1)[0][0] if spk_names else "Unknown"
|
||||
|
||||
parents.append({
|
||||
"start": p_start, "end": p_end,
|
||||
"child_ids": child_ids,
|
||||
"child_indices": [c[0] for c in group],
|
||||
"speakers": dict(spk_names.most_common()),
|
||||
"primary": primary,
|
||||
"texts": [c[4] for c in group],
|
||||
})
|
||||
|
||||
print(f"Parent chunks: {len(parents)}")
|
||||
print(f"Speakers per parent: {[len(p['speakers']) for p in parents[:5]]}")
|
||||
|
||||
# Delete old story chunks
|
||||
cur.execute("DELETE FROM dev.chunks WHERE file_uuid=%s AND chunk_type='story'", (UUID,))
|
||||
print(f"Deleted old story chunks: {cur.rowcount}")
|
||||
|
||||
# Insert parent chunks
|
||||
print("\n=== Step 2: Insert parent chunks ===")
|
||||
parent_records = []
|
||||
for pi, p in enumerate(parents):
|
||||
pid = f"{UUID}_story_{pi}"
|
||||
dialogue = " ".join([t or "" for t in p["texts"]])
|
||||
sf = int(p["start"] * FPS)
|
||||
ef = int(p["end"] * FPS)
|
||||
fc = ef - sf
|
||||
|
||||
metadata = json.dumps({
|
||||
"method": "fixed_15",
|
||||
"seg_count": len(p["child_ids"]),
|
||||
"speakers": p["speakers"],
|
||||
"speaker_count": len(p["speakers"]),
|
||||
"primary_speaker": p["primary"],
|
||||
"words": len(dialogue.split()),
|
||||
})
|
||||
|
||||
parent_records.append((
|
||||
UUID, pid, pi, "story", p["start"], p["end"],
|
||||
json.dumps({"type": "story_parent"}),
|
||||
dialogue, FPS, sf, ef, fc, FILE_ID, pid,
|
||||
metadata, p["child_ids"], [], None,
|
||||
))
|
||||
|
||||
cur.executemany("""
|
||||
INSERT INTO dev.chunks
|
||||
(file_uuid, chunk_id, chunk_index, chunk_type,
|
||||
start_time, end_time, content, text_content,
|
||||
fps, start_frame, end_frame, frame_count,
|
||||
file_id, old_chunk_id, metadata, child_chunk_ids, pre_chunk_ids, summary_text)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s::jsonb,%s,%s,%s,%s,%s,%s,%s,%s::jsonb,%s,%s,%s)
|
||||
""", parent_records)
|
||||
conn.commit()
|
||||
print(f"Inserted {len(parent_records)} parent chunks")
|
||||
|
||||
# Update sentence chunks with parent_chunk_id
|
||||
for pi, p in enumerate(parents):
|
||||
pid = f"{UUID}_story_{pi}"
|
||||
for cid in p["child_ids"]:
|
||||
cur.execute("UPDATE dev.chunks SET parent_chunk_id=%s WHERE chunk_id=%s", (pid, cid))
|
||||
conn.commit()
|
||||
print("Updated child parent references")
|
||||
|
||||
print("\n=== Step 3: Generate LLM summaries ===")
|
||||
def call_llm(prompt):
|
||||
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.1, "max_tokens": 100}).encode()
|
||||
req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
resp = urlopen(req, timeout=120)
|
||||
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
|
||||
|
||||
def call_embed(text):
|
||||
body = json.dumps({"input": text}).encode()
|
||||
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
resp = urlopen(req, timeout=30)
|
||||
return json.loads(resp.read())["data"][0]["embedding"]
|
||||
|
||||
t0 = time.time()
|
||||
summaries = []
|
||||
for pi, p in enumerate(parents):
|
||||
dialogue = " ".join([t or "" for t in p["texts"]])
|
||||
if len(dialogue) < 10:
|
||||
summary = "[no dialogue]"
|
||||
embedding = [0.0] * 768
|
||||
else:
|
||||
try:
|
||||
prompt = f"Act as a film scene analyst. Summarize this dialogue in 50 words:\n{dialogue[:3000]}"
|
||||
summary = call_llm(prompt)
|
||||
time.sleep(0.2)
|
||||
embedding = call_embed(summary)
|
||||
except Exception as e:
|
||||
print(f" P{pi} ERROR: {e}")
|
||||
summary = "[error]"
|
||||
embedding = [0.0] * 768
|
||||
|
||||
cur.execute("UPDATE dev.chunks SET summary_text=%s, updated_at=NOW() WHERE chunk_id=%s",
|
||||
(summary, f"{UUID}_story_{pi}"))
|
||||
|
||||
summaries.append({"index": pi, "chunk_id": f"{UUID}_story_{pi}",
|
||||
"summary": summary, "start": p["start"], "end": p["end"],
|
||||
"embedding": embedding})
|
||||
|
||||
if (pi + 1) % 20 == 0:
|
||||
print(f" [{pi+1}/{len(parents)}] [{time.time()-t0:.0f}s]")
|
||||
|
||||
conn.commit()
|
||||
print(f"Summaries: {len(summaries)}")
|
||||
|
||||
print("\n=== Step 4: Update Qdrant momentry_dev_stories ===")
|
||||
# Delete old
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories", method="DELETE")
|
||||
try: urlopen(req); time.sleep(0.5)
|
||||
except: pass
|
||||
|
||||
# Create
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories",
|
||||
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req)
|
||||
time.sleep(0.5)
|
||||
|
||||
# Upload dialogue + summary points (dialogue=0..279, summary=280..559)
|
||||
points = []
|
||||
for pi, p in enumerate(parents):
|
||||
# Dialogue point (zero vector)
|
||||
points.append({
|
||||
"id": pi + 1,
|
||||
"vector": [0.0] * 768,
|
||||
"payload": {"chunk_id": f"{UUID}_story_{pi}", "file_uuid": UUID,
|
||||
"start_time": p["start"], "end_time": p["end"],
|
||||
"type": "story_dialogue", "text": " ".join(p["texts"])[:500]},
|
||||
})
|
||||
# Summary point
|
||||
s = summaries[pi]
|
||||
points.append({
|
||||
"id": pi + 1 + len(parents),
|
||||
"vector": s["embedding"],
|
||||
"payload": {"chunk_id": s["chunk_id"], "file_uuid": UUID,
|
||||
"start_time": s["start"], "end_time": s["end"],
|
||||
"type": "story_summary", "summary": s["summary"]},
|
||||
})
|
||||
|
||||
batch_size = 100
|
||||
for start in range(0, len(points), batch_size):
|
||||
batch = points[start:start+batch_size]
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try: urlopen(req)
|
||||
except Exception as e: print(f" batch {start}: {e}")
|
||||
|
||||
print(f"Uploaded {len(points)} points")
|
||||
|
||||
# Verify
|
||||
resp = json.loads(urlopen(f"{QDRANT_URL}/collections/momentry_dev_stories").read())
|
||||
info = resp["result"]
|
||||
print(f"Verifed: {info['points_count']} pts, {info['config']['params']['vectors'].get('size','?')}D")
|
||||
|
||||
conn.close()
|
||||
print(f"\n=== Done [{time.time()-t0:.0f}s] ===")
|
||||
320
scripts/rebuild_story_content.py
Normal file
320
scripts/rebuild_story_content.py
Normal file
@@ -0,0 +1,320 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Rebuild story chunk text_content and regenerates summaries using new ASRX speaker assignments.
|
||||
Then updates Qdrant momentry_dev_stories and sentence_story/sentence_summary collections.
|
||||
"""
|
||||
|
||||
import json, sys, time, urllib.request
|
||||
from urllib.request import Request, urlopen
|
||||
import psycopg2
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
||||
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
||||
|
||||
def call_llm(dialogue_text):
|
||||
prompt = f"Dialogue:\n{dialogue_text}\n\n50-word summary:"
|
||||
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.1, "max_tokens": 100}).encode()
|
||||
req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
resp = urlopen(req, timeout=120)
|
||||
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
|
||||
except Exception as e:
|
||||
print(f" LLM error: {e}")
|
||||
return ""
|
||||
|
||||
def call_embed(text):
|
||||
body = json.dumps({"input": text}).encode()
|
||||
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
resp = urlopen(req, timeout=30)
|
||||
return json.loads(resp.read())["data"][0]["embedding"]
|
||||
except Exception as e:
|
||||
print(f" Embed error: {e}")
|
||||
return [0.0] * 768
|
||||
|
||||
print("=== Step 1: Load sentence chunks with new speaker info ===")
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("""
|
||||
SELECT chunk_index, text_content, metadata->>'new_speaker_name',
|
||||
metadata->>'speaker_name', content
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid = %s AND chunk_type = 'sentence'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
sentence_rows = cur.fetchall()
|
||||
print(f"Loaded {len(sentence_rows)} sentence chunks")
|
||||
|
||||
# Build lookup
|
||||
sentences = {}
|
||||
for r in sentence_rows:
|
||||
idx, old_text, new_name, old_name, content = r
|
||||
sentences[idx] = {
|
||||
"old_text": old_text or "",
|
||||
"new_name": new_name or old_name or "Unknown",
|
||||
"old_name": old_name or "Unknown",
|
||||
"content": content or {},
|
||||
}
|
||||
|
||||
# Rebuild sentence text_content with new speaker names
|
||||
print("\n=== Step 2: Rebuild sentence text_content ===")
|
||||
updated_sentences = 0
|
||||
for r in sentence_rows:
|
||||
idx, old_text, new_name, old_name, content = r
|
||||
new_name = new_name or old_name or "Unknown"
|
||||
|
||||
# Extract the text part (remove old speaker prefix if exists)
|
||||
raw_text = ""
|
||||
if content and isinstance(content, dict):
|
||||
raw_text = content.get("data", {}).get("text", "")
|
||||
if not raw_text and old_text:
|
||||
# Parse old format: [Speaker] text
|
||||
import re
|
||||
m = re.search(r'\]\s*(.*)', old_text)
|
||||
if m:
|
||||
raw_text = m.group(1)
|
||||
else:
|
||||
raw_text = old_text
|
||||
|
||||
new_text = f"[{new_name}] {raw_text}"
|
||||
|
||||
cur.execute("""
|
||||
UPDATE dev.chunks
|
||||
SET text_content = %s, updated_at = NOW()
|
||||
WHERE file_uuid = %s AND chunk_type = 'sentence' AND chunk_index = %s
|
||||
""", (new_text, UUID, idx))
|
||||
updated_sentences += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"Updated {updated_sentences} sentence chunks text_content")
|
||||
|
||||
print("\n=== Step 3: Rebuild story chunk text_content ===")
|
||||
cur.execute("""
|
||||
SELECT id, chunk_id, chunk_index, child_chunk_ids, start_time, end_time,
|
||||
text_content, summary_text
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid = %s AND chunk_type = 'story'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
story_rows = cur.fetchall()
|
||||
print(f"Loaded {len(story_rows)} story chunks")
|
||||
|
||||
# Build child text per story chunk
|
||||
story_dialogue_texts = []
|
||||
for r in story_rows:
|
||||
db_id, cid, idx, child_ids, st, et, old_text, old_summary = r
|
||||
|
||||
dialogue_parts = []
|
||||
for child_cid in (child_ids or []):
|
||||
parts = child_cid.split("_")
|
||||
child_idx = int(parts[-1])
|
||||
if child_idx in sentences:
|
||||
s = sentences[child_idx]
|
||||
raw = ""
|
||||
if s["content"] and isinstance(s["content"], dict):
|
||||
raw = s["content"].get("data", {}).get("text", "")
|
||||
if not raw:
|
||||
import re
|
||||
m = re.search(r'\]\s*(.*)', s["old_text"])
|
||||
if m:
|
||||
raw = m.group(1)
|
||||
else:
|
||||
raw = s["old_text"]
|
||||
if raw:
|
||||
dialogue_parts.append(f'({s["new_name"]}) {raw}')
|
||||
|
||||
dialogue_text = " ".join(dialogue_parts)
|
||||
story_dialogue_texts.append((db_id, cid, idx, st, et, dialogue_text, old_summary))
|
||||
|
||||
print(f"Built {len(story_dialogue_texts)} story dialogue texts")
|
||||
|
||||
# Update DB with new text_content (dialogue only, not summary yet)
|
||||
for item in story_dialogue_texts:
|
||||
db_id, cid, idx, st, et, dialogue_text, old_summary = item
|
||||
cur.execute("""
|
||||
UPDATE dev.chunks
|
||||
SET text_content = %s, updated_at = NOW()
|
||||
WHERE id = %s
|
||||
""", (dialogue_text, db_id))
|
||||
|
||||
conn.commit()
|
||||
print("Updated story chunk dialogue texts")
|
||||
|
||||
print("\n=== Step 4: Generate LLM summaries (all 228 stories) ===")
|
||||
summaries = []
|
||||
for i, item in enumerate(story_dialogue_texts):
|
||||
db_id, cid, idx, st, et, dialogue_text, old_summary = item
|
||||
|
||||
if len(dialogue_text) < 10:
|
||||
summary = "[no dialogue]"
|
||||
embedding = [0.0] * 768
|
||||
else:
|
||||
print(f" [{i+1}/{len(story_dialogue_texts)}] {cid}: {len(dialogue_text)} chars", end="")
|
||||
try:
|
||||
summary = call_llm(dialogue_text[:3000])
|
||||
print(f" -> {len(summary)} chars")
|
||||
time.sleep(0.3)
|
||||
embedding = call_embed(summary)
|
||||
except Exception as e:
|
||||
print(f" ERROR: {e}")
|
||||
summary = "[error]"
|
||||
embedding = [0.0] * 768
|
||||
|
||||
# Update DB
|
||||
s_esc = summary.replace("'", "''")
|
||||
cur.execute(f"""
|
||||
UPDATE dev.chunks
|
||||
SET summary_text = '{s_esc}', updated_at = NOW()
|
||||
WHERE id = {db_id}
|
||||
""")
|
||||
|
||||
summaries.append({
|
||||
"db_id": db_id,
|
||||
"chunk_id": cid,
|
||||
"chunk_index": idx,
|
||||
"start_time": st,
|
||||
"end_time": et,
|
||||
"dialogue": dialogue_text,
|
||||
"summary": summary,
|
||||
"embedding": embedding,
|
||||
})
|
||||
|
||||
conn.commit()
|
||||
print(f"\nGenerated {len(summaries)} summaries")
|
||||
|
||||
print("\n=== Step 5: Rebuild Qdrant momentry_dev_stories ===")
|
||||
# Delete existing
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories", method="DELETE")
|
||||
try:
|
||||
urlopen(req)
|
||||
time.sleep(0.3)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Recreate
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories",
|
||||
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req)
|
||||
time.sleep(0.3)
|
||||
|
||||
# Upload dialogue points (0..227) and summary points (228..455)
|
||||
dialogue_points = []
|
||||
summary_points = []
|
||||
for s in summaries:
|
||||
idx = s["chunk_index"]
|
||||
dialogue_points.append({
|
||||
"id": idx + 1,
|
||||
"vector": [0.0] * 768,
|
||||
"payload": {
|
||||
"chunk_id": s["chunk_id"],
|
||||
"file_uuid": UUID,
|
||||
"start_time": s["start_time"],
|
||||
"end_time": s["end_time"],
|
||||
"type": "story_dialogue",
|
||||
"text": s["dialogue"][:500],
|
||||
}
|
||||
})
|
||||
summary_points.append({
|
||||
"id": idx + 1 + 228,
|
||||
"vector": s["embedding"],
|
||||
"payload": {
|
||||
"chunk_id": s["chunk_id"],
|
||||
"file_uuid": UUID,
|
||||
"start_time": s["start_time"],
|
||||
"end_time": s["end_time"],
|
||||
"type": "story_summary",
|
||||
"summary": s["summary"],
|
||||
}
|
||||
})
|
||||
|
||||
all_story_points = dialogue_points + summary_points
|
||||
|
||||
batch_size = 100
|
||||
for start in range(0, len(all_story_points), batch_size):
|
||||
batch = all_story_points[start:start+batch_size]
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try:
|
||||
urlopen(req)
|
||||
except Exception as e:
|
||||
print(f" Batch {start}: {e}")
|
||||
if (start // batch_size) % 3 == 0:
|
||||
print(f" Uploaded {start + len(batch)}/{len(all_story_points)}")
|
||||
|
||||
print(f"Uploaded {len(all_story_points)} points to momentry_dev_stories")
|
||||
|
||||
print("\n=== Step 6: Populate sentence_story and sentence_summary ===")
|
||||
# These are the per-sentence template + summary collections
|
||||
# sentence_story: 3417 points, 768D, template payloads
|
||||
# sentence_summary: 3417 points, 768D, LLM summary payloads
|
||||
|
||||
for col_name in ["sentence_story", "sentence_summary"]:
|
||||
req = Request(f"{QDRANT_URL}/collections/{col_name}", method="DELETE")
|
||||
try:
|
||||
urlopen(req)
|
||||
time.sleep(0.2)
|
||||
except:
|
||||
pass
|
||||
|
||||
req = Request(f"{QDRANT_URL}/collections/{col_name}",
|
||||
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req)
|
||||
time.sleep(0.2)
|
||||
|
||||
# Build points for sentence_story and sentence_summary
|
||||
story_sentence_points = []
|
||||
summary_sentence_points = []
|
||||
for idx in sorted(sentences.keys()):
|
||||
s = sentences[idx]
|
||||
raw_text = ""
|
||||
if s["content"] and isinstance(s["content"], dict):
|
||||
raw_text = s["content"].get("data", {}).get("text", "")
|
||||
|
||||
dialog_line = f'({s["new_name"]}) {raw_text}'
|
||||
|
||||
story_sentence_points.append({
|
||||
"id": idx + 1,
|
||||
"vector": [0.0] * 768,
|
||||
"payload": {
|
||||
"chunk_id": f"{UUID}_{idx}",
|
||||
"file_uuid": UUID,
|
||||
"start_time": 0,
|
||||
"end_time": 0,
|
||||
"text": dialog_line,
|
||||
"speaker_name": s["new_name"],
|
||||
"chunk_type": "sentence",
|
||||
}
|
||||
})
|
||||
|
||||
# Upload sentence_story (dialogue template)
|
||||
batch_size = 200
|
||||
for start in range(0, len(story_sentence_points), batch_size):
|
||||
batch = story_sentence_points[start:start+batch_size]
|
||||
req = Request(f"{QDRANT_URL}/collections/sentence_story/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try:
|
||||
urlopen(req)
|
||||
except Exception as e:
|
||||
print(f" sentence_story batch {start}: {e}")
|
||||
if (start // batch_size) % 5 == 0:
|
||||
print(f" Uploaded {start + len(batch)}/3417 sentence_story")
|
||||
|
||||
print("Uploaded sentence_story points")
|
||||
|
||||
# sentence_summary will be populated when we generate per-sentence summaries
|
||||
# For now, mark as TODO
|
||||
print("sentence_summary: SKIPPED (needs per-sentence LLM summaries)")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
print("\n=== Done ===")
|
||||
180
scripts/rescan_single_frame_traces.py
Normal file
180
scripts/rescan_single_frame_traces.py
Normal file
@@ -0,0 +1,180 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Rescan cut scenes at 1-frame interval to find more face detections
|
||||
for single-frame traces.
|
||||
|
||||
Usage:
|
||||
python3 scripts/rescan_single_frame_traces.py --file-uuid <uuid> [--workers 2]
|
||||
"""
|
||||
import os, sys, json, subprocess, tempfile, argparse, time, psycopg2
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
|
||||
OUTPUT_DIR = os.environ.get("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
|
||||
SCRIPTS_DIR = os.environ.get("MOMENTRY_SCRIPTS_DIR", "/Users/accusys/momentry_core_0.1/scripts")
|
||||
VENV_PYTHON = "/Users/accusys/momentry_core_0.1/venv/bin/python"
|
||||
|
||||
def get_cut_scenes_with_single_traces(file_uuid):
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SET search_path TO dev")
|
||||
cur.execute("""
|
||||
SELECT c.chunk_id, c.start_frame, c.end_frame, c.start_time, c.end_time,
|
||||
COUNT(DISTINCT s.trace_id) as single_traces
|
||||
FROM dev.chunks c
|
||||
JOIN dev.face_detections fd ON fd.file_uuid=c.file_uuid
|
||||
AND fd.frame_number >= c.start_frame AND fd.frame_number <= c.end_frame
|
||||
JOIN (
|
||||
SELECT trace_id FROM dev.face_detections
|
||||
WHERE file_uuid=%s AND trace_id IS NOT NULL
|
||||
GROUP BY trace_id HAVING COUNT(*) = 1
|
||||
) s ON s.trace_id = fd.trace_id
|
||||
WHERE c.file_uuid=%s AND c.chunk_type='cut'
|
||||
GROUP BY c.id, c.chunk_id, c.start_frame, c.end_frame, c.start_time, c.end_time
|
||||
ORDER BY single_traces DESC
|
||||
""", (file_uuid, file_uuid))
|
||||
scenes = cur.fetchall()
|
||||
cur.close(); conn.close()
|
||||
return scenes
|
||||
|
||||
def process_scene(file_uuid, video_path, chunk_id, start_frame, end_frame, start_time, end_time):
|
||||
temp_dir = Path(OUTPUT_DIR) / f"rescan_{file_uuid[:8]}"
|
||||
temp_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Extract segment
|
||||
seg_path = temp_dir / f"{chunk_id}.mp4"
|
||||
duration = end_time - start_time + 2 # pad 2 seconds
|
||||
result = subprocess.run([
|
||||
"ffmpeg", "-y", "-i", video_path,
|
||||
"-ss", str(max(0, start_time - 1)),
|
||||
"-t", str(duration),
|
||||
"-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
|
||||
"-an", # no audio
|
||||
str(seg_path)
|
||||
], capture_output=True, text=True)
|
||||
|
||||
if not seg_path.exists():
|
||||
return None, f"ffmpeg failed: {result.stderr[:200]}"
|
||||
|
||||
# Run face processor
|
||||
out_path = temp_dir / f"{chunk_id}.face.json"
|
||||
frame_offset = start_frame - 1 # ffmpeg extracts from start_time-1
|
||||
|
||||
result = subprocess.run([
|
||||
VENV_PYTHON, str(Path(SCRIPTS_DIR) / "face_processor.py"),
|
||||
str(seg_path), str(out_path),
|
||||
"--sample-interval", "1",
|
||||
"--uuid", file_uuid,
|
||||
], capture_output=True, text=True, timeout=180)
|
||||
|
||||
if not out_path.exists():
|
||||
seg_path.unlink(missing_ok=True)
|
||||
return None, f"face processor failed"
|
||||
|
||||
# Read results and re-map frame numbers
|
||||
with open(out_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
new_detections = []
|
||||
for entry in data.get("frames", []):
|
||||
orig_frame = int(entry.get("frame", 0)) + frame_offset
|
||||
if orig_frame < start_frame or orig_frame > end_frame:
|
||||
continue
|
||||
faces = entry.get("faces", [])
|
||||
if faces:
|
||||
new_detections.append({"frame": orig_frame, "faces": faces})
|
||||
|
||||
# Cleanup temp files
|
||||
seg_path.unlink(missing_ok=True)
|
||||
out_path.unlink(missing_ok=True)
|
||||
|
||||
return new_detections, None
|
||||
|
||||
def merge_into_face_json(file_uuid, scene_detections):
|
||||
face_path = Path(OUTPUT_DIR) / f"{file_uuid}.face.json"
|
||||
|
||||
with open(face_path) as f:
|
||||
face_data = json.load(f)
|
||||
|
||||
# Index existing frames
|
||||
existing = {}
|
||||
for i, entry in enumerate(face_data.get("frames", [])):
|
||||
existing[entry["frame"]] = i
|
||||
|
||||
new_faces = 0
|
||||
for entry in scene_detections:
|
||||
fn = entry["frame"]
|
||||
if fn in existing:
|
||||
# Add new faces not already present
|
||||
existing_face_ids = {f.get("face_id") for f in face_data["frames"][existing[fn]]["faces"]}
|
||||
for face in entry["faces"]:
|
||||
if face.get("face_id") not in existing_face_ids:
|
||||
face_data["frames"][existing[fn]]["faces"].append(face)
|
||||
new_faces += 1
|
||||
else:
|
||||
face_data["frames"].append({"frame": fn, "faces": entry["faces"]})
|
||||
new_faces += len(entry["faces"])
|
||||
|
||||
# Re-sort by frame
|
||||
face_data["frames"].sort(key=lambda x: x["frame"])
|
||||
|
||||
with open(face_path, "w") as f:
|
||||
json.dump(face_data, f)
|
||||
|
||||
return new_faces
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--file-uuid", required=True)
|
||||
parser.add_argument("--video-path", default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
UUID = args.file_uuid
|
||||
|
||||
if args.video_path:
|
||||
video_path = args.video_path
|
||||
else:
|
||||
# Try to find video path from DB
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SET search_path TO dev")
|
||||
cur.execute("SELECT file_path FROM dev.videos WHERE file_uuid=%s", (UUID,))
|
||||
row = cur.fetchone()
|
||||
cur.close(); conn.close()
|
||||
if not row:
|
||||
print(f"Video not found for UUID {UUID}")
|
||||
return
|
||||
video_path = row[0]
|
||||
|
||||
print(f"Scanning for single-frame traces in {UUID}")
|
||||
scenes = get_cut_scenes_with_single_traces(UUID)
|
||||
print(f"Found {len(scenes)} cut scenes with single-frame traces")
|
||||
|
||||
total_new = 0
|
||||
start_time = time.time()
|
||||
|
||||
for i, (chunk_id, sf, ef, st, et, n_traces) in enumerate(scenes):
|
||||
t0 = time.time()
|
||||
detections, error = process_scene(UUID, video_path, chunk_id, sf, ef, st, et)
|
||||
|
||||
if error:
|
||||
print(f"[{i+1}/{len(scenes)}] {chunk_id}: ERROR - {error}")
|
||||
continue
|
||||
|
||||
if not detections:
|
||||
print(f"[{i+1}/{len(scenes)}] {chunk_id}: no new detections")
|
||||
continue
|
||||
|
||||
added = merge_into_face_json(UUID, detections)
|
||||
total_new += added
|
||||
elapsed = time.time() - t0
|
||||
eta = (len(scenes) - i - 1) * elapsed
|
||||
|
||||
print(f"[{i+1}/{len(scenes)}] {chunk_id}: +{added} faces ({len(detections)} frames, {elapsed:.0f}s, ETA {eta/60:.0f}min)")
|
||||
|
||||
print(f"\nDone! Added {total_new} new face detections across {len(scenes)} scenes")
|
||||
print(f"Total time: {(time.time()-start_time)/60:.1f} min")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
164
scripts/scan_handheld_objects.py
Normal file
164
scripts/scan_handheld_objects.py
Normal file
@@ -0,0 +1,164 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Scan Charade for hand-held objects using YOLO spatial overlap + pose wrist verification.
|
||||
Strategy:
|
||||
1. Sample frames at regular intervals
|
||||
2. For each person, check if non-person objects overlap with hand area
|
||||
3. Use pose wrist keypoints to verify hand position
|
||||
4. Classify with Grounding DINO
|
||||
"""
|
||||
import json, sys, time, psycopg2
|
||||
from collections import defaultdict, Counter
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
FPS = 25.0
|
||||
SAMPLE_INTERVAL = 300 # every 300 frames = every 12s
|
||||
HAND_RADIUS = 100 # pixels around wrist to check for objects
|
||||
|
||||
def iou(box1, box2):
|
||||
"""Calculate intersection over union of two boxes [x,y,w,h]."""
|
||||
x1, y1, w1, h1 = box1
|
||||
x2, y2, w2, h2 = box2
|
||||
xi1 = max(x1, x2)
|
||||
yi1 = max(y1, y2)
|
||||
xi2 = min(x1 + w1, x2 + w2)
|
||||
yi2 = min(y1 + h1, y2 + h2)
|
||||
inter = max(0, xi2 - xi1) * max(0, yi2 - yi1)
|
||||
if inter == 0: return 0
|
||||
area1 = w1 * h1
|
||||
area2 = w2 * h2
|
||||
union = area1 + area2 - inter
|
||||
return inter / union if union > 0 else 0
|
||||
|
||||
print("=== Hand-held Object Scanner ===")
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Load pose wrist data (frame → wrist positions)
|
||||
print("Loading pose wrist data...")
|
||||
t0 = time.time()
|
||||
cur.execute("""
|
||||
SELECT start_frame, data
|
||||
FROM dev.pre_chunks
|
||||
WHERE file_uuid=%s AND processor_type='pose'
|
||||
AND data->'persons' IS NOT NULL
|
||||
ORDER BY start_frame
|
||||
""", (UUID,))
|
||||
pose_wrists = {} # frame → list of (x, y) wrist positions
|
||||
for r in cur.fetchall():
|
||||
frame = r[0]
|
||||
persons = r[1].get("persons", [])
|
||||
wrists = []
|
||||
for p in persons:
|
||||
for kp in p.get("keypoints", []):
|
||||
name = kp.get("name", "")
|
||||
if name in ("left_wrist", "right_wrist") and kp.get("confidence", 0) > 0.1:
|
||||
wrists.append((kp["x"], kp["y"]))
|
||||
if wrists:
|
||||
pose_wrists[frame] = wrists
|
||||
print(f" {len(pose_wrists)} frames with wrists ({time.time()-t0:.1f}s)")
|
||||
|
||||
# Scan YOLO frames
|
||||
print(f"Scanning YOLO data (interval={SAMPLE_INTERVAL})...")
|
||||
t0 = time.time()
|
||||
|
||||
# Get total frames
|
||||
cur.execute("SELECT MAX(start_frame) FROM dev.pre_chunks WHERE file_uuid=%s AND processor_type='yolo'", (UUID,))
|
||||
max_frame = cur.fetchone()[0] or 0
|
||||
|
||||
results = []
|
||||
for frame_num in range(0, max_frame + 1, SAMPLE_INTERVAL):
|
||||
# Get YOLO detections for this frame
|
||||
cur.execute("""
|
||||
SELECT data->'objects'
|
||||
FROM dev.pre_chunks
|
||||
WHERE file_uuid=%s AND processor_type='yolo' AND start_frame=%s
|
||||
""", (UUID, frame_num))
|
||||
yolo_row = cur.fetchone()
|
||||
if not yolo_row or not yolo_row[0]:
|
||||
continue
|
||||
|
||||
objects = yolo_row[0]
|
||||
# Find persons
|
||||
persons = [o for o in objects if o.get("class_name") == "person" and o.get("confidence", 0) > 0.5]
|
||||
if not persons:
|
||||
continue
|
||||
|
||||
# Find non-person objects
|
||||
items = [o for o in objects if o.get("class_name") != "person" and o.get("confidence", 0) > 0.3]
|
||||
if not items:
|
||||
continue
|
||||
|
||||
# Get wrist positions for this frame
|
||||
wrists = pose_wrists.get(frame_num, [])
|
||||
|
||||
ts = frame_num / FPS
|
||||
frame_results = []
|
||||
|
||||
for item in items:
|
||||
item_box = (item["x"], item["y"], item["width"], item["height"])
|
||||
item_center_x = item["x"] + item["width"] / 2
|
||||
item_center_y = item["y"] + item["height"] / 2
|
||||
|
||||
# Check if item is near any person
|
||||
for person in persons:
|
||||
person_box = (person["x"], person["y"], person["width"], person["height"])
|
||||
overlap = iou(item_box, person_box)
|
||||
|
||||
if overlap > 0.01:
|
||||
# Check if near a wrist (if pose data available)
|
||||
near_hand = False
|
||||
for wx, wy in wrists:
|
||||
dist = ((item_center_x - wx) ** 2 + (item_center_y - wy) ** 2) ** 0.5
|
||||
if dist < HAND_RADIUS:
|
||||
near_hand = True
|
||||
break
|
||||
|
||||
cls = item["class_name"]
|
||||
conf = item.get("confidence", 0)
|
||||
|
||||
frame_results.append({
|
||||
"frame": frame_num,
|
||||
"timestamp": round(ts, 1),
|
||||
"time_str": f"{int(ts//60)}:{int(ts%60):02d}",
|
||||
"object": cls,
|
||||
"confidence": round(conf, 3),
|
||||
"near_hand": near_hand,
|
||||
"overlap": round(overlap, 3),
|
||||
})
|
||||
|
||||
if frame_results:
|
||||
results.extend(frame_results)
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f" Scanned in {elapsed:.1f}s")
|
||||
|
||||
# Deduplicate
|
||||
seen = set()
|
||||
deduped = []
|
||||
for r in results:
|
||||
key = (r["frame"], r["object"])
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
deduped.append(r)
|
||||
|
||||
# Group by object type
|
||||
by_object = defaultdict(list)
|
||||
for r in deduped:
|
||||
by_object[r["object"]].append(r)
|
||||
|
||||
print(f"\n=== Results: {len(deduped)} hand-held object detections ===")
|
||||
print(f"{'Object':<20} {'Count':>6} {'Near hand':>12} {'Timestamps':<40}")
|
||||
print("-"*80)
|
||||
for obj, items in sorted(by_object.items(), key=lambda x: -len(x[1])):
|
||||
near_hand = sum(1 for i in items if i["near_hand"])
|
||||
ts_list = ", ".join(i["time_str"] for i in items[:5])
|
||||
if len(items) > 5:
|
||||
ts_list += f" ... (+{len(items)-5})"
|
||||
print(f"{obj:<20} {len(items):>6} {near_hand:>8d} {ts_list:<40}")
|
||||
|
||||
# Save
|
||||
json.dump(deduped, open("/Users/accusys/momentry/output_dev/handheld_objects.json", "w"), indent=2)
|
||||
print(f"\nSaved to output_dev/handheld_objects.json")
|
||||
conn.close()
|
||||
169
scripts/speaker_bind_lip.py
Normal file
169
scripts/speaker_bind_lip.py
Normal file
@@ -0,0 +1,169 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Speaker Binding with Lip Verification
|
||||
Reads face.json (8Hz outer_lips) + asrx.json + identity_bindings
|
||||
For each ASR segment with face data + lip motion, create speaker→identity binding.
|
||||
"""
|
||||
|
||||
import json, subprocess, sys
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
OUTPUT_DIR = Path("/Users/accusys/momentry/output_dev")
|
||||
PSQL = ["/Users/accusys/pgsql/18.3/bin/psql", "-U", "accusys", "-d", "momentry", "-t", "-A"]
|
||||
|
||||
|
||||
def psql(sql: str) -> str:
|
||||
r = subprocess.run(PSQL + ["-c", sql], capture_output=True, text=True, timeout=30)
|
||||
return r.stdout.strip()
|
||||
|
||||
|
||||
def calc_lip_height(face_data):
|
||||
"""Calculate lip height from outer_lips (14 [x,y] points)"""
|
||||
lips = face_data.get("lips", {})
|
||||
outer = lips.get("outer_lips", []) if isinstance(lips, dict) else lips
|
||||
if not outer or len(outer) < 3:
|
||||
return None
|
||||
ys = [pt[1] for pt in outer]
|
||||
return max(ys) - min(ys)
|
||||
|
||||
|
||||
print("=== Speaker Binding with Lip Verification ===")
|
||||
|
||||
# Step 1: Load face traces with identity_id
|
||||
traces = psql(f"""
|
||||
SELECT trace_id, identity_id FROM dev.face_detections
|
||||
WHERE file_uuid='{UUID}' AND trace_id IS NOT NULL AND identity_id IS NOT NULL
|
||||
GROUP BY trace_id, identity_id
|
||||
""")
|
||||
trace_identity = {}
|
||||
for line in traces.strip().split('\n'):
|
||||
if not line.strip() or '|' not in line: continue
|
||||
p = line.split('|')
|
||||
trace_identity[int(p[0])] = int(p[1])
|
||||
print(f"Traces with identity: {len(trace_identity)}")
|
||||
|
||||
# Step 2: Load trace frame ranges
|
||||
tf = psql(f"""
|
||||
SELECT trace_id, MIN(frame_number), MAX(frame_number), MIN(timestamp_secs), MAX(timestamp_secs)
|
||||
FROM dev.face_detections WHERE file_uuid='{UUID}' AND trace_id IS NOT NULL
|
||||
GROUP BY trace_id
|
||||
""")
|
||||
trace_ranges = {}
|
||||
for line in tf.strip().split('\n'):
|
||||
if not line.strip() or '|' not in line: continue
|
||||
p = line.split('|')
|
||||
tid = int(p[0])
|
||||
trace_ranges[tid] = {
|
||||
'min_frame': int(p[1]), 'max_frame': int(p[2]),
|
||||
'min_ts': float(p[3]), 'max_ts': float(p[4])
|
||||
}
|
||||
|
||||
# Step 3: Load lip analysis per frame from face.json
|
||||
print("Loading face.json lips data...")
|
||||
face = json.load(open(OUTPUT_DIR / f"{UUID}.face.json"))
|
||||
frame_faces = {}
|
||||
for fr in face.get("frames", []):
|
||||
fn = fr["frame"]
|
||||
faces_data = []
|
||||
for face_data in fr.get("faces", []):
|
||||
h = calc_lip_height(face_data)
|
||||
if h is not None:
|
||||
faces_data.append({"height": h})
|
||||
if faces_data:
|
||||
frame_faces[fn] = faces_data
|
||||
|
||||
print(f"Frames with lip data: {len(frame_faces)}")
|
||||
|
||||
# Step 4: Load ASRX segments
|
||||
asrx = json.load(open(OUTPUT_DIR / f"{UUID}.asrx.json"))
|
||||
segments = asrx.get("segments", [])
|
||||
|
||||
# Step 5: For each ASR segment with face overlap, compute lip motion
|
||||
from collections import defaultdict
|
||||
speaker_trace_scores = defaultdict(list)
|
||||
|
||||
for seg in segments:
|
||||
st = seg.get("start_time", 0)
|
||||
et = seg.get("end_time", 0)
|
||||
speaker = seg.get("speaker_id", "")
|
||||
if not speaker:
|
||||
continue
|
||||
|
||||
fps = 25.0
|
||||
start_frame = int(st * fps)
|
||||
end_frame = int(et * fps) + 10
|
||||
|
||||
# Find overlapping traces
|
||||
overlapping_traces = []
|
||||
for tid, tr in trace_ranges.items():
|
||||
if tr['min_ts'] <= et and tr['max_ts'] >= st:
|
||||
overlapping_traces.append(tid)
|
||||
|
||||
if not overlapping_traces:
|
||||
continue
|
||||
|
||||
# Compute lip motion for each overlapping trace
|
||||
for tid in overlapping_traces:
|
||||
tr = trace_ranges[tid]
|
||||
# Baseline frames before ASR start
|
||||
baseline = []
|
||||
# During frames
|
||||
during = []
|
||||
for fn in frame_faces:
|
||||
fn_ts = fn / fps
|
||||
if fn_ts >= tr['min_ts'] and fn_ts <= tr['max_ts']:
|
||||
if fn_ts < st - 1.0: # Before (baseline)
|
||||
for fd in frame_faces[fn]:
|
||||
baseline.append(fd["height"])
|
||||
elif fn_ts >= st and fn_ts <= et: # During
|
||||
for fd in frame_faces[fn]:
|
||||
during.append(fd["height"])
|
||||
|
||||
if not baseline or not during:
|
||||
continue
|
||||
|
||||
baseline_avg = sum(baseline) / len(baseline)
|
||||
during_avg = sum(during) / len(during)
|
||||
motion = (during_avg - baseline_avg) / max(baseline_avg, 0.1)
|
||||
score = max(0, min(1.0, motion * 5)) # Normalize: 20% motion → 1.0
|
||||
|
||||
speaker_trace_scores[(speaker, tid)].append(score)
|
||||
|
||||
# Step 6: Create speaker bindings
|
||||
bindings = 0
|
||||
existing = psql(f"SELECT identity_value FROM dev.identity_bindings WHERE identity_type='speaker' AND identity_id IN (SELECT identity_id FROM dev.face_detections WHERE file_uuid='{UUID}' AND identity_id IS NOT NULL GROUP BY identity_id)")
|
||||
existing_speakers = set(existing.strip().split('\n')) if existing.strip() else set()
|
||||
|
||||
new_bindings = 0
|
||||
for (speaker, tid), scores in speaker_trace_scores.items():
|
||||
if tid not in trace_identity:
|
||||
continue
|
||||
identity_id = trace_identity[tid]
|
||||
avg_score = sum(scores) / len(scores) if scores else 0
|
||||
|
||||
if speaker in existing_speakers:
|
||||
continue
|
||||
if avg_score < 0.3: # Threshold: need meaningful lip motion
|
||||
continue
|
||||
|
||||
r = psql(f"""
|
||||
INSERT INTO dev.identity_bindings (identity_id, identity_type, identity_value, confidence, metadata)
|
||||
VALUES ({identity_id}, 'speaker', '{speaker}', {avg_score:.3f}, '{{"source":"lip_analysis","trace_id":{tid},"segments":{len(scores)},"avg_score":{avg_score:.3f}}}'::jsonb)
|
||||
ON CONFLICT (identity_id, identity_type, identity_value) DO UPDATE SET confidence=EXCLUDED.confidence
|
||||
""")
|
||||
new_bindings += 1
|
||||
|
||||
print(f"\n=== Done ===")
|
||||
print(f"ASR segments analyzed: {len(segments)}")
|
||||
print(f"Segments with face+lip data: {len(speaker_trace_scores)}")
|
||||
print(f"New speaker bindings: {new_bindings}")
|
||||
|
||||
# Verify
|
||||
binds = psql(f"SELECT ib.identity_value, i.name FROM dev.identity_bindings ib JOIN dev.identities i ON i.id=ib.identity_id WHERE ib.identity_type='speaker' AND i.id IN (SELECT identity_id FROM dev.face_detections WHERE file_uuid='{UUID}') ORDER BY ib.identity_value")
|
||||
print(f"\nSpeaker bindings:")
|
||||
for line in binds.strip().split('\n'):
|
||||
if line.strip() and '|' in line:
|
||||
p = line.split('|')
|
||||
print(f" {p[0]:15s} → {p[1]}")
|
||||
204
scripts/split_asr_segments.py
Normal file
204
scripts/split_asr_segments.py
Normal file
@@ -0,0 +1,204 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Split ASR segments at detected speaker change points.
|
||||
Uses ECAPA-TDNN sub-window classification against reference centroids.
|
||||
|
||||
Output: new asrx_fine.json with fine-grained segments + parent_asr_idx reference.
|
||||
"""
|
||||
import json, sys, os, time, argparse, subprocess, tempfile, shutil
|
||||
import numpy as np
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "asrx_self"))
|
||||
from main_fixed import SelfASRXFixed
|
||||
from speaker_encoder import extract_speaker_embedding, normalize_embeddings
|
||||
import torchaudio, psycopg2
|
||||
|
||||
SUB_WIN = 0.5
|
||||
SUB_STRIDE = 0.25
|
||||
CHANGE_CONFIRM = 2
|
||||
MIN_DUR = 0.7
|
||||
BATCH_SIZE = 500
|
||||
|
||||
def load_reference(uuid, db_url):
|
||||
conn = psycopg2.connect(db_url)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT chunk_index, metadata->>'new_speaker_name' FROM dev.chunks WHERE file_uuid=%s AND chunk_type='sentence' ORDER BY chunk_index", (uuid,))
|
||||
name_by_idx = dict(cur.fetchall())
|
||||
conn.close()
|
||||
|
||||
asrx_path = f"/Users/accusys/momentry/output_dev/{uuid}.asrx.json"
|
||||
asrx_full = json.load(open(asrx_path))
|
||||
ref = {"Cary Grant": [], "Audrey Hepburn": [], "Unknown": []}
|
||||
for i, seg in enumerate(asrx_full["segments"]):
|
||||
name = name_by_idx.get(i, "Unknown")
|
||||
if name in ref and i < len(asrx_full.get("embeddings", [])):
|
||||
ref[name].append(np.array(asrx_full["embeddings"][i]))
|
||||
|
||||
centroids = {}
|
||||
for name, el in ref.items():
|
||||
if el:
|
||||
c = np.mean(el, axis=0)
|
||||
centroids[name] = c / (np.linalg.norm(c) + 1e-10)
|
||||
name_to_speaker = {}
|
||||
for i, seg in enumerate(asrx_full["segments"]):
|
||||
name = name_by_idx.get(i, "Unknown")
|
||||
sid = seg["speaker_id"]
|
||||
name_to_speaker.setdefault(name, sid)
|
||||
return centroids, name_to_speaker
|
||||
|
||||
def extract_audio(video_path, sr=16000):
|
||||
tmp = tempfile.mkdtemp(prefix="asr_split_")
|
||||
wav = os.path.join(tmp, "audio.wav")
|
||||
subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", video_path,
|
||||
"-ar", str(sr), "-ac", "1", "-sample_fmt", "s16", wav], check=True, capture_output=True, timeout=300)
|
||||
wav_data, sr_actual = torchaudio.load(wav)
|
||||
if wav_data.shape[0] > 1:
|
||||
wav_data = wav_data.mean(dim=0, keepdim=True)
|
||||
return wav_data, sr_actual, tmp
|
||||
|
||||
def classify(emb, centroids):
|
||||
return max(centroids, key=lambda n: float(np.dot(emb, centroids[n])))
|
||||
|
||||
def process_batch(asr_segs, wav, sr, centroids, encoder, offset_start=0):
|
||||
ws = int(SUB_WIN * sr)
|
||||
sw = int(SUB_STRIDE * sr)
|
||||
results = []
|
||||
for si, s in enumerate(asr_segs):
|
||||
st = s["start"] - offset_start
|
||||
et = s["end"] - offset_start
|
||||
dur = et - st
|
||||
|
||||
if dur < 1.0:
|
||||
a = wav[:, int(st*sr):int(et*sr)]
|
||||
e = extract_speaker_embedding(encoder, a.numpy(), sr)
|
||||
e /= np.linalg.norm(e) + 1e-10
|
||||
results.append((s["start"], s["end"], classify(e, centroids), si))
|
||||
continue
|
||||
|
||||
ss = int(st*sr); se = int(et*sr)
|
||||
sub_e, sub_t = [], []
|
||||
for wpos in range(ss, se-ws+1, sw):
|
||||
chunk = wav[:, wpos:wpos+ws]
|
||||
sub_e.append(extract_speaker_embedding(encoder, chunk.numpy(), sr))
|
||||
sub_t.append(wpos/sr + offset_start)
|
||||
|
||||
if len(sub_e) < 3:
|
||||
a = wav[:, ss:se]
|
||||
e = extract_speaker_embedding(encoder, a.numpy(), sr)
|
||||
e /= np.linalg.norm(e) + 1e-10
|
||||
results.append((s["start"], s["end"], classify(e, centroids), si))
|
||||
continue
|
||||
|
||||
sub_e = normalize_embeddings(np.array(sub_e))
|
||||
names = []
|
||||
for i in range(len(sub_e)):
|
||||
names.append(classify(sub_e[i], centroids))
|
||||
|
||||
# Smooth
|
||||
sm = list(names)
|
||||
for i in range(1, len(names)-1):
|
||||
sm[i] = Counter(names[max(0,i-1):min(len(names),i+2)]).most_common(1)[0][0]
|
||||
|
||||
# Find splits
|
||||
splits = []
|
||||
prev = sm[0]
|
||||
for i in range(1, len(sm)):
|
||||
if sm[i] != prev:
|
||||
if i+CHANGE_CONFIRM < len(sm) and all(sm[i]==sm[j] for j in range(i, i+CHANGE_CONFIRM+1)):
|
||||
splits.append(sub_t[i]); prev = sm[i]
|
||||
elif i+CHANGE_CONFIRM >= len(sm):
|
||||
splits.append(sub_t[i]); prev = sm[i]
|
||||
|
||||
if not splits:
|
||||
results.append((s["start"], s["end"], Counter(names).most_common(1)[0][0], si))
|
||||
else:
|
||||
boundaries = [s["start"]] + splits + [s["end"]]
|
||||
for pi in range(len(boundaries)-1):
|
||||
ps, pe = boundaries[pi], boundaries[pi+1]
|
||||
if pe-ps < MIN_DUR: continue
|
||||
sub_i = [i for i, t in enumerate(sub_t) if ps <= t < pe]
|
||||
lbl = Counter([names[i] for i in sub_i]).most_common(1)[0][0] if sub_i else Counter(names).most_common(1)[0][0]
|
||||
results.append((round(ps,2), round(pe,2), lbl, si))
|
||||
|
||||
return results
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--uuid", default="aeed71342a899fe4b4c57b7d41bcb692")
|
||||
parser.add_argument("--output", help="Output path for fine ASRX JSON")
|
||||
args = parser.parse_args()
|
||||
|
||||
UUID = args.uuid
|
||||
BASE = "/Users/accusys/momentry/output_dev"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
|
||||
|
||||
print(f"Processing {UUID}")
|
||||
|
||||
centroids, name_to_speaker = load_reference(UUID, DB_URL)
|
||||
print(f"Centroids: {list(centroids.keys())}")
|
||||
|
||||
asr = json.load(open(f"{BASE}/{UUID}.asr.json"))
|
||||
asr_segs = asr["segments"]
|
||||
print(f"ASR segments: {len(asr_segs)}")
|
||||
|
||||
print("Extracting audio...")
|
||||
wav, sr, tmp_dir = extract_audio(VIDEO)
|
||||
print(f"Audio: {wav.shape[1]/sr:.0f}s")
|
||||
|
||||
inst = SelfASRXFixed()
|
||||
encoder = inst.speaker_encoder
|
||||
|
||||
all_results = []
|
||||
t0 = time.time()
|
||||
for batch_start in range(0, len(asr_segs), BATCH_SIZE):
|
||||
batch = asr_segs[batch_start:batch_start + BATCH_SIZE]
|
||||
segs = process_batch(batch, wav, sr, centroids, encoder)
|
||||
all_results.extend(segs)
|
||||
pct = (batch_start + len(batch)) * 100 // len(asr_segs)
|
||||
print(f" {batch_start+len(batch)}/{len(asr_segs)} ({pct}%) -> {len(all_results)} segments [{time.time()-t0:.0f}s]")
|
||||
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
|
||||
# Build output
|
||||
spk_stats = {}
|
||||
out_segs = []
|
||||
# Assign sequential SPEAKER_X IDs based on name order
|
||||
name_order = {name: i for i, name in enumerate(sorted(set(s[2] for s in all_results)))}
|
||||
|
||||
for start, end, name, asr_idx in all_results:
|
||||
sid = f"SPEAKER_{name_order[name]}"
|
||||
dur = end - start
|
||||
spk_stats.setdefault(sid, {"count": 0, "duration": 0})
|
||||
spk_stats[sid]["count"] += 1
|
||||
spk_stats[sid]["duration"] += dur
|
||||
out_segs.append({
|
||||
"start_time": start,
|
||||
"end_time": end,
|
||||
"speaker_id": sid,
|
||||
"speaker_name": name,
|
||||
"parent_asr_idx": asr_idx,
|
||||
})
|
||||
|
||||
output = {
|
||||
"uuid": UUID,
|
||||
"language": "en",
|
||||
"segments": out_segs,
|
||||
"speaker_stats": spk_stats,
|
||||
"total_asr_segments": len(asr_segs),
|
||||
"total_fine_segments": len(out_segs),
|
||||
}
|
||||
|
||||
output_path = args.output or f"{BASE}/{UUID}.asrx_fine.json"
|
||||
json.dump(output, open(output_path, "w"), indent=2)
|
||||
print(f"\nSaved: {output_path}")
|
||||
print(f"Segments: {len(out_segs)} (was {len(asr_segs)}, +{len(out_segs)-len(asr_segs)})")
|
||||
print(f"Speakers: {len(spk_stats)}")
|
||||
for sid, st in sorted(spk_stats.items()):
|
||||
print(f" {sid}: {st['count']} segs, {st['duration']:.0f}s")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
98
scripts/step3_asr_fine.py
Normal file
98
scripts/step3_asr_fine.py
Normal file
@@ -0,0 +1,98 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Step 3: Re-run ASR with word_timestamps on full audio.
|
||||
Map words to 4188 fine segments for accurate text.
|
||||
"""
|
||||
import json, sys, os, time, subprocess, tempfile, shutil
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
BASE = "/Users/accusys/momentry/output_dev"
|
||||
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
|
||||
|
||||
print("=== Load fine ASRX ===")
|
||||
fine = json.load(open(f"{BASE}/{UUID}.asrx_fine.json"))
|
||||
fine_segs = fine["segments"]
|
||||
print(f"Fine segments: {len(fine_segs)}")
|
||||
|
||||
print("\n=== Extract audio WAV ===")
|
||||
tmp_dir = tempfile.mkdtemp(prefix="asr_step3_")
|
||||
wav_path = os.path.join(tmp_dir, "audio.wav")
|
||||
subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", VIDEO,
|
||||
"-ar", "16000", "-ac", "1", "-sample_fmt", "s16", wav_path],
|
||||
check=True, capture_output=True, timeout=300)
|
||||
|
||||
print("Loading model with word_timestamps...")
|
||||
t0 = time.time()
|
||||
model = WhisperModel("small", device="cpu", compute_type="int8")
|
||||
print(f" Model loaded in {time.time()-t0:.1f}s")
|
||||
|
||||
print("Transcribing with word_timestamps=True...")
|
||||
t0 = time.time()
|
||||
segments, info = model.transcribe(
|
||||
wav_path, beam_size=5, vad_filter=True,
|
||||
vad_parameters={"min_silence_duration_ms": 500},
|
||||
word_timestamps=True
|
||||
)
|
||||
|
||||
# Collect all word-level data
|
||||
words = []
|
||||
for seg in segments:
|
||||
if seg.words:
|
||||
for w in seg.words:
|
||||
wt = w.word.strip()
|
||||
if wt:
|
||||
words.append({"word": wt, "start": w.start, "end": w.end})
|
||||
else:
|
||||
words.append({"word": seg.text.strip(), "start": seg.start, "end": seg.end})
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f" Done in {elapsed:.1f}s, {len(words)} words")
|
||||
|
||||
# Map words to fine segments
|
||||
print("\n=== Map words to fine segments ===")
|
||||
wi = 0
|
||||
assigned = 0
|
||||
for si, fs in enumerate(fine_segs):
|
||||
fstart = fs["start_time"]
|
||||
fend = fs["end_time"]
|
||||
seg_words = []
|
||||
|
||||
while wi < len(words):
|
||||
w = words[wi]
|
||||
if w["end"] <= fstart:
|
||||
wi += 1
|
||||
continue
|
||||
if w["start"] >= fend:
|
||||
break
|
||||
seg_words.append(w["word"])
|
||||
wi += 1
|
||||
|
||||
text = " ".join(seg_words)
|
||||
fs["text"] = text
|
||||
if text:
|
||||
assigned += 1
|
||||
|
||||
print(f" Segments with text: {assigned}/{len(fine_segs)}")
|
||||
|
||||
# Show examples
|
||||
print("\nSplit segment examples:")
|
||||
for fs in fine_segs:
|
||||
# Check if this was split (doesn't match an ASR boundary exactly)
|
||||
is_split = True
|
||||
# We can't easily check here, just show first 10 non-trivial
|
||||
if len(fs.get('text','')) > 10 and is_split:
|
||||
print(f" [{fs['start_time']:.1f}-{fs['end_time']:.1f}] {fs['speaker_name']:15s} \"{fs['text'][:60]}\"")
|
||||
break # just one for now
|
||||
|
||||
# Count text lengths
|
||||
text_lens = [len(fs.get('text','')) for fs in fine_segs]
|
||||
print(f"\n Avg text length: {sum(text_lens)/len(text_lens):.0f} chars")
|
||||
print(f" Empty texts: {sum(1 for l in text_lens if l == 0)}")
|
||||
|
||||
# Save
|
||||
fine["_asr_meta"] = {"word_timestamps": True, "asr_runtime_secs": round(elapsed, 1)}
|
||||
json.dump(fine, open(f"{BASE}/{UUID}.asrx_fine.json", "w"), indent=2)
|
||||
print(f"\nSaved")
|
||||
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
87
scripts/story_embed.py
Normal file
87
scripts/story_embed.py
Normal file
@@ -0,0 +1,87 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Story Embedding Pipeline:
|
||||
1. Read story chunks → LLM summary (Gemma4)
|
||||
2. Embed summary (EmbeddingGemma)
|
||||
3. Store in chunks table + Qdrant
|
||||
"""
|
||||
|
||||
import json, urllib.request, subprocess, sys, time, os
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
PSQL = ["/Users/accusys/pgsql/18.3/bin/psql", "-U", "accusys", "-d", "momentry", "-t", "-A"]
|
||||
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
||||
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
QDRANT_COL = "momentry_dev_stories"
|
||||
|
||||
def psql(sql):
|
||||
r = subprocess.run(PSQL + ["-c", sql], capture_output=True, text=True, timeout=30)
|
||||
return r.stdout.strip()
|
||||
|
||||
def call_llm(dialogue):
|
||||
prompt = f"Dialogue: {dialogue}\n\n50-word summary:"
|
||||
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.1, "max_tokens": 100}).encode()
|
||||
req = urllib.request.Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
resp = urllib.request.urlopen(req, timeout=120)
|
||||
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
|
||||
|
||||
def call_embed(text):
|
||||
body = json.dumps({"input": text}).encode()
|
||||
req = urllib.request.Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
return json.loads(resp.read())["data"][0]["embedding"]
|
||||
|
||||
# Step 0: Ensure Qdrant collection exists (768 dims)
|
||||
subprocess.run(["curl", "-s", "-X", "PUT", f"{QDRANT_URL}/collections/{QDRANT_COL}",
|
||||
"-H", "Content-Type: application/json",
|
||||
"-d", '{"vectors":{"size":768,"distance":"Cosine"}}'], capture_output=True)
|
||||
|
||||
# Step 1: Get all story chunks that need summaries
|
||||
lines = [l for l in psql(f"SELECT chunk_id, chunk_index, start_time, end_time, text_content FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story' AND (summary_text IS NULL OR summary_text = '') ORDER BY chunk_index").split('\n') if l.strip() and '|' in l]
|
||||
|
||||
print(f"Chunks to process: {len(lines)}")
|
||||
total = len(lines)
|
||||
errors = 0
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
parts = line.split('|', 4)
|
||||
cid, idx, st, et, dialogue = parts[0].strip(), int(parts[1]), float(parts[2]), float(parts[3]), parts[4] if len(parts) > 4 else ""
|
||||
|
||||
if len(dialogue) < 10:
|
||||
summary = "[no dialogue]"
|
||||
embedding = [0.0] * 768
|
||||
else:
|
||||
try:
|
||||
summary = call_llm(dialogue)
|
||||
time.sleep(0.3)
|
||||
embedding = call_embed(summary)
|
||||
except Exception as e:
|
||||
print(f"[{i+1}/{total}] Error: {cid} - {e}")
|
||||
errors += 1
|
||||
summary = "[error]"
|
||||
embedding = [0.0] * 768
|
||||
|
||||
# Update DB
|
||||
s_esc = summary.replace("'", "''")
|
||||
psql(f"UPDATE dev.chunks SET summary_text='{s_esc}', updated_at=CURRENT_TIMESTAMP WHERE chunk_id='{cid}'")
|
||||
|
||||
# Store in Qdrant
|
||||
point = json.dumps({"points": [{"id": idx + 1, "vector": embedding,
|
||||
"payload": {"chunk_id": cid, "file_uuid": UUID, "start_time": st, "end_time": et,
|
||||
"summary": summary, "type": "story_summary"}
|
||||
}]}).encode()
|
||||
req = urllib.request.Request(f"{QDRANT_URL}/collections/{QDRANT_COL}/points?wait=true",
|
||||
data=point, headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try:
|
||||
urllib.request.urlopen(req, timeout=10)
|
||||
except:
|
||||
pass
|
||||
|
||||
if (i+1) % 20 == 0:
|
||||
print(f"[{i+1}/{total}] {errors} errors so far")
|
||||
|
||||
print(f"\nDone. Processed: {total}, Errors: {errors}")
|
||||
print(f"Qdrant: {QDRANT_COL}")
|
||||
230
scripts/story_pipeline_full.py
Normal file
230
scripts/story_pipeline_full.py
Normal file
@@ -0,0 +1,230 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Story Pipeline Full — Speaker + Story + Summary
|
||||
Step 1: Update sentence chunks with speaker name
|
||||
Step 2: Rebuild story chunks + re-embed
|
||||
Step 3: LLM summary × 228 + embed
|
||||
"""
|
||||
|
||||
import json, urllib.request, subprocess, sys, time, os
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
DIR = "/Users/accusys/momentry/output_dev"
|
||||
PSQL = ["/Users/accusys/pgsql/18.3/bin/psql", "-U", "accusys", "-d", "momentry", "-t", "-A"]
|
||||
LLM_URL = "http://localhost:8082/v1/chat/completions"
|
||||
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
||||
QDRANT_URL = "http://localhost:6333/collections/momentry_dev_stories/points"
|
||||
|
||||
def psql(sql):
|
||||
r = subprocess.run(PSQL + ["-c", sql], capture_output=True, text=True, timeout=30)
|
||||
return r.stdout.strip()
|
||||
|
||||
def psql_file(path):
|
||||
r = subprocess.run(PSQL + ["-f", path], capture_output=True, text=True, timeout=60)
|
||||
if r.stderr and "ERROR" in r.stderr:
|
||||
print(f"SQL Error: {r.stderr[:200]}")
|
||||
return r.returncode
|
||||
|
||||
def embed_text(text):
|
||||
body = json.dumps({"input": text[:1024]}).encode()
|
||||
req = urllib.request.Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
return json.loads(urllib.request.urlopen(req, timeout=30).read())["data"][0]["embedding"]
|
||||
|
||||
def llm_summary(dialogue):
|
||||
body = json.dumps({
|
||||
"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
|
||||
"messages": [{"role": "user", "content": f"Summarize concisely:\n{dialogue}\n\n50-word summary:"}],
|
||||
"temperature": 0.1, "max_tokens": 100,
|
||||
}).encode()
|
||||
req = urllib.request.Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
return json.loads(urllib.request.urlopen(req, timeout=120).read())["choices"][0]["message"]["content"].strip()
|
||||
|
||||
fps = 25.0
|
||||
FILE_ID = 242
|
||||
|
||||
# ═══════════════════════════════════════════════════
|
||||
# Step 0: Load ASR + ASRX + speaker map
|
||||
# ═══════════════════════════════════════════════════
|
||||
print("=" * 60)
|
||||
print("Step 0: Loading data...")
|
||||
asr = json.load(open(f"{DIR}/{UUID}.asr.json"))
|
||||
segs = asr["segments"]
|
||||
asrx = json.load(open(f"{DIR}/{UUID}.asrx.json"))
|
||||
asrx_segs = asrx["segments"]
|
||||
|
||||
# Speaker map from identity_bindings
|
||||
r = psql("SELECT ib.identity_value, i.name FROM dev.identity_bindings ib JOIN dev.identities i ON i.id=ib.identity_id WHERE ib.identity_type='speaker'")
|
||||
speaker_map = {}
|
||||
for line in r.strip().split('\n'):
|
||||
if line.strip() and '|' in line:
|
||||
p = line.split('|')
|
||||
speaker_map[p[0].strip()] = p[1].strip()
|
||||
speaker_map["SPEAKER_0"] = "Speaker_0" # Fallback for unbounded
|
||||
|
||||
# ═══════════════════════════════════════════════════
|
||||
# Step 1: Update sentence chunks with speaker
|
||||
# ═══════════════════════════════════════════════════
|
||||
print("\n" + "=" * 60)
|
||||
print("Step 1: Updating sentence chunks with speaker...")
|
||||
|
||||
sql = ["BEGIN;"]
|
||||
chunk_meta = {} # idx → {speaker_id, speaker_name}
|
||||
|
||||
for idx, seg in enumerate(segs):
|
||||
st, et = seg["start"], seg["end"]
|
||||
text = seg["text"].strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# Find overlapping ASRX segment → speaker_id
|
||||
spk_id = "SPEAKER_0"
|
||||
for ax in asrx_segs:
|
||||
if ax.get("start_time", 0) <= st and ax.get("end_time", 0) >= et:
|
||||
spk_id = ax.get("speaker_id", "SPEAKER_0")
|
||||
break
|
||||
|
||||
spk_name = speaker_map.get(spk_id, spk_id)
|
||||
new_text = f"[{spk_name}] {text}"
|
||||
meta = json.dumps({"speaker_id": spk_id, "speaker_name": spk_name})
|
||||
esc = new_text.replace("'", "''")
|
||||
|
||||
sql.append(f"UPDATE dev.chunks SET text_content='{esc}', metadata='{meta}'::jsonb WHERE file_uuid='{UUID}' AND chunk_id='{UUID}_{idx}';")
|
||||
chunk_meta[idx] = {"speaker_id": spk_id, "speaker_name": spk_name}
|
||||
|
||||
sql.append("COMMIT;")
|
||||
with open("/tmp/s1_speaker.sql", "w") as f:
|
||||
f.write("\n".join(sql))
|
||||
|
||||
psql_file("/tmp/s1_speaker.sql")
|
||||
print(f" Updated {len(chunk_meta)} sentence chunks with speaker")
|
||||
|
||||
# ═══════════════════════════════════════════════════
|
||||
# Step 2: Rebuild story chunks + re-embed
|
||||
# ═══════════════════════════════════════════════════
|
||||
print("\n" + "=" * 60)
|
||||
print("Step 2: Rebuilding story chunks...")
|
||||
|
||||
# Delete old story chunks
|
||||
psql(f"DELETE FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story';")
|
||||
|
||||
# Recreate
|
||||
CHUNK_SIZE = 15
|
||||
sql2 = ["BEGIN;"]
|
||||
story_meta = []
|
||||
|
||||
for i in range(0, len(segs), CHUNK_SIZE):
|
||||
group = segs[i:i+CHUNK_SIZE]
|
||||
st, et = group[0]["start"], group[-1]["end"]
|
||||
idx = i // CHUNK_SIZE
|
||||
chunk_id = f"{UUID}_story_{idx}"
|
||||
|
||||
# Build speaker text from individual sentences
|
||||
texts = []
|
||||
speakers_used = {}
|
||||
for j, seg in enumerate(group):
|
||||
seg_idx = i + j
|
||||
if seg_idx in chunk_meta:
|
||||
cm = chunk_meta[seg_idx]
|
||||
text = seg["text"].strip()
|
||||
if text:
|
||||
texts.append(f"[{cm['speaker_name']}] {text}")
|
||||
speakers_used[cm['speaker_name']] = speakers_used.get(cm['speaker_name'], 0) + 1
|
||||
|
||||
dialogue = " ".join(texts)
|
||||
child_ids = ", ".join([f"'{UUID}_{j}'" for j in range(i, min(i+CHUNK_SIZE, len(segs)))])
|
||||
words = sum(len(t.split()) for t in texts)
|
||||
|
||||
meta = json.dumps({"method": "fixed_15", "seg_count": len(group), "words": words, "speakers": speakers_used})
|
||||
esc = dialogue.replace("'", "''")
|
||||
|
||||
sql2.append(f"""INSERT INTO dev.chunks (file_id,file_uuid,chunk_id,old_chunk_id,chunk_index,chunk_type,start_time,end_time,fps,start_frame,end_frame,text_content,content,metadata,frame_count,child_chunk_ids)
|
||||
VALUES ({FILE_ID},'{UUID}','{chunk_id}','{chunk_id}',{idx},'story',{st},{et},{fps},{int(st*fps)},{int(et*fps)},'{esc}','{{"type":"story_parent"}}'::jsonb,'{meta}'::jsonb,{int((et-st)*fps)},ARRAY[{child_ids}]);""")
|
||||
|
||||
story_meta.append({"idx": idx, "st": st, "et": et, "dialogue": dialogue, "words": words, "speakers": speakers_used})
|
||||
|
||||
sql2.append("COMMIT;")
|
||||
with open("/tmp/s2_story.sql", "w") as f:
|
||||
f.write("\n".join(sql2))
|
||||
psql_file("/tmp/s2_story.sql")
|
||||
print(f" Created {len(story_meta)} story chunks")
|
||||
|
||||
# Embed + upsert to Qdrant
|
||||
print("\n Embedding story chunks...")
|
||||
points_dialogue = []
|
||||
for sm in story_meta:
|
||||
if len(sm["dialogue"]) < 10:
|
||||
continue
|
||||
vec = embed_text(sm["dialogue"])
|
||||
points_dialogue.append({"id": sm["idx"] + 1, "vector": vec, "payload": {
|
||||
"chunk_id": f"{UUID}_story_{sm['idx']}", "file_uuid": UUID,
|
||||
"start_time": sm["st"], "end_time": sm["et"], "type": "story_dialogue"
|
||||
}})
|
||||
|
||||
for i in range(0, len(points_dialogue), 100):
|
||||
batch = points_dialogue[i:i+100]
|
||||
data = json.dumps({"points": batch, "wait": True}).encode()
|
||||
req = urllib.request.Request(f"{QDRANT_URL}?wait=true", data=data, headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urllib.request.urlopen(req, timeout=30)
|
||||
print(f" Qdrant: {len(points_dialogue)} dialogue vectors")
|
||||
|
||||
# ═══════════════════════════════════════════════════
|
||||
# Step 3: LLM summaries + embed
|
||||
# ═══════════════════════════════════════════════════
|
||||
print("\n" + "=" * 60)
|
||||
print("Step 3: LLM summaries...")
|
||||
|
||||
points_summary = []
|
||||
summary_sql = ["BEGIN;"]
|
||||
|
||||
for i, sm in enumerate(story_meta):
|
||||
if len(sm["dialogue"]) < 10:
|
||||
continue
|
||||
|
||||
try:
|
||||
summary = llm_summary(sm["dialogue"])
|
||||
time.sleep(0.3)
|
||||
vec = embed_text(summary)
|
||||
time.sleep(0.1)
|
||||
except Exception as e:
|
||||
print(f" Error on story {sm['idx']}: {e}")
|
||||
summary = "[error]"
|
||||
vec = [0.0] * 768
|
||||
|
||||
s_esc = summary.replace("'", "''")
|
||||
summary_sql.append(f"UPDATE dev.chunks SET summary_text='{s_esc}', updated_at=CURRENT_TIMESTAMP WHERE file_uuid='{UUID}' AND chunk_id='{UUID}_story_{sm['idx']}';")
|
||||
|
||||
points_summary.append({"id": 100000 + sm["idx"] + 1, "vector": vec, "payload": {
|
||||
"chunk_id": f"{UUID}_story_{sm['idx']}", "file_uuid": UUID,
|
||||
"start_time": sm["st"], "end_time": sm["et"],
|
||||
"summary": summary, "type": "story_summary"
|
||||
}})
|
||||
|
||||
if (i + 1) % 50 == 0:
|
||||
print(f" {i+1}/{len(story_meta)}")
|
||||
|
||||
# Update DB with summaries
|
||||
summary_sql.append("COMMIT;")
|
||||
with open("/tmp/s3_summary.sql", "w") as f:
|
||||
f.write("\n".join(summary_sql))
|
||||
psql_file("/tmp/s3_summary.sql")
|
||||
|
||||
# Upsert summary vectors to Qdrant
|
||||
for i in range(0, len(points_summary), 100):
|
||||
batch = points_summary[i:i+100]
|
||||
data = json.dumps({"points": batch, "wait": True}).encode()
|
||||
req = urllib.request.Request(f"{QDRANT_URL}?wait=true", data=data, headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urllib.request.urlopen(req, timeout=30)
|
||||
|
||||
print(f" Qdrant: {len(points_summary)} summary vectors")
|
||||
|
||||
# ═══════════════════════════════════════════════════
|
||||
# Step 4: Verify
|
||||
# ═══════════════════════════════════════════════════
|
||||
print("\n" + "=" * 60)
|
||||
print("Done.")
|
||||
r1 = psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='sentence' AND text_content LIKE '[%'")
|
||||
r2 = psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story'")
|
||||
r3 = psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story' AND summary_text IS NOT NULL")
|
||||
print(f"Sentence chunks with speaker: {r1}")
|
||||
print(f"Story chunks: {r2}")
|
||||
print(f"Story chunks with summary: {r3}")
|
||||
74
scripts/test_asr_large_model.py
Normal file
74
scripts/test_asr_large_model.py
Normal file
@@ -0,0 +1,74 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Compare ASR small vs large-v3 on a short test clip.
|
||||
"""
|
||||
import json, time, sys, os
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
CLIP = "/tmp/charade_test_clip.mp4"
|
||||
|
||||
models = {
|
||||
"small": {"size": "small", "device": "cpu", "compute": "int8"},
|
||||
"large-v3": {"size": "large-v3", "device": "cpu", "compute": "int8"},
|
||||
}
|
||||
|
||||
for name, cfg in models.items():
|
||||
outfile = f"/tmp/asr_{name}_result.json"
|
||||
if os.path.exists(outfile):
|
||||
print(f"{name}: already done, skip")
|
||||
continue
|
||||
|
||||
print(f"\n=== Loading {name} model ===")
|
||||
t0 = time.time()
|
||||
model = WhisperModel(cfg["size"], device=cfg["device"], compute_type=cfg["compute"])
|
||||
print(f" Loaded in {time.time()-t0:.1f}s")
|
||||
|
||||
print(f" Transcribing...")
|
||||
t0 = time.time()
|
||||
segments, info = model.transcribe(CLIP, beam_size=5, vad_filter=True,
|
||||
vad_parameters={"min_silence_duration_ms": 500})
|
||||
segs = []
|
||||
for seg in segments:
|
||||
segs.append({"start": round(seg.start + 1540, 2), "end": round(seg.end + 1540, 2),
|
||||
"text": seg.text.strip()})
|
||||
elapsed = time.time() - t0
|
||||
|
||||
result = {
|
||||
"model": name,
|
||||
"language": info.language,
|
||||
"segments": segs,
|
||||
"segment_count": len(segs),
|
||||
"duration_secs": round(elapsed, 1),
|
||||
}
|
||||
json.dump(result, open(outfile, "w"), indent=2, ensure_ascii=False)
|
||||
print(f" Done: {len(segs)} segs in {elapsed:.1f}s")
|
||||
del model # free memory
|
||||
|
||||
print("\n=== Comparison ===")
|
||||
for name in models:
|
||||
r = json.load(open(f"/tmp/asr_{name}_result.json"))
|
||||
print(f"{name}: {r['segment_count']} segs, {r['duration_secs']}s runtime")
|
||||
|
||||
# Show differences
|
||||
small = json.load(open("/tmp/asr_small_result.json"))["segments"]
|
||||
large = json.load(open("/tmp/asr_large_v3_result.json"))["segments"]
|
||||
|
||||
small_texts = set(s["text"] for s in small)
|
||||
large_texts = set(s["text"] for s in large)
|
||||
|
||||
only_small = small_texts - large_texts
|
||||
only_large = large_texts - small_texts
|
||||
|
||||
print(f"\nTexts only in small: {len(only_small)}")
|
||||
for t in sorted(only_small)[:10]:
|
||||
print(f" SMALL: \"{t}\"")
|
||||
|
||||
print(f"\nTexts only in large: {len(only_large)}")
|
||||
for t in sorted(only_large)[:10]:
|
||||
print(f" LARGE: \"{t}\"")
|
||||
|
||||
# Compare segment boundaries
|
||||
print(f"\nSegment time differences (large has more/fewer):")
|
||||
print(f" Small: {len(small)} segments")
|
||||
print(f" Large: {len(large)} segments")
|
||||
print(f" Diff: {len(large) - len(small)}")
|
||||
81
scripts/update_fine_speakers.py
Normal file
81
scripts/update_fine_speakers.py
Normal file
@@ -0,0 +1,81 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Update DB sentence chunks with fine-grained ASRX speaker assignments.
|
||||
Each ASR segment gets the majority speaker_name from overlapping fine segments.
|
||||
"""
|
||||
import json, psycopg2
|
||||
from collections import Counter
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
BASE = "/Users/accusys/momentry/output_dev"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
|
||||
print("=== Step 1: Load fine ASRX ===")
|
||||
fine = json.load(open(f"{BASE}/{UUID}.asrx_fine.json"))
|
||||
fine_segs = fine["segments"]
|
||||
print(f"Fine segments: {len(fine_segs)}")
|
||||
|
||||
print("\n=== Step 2: Load existing sentence chunks ===")
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT id, chunk_index, start_time, end_time, metadata
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid=%s AND chunk_type='sentence'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
chunks = cur.fetchall()
|
||||
print(f"DB sentence chunks: {len(chunks)}")
|
||||
|
||||
# For each chunk, find overlapping fine segments
|
||||
print("\n=== Step 3: Update speaker assignments ===")
|
||||
updated = 0
|
||||
for row in chunks:
|
||||
db_id, idx, st, et, meta = row
|
||||
if meta is None or isinstance(meta, str):
|
||||
try:
|
||||
meta = json.loads(meta) if isinstance(meta, str) else {}
|
||||
except:
|
||||
meta = {}
|
||||
|
||||
# Find overlapping fine segments
|
||||
overlapping = [s for s in fine_segs if s["start_time"] < et and s["end_time"] > st]
|
||||
|
||||
if overlapping:
|
||||
# Majority vote
|
||||
names = Counter(s["speaker_name"] for s in overlapping)
|
||||
ids = Counter(s["speaker_id"] for s in overlapping)
|
||||
best_name = names.most_common(1)[0][0]
|
||||
best_id = ids.most_common(1)[0][0]
|
||||
|
||||
meta["speaker_name"] = best_name
|
||||
meta["speaker_id"] = best_id
|
||||
meta["fine_speaker_name"] = best_name
|
||||
meta["fine_speaker_id"] = best_id
|
||||
meta["fine_details"] = dict(names)
|
||||
else:
|
||||
meta["fine_speaker_name"] = meta.get("speaker_name", "Unknown")
|
||||
meta["fine_speaker_id"] = meta.get("speaker_id", "Unknown")
|
||||
|
||||
cur.execute("""
|
||||
UPDATE dev.chunks SET metadata=%s::jsonb, updated_at=NOW()
|
||||
WHERE id=%s
|
||||
""", (json.dumps(meta), db_id))
|
||||
updated += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"Updated {updated} chunks")
|
||||
|
||||
# Verify distribution
|
||||
cur.execute("""
|
||||
SELECT metadata->>'fine_speaker_name', COUNT(*)
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid=%s AND chunk_type='sentence'
|
||||
GROUP BY 1 ORDER BY 2 DESC
|
||||
""", (UUID,))
|
||||
print("\nNew speaker distribution:")
|
||||
for name, cnt in cur.fetchall():
|
||||
print(f" {name}: {cnt}")
|
||||
|
||||
conn.close()
|
||||
print("\n=== Done ===")
|
||||
192
scripts/update_speaker_assignments.py
Normal file
192
scripts/update_speaker_assignments.py
Normal file
@@ -0,0 +1,192 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Update sentence chunk metadata with new ASRX speaker_id and speaker_name.
|
||||
Also update Qdrant momentry_dev_v1 and momentry_dev_voice collections.
|
||||
"""
|
||||
|
||||
import json, sys, time
|
||||
import psycopg2
|
||||
import numpy as np
|
||||
from urllib.request import Request, urlopen
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
ASRX_PATH = f"/Users/accusys/momentry/output_dev/{UUID}.asrx.json"
|
||||
SPEAKER_MAP_PATH = f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map_v2.json"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
|
||||
print("=== Loading data ===")
|
||||
asrx = json.load(open(ASRX_PATH))
|
||||
segs = asrx["segments"]
|
||||
embeddings = asrx.get("embeddings", [])
|
||||
speaker_map = json.load(open(SPEAKER_MAP_PATH))
|
||||
|
||||
assignments = speaker_map["assignments"]
|
||||
speaker_identity = speaker_map["speaker_identity"]
|
||||
|
||||
print(f"Loaded {len(segs)} segments, {len(embeddings)} embeddings")
|
||||
|
||||
print("\n=== Step 1: Update DB chunks with new speaker info ===")
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get existing chunks
|
||||
cur.execute("""
|
||||
SELECT id, chunk_index, metadata
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid = %s AND chunk_type = 'sentence'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
db_chunks = cur.fetchall()
|
||||
print(f"Found {len(db_chunks)} DB sentence chunks")
|
||||
|
||||
updated = 0
|
||||
for row in db_chunks:
|
||||
db_id, chunk_idx, old_meta = row
|
||||
if chunk_idx >= len(assignments):
|
||||
print(f"WARNING: chunk_idx {chunk_idx} out of range for assignments ({len(assignments)})")
|
||||
continue
|
||||
|
||||
a = assignments[chunk_idx]
|
||||
new_sid = a["speaker_id"]
|
||||
new_name = a["speaker_name"]
|
||||
|
||||
# Preserve old metadata but update speaker fields
|
||||
if old_meta is None:
|
||||
old_meta = {}
|
||||
elif isinstance(old_meta, str):
|
||||
old_meta = json.loads(old_meta)
|
||||
|
||||
old_meta["new_speaker_id"] = new_sid
|
||||
old_meta["new_speaker_name"] = new_name
|
||||
old_meta["old_speaker_id"] = old_meta.get("speaker_id", "")
|
||||
old_meta["old_speaker_name"] = old_meta.get("speaker_name", "")
|
||||
|
||||
# Update
|
||||
meta_json = json.dumps(old_meta)
|
||||
cur.execute("""
|
||||
UPDATE dev.chunks
|
||||
SET metadata = %s::jsonb, updated_at = NOW()
|
||||
WHERE id = %s
|
||||
""", (meta_json, db_id))
|
||||
updated += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"Updated {updated} DB chunks")
|
||||
|
||||
# Also update story chunks with new aggregated speaker info
|
||||
print("\n=== Step 2: Update story chunk aggregates ===")
|
||||
cur.execute("""
|
||||
SELECT id, chunk_index, metadata, child_chunk_ids
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid = %s AND chunk_type = 'story'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
stories = cur.fetchall()
|
||||
print(f"Found {len(stories)} story chunks")
|
||||
|
||||
# Get all sentence chunks with their new speaker info
|
||||
cur.execute("""
|
||||
SELECT chunk_index, metadata->>'new_speaker_name' as speaker_name
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid = %s AND chunk_type = 'sentence'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
sentences = cur.fetchall()
|
||||
sent_names = {s[0]: s[1] for s in sentences}
|
||||
|
||||
for row in stories:
|
||||
db_id, idx, meta, child_ids = row
|
||||
if meta is None:
|
||||
meta = {}
|
||||
elif isinstance(meta, str):
|
||||
meta = json.loads(meta)
|
||||
|
||||
if child_ids:
|
||||
# Aggregate speaker info from child chunks
|
||||
speaker_counts = {}
|
||||
for cid in child_ids:
|
||||
# Parse chunk_index from child chunk_id
|
||||
parts = cid.split("_")
|
||||
child_idx = int(parts[-1])
|
||||
if child_idx in sent_names:
|
||||
name = sent_names[child_idx]
|
||||
speaker_counts[name] = speaker_counts.get(name, 0) + 1
|
||||
|
||||
meta["speaker_breakdown"] = speaker_counts
|
||||
primary = max(speaker_counts, key=speaker_counts.get) if speaker_counts else "Unknown"
|
||||
meta["primary_speaker"] = primary
|
||||
meta["speaker_count"] = len(speaker_counts)
|
||||
|
||||
meta_json = json.dumps(meta)
|
||||
cur.execute("""
|
||||
UPDATE dev.chunks
|
||||
SET metadata = %s::jsonb, updated_at = NOW()
|
||||
WHERE id = %s
|
||||
""", (meta_json, db_id))
|
||||
|
||||
conn.commit()
|
||||
print(f"Updated {len(stories)} story chunks")
|
||||
|
||||
print("\n=== Step 3: Update Qdrant momentry_dev_voice ===")
|
||||
# Delete old voice collection and recreate
|
||||
# First check if it exists
|
||||
import urllib.request
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_voice", method="DELETE")
|
||||
try:
|
||||
urlopen(req)
|
||||
print("Deleted old momentry_dev_voice collection")
|
||||
except:
|
||||
print("Could not delete or doesn't exist")
|
||||
|
||||
time.sleep(0.5)
|
||||
|
||||
# Create collection
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_voice",
|
||||
data=json.dumps({"vectors": {"size": 192, "distance": "Cosine"}}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try:
|
||||
urlopen(req)
|
||||
print("Created momentry_dev_voice collection (192D)")
|
||||
except Exception as e:
|
||||
print(f"Create collection error: {e}")
|
||||
|
||||
# Upload in batches
|
||||
batch_size = 100
|
||||
total_uploaded = 0
|
||||
for start in range(0, len(assignments), batch_size):
|
||||
batch = assignments[start:start+batch_size]
|
||||
points = []
|
||||
for i, a in enumerate(batch):
|
||||
idx = start + i
|
||||
emb = embeddings[idx]
|
||||
points.append({
|
||||
"id": idx + 1,
|
||||
"vector": emb,
|
||||
"payload": {
|
||||
"file_uuid": UUID,
|
||||
"speaker_id": a["speaker_id"],
|
||||
"speaker_name": a["speaker_name"],
|
||||
"start_time": a["start_time"],
|
||||
"end_time": a["end_time"],
|
||||
"segment_index": idx,
|
||||
}
|
||||
})
|
||||
|
||||
req = Request(f"{QDRANT_URL}/collections/momentry_dev_voice/points?wait=true",
|
||||
data=json.dumps({"points": points}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try:
|
||||
urlopen(req)
|
||||
total_uploaded += len(points)
|
||||
except Exception as e:
|
||||
print(f" Batch {start} error: {e}")
|
||||
|
||||
if (start // batch_size) % 5 == 0:
|
||||
print(f" Uploaded {total_uploaded}/{len(assignments)} voice embeddings")
|
||||
|
||||
print(f"\nUploaded {total_uploaded} voice embeddings to momentry_dev_voice")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
print("\n=== Done ===")
|
||||
139
scripts/vectorize_4188.py
Normal file
139
scripts/vectorize_4188.py
Normal file
@@ -0,0 +1,139 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Vectorize 4188 sentence chunks via EmbeddingGemma (768D) + rebuild Qdrant collections.
|
||||
"""
|
||||
import json, sys, time
|
||||
from urllib.request import Request, urlopen
|
||||
import psycopg2
|
||||
import urllib.request
|
||||
|
||||
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
EMBED_URL = "http://localhost:11436/v1/embeddings"
|
||||
COLLECTIONS = ["momentry_dev_v1", "sentence_story", "sentence_summary"]
|
||||
|
||||
def call_embed(text):
|
||||
body = json.dumps({"input": text}).encode()
|
||||
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
resp = urlopen(req, timeout=30)
|
||||
return json.loads(resp.read())["data"][0]["embedding"]
|
||||
|
||||
print("=== Step 1: Load chunks ===")
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
SELECT chunk_index, chunk_id, text_content, metadata->>'speaker_name',
|
||||
start_time, end_time, metadata->>'speaker_id'
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid=%s AND chunk_type='sentence'
|
||||
ORDER BY chunk_index
|
||||
""", (UUID,))
|
||||
chunks = cur.fetchall()
|
||||
conn.close()
|
||||
print(f"Loaded {len(chunks)} chunks")
|
||||
|
||||
print("\n=== Step 2: Vectorize (EmbeddingGemma 768D) ===")
|
||||
# Generate cleaned text for embedding: "Speaker: text" format
|
||||
texts_for_embed = []
|
||||
for r in chunks:
|
||||
spk = r[3] or "Unknown"
|
||||
txt = r[2] or ""
|
||||
# Remove [Speaker] prefix if present
|
||||
if txt.startswith("["):
|
||||
txt = txt.split("]", 1)[-1].strip()
|
||||
texts_for_embed.append(f"{spk}: \"{txt}\"")
|
||||
|
||||
t0 = time.time()
|
||||
embeddings = []
|
||||
batch_size = 50
|
||||
for start in range(0, len(texts_for_embed), batch_size):
|
||||
batch = texts_for_embed[start:start+batch_size]
|
||||
# Try batch embed
|
||||
body = json.dumps({"input": batch}).encode()
|
||||
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
resp = json.loads(urlopen(req, timeout=60).read())
|
||||
batch_embs = [d["embedding"] for d in resp["data"]]
|
||||
except:
|
||||
# Fallback to single
|
||||
batch_embs = []
|
||||
for t in batch:
|
||||
batch_embs.append(call_embed(t))
|
||||
embeddings.extend(batch_embs)
|
||||
|
||||
if (start // batch_size) % 10 == 0:
|
||||
pct = (start + len(batch)) * 100 // len(texts_for_embed)
|
||||
print(f" {start+len(batch)}/{len(texts_for_embed)} ({pct}%) [{time.time()-t0:.0f}s]")
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f" Done: {len(embeddings)} embeddings in {elapsed:.1f}s ({elapsed/len(embeddings):.2f}s each)")
|
||||
|
||||
print("\n=== Step 3: Rebuild Qdrant collections ===")
|
||||
import time as time_module
|
||||
|
||||
for col in COLLECTIONS:
|
||||
# Delete
|
||||
req = Request(f"{QDRANT_URL}/collections/{col}", method="DELETE")
|
||||
try: urlopen(req); time_module.sleep(0.3)
|
||||
except: pass
|
||||
|
||||
# Create
|
||||
req = Request(f"{QDRANT_URL}/collections/{col}",
|
||||
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
urlopen(req)
|
||||
time_module.sleep(0.3)
|
||||
print(f" Created {col}")
|
||||
|
||||
# Upload
|
||||
print("\n=== Step 4: Upload points ===")
|
||||
batch_size = 100
|
||||
for col in COLLECTIONS:
|
||||
points = []
|
||||
for i, r in enumerate(chunks):
|
||||
idx = r[0]
|
||||
cid = r[1]
|
||||
spk_name = r[3] or "Unknown"
|
||||
spk_id = r[6] or "Unknown"
|
||||
txt = r[2] or ""
|
||||
st = r[4]
|
||||
et = r[5]
|
||||
|
||||
payload = {
|
||||
"chunk_type": "sentence", "uuid": UUID,
|
||||
"chunk_id": cid, "start_time": st, "end_time": et,
|
||||
"speaker_name": spk_name, "speaker_id": spk_id,
|
||||
}
|
||||
if col == "momentry_dev_v1":
|
||||
payload["text"] = txt
|
||||
elif col == "sentence_story":
|
||||
payload["text"] = txt
|
||||
elif col == "sentence_summary":
|
||||
payload["summary"] = txt
|
||||
|
||||
points.append({
|
||||
"id": idx + 1,
|
||||
"vector": embeddings[i],
|
||||
"payload": payload,
|
||||
})
|
||||
|
||||
for start in range(0, len(points), batch_size):
|
||||
batch = points[start:start+batch_size]
|
||||
req = Request(f"{QDRANT_URL}/collections/{col}/points?wait=true",
|
||||
data=json.dumps({"points": batch}).encode(),
|
||||
headers={"Content-Type": "application/json"}, method="PUT")
|
||||
try: urlopen(req)
|
||||
except Exception as e: print(f" {col} batch {start}: {e}")
|
||||
if (start // batch_size) % 5 == 0:
|
||||
print(f" {col}: {start+len(batch)}/{len(points)}")
|
||||
print(f" {col}: done")
|
||||
|
||||
# Verify
|
||||
print("\n=== Verify ===")
|
||||
for col in COLLECTIONS:
|
||||
resp = json.loads(urlopen(f"{QDRANT_URL}/collections/{col}").read())
|
||||
info = resp["result"]
|
||||
print(f" {col}: {info['points_count']} pts, {info['config']['params']['vectors'].get('size','?')}D")
|
||||
|
||||
print("\n=== Done ===")
|
||||
573
scripts/vision_agent.py
Normal file
573
scripts/vision_agent.py
Normal file
@@ -0,0 +1,573 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Momentry Eye — Multi-model vision detection agent
|
||||
Models: grounding-dino (default), paligemma
|
||||
Usage:
|
||||
python3 scripts/vision_agent.py
|
||||
curl localhost:5052/health
|
||||
curl localhost:5052/detect -d '{"time":5461,"prompt":"gun","model":"grounding-dino"}'
|
||||
curl localhost:5052/search -d '{"query":"find the gun","model":"paligemma"}'
|
||||
"""
|
||||
import json, os, sys, time, cv2, torch, re, psycopg2, threading
|
||||
from PIL import Image, ImageDraw
|
||||
from flask import Flask, request, jsonify, send_file
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
|
||||
BASE_DIR = "/Users/accusys/momentry/output_dev"
|
||||
SHOTS_DIR = os.path.join(BASE_DIR, "vision_shots")
|
||||
os.makedirs(SHOTS_DIR, exist_ok=True)
|
||||
PORT = int(os.environ.get("VISION_AGENT_PORT", 5052))
|
||||
|
||||
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
|
||||
|
||||
VIDEO_PATHS = {
|
||||
"aeed71342a899fe4b4c57b7d41bcb692":
|
||||
"/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4",
|
||||
}
|
||||
|
||||
# ======================== Model Registry ========================
|
||||
MODELS = {} # name -> {"model": obj, "processor": obj, "info": dict}
|
||||
|
||||
def load_gdino():
|
||||
"""Load Grounding DINO Base."""
|
||||
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
||||
print("[GDINO] Loading...")
|
||||
t0 = time.time()
|
||||
proc = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
|
||||
model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-base").to(DEVICE)
|
||||
print(f"[GDINO] Loaded in {time.time()-t0:.1f}s")
|
||||
return {
|
||||
"model": model, "processor": proc,
|
||||
"info": {
|
||||
"name": "grounding-dino", "params_m": 232, "size_mb": 891,
|
||||
"resolution": 384, "has_confidence": True,
|
||||
"license": "Apache 2.0",
|
||||
}
|
||||
}
|
||||
|
||||
def load_paligemma():
|
||||
"""Load PaliGemma 3B mix-224."""
|
||||
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
|
||||
print("[PaliGemma] Loading...")
|
||||
t0 = time.time()
|
||||
proc = AutoProcessor.from_pretrained("google/paligemma-3b-mix-224")
|
||||
model = PaliGemmaForConditionalGeneration.from_pretrained(
|
||||
"google/paligemma-3b-mix-224", dtype=torch.bfloat16
|
||||
).to(DEVICE)
|
||||
print(f"[PaliGemma] Loaded in {time.time()-t0:.1f}s")
|
||||
return {
|
||||
"model": model, "processor": proc,
|
||||
"info": {
|
||||
"name": "paligemma", "params_m": 2923, "size_mb": 3000,
|
||||
"resolution": 224, "has_confidence": False,
|
||||
"license": "Gemma license",
|
||||
}
|
||||
}
|
||||
|
||||
MODEL_REGISTRY = {
|
||||
"grounding-dino": load_gdino,
|
||||
"paligemma": load_paligemma,
|
||||
}
|
||||
|
||||
def get_model(name):
|
||||
"""Lazy-load and cache a model by name."""
|
||||
if name not in MODELS:
|
||||
if name not in MODEL_REGISTRY:
|
||||
return None
|
||||
MODELS[name] = MODEL_REGISTRY[name]()
|
||||
return MODELS[name]
|
||||
|
||||
# ======================== Inference ========================
|
||||
def infer_gdino(img, prompt, threshold=0.1):
|
||||
"""Grounding DINO inference. Returns [{bbox, score, label}]."""
|
||||
m = get_model("grounding-dino")
|
||||
inputs = m["processor"](images=img, text=f"{prompt}.", return_tensors="pt").to(DEVICE)
|
||||
with torch.no_grad():
|
||||
outputs = m["model"](**inputs)
|
||||
dets = m["processor"].post_process_grounded_object_detection(
|
||||
outputs, threshold=threshold, target_sizes=[img.size[::-1]])[0]
|
||||
results = []
|
||||
for i in range(len(dets["boxes"])):
|
||||
results.append({
|
||||
"bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
|
||||
"score": round(dets["scores"][i].item(), 3),
|
||||
"label": prompt,
|
||||
})
|
||||
return results
|
||||
|
||||
def infer_paligemma(img, prompt, threshold=0.1):
|
||||
"""PaliGemma inference. Returns [{bbox, label}] — no confidence scores."""
|
||||
m = get_model("paligemma")
|
||||
inputs = m["processor"](text=f"detect {prompt}", images=img, return_tensors="pt").to(DEVICE)
|
||||
with torch.no_grad():
|
||||
outputs = m["model"].generate(**inputs, max_new_tokens=100)
|
||||
result = m["processor"].decode(outputs[0], skip_special_tokens=True)
|
||||
# Parse PaliGemma output format: <locXXXX><locXXXX><locXXXX><locXXXX> label
|
||||
locs = re.findall(r'<loc(\d+)>', result)
|
||||
results = []
|
||||
if len(locs) >= 4:
|
||||
n_dets = len(locs) // 4
|
||||
# Extract labels (text between bbox tokens)
|
||||
labels = re.findall(r'>\s*(\w+)\s*<|>\s*(\w+)$', result.replace('detect ' + prompt, ''))
|
||||
for i in range(n_dets):
|
||||
idx = i * 4
|
||||
# Convert PaliGemma loc tokens to image coordinates (0-1024 range)
|
||||
img_w, img_h = img.size
|
||||
x1 = int(locs[idx]) / 1024 * img_w
|
||||
y1 = int(locs[idx+1]) / 1024 * img_h
|
||||
x2 = int(locs[idx+2]) / 1024 * img_w
|
||||
y2 = int(locs[idx+3]) / 1024 * img_h
|
||||
results.append({
|
||||
"bbox": [round(x1, 1), round(y1, 1), round(x2, 1), round(y2, 1)],
|
||||
"score": 1.0,
|
||||
"label": prompt,
|
||||
})
|
||||
return results
|
||||
|
||||
INFERENCE = {
|
||||
"grounding-dino": infer_gdino,
|
||||
"paligemma": infer_paligemma,
|
||||
}
|
||||
|
||||
# ======================== Utilities ========================
|
||||
def find_video(uuid):
|
||||
if uuid in VIDEO_PATHS: return VIDEO_PATHS[uuid]
|
||||
import glob
|
||||
base = "/Users/accusys/momentry/var/sftpgo/data/demo"
|
||||
for f in glob.glob(f"{base}/**/Charade*", recursive=True):
|
||||
if f.endswith((".mp4", ".mov", ".avi")): VIDEO_PATHS[uuid] = f; return f
|
||||
for f in glob.glob(f"{base}/**/*{uuid[:8]}*", recursive=True):
|
||||
if f.endswith((".mp4", ".mov", ".avi")): VIDEO_PATHS[uuid] = f; return f
|
||||
return None
|
||||
|
||||
def parse_query(query):
|
||||
query = query.lower().strip()
|
||||
prefixes = ["find ", "show ", "search ", "where is ", "where are ",
|
||||
"looking for ", "detect ", "locate ", "spot ", "scan for "]
|
||||
for p in prefixes:
|
||||
if query.startswith(p):
|
||||
query = query[len(p):]
|
||||
for a in ["a ", "an ", "the ", "some ", "any "]:
|
||||
if query.startswith(a):
|
||||
query = query[len(a):]
|
||||
query = query.rstrip(".?!,")
|
||||
for s in [" in the image", " in this scene", " in the picture",
|
||||
" being held", " in hand", " in frame", " please"]:
|
||||
if query.endswith(s):
|
||||
query = query[: -len(s)]
|
||||
return query.strip()
|
||||
|
||||
def resolve_target(target_str):
|
||||
if not target_str or ":" not in target_str:
|
||||
return None
|
||||
parts = target_str.split(":", 1)
|
||||
if len(parts) != 2: return None
|
||||
uuid, identifier = parts
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT start_time, end_time FROM dev.chunks WHERE file_uuid=%s AND chunk_id=%s LIMIT 1", (uuid, identifier))
|
||||
row = cur.fetchone()
|
||||
if row: cur.close(); conn.close(); return (uuid, float(row[0]), float(row[1]))
|
||||
if identifier.isdigit():
|
||||
cid = f"{uuid}_{identifier}"
|
||||
cur.execute("SELECT start_time, end_time FROM dev.chunks WHERE file_uuid=%s AND chunk_id=%s LIMIT 1", (uuid, cid))
|
||||
row = cur.fetchone()
|
||||
if row: cur.close(); conn.close(); return (uuid, float(row[0]), float(row[1]))
|
||||
tid = identifier.replace("trace_", "")
|
||||
cur.execute("SELECT MIN(start_time), MAX(end_time) FROM dev.chunks WHERE file_uuid=%s AND chunk_type='trace' AND chunk_id LIKE %s", (uuid, f"%_trace_{tid}"))
|
||||
row = cur.fetchone()
|
||||
if row and row[0] is not None: cur.close(); conn.close(); return (uuid, float(row[0]), float(row[1]))
|
||||
cur.close(); conn.close()
|
||||
return None
|
||||
|
||||
def register_resource(resource_id, name, info):
|
||||
try:
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
cur.execute("""
|
||||
INSERT INTO dev.resources (resource_id, resource_type, category, capabilities, config, metadata, status, last_heartbeat)
|
||||
VALUES (%s, %s, %s, %s::jsonb, %s::jsonb, %s::jsonb, %s, NOW())
|
||||
ON CONFLICT (resource_id) DO UPDATE SET status=%s, last_heartbeat=NOW(), config=EXCLUDED.config
|
||||
""", (
|
||||
resource_id, "vision_model", "object_detection",
|
||||
json.dumps({"detect": "Single-frame detection", "search": "Range search with NL query",
|
||||
"has_confidence": info.get("has_confidence", True)}),
|
||||
json.dumps({"name": name, "port": PORT, "device": DEVICE, "params_m": info.get("params_m"),
|
||||
"resolution": info.get("resolution"), "license": info.get("license")}),
|
||||
json.dumps({"version": "2.0", "docs": "/health"}),
|
||||
"online", "online"))
|
||||
conn.commit(); cur.close(); conn.close()
|
||||
print(f"[Resource] Registered '{resource_id}'")
|
||||
except Exception as e:
|
||||
print(f"[Resource] Register '{resource_id}' failed: {e}")
|
||||
|
||||
def heartbeat_loop(resource_ids):
|
||||
while True:
|
||||
try:
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
for rid in resource_ids:
|
||||
cur.execute("UPDATE dev.resources SET last_heartbeat = NOW() WHERE resource_id = %s", (rid,))
|
||||
conn.commit(); cur.close(); conn.close()
|
||||
except: pass
|
||||
time.sleep(60)
|
||||
|
||||
# ======================== Annotate ========================
|
||||
def annotate_image(img, detections, prompt):
|
||||
draw = ImageDraw.Draw(img)
|
||||
for d in detections:
|
||||
b = d["bbox"]
|
||||
score = d.get("score", 1.0)
|
||||
draw.rectangle(b, outline="lime", width=3)
|
||||
draw.text((b[0], b[1]-18), f"{prompt} {score:.2f}", fill="lime")
|
||||
return img
|
||||
|
||||
# ======================== API Routes ========================
|
||||
@app.route("/models", methods=["GET"])
|
||||
def list_models():
|
||||
"""List available models and their status."""
|
||||
result = []
|
||||
for name, loader in MODEL_REGISTRY.items():
|
||||
cached = name in MODELS
|
||||
info = dict(MODELS[name]["info"]) if cached else {"name": name, "loaded": False}
|
||||
info["loaded"] = cached
|
||||
result.append(info)
|
||||
return jsonify({"models": result})
|
||||
|
||||
# Default fusion weights: GDINO 0.6, PaliGemma 0.4
|
||||
FUSION_WEIGHTS = {"grounding-dino": 0.6, "paligemma": 0.4}
|
||||
|
||||
@app.route("/detect", methods=["POST"])
|
||||
def detect():
|
||||
data = request.json or {}
|
||||
uuid = data.get("uuid", "aeed71342a899fe4b4c57b7d41bcb692")
|
||||
t_sec = data.get("time", 0)
|
||||
prompt = data.get("prompt", "gun")
|
||||
model_name = data.get("model", "grounding-dino")
|
||||
threshold = data.get("threshold", 0.1)
|
||||
weights = data.get("weights", None) # e.g. {"grounding-dino":0.7,"paligemma":0.3}
|
||||
fusion_weights = weights if weights else \
|
||||
({model_name: 1.0} if model_name != "fusion" else FUSION_WEIGHTS)
|
||||
|
||||
# Determine which models to run
|
||||
if model_name == "fusion":
|
||||
models_to_run = list(INFERENCE.keys())
|
||||
elif model_name in INFERENCE:
|
||||
models_to_run = [model_name]
|
||||
else:
|
||||
return jsonify({"error": f"Unknown model: {model_name}"}), 400
|
||||
|
||||
video = find_video(uuid)
|
||||
if not video: return jsonify({"error": "Video not found"}), 404
|
||||
|
||||
cap = cv2.VideoCapture(video)
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * (cap.get(cv2.CAP_PROP_FPS) or 25.0)))
|
||||
ret, frame = cap.read()
|
||||
cap.release()
|
||||
if not ret: return jsonify({"error": f"Cannot read frame at {t_sec}s"}), 400
|
||||
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
|
||||
all_detections = {}
|
||||
fusion_results = []
|
||||
t0 = time.time()
|
||||
|
||||
for mn in models_to_run:
|
||||
if mn not in INFERENCE: continue
|
||||
detections = INFERENCE[mn](img, prompt, threshold)
|
||||
all_detections[mn] = detections
|
||||
w = fusion_weights.get(mn, 0.5)
|
||||
|
||||
for d in detections:
|
||||
gdino_score = d.get("score", 1.0)
|
||||
# PaliGemma has no score, treat detected=1.0
|
||||
model_score = gdino_score if mn == "grounding-dino" else 1.0
|
||||
fused = round(model_score * w, 3)
|
||||
|
||||
fusion_results.append({
|
||||
"bbox": d["bbox"],
|
||||
"label": d["label"],
|
||||
"score": model_score,
|
||||
"fused_score": fused,
|
||||
"source_model": mn,
|
||||
})
|
||||
|
||||
infer_ms = (time.time() - t0) * 1000
|
||||
|
||||
# Deduplicate by bbox IOU for fusion mode
|
||||
if model_name == "fusion" and len(fusion_results) > 1:
|
||||
deduped = []
|
||||
fusion_results.sort(key=lambda x: -x["fused_score"])
|
||||
for r in fusion_results:
|
||||
overlap = False
|
||||
for d in deduped:
|
||||
b1, b2 = r["bbox"], d["bbox"]
|
||||
iou = calc_iou(b1, b2)
|
||||
if iou > 0.5:
|
||||
overlap = True
|
||||
break
|
||||
if not overlap:
|
||||
deduped.append(r)
|
||||
fusion_results = deduped
|
||||
|
||||
# Annotate with best result
|
||||
display_dets = [{"bbox": r["bbox"], "score": r["fused_score"], "label": prompt} for r in fusion_results]
|
||||
if model_name != "fusion":
|
||||
display_dets = all_detections.get(model_name, [])
|
||||
|
||||
img_ann = annotate_image(img.copy(), display_dets, prompt)
|
||||
shot_name = f"{uuid[:8]}_{int(t_sec)}s_{prompt}_{model_name}.jpg"
|
||||
img_ann.save(os.path.join(SHOTS_DIR, shot_name))
|
||||
|
||||
return jsonify({
|
||||
"model": model_name,
|
||||
"fusion_weights": fusion_weights,
|
||||
"models_used": models_to_run,
|
||||
"per_model": {mn: {"detections": all_detections.get(mn, []),
|
||||
"n_detections": len(all_detections.get(mn, []))}
|
||||
for mn in models_to_run},
|
||||
"fusion": fusion_results if model_name == "fusion" else None,
|
||||
"detections": display_dets,
|
||||
"time_ms": round(infer_ms, 1),
|
||||
"n_detections": len(display_dets),
|
||||
"shot_url": f"/shots/{shot_name}",
|
||||
})
|
||||
|
||||
def calc_iou(b1, b2):
|
||||
xi1 = max(b1[0], b2[0]); yi1 = max(b1[1], b2[1])
|
||||
xi2 = min(b1[2], b2[2]); yi2 = min(b1[3], b2[3])
|
||||
inter = max(0, xi2 - xi1) * max(0, yi2 - yi1)
|
||||
a1 = (b1[2]-b1[0])*(b1[3]-b1[1])
|
||||
a2 = (b2[2]-b2[0])*(b2[3]-b2[1])
|
||||
return inter / (a1 + a2 - inter + 1e-10)
|
||||
|
||||
@app.route("/search", methods=["POST"])
|
||||
def search():
|
||||
data = request.json or {}
|
||||
uuid = data.get("uuid", "aeed71342a899fe4b4c57b7d41bcb692")
|
||||
target_str = data.get("target", "")
|
||||
query = data.get("query", "find the gun")
|
||||
range_str = data.get("range", "0-6780")
|
||||
interval = data.get("interval", 30)
|
||||
threshold = data.get("threshold", 0.15)
|
||||
model_name = data.get("model", "grounding-dino")
|
||||
|
||||
if model_name not in INFERENCE:
|
||||
return jsonify({"error": f"Unknown model: {model_name}. Available: {list(INFERENCE.keys())}"}), 400
|
||||
|
||||
# Parse query → object name
|
||||
prompt = parse_query(query)
|
||||
if not prompt:
|
||||
return jsonify({"error": f"Cannot parse query: {query}"}), 400
|
||||
|
||||
# Resolve target → time range
|
||||
resolved_label = ""
|
||||
if target_str:
|
||||
resolved = resolve_target(target_str)
|
||||
if not resolved:
|
||||
return jsonify({"error": f"Cannot resolve target: {target_str}"}), 404
|
||||
uuid, range_start, range_end = resolved
|
||||
else:
|
||||
parts = range_str.split("-") if "-" in range_str else ["0", "6780"]
|
||||
range_start = float(parts[0])
|
||||
range_end = float(parts[1]) if len(parts) > 1 else 6780
|
||||
|
||||
video = find_video(uuid)
|
||||
if not video: return jsonify({"error": "Video not found"}), 404
|
||||
|
||||
cap = cv2.VideoCapture(video)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
|
||||
hits = []
|
||||
t_start = time.time()
|
||||
infer_fn = INFERENCE[model_name]
|
||||
frame_step = int(interval * fps)
|
||||
|
||||
for frame_num in range(int(range_start * fps), min(int(range_end * fps), total_frames), frame_step):
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
|
||||
ret, frame = cap.read()
|
||||
if not ret: continue
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
detections = infer_fn(img, prompt, threshold)
|
||||
if detections:
|
||||
ts = frame_num / fps
|
||||
best = max(d.get("score", 1.0) for d in detections)
|
||||
hits.append({
|
||||
"time": round(ts, 1),
|
||||
"time_str": f"{int(ts//60)}:{int(ts%60):02d}.{int((ts%1)*fps):02d}",
|
||||
"frame": frame_num,
|
||||
"n_detections": len(detections),
|
||||
"best_score": best,
|
||||
"detections": detections[:3],
|
||||
})
|
||||
if len(hits) >= 100: break
|
||||
|
||||
cap.release()
|
||||
elapsed = time.time() - t_start
|
||||
|
||||
return jsonify({
|
||||
"model": model_name,
|
||||
"query": query, "object": prompt,
|
||||
"target": target_str or None,
|
||||
"range": f"{range_start:.0f}-{range_end:.0f}",
|
||||
"interval_secs": interval,
|
||||
"hits": hits,
|
||||
"n_hits": len(hits),
|
||||
"elapsed_secs": round(elapsed, 1),
|
||||
})
|
||||
|
||||
@app.route("/multimodal", methods=["POST"])
|
||||
def multimodal_search():
|
||||
"""Multi-modal search across all chunk types.
|
||||
For sentence chunks: ASR text + visual confirmation.
|
||||
For trace/story/cut chunks: visual detection only (no ASR text).
|
||||
Input:
|
||||
{"keyword":"gun"} — find chunks mentioning "gun" in ASR + visually confirm
|
||||
{"keyword":"gun","chunk_type":"trace"} — search trace chunks visually (no ASR)
|
||||
{"target":"file_uuid:chunk_id"} — search a specific chunk visually
|
||||
"""
|
||||
data = request.json or {}
|
||||
uuid = data.get("uuid", "aeed71342a899fe4b4c57b7d41bcb692")
|
||||
keyword = data.get("keyword", "")
|
||||
prompt = data.get("prompt", keyword or "")
|
||||
target_str = data.get("target", "")
|
||||
chunk_type = data.get("chunk_type", "sentence") # sentence, trace, story, cut
|
||||
threshold = data.get("threshold", 0.15)
|
||||
model_name = "grounding-dino"
|
||||
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Resolve target first if provided
|
||||
if target_str:
|
||||
resolved = resolve_target(target_str)
|
||||
if not resolved:
|
||||
return jsonify({"error": f"Cannot resolve target: {target_str}"}), 404
|
||||
uuid, st, et = resolved
|
||||
cur.execute("SELECT chunk_id, chunk_index, chunk_type, text_content FROM dev.chunks WHERE file_uuid=%s AND start_time=%s AND end_time=%s LIMIT 1",
|
||||
(uuid, st, et))
|
||||
chunks = [(r[0], r[1], r[2], st, et, r[3] or "") for r in cur.fetchall()]
|
||||
elif keyword and chunk_type == "sentence":
|
||||
# Search sentence chunks by ASR text keyword
|
||||
cur.execute("""
|
||||
SELECT chunk_id, chunk_index, chunk_type, start_time, end_time, text_content
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid=%s AND chunk_type='sentence'
|
||||
AND text_content ILIKE CONCAT('%%', %s, '%%')
|
||||
ORDER BY start_time
|
||||
""", (uuid, keyword))
|
||||
chunks = cur.fetchall()
|
||||
else:
|
||||
# Search any chunk type by time range (visual only, no ASR)
|
||||
range_str = data.get("range", "0-6780")
|
||||
parts = range_str.split("-") if "-" in range_str else ["0", "6780"]
|
||||
rs, re = float(parts[0]), float(parts[1]) if len(parts) > 1 else 6780
|
||||
cur.execute("""
|
||||
SELECT chunk_id, chunk_index, chunk_type, start_time, end_time, COALESCE(text_content, '')
|
||||
FROM dev.chunks
|
||||
WHERE file_uuid=%s AND chunk_type=%s
|
||||
AND start_time BETWEEN %s AND %s
|
||||
ORDER BY start_time
|
||||
""", (uuid, chunk_type, rs, re))
|
||||
chunks = cur.fetchall()
|
||||
|
||||
conn.close()
|
||||
|
||||
if not chunks:
|
||||
return jsonify({"error": f"No matching chunks found"}), 404
|
||||
|
||||
# Visual confirmation
|
||||
video = find_video(uuid)
|
||||
if not video:
|
||||
return jsonify({"error": "Video not found"}), 404
|
||||
|
||||
cap = cv2.VideoCapture(video)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
infer_fn = INFERENCE.get(model_name)
|
||||
|
||||
results = []
|
||||
t_start = time.time()
|
||||
|
||||
for chunk_id, chunk_idx, ctype, st, et, text in chunks:
|
||||
center = (st + et) / 2
|
||||
frame_num = int(center * fps)
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
|
||||
ret, frame = cap.read()
|
||||
if not ret: continue
|
||||
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
detections = infer_fn(img, prompt or keyword, threshold)
|
||||
|
||||
entry = {
|
||||
"chunk_id": chunk_id,
|
||||
"chunk_index": chunk_idx,
|
||||
"chunk_type": ctype,
|
||||
"time_range": f"{st:.1f}-{et:.1f}",
|
||||
"time_str": f"{int(st//60)}:{int(st%60):02d}-{int(et//60)}:{int(et%60):02d}",
|
||||
"visual_confirmed": len(detections) > 0,
|
||||
"best_score": round(max(d.get("score", 1.0) for d in detections), 3) if detections else 0,
|
||||
"n_visual_dets": len(detections),
|
||||
}
|
||||
if keyword and ctype == "sentence":
|
||||
entry["asr_text"] = text[:150]
|
||||
entry["asr_matched"] = keyword.lower() in text.lower()
|
||||
|
||||
results.append(entry)
|
||||
|
||||
cap.release()
|
||||
elapsed = time.time() - t_start
|
||||
|
||||
return jsonify({
|
||||
"keyword": keyword or prompt,
|
||||
"chunk_type": chunk_type,
|
||||
"target": target_str or None,
|
||||
"total_chunks": len(chunks),
|
||||
"visual_confirmed": sum(1 for r in results if r["visual_confirmed"]),
|
||||
"asr_matched": sum(1 for r in results if r.get("asr_matched")),
|
||||
"elapsed_secs": round(elapsed, 1),
|
||||
"results": results,
|
||||
})
|
||||
|
||||
@app.route("/shots/<filename>")
|
||||
def serve_shot(filename):
|
||||
path = os.path.join(SHOTS_DIR, filename)
|
||||
if not os.path.exists(path): return jsonify({"error": "Not found"}), 404
|
||||
return send_file(path, mimetype="image/jpeg")
|
||||
|
||||
@app.route("/health")
|
||||
def health():
|
||||
loaded = list(MODELS.keys())
|
||||
available = list(MODEL_REGISTRY.keys())
|
||||
return jsonify({
|
||||
"status": "ok",
|
||||
"models_loaded": loaded,
|
||||
"models_available": available,
|
||||
"device": DEVICE,
|
||||
"port": PORT,
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Register both as resources
|
||||
gdino_info = {"params_m": 232, "resolution": 384, "has_confidence": True, "license": "Apache 2.0"}
|
||||
pg_info = {"params_m": 2923, "resolution": 224, "has_confidence": False, "license": "Gemma license"}
|
||||
register_resource("eye-gdino", "grounding-dino", gdino_info)
|
||||
register_resource("eye-paligemma", "paligemma", pg_info)
|
||||
|
||||
# Start heartbeat
|
||||
t = threading.Thread(target=heartbeat_loop, args=(["eye-gdino", "eye-paligemma"],), daemon=True)
|
||||
t.start()
|
||||
|
||||
# Pre-load grounding-dino by default
|
||||
print(f"\n{'='*60}")
|
||||
print(f" 👁️ Momentry Eye — port {PORT}")
|
||||
print(f"{'='*60}")
|
||||
print(f" Models: {', '.join(MODEL_REGISTRY.keys())}")
|
||||
print(f" Device: {DEVICE}")
|
||||
print(f" Resources: eye-gdino, eye-paligemma")
|
||||
print(f" Loading default model...")
|
||||
get_model("grounding-dino")
|
||||
print(f" 👁️ Ready: http://localhost:{PORT}")
|
||||
app.run(host="0.0.0.0", port=PORT, threaded=True)
|
||||
84
scripts/zero_shot_combined_test.py
Normal file
84
scripts/zero_shot_combined_test.py
Normal file
@@ -0,0 +1,84 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Test Grounding DINO Large with COMBINED prompts — one inference per frame.
|
||||
"""
|
||||
import json, os, time, cv2, torch
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
||||
|
||||
MODEL_PATH = "/Users/accusys/momentry_core_0.1/models/gun/grounding-dino-large-hf"
|
||||
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
|
||||
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/zero_shot_objects"
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
TIMEPOINTS = [
|
||||
(429, "stamp"), (691, "stamp_letter"), (762, "passport"),
|
||||
(3491, "passport"), (5054, "passport"),
|
||||
(5434, "letter"), (5443, "stamp_envelope"),
|
||||
(5467, "envelope"), (5500, "stamp"), (5506, "stamp"),
|
||||
(5783, "letter"), (5786, "envelope"),
|
||||
]
|
||||
|
||||
COMBINED_PROMPT = "stamp. postage stamp. envelope. passport. identification. letter."
|
||||
|
||||
print("Loading Large model...")
|
||||
t0 = time.time()
|
||||
processor = AutoProcessor.from_pretrained(MODEL_PATH)
|
||||
model = AutoModelForZeroShotObjectDetection.from_pretrained(MODEL_PATH)
|
||||
device = "mps" if torch.backends.mps.is_available() else "cpu"
|
||||
model.to(device)
|
||||
print(f"Loaded in {time.time()-t0:.1f}s")
|
||||
|
||||
cap = cv2.VideoCapture(VIDEO)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
|
||||
print(f"\nTesting {len(TIMEPOINTS)} timepoints with combined prompt...")
|
||||
t_infer = time.time()
|
||||
|
||||
for t_sec, label in TIMEPOINTS:
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
|
||||
ret, frame = cap.read()
|
||||
if frame is None: continue
|
||||
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
|
||||
# ONE inference with ALL prompts
|
||||
inputs = processor(images=img, text=COMBINED_PROMPT, return_tensors="pt").to(device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
target = torch.tensor([img.size[::-1]])
|
||||
dets = processor.post_process_grounded_object_detection(
|
||||
outputs, threshold=0.1, target_sizes=target
|
||||
)[0]
|
||||
|
||||
det_list = []
|
||||
for i in range(len(dets["boxes"])):
|
||||
det_list.append({
|
||||
"bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
|
||||
"score": round(dets["scores"][i].item(), 3),
|
||||
"label": str(dets["labels"][i]) if "labels" in dets else "object",
|
||||
})
|
||||
|
||||
# Classify which expected objects were found
|
||||
found = set()
|
||||
for d in det_list:
|
||||
lbl = d["label"].lower()
|
||||
for obj in ["stamp", "envelope", "passport", "letter"]:
|
||||
if obj in lbl:
|
||||
found.add(obj)
|
||||
|
||||
found_str = ", ".join(sorted(found)) if found else "none"
|
||||
print(f" {t_sec//60}:{t_sec%60:02d} {label:20s} | {len(det_list)} dets | found: [{found_str}]")
|
||||
|
||||
# Save annotated frame
|
||||
for d in det_list:
|
||||
x1, y1, x2, y2 = [int(v) for v in d["bbox"]]
|
||||
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
|
||||
cv2.putText(frame, f"{d['label']} {d['score']:.2f}", (x1, y1-5),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
|
||||
|
||||
cv2.imwrite(os.path.join(OUTPUT_DIR, f"combined_{t_sec}s.jpg"), frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
|
||||
|
||||
cap.release()
|
||||
print(f"\nDone in {time.time()-t_infer:.0f}s")
|
||||
print(f"Screenshots: {OUTPUT_DIR}/")
|
||||
156
scripts/zero_shot_gun_test.py
Normal file
156
scripts/zero_shot_gun_test.py
Normal file
@@ -0,0 +1,156 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Zero-shot Gun Detection Test — OWL-ViT vs Grounding DINO
|
||||
Tests on 8 known timepoints: 5 original pistol frames + 3 ASR gun mentions.
|
||||
"""
|
||||
import json, os, sys, time, cv2
|
||||
import torch
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
|
||||
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/zero_shot_test"
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
TIMEPOINTS = [
|
||||
(2646, "2646s", "ASR: He has a gun"),
|
||||
(3188, "3188s", "Original pistol"),
|
||||
(3697, "3697s", "ASR: Where's your gun"),
|
||||
(5341, "5341s", "ASR: He already killed 3 men"),
|
||||
(5461, "5461s", "Original pistol"),
|
||||
(6309, "6309s", "Original pistol"),
|
||||
(6377, "6377s", "Original gun"),
|
||||
(6479, "6479s", "Original pistol"),
|
||||
]
|
||||
PROMPTS = ["gun", "pistol", "rifle", "weapon"]
|
||||
|
||||
cap = cv2.VideoCapture(VIDEO)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
|
||||
def get_frame(t_sec):
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
|
||||
ret, frame = cap.read()
|
||||
return frame if ret else None
|
||||
|
||||
def save_annotated(frame, detections, prompt, model_name, label):
|
||||
img = frame.copy()
|
||||
for d in detections:
|
||||
x1, y1, x2, y2 = [int(v) for v in d["bbox"]]
|
||||
conf = d["score"]
|
||||
cls = d["label"]
|
||||
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
|
||||
cv2.putText(img, f"{cls} {conf:.2f}", (x1, y1-5),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
|
||||
filename = f"{label}_{model_name}_prompt-{prompt}.jpg"
|
||||
cv2.imwrite(os.path.join(OUTPUT_DIR, filename), img, [cv2.IMWRITE_JPEG_QUALITY, 85])
|
||||
return filename
|
||||
|
||||
all_results = {}
|
||||
|
||||
# ========== OWL-ViT ==========
|
||||
print("=" * 60)
|
||||
print("OWL-ViT (google/owlvit-base-patch32)")
|
||||
print("=" * 60)
|
||||
|
||||
from transformers import OwlViTProcessor, OwlViTForObjectDetection
|
||||
|
||||
owl_proc = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
|
||||
owl_model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
|
||||
device = "mps" if torch.backends.mps.is_available() else "cpu"
|
||||
owl_model.to(device)
|
||||
print(f"Device: {device}")
|
||||
|
||||
owl_dets = {}
|
||||
t0 = time.time()
|
||||
for t_sec, label, desc in TIMEPOINTS:
|
||||
frame = get_frame(t_sec)
|
||||
if frame is None: continue
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
for prompt in PROMPTS:
|
||||
inputs = owl_proc(text=[[prompt]], images=img, return_tensors="pt").to(device)
|
||||
with torch.no_grad():
|
||||
outputs = owl_model(**inputs)
|
||||
target = torch.tensor([img.size[::-1]])
|
||||
dets = owl_proc.post_process_grounded_object_detection(outputs, threshold=0.05, target_sizes=target)[0]
|
||||
det_list = []
|
||||
for i in range(len(dets["boxes"])):
|
||||
det_list.append({
|
||||
"bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
|
||||
"score": round(dets["scores"][i].item(), 3),
|
||||
"label": prompt,
|
||||
})
|
||||
save_annotated(frame, det_list, prompt, "owlvit", label)
|
||||
key = f"{label}_prompt-{prompt}"
|
||||
owl_dets[key] = det_list
|
||||
if det_list:
|
||||
best = max(d["score"] for d in det_list)
|
||||
print(f" [{desc}] prompt='{prompt}': {len(det_list)} det best={best:.3f}")
|
||||
|
||||
all_results["owlvit"] = {"elapsed": round(time.time()-t0, 1), "detections": owl_dets}
|
||||
|
||||
# ========== Grounding DINO ==========
|
||||
print("\n" + "=" * 60)
|
||||
print("Grounding DINO (IDEA-Research/grounding-dino-base)")
|
||||
print("=" * 60)
|
||||
|
||||
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
||||
|
||||
gd_proc = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
|
||||
gd_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-base")
|
||||
gd_model.to(device)
|
||||
|
||||
gd_dets = {}
|
||||
t0 = time.time()
|
||||
for t_sec, label, desc in TIMEPOINTS:
|
||||
frame = get_frame(t_sec)
|
||||
if frame is None: continue
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
for prompt in PROMPTS:
|
||||
inputs = gd_proc(images=img, text=prompt, return_tensors="pt").to(device)
|
||||
with torch.no_grad():
|
||||
outputs = gd_model(**inputs)
|
||||
target = torch.tensor([img.size[::-1]])
|
||||
dets = gd_proc.post_process_grounded_object_detection(outputs, threshold=0.05, target_sizes=target)[0]
|
||||
det_list = []
|
||||
for i in range(len(dets["boxes"])):
|
||||
det_list.append({
|
||||
"bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
|
||||
"score": round(dets["scores"][i].item(), 3),
|
||||
"label": prompt,
|
||||
})
|
||||
save_annotated(frame, det_list, prompt, "grounding-dino", label)
|
||||
key = f"{label}_prompt-{prompt}"
|
||||
gd_dets[key] = det_list
|
||||
if det_list:
|
||||
best = max(d["score"] for d in det_list)
|
||||
print(f" [{desc}] prompt='{prompt}': {len(det_list)} det best={best:.3f}")
|
||||
|
||||
all_results["grounding-dino"] = {"elapsed": round(time.time()-t0, 1), "detections": gd_dets}
|
||||
|
||||
cap.release()
|
||||
|
||||
# ========== Summary ==========
|
||||
print("\n" + "=" * 60)
|
||||
print("SUMMARY")
|
||||
print("=" * 60)
|
||||
for model in ["owlvit", "grounding-dino"]:
|
||||
d = all_results[model]
|
||||
dets = d["detections"]
|
||||
hits = sum(1 for v in dets.values() if v)
|
||||
total = sum(len(v) for v in dets.values())
|
||||
print(f"\n{model} ({d['elapsed']}s): {hits}/8 timepoints, {total} total detections")
|
||||
for t_sec, label, desc in TIMEPOINTS:
|
||||
candidates = []
|
||||
for p in PROMPTS:
|
||||
key = f"{label}_prompt-{p}"
|
||||
if key in dets and dets[key]:
|
||||
for dd in dets[key]:
|
||||
candidates.append((p, dd["score"]))
|
||||
if candidates:
|
||||
best = max(candidates, key=lambda x: x[1])
|
||||
print(f" {desc}: best={best[1]:.3f} (prompt='{best[0]}')")
|
||||
else:
|
||||
print(f" {desc}: no detections")
|
||||
|
||||
json.dump(all_results, open(os.path.join(OUTPUT_DIR, "zero_shot_results.json"), "w"), indent=2)
|
||||
print(f"\nSaved to {OUTPUT_DIR}/")
|
||||
103
scripts/zero_shot_objects_test.py
Normal file
103
scripts/zero_shot_objects_test.py
Normal file
@@ -0,0 +1,103 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Test Grounding DINO Large on stamps, envelopes, passports, letters.
|
||||
"""
|
||||
import json, os, time, cv2, torch
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
||||
|
||||
MODEL_PATH = "/Users/accusys/momentry_core_0.1/models/gun/grounding-dino-large-hf"
|
||||
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
|
||||
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/zero_shot_objects"
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
# Timepoints per object type
|
||||
TESTS = [
|
||||
# (label, time_sec, prompts)
|
||||
("stamp_001", 429, ["stamp", "postage stamp"]),
|
||||
("stamp_002", 691, ["stamp", "envelope", "letter"]),
|
||||
("stamp_003", 5443, ["stamp", "envelope"]),
|
||||
("stamp_004", 5500, ["stamp"]),
|
||||
("stamp_005", 5506, ["stamp"]),
|
||||
("envelope_001", 5443, ["envelope"]),
|
||||
("envelope_002", 5467, ["envelope"]),
|
||||
("envelope_003", 5786, ["envelope"]),
|
||||
("passport_001", 762, ["passport", "identification"]),
|
||||
("passport_002", 3491, ["passport", "identification"]),
|
||||
("passport_003", 5054, ["passport"]),
|
||||
("letter_001", 691, ["letter", "envelope"]),
|
||||
("letter_002", 5434, ["letter", "envelope"]),
|
||||
("letter_003", 5783, ["letter", "stamp"]),
|
||||
]
|
||||
|
||||
print(f"Loading Large model...")
|
||||
t0 = time.time()
|
||||
processor = AutoProcessor.from_pretrained(MODEL_PATH)
|
||||
model = AutoModelForZeroShotObjectDetection.from_pretrained(MODEL_PATH)
|
||||
device = "mps" if torch.backends.mps.is_available() else "cpu"
|
||||
model.to(device)
|
||||
print(f"Loaded in {time.time()-t0:.1f}s, device={device}")
|
||||
|
||||
cap = cv2.VideoCapture(VIDEO)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
|
||||
results = {}
|
||||
t_infer = time.time()
|
||||
|
||||
for label, t_sec, prompts in TESTS:
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
|
||||
ret, frame = cap.read()
|
||||
if frame is None: continue
|
||||
|
||||
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
||||
key = f"{label}_{t_sec}s"
|
||||
results[key] = {"time": t_sec, "time_str": f"{t_sec//60}:{t_sec%60:02d}", "prompts": {}}
|
||||
|
||||
for prompt in prompts:
|
||||
inputs = processor(images=img, text=f"{prompt}.", return_tensors="pt").to(device)
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
target = torch.tensor([img.size[::-1]])
|
||||
dets = processor.post_process_grounded_object_detection(
|
||||
outputs, threshold=0.1, target_sizes=target
|
||||
)[0]
|
||||
|
||||
det_list = []
|
||||
for i in range(len(dets["boxes"])):
|
||||
det_list.append({
|
||||
"bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
|
||||
"score": round(dets["scores"][i].item(), 3),
|
||||
})
|
||||
results[key]["prompts"][prompt] = det_list
|
||||
|
||||
# Save annotated frame
|
||||
if det_list:
|
||||
cv2_img = frame.copy()
|
||||
for d in det_list:
|
||||
x1, y1, x2, y2 = [int(v) for v in d["bbox"]]
|
||||
cv2.rectangle(cv2_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
|
||||
cv2.putText(cv2_img, f"{prompt} {d['score']:.2f}", (x1, y1-5),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
|
||||
cv2.imwrite(os.path.join(OUTPUT_DIR, f"{label}_{t_sec}s_{prompt}.jpg"), cv2_img,
|
||||
[cv2.IMWRITE_JPEG_QUALITY, 85])
|
||||
|
||||
cap.release()
|
||||
elapsed = time.time() - t_infer
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Results ({elapsed:.0f}s)")
|
||||
print(f"{'='*60}")
|
||||
for key, data in sorted(results.items()):
|
||||
found = [p for p, dets in data["prompts"].items() if dets]
|
||||
if found:
|
||||
best = max(
|
||||
((p, d["score"]) for p, dets in data["prompts"].items() for d in dets),
|
||||
key=lambda x: x[1]
|
||||
)
|
||||
print(f" {data['time_str']} {key:20s} ✅ {best[1]:.3f} ({best[0]})")
|
||||
else:
|
||||
print(f" {data['time_str']} {key:20s} ❌ none")
|
||||
|
||||
json.dump(results, open(os.path.join(OUTPUT_DIR, "results.json"), "w"), indent=2)
|
||||
print(f"\nScreenshots saved to {OUTPUT_DIR}/")
|
||||
Reference in New Issue
Block a user