feat: Phase 1 handover - schema migration, correction mechanism, API fixes

Schema changes: dev.chunks->dev.chunk, remove old_chunk_id/chunk_index
Correction: asr-1.json format, generate/apply scripts
API: 37/37 endpoints fixed and tested
Docs: HANDOVER_V2.0.md for M4
This commit is contained in:
Accusys
2026-05-11 07:03:22 +08:00
parent ef894a44ad
commit 39ba5ddf76
147 changed files with 19843 additions and 3053 deletions

View File

@@ -0,0 +1,163 @@
#!/opt/homebrew/bin/python3.11
"""
Apply asr-1.json corrections to dev.chunks.
DELETE old chunks, INSERT corrected chunks.
PRESERVE chunk_vectors by renaming old chunk_id to new corrected IDs.
"""
import json, os, subprocess, sys, time
PG_BIN = "/Users/accusys/pgsql/18.3/bin"
DB_USER = "accusys"
DB_NAME = "momentry"
OUTPUT_DIR = "/Users/accusys/momentry/output_dev"
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
DRY_RUN = "--dry-run" in sys.argv
def psql(sql, raw=False):
args = [f"{PG_BIN}/psql", "-U", DB_USER, "-d", DB_NAME]
if not raw:
args += ["-t", "-A"]
args += ["-c", sql]
r = subprocess.run(args, capture_output=True, text=True, timeout=15)
if r.returncode != 0: return None, r.stderr[:200]
return r.stdout.strip(), None
def esc(val):
if val is None: return "NULL"
return "'" + str(val).replace("'", "''") + "'"
def main():
t0 = time.time()
fps = 24.0
errors = 0
d = json.load(open(os.path.join(OUTPUT_DIR, f"{UUID}.asr-1.json")))
kept = d["kept"]
corrections = d["corrections"]
total = len(kept) + sum(len(c["corrected"]) for c in corrections)
print(f"Kept: {len(kept)}, Corrected chunks: {sum(len(c['corrected']) for c in corrections)}, Total: {total}\n")
# Step 1: DELETE old sentence chunks
if not DRY_RUN:
psql(f"DELETE FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='sentence';")
print(f"Step 1/4: Deleted old chunks (dry_run={DRY_RUN})")
# Step 2: RENAME chunk_vectors: old chunk_id → new corrected IDs
# For kept chunks: chunk_id unchanged → no action needed
# For corrections: clone the vector to each new child ID
vec_renamed = 0
batch_sql = []
for c in corrections:
old_id = str(c["parent_chunk_index"])
new_ids = []
for si, child in enumerate(c["corrected"]):
new_id = child.get("new_chunk_id", f"{c['parent_chunk_index']}-{si+1:02d}")
new_ids.append(new_id)
# Check if old_id has a vector in chunk_vectors
if not DRY_RUN:
out, err = psql(
f"SELECT count(*) FROM dev.chunk_vectors "
f"WHERE uuid='{UUID}' AND chunk_id='{old_id}'"
)
count = int(out.strip()) if out and out.strip().isdigit() else 0
else:
count = 1 # assume exists for dry-run
if count > 0:
# Delete old row, insert new rows for each child (cloning the embedding)
if not DRY_RUN:
# Get the embedding data
out, err = psql(
f"SELECT embedding FROM dev.chunk_vectors "
f"WHERE uuid='{UUID}' AND chunk_id='{old_id}'"
)
embedding = out.strip() if out and out.strip() else "NULL"
# Delete old
psql(f"DELETE FROM dev.chunk_vectors WHERE uuid='{UUID}' AND chunk_id='{old_id}'")
# Insert new rows
for new_id in new_ids:
psql(
f"INSERT INTO dev.chunk_vectors (chunk_id, uuid, chunk_type, embedding) "
f"VALUES ('{new_id}', '{UUID}', 'sentence', '{embedding}'::jsonb)"
)
vec_renamed += len(new_ids)
print(f"Step 2/4: chunk_vectors renamed: {vec_renamed} new entries (dry_run={DRY_RUN})")
# Step 3: INSERT kept chunks
batch = []
for k in kept:
child_id = str(k["chunk_index"])
sf = k["start_frame"]
ef = k["end_frame"]
text = k["text_content"]
st = round(sf / fps, 3)
et = round(ef / fps, 3)
batch.append(
f"INSERT INTO dev.chunks "
f"(file_uuid, chunk_id, old_chunk_id, chunk_index, chunk_type, "
f"start_time, end_time, start_frame, end_frame, text_content, fps, content) "
f"VALUES ("
f"'{UUID}', '{child_id}', '{child_id}', 0, 'sentence', "
f"{esc(st)}, {esc(et)}, {sf}, {ef}, {esc(text)}, {fps}, "
f"'{{\"source\": \"asr-1\"}}'::jsonb"
f");"
)
# Step 4: INSERT corrected chunks
for c in corrections:
for si, child in enumerate(c["corrected"]):
child_id = child.get("new_chunk_id", f"{c['parent_chunk_index']}-{si+1:02d}")
sf = child["start_frame"]
ef = child["end_frame"]
text = child["text_content"]
st = round(sf / fps, 3)
et = round(ef / fps, 3)
batch.append(
f"INSERT INTO dev.chunks "
f"(file_uuid, chunk_id, old_chunk_id, chunk_index, chunk_type, "
f"start_time, end_time, start_frame, end_frame, text_content, fps, content) "
f"VALUES ("
f"'{UUID}', '{child_id}', '{child_id}', 0, 'sentence', "
f"{esc(st)}, {esc(et)}, {sf}, {ef}, {esc(text)}, {fps}, "
f"'{{\"source\": \"asr-1\"}}'::jsonb"
f");"
)
# Execute batch
for bs in range(0, len(batch), 100):
be = min(bs + 100, len(batch))
if not DRY_RUN:
for s in batch[bs:be]:
out, err = psql(s)
if err:
errors += 1
if errors <= 3: print(f" ERROR: {err[:120]}")
pct = be * 100 // len(batch)
print(f" Steps 3+4/4: [{be}/{len(batch)}] {pct}% err={errors} [{time.time()-t0:.0f}s]")
# Verify
if not DRY_RUN:
sc = psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='sentence'")
vc = psql(f"SELECT count(*) FROM dev.chunk_vectors WHERE uuid='{UUID}'")
mc = psql(
f"SELECT count(*) FROM dev.chunk_vectors cv "
f"JOIN dev.chunks c ON c.file_uuid=cv.uuid AND c.chunk_id=cv.chunk_id "
f"WHERE cv.uuid='{UUID}'"
)
print(f"\n Verify: {sc[0].strip()} chunks, {vc[0].strip()} vectors, {mc[0].strip()} matched")
print(f"\n{'='*50}")
print("DRY RUN" if DRY_RUN else "APPLIED")
print(f" Total chunks: {len(batch)}")
print(f" Vectors renamed: {vec_renamed}")
print(f" Errors: {errors}")
print(f" Time: {time.time()-t0:.1f}s")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,83 @@
#!/opt/homebrew/bin/python3.11
"""
Comprehensive ASR Model Selection Benchmark
Tests 5 models × 2 VAD settings across 3 test clips.
Output: JSON results + markdown report
"""
import json, time, os, gc, sys
from faster_whisper import WhisperModel
CLIPS = {
"A_rapid": {"path": "/tmp/asr_clip_A.mp4", "offset": 1540},
"B_normal": {"path": "/tmp/asr_clip_B.mp4", "offset": 600},
"C_complex": {"path": "/tmp/asr_clip_C.mp4", "offset": 4400},
}
MODELS = ["tiny", "base", "small", "medium", "large-v3"]
VAD_SETTINGS = [200, 500] # min_silence_duration_ms
RESULTS_FILE = "/tmp/asr_benchmark_results.json"
def run_transcribe(model, clip_path, clip_name, vad_ms):
segs = []
t0 = time.time()
vad_params = {"min_silence_duration_ms": vad_ms}
segments, info = model.transcribe(clip_path, beam_size=5, vad_filter=True,
vad_parameters=vad_params)
for seg in segments:
segs.append({"start": round(seg.start, 2), "end": round(seg.end, 2),
"text": seg.text.strip()})
elapsed = time.time() - t0
return segs, info, elapsed
# Load existing results to skip completed
all_results = {}
if os.path.exists(RESULTS_FILE):
all_results = json.load(open(RESULTS_FILE))
print(f"Loaded {sum(len(v) for v in all_results.values())} existing results")
total = len(CLIPS) * len(MODELS) * len(VAD_SETTINGS)
done = sum(len(v) for v in all_results.values())
print(f"Total: {total} tests, {done} already done, {total-done} remaining\n")
for clip_name, clip_cfg in CLIPS.items():
if clip_name not in all_results:
all_results[clip_name] = {}
for model_size in MODELS:
for vad_ms in VAD_SETTINGS:
key = f"{model_size}_vad{vad_ms}"
if key in all_results[clip_name]:
continue
print(f"[{clip_name}] {model_size} VAD={vad_ms}ms ...", end=" ", flush=True)
t_load = time.time()
model = WhisperModel(model_size, device="cpu", compute_type="int8")
load_time = time.time() - t_load
segs, info, trans_time = run_transcribe(model, clip_cfg["path"], clip_name, vad_ms)
# Total chars
total_chars = sum(len(s["text"]) for s in segs)
all_results[clip_name][key] = {
"model": model_size,
"vad_ms": vad_ms,
"segments": segs,
"segment_count": len(segs),
"total_chars": total_chars,
"runtime_secs": round(trans_time, 1),
"load_time_secs": round(load_time, 1),
"language": info.language,
}
print(f"{len(segs)} segs, {total_chars} chars, {trans_time:.1f}s")
# Free memory between models
del model
gc.collect()
# Save incrementally
json.dump(all_results, open(RESULTS_FILE, "w"))
print("\n=== All tests complete ===")
print(json.dumps({k: {kk: {kkk: vv for kkk, vv in v.items() if kkk != "segments"} for kk, v in vv.items()} for k, vv in all_results.items()}, indent=2))

View File

@@ -0,0 +1,173 @@
#!/opt/homebrew/bin/python3.11
"""
LLM-clean all 4188 sentence texts, re-embed, update momentry_dev_v1 + sentence_story.
"""
import json, time, os
from urllib.request import Request, urlopen
import psycopg2
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
QDRANT_URL = "http://localhost:6333"
LLM_URL = "http://localhost:8082/v1/chat/completions"
EMBED_URL = "http://localhost:11436/v1/embeddings"
CHECKPOINT = f"/tmp/sentence_clean_{UUID}.json"
def call_llm(prompt):
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1, "max_tokens": 80}).encode()
req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
resp = urlopen(req, timeout=30)
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
def call_embed(text):
body = json.dumps({"input": text}).encode()
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
resp = urlopen(req, timeout=30)
return json.loads(resp.read())["data"][0]["embedding"]
print("=== Step 1: Load all sentences ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("""
SELECT id, chunk_id, text_content
FROM dev.chunks
WHERE file_uuid = %s AND chunk_type = 'sentence'
ORDER BY id
""", (UUID,))
rows = cur.fetchall()
conn.close()
print(f"Loaded {len(rows)} sentences")
# Reset checkpoint (incompatible with old chunk_index format)
if os.path.exists(CHECKPOINT):
os.remove(CHECKPOINT)
print("Old checkpoint removed (format changed)")
results = []
errors = 0
print("\n=== Step 2: LLM clean + embed ===")
for i, (cid, chunk_id, text_content) in enumerate(rows):
input_text = text_content
prompt = f"""Clean this movie dialogue line. Fix truncated words, capitalize, add punctuation.
Return: SPEAKER: "clean text"
Input: [Cary Grant] can't you do something constructive like start
Return: Cary Grant: "Can't you do something constructive like start?"
Input: [Audrey Hepburn] qui se présente influence d'une manière vitale la proposition l
Return: Audrey Hepburn: "Qui se présente influence d'une manière vitale la proposition..."
Input: {input_text}
Return:"""
try:
cleaned = call_llm(prompt)
embedding = call_embed(cleaned)
time.sleep(0.1)
except Exception as e:
print(f" [{i+1}/{len(rows)}] id={cid} chunk={chunk_id} ERROR: {e}")
cleaned = input_text
embedding = [0.0] * 768
errors += 1
entry = {
"index": i,
"chunk_id": chunk_id,
"original": input_text,
"cleaned": cleaned,
"embedding": embedding,
}
results.append(entry)
json.dump({"last": i}, open(CHECKPOINT, "w"))
if (i + 1) % 50 == 0:
print(f" [{i+1}/{len(rows)}] chunk={chunk_id} errors={errors}")
results.sort(key=lambda x: x["index"])
print(f"\nDone: {len(results)} cleaned, {errors} errors")
print("\n=== Step 3: Rebuild momentry_dev_v1 ===")
# Delete old
req = Request(f"{QDRANT_URL}/collections/momentry_dev_v1", method="DELETE")
try: urlopen(req); time.sleep(0.5)
except: pass
req = Request(f"{QDRANT_URL}/collections/momentry_dev_v1",
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
urlopen(req); time.sleep(0.5)
batch_size = 100
points = []
for pi, r in enumerate(results):
points.append({
"id": pi + 1,
"vector": r["embedding"],
"payload": {
"chunk_type": "sentence",
"uuid": UUID,
"chunk_id": r["chunk_id"],
"text": r["cleaned"],
"original": r["original"],
}
})
for start in range(0, len(points), batch_size):
batch = points[start:start+batch_size]
req = Request(f"{QDRANT_URL}/collections/momentry_dev_v1/points?wait=true",
data=json.dumps({"points": batch}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
try: urlopen(req)
except Exception as e: print(f" batch {start}: {e}")
if (start // batch_size) % 5 == 0:
print(f" momentry_dev_v1: {start+len(batch)}/{len(points)}")
print(" momentry_dev_v1 done")
print("\n=== Step 4: Rebuild sentence_story ===")
req = Request(f"{QDRANT_URL}/collections/sentence_story", method="DELETE")
try: urlopen(req); time.sleep(0.5)
except: pass
req = Request(f"{QDRANT_URL}/collections/sentence_story",
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
urlopen(req); time.sleep(0.5)
story_points = []
for pi, r in enumerate(results):
story_points.append({
"id": pi + 1,
"vector": r["embedding"],
"payload": {
"chunk_type": "sentence",
"uuid": UUID,
"chunk_id": r["chunk_id"],
"text": r["cleaned"],
}
})
for start in range(0, len(story_points), batch_size):
batch = story_points[start:start+batch_size]
req = Request(f"{QDRANT_URL}/collections/sentence_story/points?wait=true",
data=json.dumps({"points": batch}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
try: urlopen(req)
except Exception as e: print(f" batch {start}: {e}")
if (start // batch_size) % 5 == 0:
print(f" sentence_story: {start+len(batch)}/{len(story_points)}")
print(" sentence_story done")
# Verify
for col in ["momentry_dev_v1", "sentence_story"]:
resp = json.loads(urlopen(f"{QDRANT_URL}/collections/{col}").read())
info = resp["result"]
print(f"Verified {col}: {info['points_count']} pts, {info['config']['params']['vectors'].get('size','?')}D")
print("\n=== Done ===")

View File

@@ -0,0 +1,138 @@
#!/opt/homebrew/bin/python3.11
"""
Comparison test: Grounding DINO Base vs Florence-2 Base vs Florence-2 Large
Tests on 8 known timepoints with gun prompts.
"""
import json, os, sys, time, cv2, torch
from PIL import Image
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/model_comparison"
os.makedirs(OUTPUT_DIR, exist_ok=True)
TIMEPOINTS = [
(2646, "2646s"), (3188, "3188s"), (3697, "3697s"),
(5341, "5341s"), (5461, "5461s"), (6309, "6309s"),
(6377, "6377s"), (6479, "6479s"),
]
PROMPTS = {"gun": "gun.", "pistol": "pistol."}
device = "mps" if torch.backends.mps.is_available() else "cpu"
cap = cv2.VideoCapture(VIDEO)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
frames = {}
for t_sec, label in TIMEPOINTS:
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
ret, frame = cap.read()
if ret: frames[label] = frame
cap.release()
print(f"Loaded {len(frames)} frames")
all_results = {}
# ========== Grounding DINO Base ==========
print("\n" + "="*60)
print("Grounding DINO Base")
print("="*60)
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
t0 = time.time()
gd_proc = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
gd_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-base").to(device)
gd_dets = {}
for label, frame in frames.items():
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
for pname, prompt in PROMPTS.items():
inputs = gd_proc(images=img, text=prompt, return_tensors="pt").to(device)
with torch.no_grad():
outputs = gd_model(**inputs)
target = torch.tensor([img.size[::-1]])
dets = gd_proc.post_process_grounded_object_detection(outputs, threshold=0.1, target_sizes=target)[0]
scores = [round(s.item(), 3) for s in dets["scores"]] if len(dets["boxes"]) > 0 else []
gd_dets[f"{label}_{pname}"] = scores
all_results["grounding-dino-base"] = {"elapsed": round(time.time()-t0, 1), "detections": gd_dets}
print(f" Done in {all_results['grounding-dino-base']['elapsed']}s")
del gd_model; torch.mps.empty_cache()
# ========== Florence-2 Base ==========
print("\n" + "="*60)
print("Florence-2 Base")
print("="*60)
from transformers import AutoProcessor, AutoModelForCausalLM
t0 = time.time()
f2b_proc = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
f2b_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True).to(device)
f2b_dets = {}
for label, frame in frames.items():
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
for pname, prompt_text in PROMPTS.items():
task = f"<OD>" # Object detection task
text = f"{task}{prompt_text}"
inputs = f2b_proc(text=text, images=img, return_tensors="pt").to(device)
with torch.no_grad():
outputs = f2b_model.generate(**inputs, max_new_tokens=100, num_beams=3)
result = f2b_proc.decode(outputs[0], skip_special_tokens=False)
# Parse Florence-2 output format
scores = []
if "<p>" in result and "</p>" in result:
# Simple parsing: count detections (Florence-2 outputs positions)
# Florence-2 outputs: <OD>gun.</s><p><loc_...><loc_...><loc_...><loc_...>gun</p>...
import re
detections = re.findall(r'<loc_\d+>', result)
n_dets = len(detections) // 4 # 4 coords per bbox
scores = [1.0] * n_dets if n_dets > 0 else [] # Florence-2 doesn't output confidence
elif prompt_text.replace('.','') in result:
scores = [1.0] # At least one detection found
f2b_dets[f"{label}_{pname}"] = scores
all_results["florence2-base"] = {"elapsed": round(time.time()-t0, 1), "detections": f2b_dets}
print(f" Done in {all_results['florence2-base']['elapsed']}s")
del f2b_model; torch.mps.empty_cache()
# ========== Florence-2 Large ==========
print("\n" + "="*60)
print("Florence-2 Large")
print("="*60)
t0 = time.time()
f2l_proc = AutoProcessor.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True)
f2l_model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-large", trust_remote_code=True).to(device)
f2l_dets = {}
for label, frame in frames.items():
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
for pname, prompt_text in PROMPTS.items():
task = f"<OD>"
text = f"{task}{prompt_text}"
inputs = f2l_proc(text=text, images=img, return_tensors="pt").to(device)
with torch.no_grad():
outputs = f2l_model.generate(**inputs, max_new_tokens=100, num_beams=3)
result = f2l_proc.decode(outputs[0], skip_special_tokens=False)
scores = []
import re
detections = re.findall(r'<loc_\d+>', result)
n_dets = len(detections) // 4
scores = [1.0] * n_dets if n_dets > 0 else []
f2l_dets[f"{label}_{pname}"] = scores
all_results["florence2-large"] = {"elapsed": round(time.time()-t0, 1), "detections": f2l_dets}
print(f" Done in {all_results['florence2-large']['elapsed']}s")
del f2l_model; torch.mps.empty_cache()
# ========== Summary ==========
print("\n" + "="*60)
print(f"{'Model':<25} {'Time':>8} {'Gun hits':>10} {'Gun best':>10} {'Pistol hits':>12} {'Pistol best':>10}")
print("-"*75)
for model_name in ["grounding-dino-base", "florence2-base", "florence2-large"]:
d = all_results[model_name]
dets = d["detections"]
gun_scores = []
pistol_scores = []
for label, _, _ in TIMEPOINTS:
gk = f"{label}s_gun"
pk = f"{label}s_pistol"
gun_scores.extend(dets.get(gk, []))
pistol_scores.extend(dets.get(pk, []))
gun_hits = sum(1 for s in gun_scores if s > 0)
pistol_hits = sum(1 for s in pistol_scores if s > 0)
gun_best = max(gun_scores) if gun_scores else 0
pistol_best = max(pistol_scores) if pistol_scores else 0
print(f"{model_name:<25} {d['elapsed']:>7.1f}s {gun_hits:>6d}/8 {gun_best:>8.3f} {pistol_hits:>6d}/8 {pistol_best:>8.3f}")
json.dump(all_results, open(os.path.join(OUTPUT_DIR, "model_comparison.json"), "w"), indent=2)
print(f"\nSaved to {OUTPUT_DIR}/")

78
scripts/coreml_embed_server.py Executable file
View File

@@ -0,0 +1,78 @@
"""
Simple Flask-like HTTP server for CoreML ANE embedding inference.
Replaces /api/embeddings endpoint that comic_embed.rs calls.
"""
import json, os, argparse
from http.server import HTTPServer, BaseHTTPRequestHandler
import numpy as np
from transformers import AutoTokenizer
# Global model
MODEL = None
TOKENIZER = None
MODEL_PATH = "/Users/accusys/models/mxbai-embed-large-v1.mlpackage"
class EmbeddingHandler(BaseHTTPRequestHandler):
def do_POST(self):
if self.path == "/api/embeddings":
length = int(self.headers.get("Content-Length", 0))
body = self.read(length)
try:
data = json.loads(body)
prompt = data.get("prompt", "")
# Strip search_document: or search_query: prefix
if prompt.startswith("search_document: "):
prompt = prompt[17:]
elif prompt.startswith("search_query: "):
prompt = prompt[14:]
tokens = TOKENIZER(prompt, return_tensors="np", padding="max_length", truncation=True, max_length=512)
input_ids = tokens["input_ids"].astype(np.int32)
attention_mask = tokens["attention_mask"].astype(np.int32)
result = MODEL.predict({"input_ids": input_ids, "attention_mask": attention_mask})
embedding = result["embedding"][0].tolist()
resp = json.dumps({"embedding": embedding}).encode()
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(resp)
except Exception as e:
resp = json.dumps({"error": str(e)}).encode()
self.send_response(500)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(resp)
else:
self.send_response(404)
self.end_headers()
def read(self, length):
return self.rfile.read(length)
def main():
global MODEL, TOKENIZER
parser = argparse.ArgumentParser()
parser.add_argument("--port", type=int, default=11435)
parser.add_argument("--model", default=MODEL_PATH)
args = parser.parse_args()
import coremltools as ct
print(f"Loading CoreML model from {args.model}...")
MODEL = ct.models.MLModel(args.model, compute_units=ct.ComputeUnit.ALL)
print(f"Model loaded (compute: {MODEL.compute_unit})")
print("Loading tokenizer...")
TOKENIZER = AutoTokenizer.from_pretrained("mixedbread-ai/mxbai-embed-large-v1")
print("Tokenizer loaded")
server = HTTPServer(("127.0.0.1", args.port), EmbeddingHandler)
print(f"ANE Embedding server running on port {args.port}")
print(f"API: POST http://127.0.0.1:{args.port}/api/embeddings")
print(f" Body: {{\"model\": \"...\", \"prompt\": \"...\"}}")
print(f" Response: {{\"embedding\": [...]}}")
server.serve_forever()
if __name__ == "__main__":
main()

View File

@@ -1,176 +1,281 @@
#!/opt/homebrew/bin/python3.11
"""
Momentry Dashboard — Flask web app
Reads pipeline status + Redis + system health on demand
Momentry Dashboard v2 — Direct DB/Qdrant/Redis queries, no subprocess blocking
"""
import json, os, subprocess, sys, platform
import json, os, platform, time
from pathlib import Path
from flask import Flask, jsonify, render_template_string
import psycopg2
import urllib.request
app = Flask(__name__)
PROJECT = Path(__file__).resolve().parent.parent
# System role detection
HOSTNAME = platform.node()
IS_M5 = "MacBook" in HOSTNAME or "M5" in HOSTNAME
IS_M5 = "MacBook" in HOSTNAME
SYSTEM_ROLE = "M5 (MacBook Pro)" if IS_M5 else "M4 (Mac Mini)"
SYSTEM_COLOR = "#58a6ff" if IS_M5 else "#f0883e"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
QDRANT_URL = "http://localhost:6333"
LLM_URL = "http://localhost:8082/v1/chat/completions"
EMBED_URL = "http://localhost:11436/v1/embeddings"
def run_status_json():
"""Run pipeline_status.py and return parsed JSON"""
r = subprocess.run(
[sys.executable, str(PROJECT / "scripts/pipeline_status.py"), "--json"],
capture_output=True, text=True, timeout=30,
)
return json.loads(r.stdout)
COLLECTIONS = [
"momentry_dev_v1", "momentry_dev_stories", "momentry_dev_voice",
"momentry_dev_faces", "sentence_story", "sentence_summary",
"momentry_dev_rule1_v2",
]
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
def run_redis_info():
"""Fetch key Redis metrics"""
result = {}
def db_query(sql, params=None):
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute(sql, params or ())
rows = cur.fetchall()
conn.close()
return rows
def qdrant_get(path):
try:
r = subprocess.run(
["redis-cli", "-a", "accusys", "INFO", "all"],
capture_output=True, text=True, timeout=5,
)
for line in r.stdout.split("\n"):
line = line.strip()
if ":" not in line or line.startswith("#"):
continue
k, v = line.split(":", 1)
if k in ("total_system_memory_human", "used_memory_human",
"used_memory_peak_human", "total_connections_received",
"total_commands_processed", "keyspace_hits", "keyspace_misses",
"connected_clients", "uptime_in_seconds"):
result[k] = v if not v.endswith("_human") else v
result["keyspace_hits"] = int(result.get("keyspace_hits", 0))
result["keyspace_misses"] = int(result.get("keyspace_misses", 0))
hit_rate = result["keyspace_hits"] / max(result["keyspace_hits"] + result["keyspace_misses"], 1) * 100
result["hit_rate_pct"] = round(hit_rate, 1)
except Exception as e:
result["error"] = str(e)
# Get momentry keys
try:
r = subprocess.run(
["redis-cli", "-a", "accusys", "KEYS", "momentry_dev:*"],
capture_output=True, text=True, timeout=5,
)
keys = [k for k in r.stdout.strip().split("\n") if k]
result["momentry_keys"] = len(keys)
# Sample a few interesting keys
sample = {}
for k in keys:
if k.endswith(":health") or k.endswith(":job:") or ":processor:" in k:
pass
if len(sample) >= 5:
break
result["key_sample"] = keys[:10]
resp = urllib.request.urlopen(f"{QDRANT_URL}{path}", timeout=5)
return json.loads(resp.read())
except:
result["momentry_keys"] = 0
result["key_sample"] = []
return None
def qdrant_count(col):
r = qdrant_get(f"/collections/{col}")
if r:
return r.get("result", {}).get("points_count", 0)
return -1
def qdrant_dim(col):
r = qdrant_get(f"/collections/{col}")
if r:
cfg = r.get("result", {}).get("config", {}).get("params", {}).get("vectors", {})
return cfg.get("size", "?")
return "?"
@app.route("/")
def index():
return render_template_string(TEMPLATE, SYSTEM_ROLE=SYSTEM_ROLE)
@app.route("/api/all")
def api_all():
return jsonify({
"system": {"hostname": HOSTNAME, "role": SYSTEM_ROLE, "is_m5": IS_M5},
"status": get_status(),
"qdrant": get_qdrant_info(),
"db": get_db_info(),
"processes": get_processes(),
})
@app.route("/api/status")
def api_status():
return jsonify(get_status())
@app.route("/api/qdrant")
def api_qdrant():
return jsonify(get_qdrant_info())
@app.route("/api/db")
def api_db():
return jsonify(get_db_info())
@app.route("/api/processes")
def api_processes():
return jsonify(get_processes())
def get_status():
"""Pipeline checklist — direct DB queries"""
t0 = time.time()
stages = []
# 1. ASR file
asr_path = f"/Users/accusys/momentry/output_dev/{UUID}.asr.json"
asr_segs = 0
try:
if os.path.exists(asr_path):
d = json.load(open(asr_path))
asr_segs = len(d.get("segments", []))
except: pass
stages.append({"name":"ASR","passed":asr_segs>0,"detail":f"{asr_segs} seg","elapsed":0.0})
# 2. ASRX file
asrx_path = f"/Users/accusys/momentry/output_dev/{UUID}.asrx.json"
asrx_segs = 0
try:
if os.path.exists(asrx_path):
d = json.load(open(asrx_path))
asrx_segs = len(d.get("segments", []))
except: pass
stages.append({"name":"ASRX","passed":asrx_segs>0,"detail":f"{asrx_segs} seg","elapsed":0.0})
# 3. Sentence chunks
try:
cnt = db_query("SELECT count(*) FROM dev.chunks WHERE file_uuid=%s AND chunk_type='sentence'", (UUID,))[0][0]
except:
cnt = 0
stages.append({"name":"Sentence","passed":cnt>0,"detail":f"{cnt} chunks","elapsed":0.0})
# 4. Vectorization (Qdrant)
v1 = qdrant_count("momentry_dev_v1")
stages.append({"name":"Vectorize","passed":v1>0,"detail":f"{v1} Qdrant","elapsed":0.0})
# 5. Face traces
try:
traces = db_query("SELECT count(DISTINCT trace_id) FROM dev.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL", (UUID,))[0][0]
faces = db_query("SELECT count(*) FROM dev.face_detections WHERE file_uuid=%s AND trace_id IS NOT NULL", (UUID,))[0][0]
except:
traces = faces = 0
stages.append({"name":"FaceTrace","passed":traces>0,"detail":f"{traces} traces, {faces} faces","elapsed":0.0})
# 6. TKG
try:
nodes = db_query("SELECT count(*) FROM dev.tkg_nodes WHERE file_uuid=%s", (UUID,))[0][0]
edges = db_query("SELECT count(*) FROM dev.tkg_edges WHERE file_uuid=%s", (UUID,))[0][0]
except:
nodes = edges = 0
stages.append({"name":"TKG","passed":nodes>0,"detail":f"{nodes} nodes, {edges} edges","elapsed":0.0})
# 7. Trace chunks
try:
tc = db_query("SELECT count(*) FROM dev.chunks WHERE file_uuid=%s AND chunk_type='trace'", (UUID,))[0][0]
except:
tc = 0
stages.append({"name":"TraceChunks","passed":tc>0,"detail":f"{tc} chunks","elapsed":0.0})
# 8. Phase 1 release
p1 = PROJECT / "release" / "phase1" / "latest"
p1_ok = p1.exists() and (p1 / "RELEASE_INFO.txt").exists()
p1_size = sum(f.stat().st_size for f in p1.rglob("*") if f.is_file()) // (1024*1024) if p1.exists() else 0
stages.append({"name":"Phase1","passed":p1_ok,"detail":f"{p1_size}MB","elapsed":0.0})
all_passed = all(s["passed"] for s in stages)
return {
"uuid": UUID,
"passed": all_passed,
"stages": stages,
"checked_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"total_elapsed": round(time.time() - t0, 1),
"health": get_health(),
}
def get_health():
h = {}
try:
import os
load = os.getloadavg()
h["cpu_load_1m"] = round(load[0], 1)
h["cpu_load_5m"] = round(load[1], 1)
except:
h["cpu_load_1m"] = h["cpu_load_5m"] = -1
try:
import subprocess
rss = 0
out = subprocess.run(["ps", "-A", "-o", "rss="], capture_output=True, text=True, timeout=5).stdout
for line in out.strip().split("\n"):
if line.strip():
rss += int(line.strip())
h["memory_used_mb"] = rss // 1024 if rss else 0
except:
pass
try:
d = subprocess.run(["df", "-h", "/Users/accusys/momentry/output_dev"],
capture_output=True, text=True, timeout=5).stdout.strip().split("\n")[-1].split()
h["disk_use_pct"] = d[4] if len(d) > 4 else "?"
h["disk_avail"] = d[3] if len(d) > 3 else "?"
except:
pass
try:
import torch
h["gpu_available"] = torch.backends.mps.is_available()
except:
h["gpu_available"] = False
services = {"postgresql": False, "qdrant": False, "embedding": False, "llm": False}
try:
conn = psycopg2.connect(DB_URL)
conn.close()
services["postgresql"] = True
except:
pass
try:
r = qdrant_get("/collections")
services["qdrant"] = r is not None
except:
pass
try:
resp = urllib.request.urlopen("http://localhost:11436/health", timeout=3)
services["embedding"] = resp.status == 200
except:
pass
try:
req = urllib.request.Request(LLM_URL,
data=json.dumps({"model":"google_gemma-4-26B-A4B-it-Q5_K_M.gguf","messages":[{"role":"user","content":"ping"}],"max_tokens":1}).encode(),
headers={"Content-Type":"application/json"}, method="POST")
resp = urllib.request.urlopen(req, timeout=3)
services["llm"] = resp.status == 200
except:
pass
h["services"] = services
return h
def get_qdrant_info():
result = []
for col in COLLECTIONS:
r = qdrant_get(f"/collections/{col}")
if r:
info = r.get("result", {})
cfg = info.get("config", {}).get("params", {}).get("vectors", {})
result.append({
"name": col,
"points": info.get("points_count", 0),
"dim": cfg.get("size", "?"),
})
else:
result.append({"name": col, "points": -1, "dim": "?"})
return result
def run_db_info():
"""Fetch DB metrics + current processing file"""
psql = "/Users/accusys/pgsql/18.3/bin/psql"
cmd = [psql, "-U", "accusys", "-d", "momentry", "-t", "-A"]
def get_db_info():
result = {}
try:
r = subprocess.run(cmd + ["-c", """
rows = db_query("""
SELECT 'videos', count(*) FROM dev.videos
UNION ALL SELECT 'chunks', count(*) FROM dev.chunks
UNION ALL SELECT 'face_detections', count(*) FROM dev.face_detections
UNION ALL SELECT 'identities', count(*) FROM dev.identities
UNION ALL SELECT 'tkg_nodes', count(*) FROM dev.tkg_nodes
UNION ALL SELECT 'tkg_edges', count(*) FROM dev.tkg_edges
"""], capture_output=True, text=True, timeout=10)
for line in r.stdout.strip().split("\n"):
if not line.strip() or "|" not in line:
continue
parts = line.split("|")
result[parts[0].strip()] = int(parts[1])
""")
for r in rows:
result[r[0]] = r[1]
except:
pass
# 所有檔案的 pipeline 進度(依檔案名去重,取最新)
try:
r = subprocess.run(cmd + ["-c", """
SELECT DISTINCT ON (v.file_name)
v.file_uuid, v.file_name, v.status,
COALESCE(v.processing_status::text, '{}') as pstatus,
m.status as job_status
FROM dev.videos v
LEFT JOIN dev.monitor_jobs m ON m.uuid = v.file_uuid
WHERE v.status IN ('completed', 'processing')
OR m.status IS NOT NULL
ORDER BY v.file_name, GREATEST(
COALESCE(v.registration_time::timestamp, '1970-01-01'),
COALESCE(m.updated_at, '1970-01-01')
) DESC
LIMIT 20
"""], capture_output=True, text=True, timeout=10)
seen_names = set()
files = []
for line in r.stdout.strip().split("\n"):
if not line.strip() or "|" not in line:
continue
parts = line.split("|", 4)
if len(parts) < 5:
continue
name = parts[1].strip()
if name in seen_names:
continue
seen_names.add(name)
f = {"uuid": parts[0].strip(), "name": name,
"status": parts[2].strip(), "job_status": parts[4].strip()}
try:
ps = json.loads(parts[3]) if parts[3] and parts[3] != '{}' else {}
f["progress"] = ps.get("progress", {})
except:
f["progress"] = {}
files.append(f)
result["files"] = files
except Exception as e:
result["files_error"] = str(e)
return result
@app.route("/")
def index():
return render_template_string(TEMPLATE)
@app.route("/api/status")
def api_status():
return jsonify(run_status_json())
@app.route("/api/redis")
def api_redis():
return jsonify(run_redis_info())
@app.route("/api/db")
def api_db():
return jsonify(run_db_info())
@app.route("/api/all")
def api_all():
return jsonify({
"system": {"hostname": HOSTNAME, "role": SYSTEM_ROLE, "is_m5": IS_M5},
"status": run_status_json(),
"redis": run_redis_info(),
"db": run_db_info(),
})
def get_processes():
import subprocess
scripts = ["clean_sentence_text.py", "generate_sentence_summaries.py"]
result = {}
for s in scripts:
try:
r = subprocess.run(["pgrep", "-f", s], capture_output=True, text=True, timeout=3)
pids = [p.strip() for p in r.stdout.strip().split("\n") if p.strip()]
if pids:
r2 = subprocess.run(["ps", "-o", "etime=", "-p", pids[0]], capture_output=True, text=True, timeout=3)
result[s] = {"pid": int(pids[0]), "elapsed": r2.stdout.strip()}
else:
result[s] = None
except:
result[s] = None
return result
TEMPLATE = """<!DOCTYPE html>
<html lang="zh-TW">
@@ -193,10 +298,6 @@ th, td { padding: 8px 12px; text-align: left; border-bottom: 1px solid #21262d;
th { color: #8b949e; font-weight: 600; }
.pass { color: #3fb950; font-weight: bold; }
.fail { color: #f85149; font-weight: bold; }
.badge { display: inline-block; padding: 2px 8px; border-radius: 12px; font-size: 12px; font-weight: 600; }
.badge-ok { background: #1b3a1b; color: #3fb950; }
.badge-err { background: #3a1b1b; color: #f85149; }
.badge-warn { background: #3a321b; color: #d29922; }
.stat-value { font-size: 28px; font-weight: 700; }
.stat-label { font-size: 12px; color: #8b949e; margin-top: 4px; }
.stat-card { background: #0d1117; border: 1px solid #30363d; border-radius: 6px; padding: 16px; text-align: center; }
@@ -204,275 +305,167 @@ th { color: #8b949e; font-weight: 600; }
.last-updated { color: #8b949e; font-size: 13px; }
button { background: #238636; color: white; border: none; padding: 8px 20px; border-radius: 6px; cursor: pointer; font-size: 14px; }
button:hover { background: #2ea043; }
.progress-bar { height: 6px; background: #21262d; border-radius: 3px; margin-top: 8px; }
.progress-fill { height: 100%; border-radius: 3px; background: #238636; transition: width 0.5s; }
#error { display: none; background: #3a1b1b; border: 1px solid #f85149; border-radius: 6px; padding: 12px; margin-bottom: 16px; color: #f85149; font-size: 13px; }
@media (max-width: 768px) { .col { min-width: 100%; } }
</style>
</head>
<body>
<div class="container">
<div class="refresh-bar">
<h1>Momentry Dashboard <span style="font-size:14px;background:#1f2937;color:#{{'58a6ff' if IS_M5 else 'f0883e'}};padding:4px 12px;border-radius:12px;margin-left:8px;vertical-align:middle">🤖 {{ SYSTEM_ROLE }}</span></h1>
<div class="refresh-bar">
<h1>Momentry Dashboard <span id="roleBadge" style="font-size:14px;background:#1f2937;padding:4px 12px;border-radius:12px;margin-left:8px">\U0001F4BB {{ SYSTEM_ROLE }}</span></h1>
<div style="display:flex;align-items:center;gap:8px">
<span class="last-updated" id="lastUpdated"></span>
<button onclick="copyStatus()" style="background:#1f6feb;padding:6px 14px;font-size:13px">📋 Copy</button>
<button onclick="fetchAll()" style="background:#238636;padding:6px 14px;font-size:13px">⟳ Refresh</button>
<span class="last-updated" id="lastUpdated">\u2014</span>
<button onclick="load()" style="background:#238636;padding:6px 14px;font-size:13px">\u27F3 Refresh</button>
</div>
</div>
<div id="error"></div>
<div class="row">
<div class="col">
<div class="section">
<h2> Pipeline Checklist</h2>
<table id="checklist"><tr><td colspan="3">Loading...</td></tr></table>
<h2>\u2705 Pipeline Checklist</h2>
<table id="checklist"><tr><td>Loading...</td></tr></table>
</div>
</div>
<div class="col">
<div class="section">
<h2>💻 System Health</h2>
<h2>\U0001F4BB System Health</h2>
<div id="health" style="font-size:14px">Loading...</div>
</div>
<div class="section">
<h2>🛠 Services</h2>
<h2>\U0001F6E0 Services</h2>
<div id="services" style="font-size:14px">Loading...</div>
</div>
</div>
</div>
<div class="section" id="fileProgressSection">
<h2>📁 Pipeline Progress</h2>
<div id="fileProgress" style="font-size:14px">Loading...</div>
<div class="row">
<div class="col">
<div class="section">
<h2>\U0001F4CA Qdrant Collections</h2>
<div id="qdrant" style="font-size:14px">Loading...</div>
</div>
</div>
<div class="col">
<div class="section">
<h2>\u2699\uFE0F Background Processes</h2>
<div id="processes" style="font-size:14px">Loading...</div>
</div>
</div>
</div>
<div class="row">
<div class="col">
<div class="section">
<h2>⚡ Redis</h2>
<div id="redis" style="font-size:14px">Loading...</div>
</div>
</div>
<div class="col">
<div class="section">
<h2>🗄 Database</h2>
<h2>\U0001F4DB Database</h2>
<div id="db" style="font-size:14px">Loading...</div>
</div>
</div>
</div>
<div class="section">
<h2>⏱ Processor Timing</h2>
<table id="timing"><tr><td>Loading...</td></tr></table>
</div>
</div>
<script>
async function fetchAll() {
async function load() {
const ts = new Date().toISOString().slice(11,19);
document.getElementById('lastUpdated').textContent = '🔄 ' + ts;
document.getElementById("lastUpdated").textContent = "\U0001F504 " + ts;
document.getElementById("error").style.display = "none";
try {
const all = await (await fetch('/api/all')).json();
_lastData = all;
const status = all.status;
renderChecklist(status.job);
renderHealth(status.health);
renderTiming(status.health?.processors);
if (all.redis) renderRedis(all.redis);
if (all.db) { renderDb(all.db); renderFileProgress(all.db); }
document.getElementById('lastUpdated').textContent = ' ' + ts;
const resp = await fetch("/api/all");
if (!resp.ok) throw new Error("HTTP " + resp.status);
const d = await resp.json();
renderChecklist(d.status);
renderHealth(d.status.health);
renderQdrant(d.qdrant);
renderProcesses(d.processes);
renderDb(d.db);
document.getElementById("lastUpdated").textContent = "\u2705 " + ts;
} catch(e) {
document.getElementById('checklist').innerHTML = '<tr><td class="fail">Error: ' + e.message + '</td></tr>';
// Fallback: try separate endpoints
try {
const s = await (await fetch('/api/status')).json(); renderChecklist(s.job); renderHealth(s.health); renderTiming(s.health?.processors);
} catch(e2) {}
try {
const r = await (await fetch('/api/redis')).json(); renderRedis(r);
} catch(e2) {}
try {
const d = await (await fetch('/api/db')).json(); renderDb(d); renderFileProgress(d);
} catch(e2) {}
showError(e.message);
document.getElementById("lastUpdated").textContent = "\u274C " + ts;
}
}
function renderChecklist(job) {
if (!job || !job.stages) return;
let h = '<tr><th>Stage</th><th>Status</th><th>Detail</th><th>Time</th></tr>';
for (const s of job.stages) {
const cls = s.passed ? 'pass' : 'fail';
const icon = s.passed ? '' : '';
h += '<tr><td>' + s.name + '</td><td class="' + cls + '">' + icon + '</td><td>' + s.detail + '</td><td>' + s.elapsed + 's</td></tr>';
function showError(msg) {
document.getElementById("error").innerHTML = "\u26A0\uFE0F " + msg;
document.getElementById("error").style.display = "block";
}
function renderChecklist(status) {
const job = status || {};
const stages = job.stages || [];
let h = "<tr><th>Stage</th><th>Status</th><th>Detail</th></tr>";
for (const s of stages) {
h += "<tr><td>" + s.name + '</td><td class="' + (s.passed ? "pass" : "fail") + '">' + (s.passed ? "\u2705" : "\u274C") + "</td><td>" + s.detail + "</td></tr>";
}
const totalCls = job.passed ? 'pass' : 'fail';
h += '<tr style="font-weight:bold;border-top:2px solid #30363d"><td>TOTAL</td><td class="' + totalCls + '">' + (job.passed ? '' : '') + '</td><td></td><td>' + job.total_elapsed + 's</td></tr>';
document.getElementById('checklist').innerHTML = h;
h += '<tr style="font-weight:bold;border-top:2px solid #30363d"><td>TOTAL</td><td class="' + (job.passed ? "pass" : "fail") + '">' + (job.passed ? "\u2705" : "\u274C") + "</td><td></td></tr>";
document.getElementById("checklist").innerHTML = h;
}
function renderHealth(h) {
if (!h) return;
const memPct = h.memory_used_mb ? (h.memory_used_mb / 49152 * 100).toFixed(1) : '?';
const memBar = Math.min(parseFloat(memPct), 100);
const barColor = memBar > 85 ? '#f85149' : memBar > 70 ? '#d29922' : '#3fb950';
document.getElementById('health').innerHTML = `
<div class="row">
<div class="col"><div class="stat-card"><div class="stat-value">${h.cpu_load_1m ?? '?'}</div><div class="stat-label">CPU Load (1m)</div></div></div>
<div class="col"><div class="stat-card"><div class="stat-value">${memPct}%</div><div class="stat-label">Memory</div><div class="progress-bar"><div class="progress-fill" style="width:${memBar}%;background:${barColor}"></div></div></div></div>
<div class="col"><div class="stat-card"><div class="stat-value">${h.disk_use_pct ?? '?'}</div><div class="stat-label">Disk Used</div></div></div>
</div>
`;
let cards = '<div class="row">';
cards += '<div class="col"><div class="stat-card"><div class="stat-value">' + (h.cpu_load_1m ?? "?") + '</div><div class="stat-label">CPU Load (1m)</div></div></div>';
const memPct = h.memory_used_mb ? (h.memory_used_mb / 49152 * 100).toFixed(1) : "?";
cards += '<div class="col"><div class="stat-card"><div class="stat-value">' + memPct + '%</div><div class="stat-label">Memory</div></div></div>';
cards += '<div class="col"><div class="stat-card"><div class="stat-value">' + (h.disk_use_pct ?? "?") + '</div><div class="stat-label">Disk</div></div></div>';
cards += "</div>";
document.getElementById("health").innerHTML = cards;
const svc = h.services || {};
document.getElementById('services').innerHTML = Object.entries(svc).map(([k,v]) =>
'<span style="margin-right:16px">' + (v ? '' : '') + ' ' + k + '</span>'
).join('');
let svcHtml = "";
for (const [k, v] of Object.entries(svc)) {
svcHtml += '<span style="margin-right:16px">' + (v ? "\u2705" : "\u274C") + " " + k + "</span>";
}
document.getElementById("services").innerHTML = svcHtml;
}
function renderTiming(procs) {
function renderQdrant(cols) {
if (!cols) return;
let h = "<table><tr><th>Collection</th><th>Points</th><th>Dim</th></tr>";
for (let i = 0; i < cols.length; i++) {
const c = cols[i];
h += "<tr><td>" + c.name + "</td><td>" + (c.points >= 0 ? Number(c.points).toLocaleString() : "err") + "</td><td>" + c.dim + "</td></tr>";
}
h += "</table>";
document.getElementById("qdrant").innerHTML = h;
}
function renderProcesses(procs) {
if (!procs) return;
let h = '<tr><th>Processor</th><th>Duration</th></tr>';
for (const p of procs) {
const d = p.duration_secs;
const dur = d ? (d < 60 ? d + 's' : d < 3600 ? Math.floor(d/60) + 'm ' + (d%60) + 's' : Math.floor(d/3600) + 'h ' + Math.floor((d%3600)/60) + 'm') : 'running';
h += '<tr><td>' + p.name + '</td><td>' + dur + '</td></tr>';
}
document.getElementById('timing').innerHTML = h;
}
function renderRedis(r) {
if (!r) return;
let h = '<div class="row">';
const cards = [
{k:'used_memory_human', l:'Memory Used'},
{k:'total_system_memory_human', l:'System Memory'},
{k:'connected_clients', l:'Clients'},
{k:'hit_rate_pct', l:'Hit Rate'},
{k:'momentry_keys', l:'Momentry Keys'},
{k:'uptime_in_seconds', l:'Uptime'},
];
for (const c of cards) {
let v = r[c.k] ?? '';
if (c.k === 'uptime_in_seconds' && typeof v === 'number') {
v = v > 86400 ? Math.round(v/86400) + 'd' : Math.round(v/3600) + 'h';
let h = "<table><tr><th>Script</th><th>Status</th></tr>";
for (const name in procs) {
const info = procs[name];
if (info) {
h += "<tr><td>" + name + "</td><td>\u25B6 running " + info.elapsed + "</td></tr>";
} else {
h += '<tr style="color:#8b949e"><td>' + name + "</td><td>\u23F3 idle</td></tr>";
}
if (c.k === 'hit_rate_pct' && typeof v === 'number') v = v.toFixed(1) + '%';
h += '<div class="col"><div class="stat-card"><div class="stat-value">' + v + '</div><div class="stat-label">' + c.l + '</div></div></div>';
}
h += '</div>';
if (r.key_sample && r.key_sample.length) {
h += '<div style="margin-top:12px;font-size:12px;color:#8b949e">Recent keys: ' + r.key_sample.slice(0,6).join(', ') + '</div>';
}
document.getElementById('redis').innerHTML = h;
}
const PIPELINE_STAGES = ['cut','scene','asr','asrx','yolo','ocr','face','pose','visual_chunk','story'];
function renderFileProgress(d) {
const el = document.getElementById('fileProgress');
if (!d || !d.files || d.files.length === 0) {
el.innerHTML = '<div style="color:#8b949e">No files found</div>';
return;
}
let h = '<table><tr><th>File</th><th>Status</th>';
for (const s of PIPELINE_STAGES) h += '<th style="font-size:11px">' + s.slice(0,4) + '</th>';
h += '</tr>';
for (const f of d.files) {
const name = f.name.length > 50 ? f.name.slice(0,50) + '...' : f.name;
const statusIcon = f.job_status === 'running' ? '▶️' : f.job_status === 'pending' ? '' : f.status === 'completed' ? '' : '';
const progress = f.progress || {};
h += '<tr><td style="max-width:300px;overflow:hidden;text-overflow:ellipsis;white-space:nowrap" title="' + f.name + '">' + name + '</td>'
+ '<td>' + statusIcon + ' ' + (f.job_status || f.status) + '</td>';
for (const s of PIPELINE_STAGES) {
const ps = progress[s.toUpperCase()] || {};
const st = ps.status || '';
let icon = '';
if (st === 'completed') icon = '';
else if (st === 'running') icon = '';
else if (st === 'failed') icon = '';
h += '<td style="text-align:center;font-size:13px">' + icon + '</td>';
}
h += '</tr>';
}
h += '</table>';
el.innerHTML = h;
h += "</table>";
document.getElementById("processes").innerHTML = h;
}
function renderDb(d) {
if (!d) return;
const rows = ['videos','chunks','face_detections','identities','tkg_nodes','tkg_edges'];
const keys = ["videos","chunks","face_detections","identities","tkg_nodes","tkg_edges"];
let h = '<div class="row">';
for (const key of rows) {
const v = d[key] ?? 0;
h += '<div class="col"><div class="stat-card"><div class="stat-value">' + v.toLocaleString() + '</div><div class="stat-label">' + key.replace(/_/g,' ') + '</div></div></div>';
for (let i = 0; i < keys.length; i++) {
const v = d[keys[i]] ?? 0;
h += '<div class="col"><div class="stat-card"><div class="stat-value">' + Number(v).toLocaleString() + '</div><div class="stat-label">' + keys[i].replace(/_/g," ") + '</div></div></div>';
}
h += '</div>';
document.getElementById('db').innerHTML = h;
h += "</div>";
document.getElementById("db").innerHTML = h;
}
let _lastData = null;
function copyStatus() {
if (!_lastData) { alert('No data loaded yet'); return; }
const d = _lastData;
const job = d.status?.job;
const h = d.status?.health;
const db = d.db;
const r = d.redis;
let lines = [];
lines.push('Momentry Pipeline Status');
lines.push('='.repeat(50));
lines.push('System: ' + (d.system?.role || '?') + ' | ' + new Date().toISOString().slice(0,19).replace('T',' '));
lines.push('');
if (job?.stages) {
lines.push('── Checklist ──');
for (const s of job.stages) {
lines.push(' ' + (s.passed ? '' : '') + ' ' + s.name.padEnd(14) + s.detail);
}
lines.push(' ' + (job.passed ? '' : '') + ' TOTAL'.padEnd(14) + job.total_elapsed + 's');
lines.push('');
}
if (h) {
lines.push('── Health ──');
lines.push(' CPU: ' + (h.cpu_load_1m ?? '?') + ' Memory: ' + (h.memory_used_mb ?? '?') + 'MB GPU: ' + (h.gpu_available ? '' : ''));
if (h.services) {
lines.push(' Services: ' + Object.entries(h.services).map(([k,v]) => k + '=' + (v ? '' : '')).join(' '));
}
lines.push('');
}
if (r) {
lines.push('── Redis ──');
lines.push(' Keys: ' + (r.momentry_keys ?? '?') + ' Hit Rate: ' + (r.hit_rate_pct ?? '?') + '% Uptime: ' + (r.uptime_in_seconds ? Math.round(r.uptime_in_seconds/3600)+'h' : '?'));
lines.push('');
}
if (db) {
lines.push('── Database ──');
const tbls = ['videos','chunks','face_detections','identities','tkg_nodes','tkg_edges'];
for (const t of tbls) {
if (db[t] !== undefined) lines.push(' ' + t + ': ' + db[t].toLocaleString());
}
if (db.files) {
lines.push('');
lines.push('── Files ──');
for (const f of db.files) {
lines.push(' ' + (f.job_status === 'running' ? '▶️' : f.job_status === 'pending' ? '' : f.status === 'completed' ? '' : '') + ' ' + f.name.slice(0,60));
}
}
lines.push('');
}
const text = lines.join('\n');
navigator.clipboard.writeText(text).then(() => {
const btn = event.target;
const orig = btn.textContent;
btn.textContent = '✅ Copied!';
setTimeout(() => btn.textContent = orig, 2000);
}).catch(() => alert('Copy failed'));
}
fetchAll();
setInterval(fetchAll, 15000);
load();
setInterval(load, 30000);
</script>
</body>
</html>"""
if __name__ == "__main__":
port = int(os.environ.get("DASHBOARD_PORT", 5050))
print(f"Momentry Dashboard: http://0.0.0.0:{port}")
app.run(host="0.0.0.0", port=port, debug=False)
print(f"Momentry Dashboard v2: http://0.0.0.0:{port}")
app.run(host="0.0.0.0", port=port, threaded=True)

View File

@@ -0,0 +1,324 @@
#!/opt/homebrew/bin/python3.11
"""
Dense Scan Traces - Re-scan frame-by-frame for traces with < 4 detections.
Flow:
1. Query face_detections for traces with < 4 rows for a file_uuid
2. For each short trace:
a. Extract video segment (ffmpeg)
b. Run face_processor.py with --sample-interval 1
c. Match new detections to trace by embedding similarity
d. Insert new rows into face_detections
Usage:
python dense_scan_traces.py --file-uuid <uuid> [--video-path <path>]
"""
import sys
import os
import json
import argparse
import subprocess
import time
import tempfile
import numpy as np
import psycopg2
import psycopg2.extras
from typing import List, Dict, Optional
DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
SCHEMA = os.environ.get("MOMENTRY_DB_SCHEMA", "dev")
OUTPUT_DIR = os.environ.get("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
FACE_PROCESSOR = os.path.join(SCRIPT_DIR, "face_processor.py")
PYTHON_BIN = "/opt/homebrew/bin/python3.11"
MIN_DETECTIONS = 4
def get_conn():
return psycopg2.connect(DB_URL)
def get_video_path(file_uuid: str) -> Optional[str]:
"""Get video file path from videos table"""
conn = get_conn()
cur = conn.cursor()
try:
cur.execute(
f"SELECT file_path FROM {SCHEMA}.videos WHERE file_uuid = %s",
(file_uuid,),
)
row = cur.fetchone()
return row[0] if row else None
finally:
cur.close()
conn.close()
def get_short_traces(file_uuid: str, min_det: int = MIN_DETECTIONS) -> List[Dict]:
"""Find traces with < min_det rows"""
conn = get_conn()
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
try:
cur.execute(
f"""
SELECT trace_id, COUNT(*) as cnt,
MIN(frame_number) as start_frame,
MAX(frame_number) as end_frame
FROM {SCHEMA}.face_detections
WHERE file_uuid = %s AND trace_id IS NOT NULL
GROUP BY trace_id
HAVING COUNT(*) < %s
ORDER BY trace_id
""",
(file_uuid, min_det),
)
return [dict(r) for r in cur.fetchall()]
finally:
cur.close()
conn.close()
def get_trace_embeddings(file_uuid: str, trace_id: int) -> List[Dict]:
"""Get existing embedding vectors for a trace"""
conn = get_conn()
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
try:
cur.execute(
f"""
SELECT frame_number, x, y, width, height, embedding
FROM {SCHEMA}.face_detections
WHERE file_uuid = %s AND trace_id = %s AND embedding IS NOT NULL
ORDER BY frame_number
""",
(file_uuid, trace_id),
)
return [dict(r) for r in cur.fetchall()]
finally:
cur.close()
conn.close()
def cosine_similarity(a: List[float], b: List[float]) -> float:
if not a or not b:
return 0.0
v1, v2 = np.array(a), np.array(b)
n1, n2 = np.linalg.norm(v1), np.linalg.norm(v2)
if n1 == 0 or n2 == 0:
return 0.0
return float(np.dot(v1, v2) / (n1 * n2))
def extract_video_segment(video_path: str, start_frame: int, end_frame: int, output_path: str, fps: float = 59.94):
"""Extract a frame range from video using ffmpeg (fast seek via -ss)"""
start_time = max(0.0, start_frame / fps - 1.0)
cmd = [
"ffmpeg", "-y",
"-ss", f"{start_time:.2f}",
"-i", video_path,
"-vf", f"select=between(n\\,{start_frame}\\,{end_frame}),setpts=PTS-STARTPTS",
"-vsync", "0",
"-an", output_path,
]
subprocess.run(cmd, check=True, timeout=120, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
def match_new_detections(new_face_json: str, ref_embeddings: List[Dict],
similarity_threshold: float = 0.7) -> List[Dict]:
"""Match dense-scan detections to trace by embedding similarity"""
with open(new_face_json) as f:
data = json.load(f)
if not ref_embeddings:
return []
matches = []
frames = data.get("frames", []) if isinstance(data.get("frames"), list) else []
for frame_data in frames:
frame_num = frame_data.get("frame", 0)
for face in frame_data.get("faces", []):
emb = face.get("embedding")
if not emb:
continue
# Find best matching reference embedding
best_sim = 0.0
best_ref = None
for ref in ref_embeddings:
sim = cosine_similarity(emb, ref["embedding"])
if sim > best_sim:
best_sim = sim
best_ref = ref
if best_sim >= similarity_threshold:
matches.append({
"frame_number": frame_num,
"x": face["x"],
"y": face["y"],
"width": face["width"],
"height": face["height"],
"confidence": face.get("confidence", 0.5),
"embedding": emb,
"similarity": best_sim,
})
return matches
def insert_detections(file_uuid: str, trace_id: int, detections: List[Dict]):
"""Insert new detections into face_detections, skipping existing frames"""
if not detections:
return 0
conn = get_conn()
cur = conn.cursor()
try:
inserted = 0
for d in detections:
# Check if frame already exists for this trace
cur.execute(
f"SELECT 1 FROM {SCHEMA}.face_detections "
f"WHERE file_uuid=%s AND frame_number=%s AND trace_id=%s",
(file_uuid, d["frame_number"], trace_id),
)
if cur.fetchone():
continue
emb = d.get("embedding") if d.get("embedding") else None
cur.execute(
f"""
INSERT INTO {SCHEMA}.face_detections
(file_uuid, frame_number, face_id, trace_id,
x, y, width, height, confidence, embedding)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
""",
(
file_uuid, d["frame_number"], None, trace_id,
d["x"], d["y"], d["width"], d["height"],
d.get("confidence", 0.5), emb,
),
)
inserted += 1
conn.commit()
return inserted
except Exception as e:
conn.rollback()
print(f" [DENSE] DB error: {e}")
return 0
finally:
cur.close()
conn.close()
def dense_scan_trace(file_uuid: str, trace_id: int, video_path: str,
start_frame: int, end_frame: int):
"""Re-scan a trace's frame range frame-by-frame"""
pad = 15
seg_start = max(0, start_frame - pad)
seg_end = end_frame + pad
# Get reference embeddings FIRST (outside tempdir, before tempdir cleanup)
refs = get_trace_embeddings(file_uuid, trace_id)
if not refs:
return 0
new_detections = None
with tempfile.TemporaryDirectory() as tmpdir:
# Extract segment
segment_path = os.path.join(tmpdir, f"seg_{trace_id}.mp4")
try:
extract_video_segment(video_path, seg_start, seg_end, segment_path)
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as e:
err = e.stderr.decode() if hasattr(e, 'stderr') and e.stderr else str(e)
print(f" [DENSE] ffmpeg failed: {err[:200]}")
return 0
# Run face_processor with sample_interval=1
face_out = os.path.join(tmpdir, f"face_{trace_id}.json")
try:
subprocess.run(
[PYTHON_BIN, FACE_PROCESSOR, segment_path, face_out,
"--sample-interval", "1", "--uuid", file_uuid],
check=True, timeout=120,
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
)
except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as e:
print(f" [DENSE] face_processor failed for trace {trace_id}: {e}")
return 0
if not os.path.exists(face_out):
return 0
# Match new detections while tempdir still exists
new_detections = match_new_detections(face_out, refs)
# Tempdir cleaned up here — face_out no longer accessible
if not new_detections:
return 0
# Adjust frame numbers
adjusted = []
for d in new_detections:
df = seg_start + d["frame_number"] - 1
orig_fn = d["frame_number"]
d["frame_number"] = df
if not any(r["frame_number"] == df for r in refs):
adjusted.append(d)
if not adjusted:
return 0
count = insert_detections(file_uuid, trace_id, adjusted)
print(f" [DENSE] Trace {trace_id}: added {count} new detections (range {seg_start}-{seg_end})")
return count
def main():
parser = argparse.ArgumentParser(description="Dense re-scan for short face traces")
parser.add_argument("--file-uuid", required=True, help="Video file UUID")
parser.add_argument("--video-path", help="Video file path (auto-detect if omitted)")
parser.add_argument("--min-detections", type=int, default=MIN_DETECTIONS,
help=f"Minimum detections per trace (default: {MIN_DETECTIONS})")
parser.add_argument("--dry-run", action="store_true", help="Only list short traces")
args = parser.parse_args()
min_det = getattr(args, 'min_detections', MIN_DETECTIONS)
# Get video path
video_path = args.video_path or get_video_path(args.file_uuid)
if not video_path or not os.path.exists(video_path):
print(f"[DENSE] Video not found: {video_path}", file=sys.stderr)
sys.exit(1)
print(f"[DENSE] Video: {video_path}")
# Find short traces
short_traces = get_short_traces(args.file_uuid, min_det)
print(f"[DENSE] Traces with < {min_det} detections: {len(short_traces)}")
if args.dry_run:
for t in short_traces:
print(f" Trace {t['trace_id']}: {t['cnt']} detections "
f"(frames {t['start_frame']}-{t['end_frame']})")
return
# Dense scan each short trace
total_added = 0
total_traces = 0
t0 = time.time()
for t in short_traces:
count = dense_scan_trace(
args.file_uuid, t["trace_id"], video_path,
t["start_frame"], t["end_frame"],
)
if count > 0:
total_added += count
total_traces += 1
elapsed = time.time() - t0
print(f"\n[DENSE] Done: {total_traces} traces supplemented, "
f"{total_added} new detections added, {elapsed:.1f}s")
if __name__ == "__main__":
main()

327
scripts/export_file.py Executable file
View File

@@ -0,0 +1,327 @@
#!/opt/homebrew/bin/python3.11
"""
momentry-export — 打包檔案歷程
將單一 file_uuid 的所有產出打包成可攜帶的 tar.gz
Usage:
python3 scripts/export_file.py <uuid> [--output <path>] [--include-video]
Example:
python3 scripts/export_file.py fa182e9c26145b2c1a932f73d1d484e5 --output /tmp/test_export.tar.gz
"""
import sys, os, json, argparse, tarfile, io, time
from pathlib import Path
from datetime import datetime
import psycopg2
import psycopg2.extras
DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
SCHEMA = os.environ.get("MOMENTRY_DB_SCHEMA", "dev")
OUTPUT_DIR = os.environ.get("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
TABLES = [
"pre_chunks", "chunks", "face_detections",
"processor_results", "processor_versions",
"videos", "api_keys",
]
def get_conn():
return psycopg2.connect(DB_URL)
def fetch_table(conn, table: str, uuid: str) -> list[dict]:
"""Fetch rows from a table that reference this UUID"""
uuid_columns = {"file_uuid", "uuid"}
# Get columns
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute(
"SELECT column_name, data_type FROM information_schema.columns "
"WHERE table_schema = %s AND table_name = %s",
(SCHEMA, table),
)
cols = cur.fetchall()
uuid_col = None
for c in cols:
if c["column_name"] in uuid_columns:
uuid_col = c["column_name"]
break
if not uuid_col:
cur.close()
return []
# Fetch rows
cur.execute(
f"SELECT * FROM {SCHEMA}.{table} WHERE {uuid_col} = %s",
(uuid,),
)
rows = [dict(r) for r in cur.fetchall()]
cur.close()
return rows
def fetch_video_row(conn, uuid: str) -> dict | None:
"""Get video metadata"""
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute(f"SELECT * FROM {SCHEMA}.videos WHERE file_uuid = %s", (uuid,))
row = cur.fetchone()
cur.close()
return dict(row) if row else None
def serialize_value(v):
"""Convert DB types to JSON-serializable"""
if isinstance(v, (datetime,)):
return v.isoformat()
if isinstance(v, bytes):
return list(v) # convert bytea to list of ints
if isinstance(v, (list,)):
# Check if it's a pgvector (list of floats)
return v
return v
def export_file(uuid: str, output_path: str, include_video: bool = False):
"""Export all data for a UUID into a tar.gz"""
t0 = time.time()
print(f"[EXPORT] Exporting {uuid}...")
conn = get_conn()
buf = io.BytesIO()
# 先確認是否完成
cur = conn.cursor()
cur.execute(
f"SELECT status FROM {SCHEMA}.monitor_jobs WHERE uuid = %s ORDER BY id DESC LIMIT 1",
(uuid,),
)
row = cur.fetchone()
job_status = row[0] if row else "unknown"
cur.close()
if job_status == "completed":
print(f" [EXPORT] Job status: ✅ {job_status}")
elif job_status == "failed":
print(f" [EXPORT] ⚠️ Job status: ❌ {job_status} (仍可匯出部分資料)")
elif job_status == "running":
print(f" [EXPORT] ⚠️ Job status: ⏳ {job_status} (處理中,產出不完全)")
else:
print(f" [EXPORT] ⚠️ Job status: {job_status}")
video = fetch_video_row(conn, uuid)
if not video:
print(f"[EXPORT] UUID {uuid} not found in videos table")
conn.close()
return False
# 歷程完整性檢查
print(f"\n ── 歷程完整性檢查 ──")
# Job status
completeness = {"job": job_status == "completed"}
# Processors: 7 processors all completed
cur = conn.cursor()
cur.execute(
f"SELECT processor, status FROM {SCHEMA}.processor_results "
f"WHERE file_uuid = %s ORDER BY processor",
(uuid,),
)
procs = {r[0]: r[1] for r in cur.fetchall()}
cur.close()
expected = ["asr", "asrx", "cut", "face", "ocr", "pose", "yolo"]
for p in expected:
st = procs.get(p, "missing")
completeness[f"proc_{p}"] = st == "completed"
completeness["processors"] = f"{sum(1 for p in expected if procs.get(p)=='completed')}/{len(expected)}"
# Output JSON files
output_dir = Path(OUTPUT_DIR)
json_files = sorted(output_dir.glob(f"{uuid}.*.json"))
completeness["output_jsons"] = len(json_files)
# Face detections
cur = conn.cursor()
cur.execute(
f"SELECT count(*) FROM {SCHEMA}.face_detections WHERE file_uuid = %s",
(uuid,),
)
completeness["face_detections"] = cur.fetchone()[0]
cur.close()
# Chunks (Rule 1)
cur = conn.cursor()
cur.execute(
f"SELECT count(*) FROM {SCHEMA}.chunks WHERE file_uuid = %s",
(uuid,),
)
completeness["chunks"] = cur.fetchone()[0]
cur.close()
# Print completeness report
for k, v in completeness.items():
icon = "" if v is True else ("" if v is False else "")
print(f" {icon} {k}: {v}")
# Decide if export is viable
has_core_data = completeness["output_jsons"] > 0 or completeness["face_detections"] > 0 or completeness["chunks"] > 0
if not has_core_data and job_status != "completed":
print(f"\n ⛔ 歷程不完整,無核心產出,中止匯出")
conn.close()
return False
print(f" ─────────────────\n")
with tarfile.open(fileobj=buf, mode="w:gz") as tar:
manifest = {
"exported_at": datetime.now().isoformat(),
"version": "1.0",
"file_uuid": uuid,
"file_name": video.get("file_name"),
"duration": video.get("duration"),
"fps": float(video.get("fps") or 0),
"width": video.get("width"),
"height": video.get("height"),
"total_frames": video.get("total_frames"),
"include_video": include_video,
"completeness": {k: str(v) if not isinstance(v, (bool, int, str)) else v
for k, v in completeness.items()},
"merge_policy": {
"identities": "merge_by_name",
"description": "匯入時 identity 依名稱比對,已存在則合併(保留 target 的 identity_id不存在則新增",
},
}
_add_json(tar, "manifest.json", manifest)
# 2. Video metadata (videos table row)
_add_json(tar, "data/video.json", video)
# 3. DB tables
for table in TABLES:
rows = fetch_table(conn, table, uuid)
if rows:
_add_json(tar, f"data/{table}.json", rows)
print(f" [EXPORT] {table}: {len(rows)} rows")
else:
print(f" [EXPORT] {table}: (empty)")
# 4. Face detection embeddings (handle vector type)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
cur.execute(
f"SELECT id, file_uuid, frame_number, trace_id, x, y, width, height, "
f"confidence, identity_id FROM {SCHEMA}.face_detections WHERE file_uuid = %s",
(uuid,),
)
fd_rows = [dict(r) for r in cur.fetchall()]
cur.close()
if fd_rows:
_add_json(tar, "data/face_detections_meta.json", fd_rows)
print(f" [EXPORT] face_detections (meta): {len(fd_rows)} rows")
else:
print(f" [EXPORT] face_detections: (empty)")
# 5. Identity 關聯資料
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
# 找出此 file_uuid 相關的所有 identity_id
cur.execute(
f"SELECT DISTINCT identity_id FROM {SCHEMA}.face_detections "
f"WHERE file_uuid = %s AND identity_id IS NOT NULL",
(uuid,),
)
identity_ids = [r["identity_id"] for r in cur.fetchall()]
if identity_ids:
# 查 identities 表
placeholders = ",".join(["%s"] * len(identity_ids))
cur.execute(
f"SELECT * FROM {SCHEMA}.identities WHERE id IN ({placeholders})",
identity_ids,
)
ident_rows = [dict(r) for r in cur.fetchall()]
_add_json(tar, "data/identities.json", ident_rows)
print(f" [EXPORT] identities: {len(ident_rows)} rows")
# 查 identity_bindings
cur.execute(
f"SELECT * FROM {SCHEMA}.identity_bindings "
f"WHERE identity_id IN ({placeholders})",
identity_ids,
)
bind_rows = [dict(r) for r in cur.fetchall()]
if bind_rows:
_add_json(tar, "data/identity_bindings.json", bind_rows)
print(f" [EXPORT] identity_bindings: {len(bind_rows)} rows")
# 查 file_identities若 table 存在)
try:
cur.execute(
f"SELECT * FROM {SCHEMA}.file_identities WHERE file_uuid = %s",
(uuid,),
)
fi_rows = [dict(r) for r in cur.fetchall()]
if fi_rows:
_add_json(tar, "data/file_identities.json", fi_rows)
print(f" [EXPORT] file_identities: {len(fi_rows)} rows")
except Exception:
pass # table 可能不存在
else:
print(f" [EXPORT] identities: (none bound to this file)")
cur.close()
# 6. Output JSON files
output_dir = Path(OUTPUT_DIR)
json_files = list(output_dir.glob(f"{uuid}.*.json"))
for jf in json_files:
arcname = f"output/{jf.name}"
tar.add(str(jf), arcname=arcname)
print(f" [EXPORT] output/{jf.name} ({jf.stat().st_size / 1024:.0f}KB)")
print(f" [EXPORT] output JSONs: {len(json_files)} files")
# 7. Original video file (optional)
if include_video and video.get("file_path"):
src = video["file_path"]
if os.path.exists(src):
tar.add(src, arcname="original/" + os.path.basename(src))
print(f" [EXPORT] original video: {src}")
else:
print(f" [WARN] Video file not found: {src}")
conn.close()
# Write to disk
with open(output_path, "wb") as f:
f.write(buf.getvalue())
size_mb = os.path.getsize(output_path) / 1e6
elapsed = time.time() - t0
print(f"\n[EXPORT] Done: {output_path} ({size_mb:.1f}MB, {elapsed:.1f}s)")
return True
def _add_json(tar: tarfile.TarFile, arcname: str, data):
"""Add a JSON file to the tar archive"""
raw = json.dumps(data, ensure_ascii=False, default=str, indent=2).encode()
info = tarfile.TarInfo(name=arcname)
info.size = len(raw)
info.mtime = int(time.time())
tar.addfile(info, io.BytesIO(raw))
def main():
parser = argparse.ArgumentParser(description="Export file processing history")
parser.add_argument("uuid", help="File UUID to export")
parser.add_argument("--output", "-o", default=None,
help="Output tar.gz path (default: {uuid}.tar.gz)")
parser.add_argument("--include-video", action="store_true",
help="Include original video file in export")
args = parser.parse_args()
output = args.output or f"{args.uuid}.tar.gz"
success = export_file(args.uuid, output, args.include_video)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()

114
scripts/fix_asr_text.py Normal file
View File

@@ -0,0 +1,114 @@
#!/opt/homebrew/bin/python3.11
"""
Redo ASR word-timestamp mapping correctly.
Save words first, then map to fine segments with independent scanning.
"""
import json, sys, os, time, subprocess, tempfile, shutil
from faster_whisper import WhisperModel
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
BASE = "/Users/accusys/momentry/output_dev"
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
print("Load fine segments...")
fine = json.load(open(f"{BASE}/{UUID}.asrx_fine.json"))
fine_segs = fine["segments"]
print(f"{len(fine_segs)} segments")
# Extract full audio
tmp_dir = tempfile.mkdtemp(prefix="asr_fix_")
wav_path = os.path.join(tmp_dir, "audio.wav")
subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", VIDEO,
"-ar", "16000", "-ac", "1", "-sample_fmt", "s16", wav_path],
check=True, capture_output=True, timeout=300)
print("Loading model...")
model = WhisperModel("small", device="cpu", compute_type="int8")
# Check if words file exists
words_file = f"{BASE}/{UUID}.words.json"
if os.path.exists(words_file):
print("Loading saved words...")
words = json.load(open(words_file))
else:
print("Transcribing with word_timestamps...")
t0 = time.time()
segments, info = model.transcribe(
wav_path, beam_size=5, vad_filter=True,
vad_parameters={"min_silence_duration_ms": 500},
word_timestamps=True
)
words = []
for seg in segments:
if seg.words:
for w in seg.words:
wt = w.word.strip()
if wt:
words.append({"word": wt, "start": w.start, "end": w.end})
# Also save segment-level as fallback
words.append({"word": seg.text.strip(), "start": seg.start, "end": seg.end, "_seg": True})
elapsed = time.time() - t0
print(f" {len(words)} entries in {elapsed:.1f}s")
json.dump(words, open(words_file, "w"))
# Separate word-level and segment-level
word_entries = [w for w in words if not w.get("_seg")]
seg_entries = [w for w in words if w.get("_seg")]
print(f"Word-level: {len(word_entries)}, Segment-level: {len(seg_entries)}")
# Map: for each fine segment, find ALL word entries within its time range
print("Mapping words to segments...")
assigned = 0
for si, fs in enumerate(fine_segs):
fstart = fs["start_time"]
fend = fs["end_time"]
seg_words = []
# Use word-level entries first (more precise)
for w in word_entries:
if w["start"] >= fstart and w["end"] <= fend + 0.05:
seg_words.append(w["word"])
elif w["start"] > fend:
break # words are sorted by time
if not seg_words:
# Fallback to segment-level
for w in seg_entries:
if w["start"] >= fstart and w["end"] <= fend + 0.05:
seg_words.append(w["word"])
elif w["start"] > fend:
break
text = " ".join(seg_words) if seg_words else ""
fs["text"] = text
if text:
assigned += 1
if (si + 1) % 500 == 0:
print(f" {si+1}/{len(fine_segs)}")
print(f"Segments with text: {assigned}/{len(fine_segs)}")
# Fix empty segments: use original ASR text
asr = json.load(open(f"{BASE}/{UUID}.asr.json"))
asr_segs = asr["segments"]
asr_bounds = {(s['start'], s['end']): s['text'] for s in asr_segs}
for fs in fine_segs:
if not fs.get('text', '').strip():
key = (fs['start_time'], fs['end_time'])
if key in asr_bounds:
fs['text'] = asr_bounds[key]
else:
fs['text'] = ""
with_text = sum(1 for fs in fine_segs if fs.get('text','').strip())
print(f"After fallback: {with_text}/{len(fine_segs)} with text")
# Save
fine["_asr_meta"]["word_file"] = words_file
json.dump(fine, open(f"{BASE}/{UUID}.asrx_fine.json", "w"), indent=2)
print("Saved")
shutil.rmtree(tmp_dir, ignore_errors=True)

View File

@@ -0,0 +1,142 @@
#!/opt/homebrew/bin/python3.11
"""
Grounding DINO Base vs Large comparison test.
Both use Swin-B backbone; Large trained on 7 datasets vs Base's 3.
"""
import json, os, sys, time, cv2, torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/gdino_comparison"
LARGE_PATH = "/Users/accusys/momentry_core_0.1/models/gun/grounding-dino-large-hf"
os.makedirs(OUTPUT_DIR, exist_ok=True)
TIMEPOINTS = [
(2646, "2646s"), (3188, "3188s"), (3697, "3697s"), (5341, "5341s"),
(5461, "5461s"), (6309, "6309s"), (6377, "6377s"), (6479, "6479s"),
]
PROMPTS = ["gun", "pistol", "rifle", "weapon"]
cap = cv2.VideoCapture(VIDEO)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
def get_frame(t_sec):
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
ret, frame = cap.read()
return frame if ret else None
models = {
"base": {"path": "IDEA-Research/grounding-dino-base", "label": "Base (3 datasets)"},
"large": {"path": LARGE_PATH, "label": "Large (7 datasets)"},
}
all_results = {}
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Device: {device}")
for model_name, model_info in models.items():
print(f"\n{'='*60}")
print(f"Loading {model_info['label']} ({model_name})...")
print(f"{'='*60}")
t_load = time.time()
processor = AutoProcessor.from_pretrained(model_info["path"])
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_info["path"]).to(device)
load_time = time.time() - t_load
print(f" Loaded in {load_time:.1f}s")
model_dets = {}
t0 = time.time()
for t_sec, label in TIMEPOINTS:
frame = get_frame(t_sec)
if frame is None: continue
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
for prompt in PROMPTS:
inputs = processor(images=img, text=f"{prompt}.", return_tensors="pt").to(device)
with torch.no_grad():
outputs = model(**inputs)
target = torch.tensor([img.size[::-1]])
dets = processor.post_process_grounded_object_detection(
outputs, threshold=0.05, target_sizes=target
)[0]
det_list = []
for i in range(len(dets["boxes"])):
det_list.append({
"bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
"score": round(dets["scores"][i].item(), 3),
"label": prompt,
})
model_dets[f"{label}_prompt-{prompt}"] = det_list
elapsed = time.time() - t0
all_results[model_name] = {"elapsed": round(elapsed, 1), "detections": model_dets}
print(f" Inference: {elapsed:.1f}s")
del model
torch.mps.empty_cache()
cap.release()
# ========== Summary ==========
print(f"\n{'='*60}")
print("COMPARISON SUMMARY")
print(f"{'='*60}")
for model_name in ["base", "large"]:
d = all_results[model_name]
dets = d["detections"]
hits = sum(1 for v in dets.values() if v)
total = sum(len(v) for v in dets.values())
print(f"\n{model_name.upper()} ({d['elapsed']}s): {hits}/32 prompt-timepoint hits, {total} total detections")
for t_sec, label in TIMEPOINTS:
candidates = []
for p in PROMPTS:
key = f"{label}_prompt-{p}"
key_rev = f"{label}_prompt-{p}."
for k in [key, key_rev]:
if k in dets and dets[k]:
for dd in dets[k]:
candidates.append((p, dd["score"]))
if candidates:
best = max(candidates, key=lambda x: x[1])
print(f" {t_sec}s ({(t_sec//60)}:{t_sec%60:02d}): best={best[1]:.3f} (prompt='{best[0]}')")
else:
print(f" {t_sec}s: no detections")
# Per-timepoint comparison
print(f"\n{'='*60}")
print("PER-TIMEPOINT COMPARISON")
print(f"{'='*60}")
for t_sec, label in TIMEPOINTS:
base_best = None
large_best = None
for p in PROMPTS:
for mn in ["base", "large"]:
dets = all_results[mn]["detections"]
for k in [f"{label}_prompt-{p}", f"{label}_prompt-{p}."]:
if k in dets and dets[k]:
scores = [dd["score"] for dd in dets[k]]
best = max(scores)
if mn == "base" and (base_best is None or best > base_best[1]):
base_best = (p, best)
if mn == "large" and (large_best is None or best > large_best[1]):
large_best = (p, best)
b_str = f"base={base_best[1]:.3f} ({base_best[0]})" if base_best else "base=no det"
l_str = f"large={large_best[1]:.3f} ({large_best[0]})" if large_best else "large=no det"
delta = ""
if base_best and large_best:
d = large_best[1] - base_best[1]
delta = f" ({'+'if d>0 else ''}{d:.3f})"
print(f" {t_sec}s: {b_str:30s} | {l_str:30s}{delta}")
# Save
json.dump(all_results, open(os.path.join(OUTPUT_DIR, "comparison_results.json"), "w"), indent=2)
print(f"\nSaved to {OUTPUT_DIR}/")

343
scripts/gdino_frame_api.py Normal file
View File

@@ -0,0 +1,343 @@
#!/opt/homebrew/bin/python3.11
"""
Grounding DINO Frame API v2 — Zero-shot detection + natural language range search.
Usage:
python3 scripts/gdino_frame_api.py # Start server (port 5051)
curl http://localhost:5051/detect -d '{"time":5461,"prompt":"gun"}'
curl http://localhost:5051/search -d '{"query":"find the gun","range":"0-6780"}'
"""
import json, os, sys, time, cv2, torch, re, psycopg2, threading
from PIL import Image, ImageDraw
from flask import Flask, request, jsonify, send_file
from datetime import datetime, timezone
app = Flask(__name__)
RESOURCE_ID = "grounding-dino-v1"
RESOURCE_TYPE = "vision_detector"
CATEGORY = "zero_shot_detection"
MODEL_NAME = "IDEA-Research/grounding-dino-base"
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
BASE_DIR = "/Users/accusys/momentry/output_dev"
SHOTS_DIR = os.path.join(BASE_DIR, "api_shots")
os.makedirs(SHOTS_DIR, exist_ok=True)
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
PORT = int(os.environ.get("GDINO_API_PORT", 5051))
VIDEO_PATHS = {
"aeed71342a899fe4b4c57b7d41bcb692":
"/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4",
}
_model = None
_processor = None
def register_resource():
"""Register this service as a resource in dev.resources."""
try:
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("""
INSERT INTO dev.resources (resource_id, resource_type, category, capabilities, config, metadata, status, last_heartbeat)
VALUES (%s, %s, %s, %s::jsonb, %s::jsonb, %s::jsonb, %s, NOW())
ON CONFLICT (resource_id)
DO UPDATE SET status = %s, last_heartbeat = NOW(), config = %s::jsonb
""", (
RESOURCE_ID, RESOURCE_TYPE, CATEGORY,
json.dumps({
"detect": "Single-frame object detection",
"search": "Time-range search with natural language query",
"target_formats": ["file_uuid:chunk_id", "file_uuid:trace_id", "file_uuid:chunk_index", "range"],
}),
json.dumps({"port": PORT, "device": DEVICE, "model": MODEL_NAME, "host": "localhost"}),
json.dumps({"version": "2.0", "docs": "/health"}),
"online", "online", json.dumps({"port": PORT, "device": DEVICE, "model": MODEL_NAME}),
))
conn.commit()
cur.close(); conn.close()
print(f"[Resource] Registered as '{RESOURCE_ID}' (type={RESOURCE_TYPE})")
except Exception as e:
print(f"[Resource] Registration failed: {e}")
def heartbeat_loop():
"""Update heartbeat every 60 seconds."""
while True:
try:
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("UPDATE dev.resources SET last_heartbeat = NOW() WHERE resource_id = %s", (RESOURCE_ID,))
conn.commit()
cur.close(); conn.close()
except:
pass
time.sleep(60)
def get_model():
global _model, _processor
if _model is None:
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
print(f"[GDINO] Loading model on {DEVICE}...")
t0 = time.time()
_processor = AutoProcessor.from_pretrained(MODEL_NAME)
_model = AutoModelForZeroShotObjectDetection.from_pretrained(MODEL_NAME).to(DEVICE)
print(f"[GDINO] Loaded in {time.time()-t0:.1f}s")
return _model, _processor
def find_video(uuid):
if uuid in VIDEO_PATHS: return VIDEO_PATHS[uuid]
import glob
base = "/Users/accusys/momentry/var/sftpgo/data/demo"
for f in glob.glob(f"{base}/**/Charade*", recursive=True):
if f.endswith((".mp4", ".mov", ".avi")): VIDEO_PATHS[uuid] = f; return f
for f in glob.glob(f"{base}/**/*{uuid[:8]}*", recursive=True):
if f.endswith((".mp4", ".mov", ".avi")): VIDEO_PATHS[uuid] = f; return f
return None
def resolve_target(target_str):
"""Resolve 'file_uuid:chunk_id' or 'file_uuid:trace_id' to (file_uuid, start_time, end_time).
Returns (uuid, start_sec, end_sec, label) or None.
"""
if not target_str or ":" not in target_str:
return None
parts = target_str.split(":", 1)
if len(parts) != 2:
return None
uuid, identifier = parts
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
# Try chunk_id first
cur.execute("""
SELECT start_time, end_time, chunk_id FROM dev.chunks
WHERE file_uuid=%s AND chunk_id=%s LIMIT 1
""", (uuid, identifier))
row = cur.fetchone()
if row:
cur.close(); conn.close()
return (uuid, float(row[0]), float(row[1]), identifier)
# Try chunk_index
if identifier.isdigit():
cid = f"{uuid}_{identifier}"
cur.execute("""
SELECT start_time, end_time, chunk_id FROM dev.chunks
WHERE file_uuid=%s AND chunk_id=%s LIMIT 1
""", (uuid, cid))
row = cur.fetchone()
if row:
cur.close(); conn.close()
return (uuid, float(row[0]), float(row[1]), cid)
# Try trace_id
if identifier.startswith("trace_") or identifier.isdigit():
trace_id = identifier.replace("trace_", "")
cur.execute("""
SELECT MIN(start_time), MAX(end_time), chunk_id FROM dev.chunks
WHERE file_uuid=%s AND chunk_type='trace' AND chunk_id LIKE %s
GROUP BY chunk_id LIMIT 1
""", (uuid, f"%_trace_{trace_id}"))
row = cur.fetchone()
if row:
cur.close(); conn.close()
return (uuid, float(row[0]), float(row[1]), f"trace_{trace_id}")
cur.close(); conn.close()
return None
def parse_query(query):
"""Extract search object from natural language query."""
query = query.lower().strip()
# Direct object name
articles = ["a ", "an ", "the ", "some ", "any "]
prefixes = ["find ", "show ", "search ", "where is ", "where are ",
"looking for ", "detect ", "locate ", "spot ", "scan for "]
for p in prefixes:
if query.startswith(p):
query = query[len(p):]
for a in articles:
if query.startswith(a):
query = query[len(a):]
# Remove trailing punctuation and extra words
query = query.rstrip(".?!,")
for suffix in [" in the image", " in this scene", " in the picture",
" being held", " in hand", " in frame", " please"]:
if query.endswith(suffix):
query = query[: -len(suffix)]
return query.strip()
def infer_frame(img, prompt, threshold=0.1):
"""Run Grounding DINO on a PIL image. Returns list of detections."""
model, processor = get_model()
inputs = processor(images=img, text=f"{prompt}.", return_tensors="pt").to(DEVICE)
with torch.no_grad():
outputs = model(**inputs)
dets = processor.post_process_grounded_object_detection(
outputs, threshold=threshold, target_sizes=[img.size[::-1]])[0]
results = []
for i in range(len(dets["boxes"])):
results.append({
"bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
"score": round(dets["scores"][i].item(), 3),
"label": prompt,
})
return results
@app.route("/detect", methods=["POST"])
def detect():
"""Detect objects in a single frame.
Input: {"uuid","time","prompt","threshold"}
"""
data = request.json or {}
uuid = data.get("uuid", "aeed71342a899fe4b4c57b7d41bcb692")
t_sec = data.get("time", 0)
prompt = data.get("prompt", "gun")
threshold = data.get("threshold", 0.1)
video = find_video(uuid)
if not video: return jsonify({"error": "Video not found"}), 404
cap = cv2.VideoCapture(video)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
ret, frame = cap.read()
cap.release()
if not ret: return jsonify({"error": f"Cannot read frame at {t_sec}s"}), 400
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
t0 = time.time()
detections = infer_frame(img, prompt, threshold)
infer_ms = (time.time() - t0) * 1000
draw = ImageDraw.Draw(img)
for d in detections:
b = d["bbox"]
draw.rectangle(b, outline="lime", width=3)
draw.text((b[0], b[1]-18), f"{d['label']} {d['score']:.2f}", fill="lime")
shot_name = f"{uuid[:8]}_{int(t_sec)}s_{prompt}.jpg"
img.save(os.path.join(SHOTS_DIR, shot_name))
return jsonify({
"detections": detections,
"time_ms": round(infer_ms, 1),
"n_detections": len(detections),
"shot_url": f"/shots/{shot_name}",
})
@app.route("/search", methods=["POST"])
def search():
"""Search across a time range with natural language query.
Input: {"uuid","target":"file_uuid:chunk_id","query":"find the gun","range":"0-6780","interval":30,"threshold":0.15}
target: 'file_uuid:chunk_id' or 'file_uuid:trace_id' — resolves to time range automatically
range: manual time range (used if target not provided)
"""
data = request.json or {}
uuid = data.get("uuid", "aeed71342a899fe4b4c57b7d41bcb692")
target_str = data.get("target", "")
query = data.get("query", "find the gun")
range_str = data.get("range", "0-6780")
interval = data.get("interval", 30)
threshold = data.get("threshold", 0.15)
prompt = parse_query(query)
if not prompt:
return jsonify({"error": f"Cannot parse query: {query}"}), 400
# Resolve target → time range
resolved_label = ""
if target_str:
resolved = resolve_target(target_str)
if resolved:
uuid, range_start, range_end, resolved_label = resolved
else:
return jsonify({"error": f"Cannot resolve target: {target_str}"}), 404
else:
# Parse manual range
if "-" in range_str:
parts = range_str.split("-")
range_start = float(parts[0])
range_end = float(parts[1]) if len(parts) > 1 else 6780
else:
range_start = 0
range_end = 6780
video = find_video(uuid)
if not video: return jsonify({"error": "Video not found"}), 404
cap = cv2.VideoCapture(video)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
hits = []
t_start = time.time()
frame_step = int(interval * fps)
for frame_num in range(int(range_start * fps), min(int(range_end * fps), total_frames), frame_step):
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = cap.read()
if not ret: continue
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
detections = infer_frame(img, prompt, threshold)
if detections:
ts = frame_num / fps
best = max(d["score"] for d in detections)
hits.append({
"time": round(ts, 1),
"time_str": f"{int(ts//60)}:{int(ts%60):02d}.{int((ts%1)*fps):02d}",
"frame": frame_num,
"detections": detections,
"best_score": best,
})
if len(hits) >= 100: # safety limit
break
cap.release()
elapsed = time.time() - t_start
return jsonify({
"query": query,
"object": prompt,
"target": target_str or None,
"resolved_target": resolved_label or None,
"range": f"{range_start:.0f}-{range_end:.0f}",
"interval_secs": interval,
"scanned_frames": int((range_end - range_start) / interval) + 1,
"hits": hits,
"n_hits": len(hits),
"elapsed_secs": round(elapsed, 1),
})
@app.route("/shots/<filename>")
def serve_shot(filename):
path = os.path.join(SHOTS_DIR, filename)
if not os.path.exists(path): return jsonify({"error": "Not found"}), 404
return send_file(path, mimetype="image/jpeg")
@app.route("/health")
def health():
return jsonify({
"status": "ok",
"resource_id": RESOURCE_ID,
"resource_type": RESOURCE_TYPE,
"model": MODEL_NAME,
"device": DEVICE,
"port": PORT,
})
if __name__ == "__main__":
# Register as resource
register_resource()
# Start heartbeat thread
t = threading.Thread(target=heartbeat_loop, daemon=True)
t.start()
# Load model
get_model()
print(f"[GDINO] Frame API v2: http://0.0.0.0:{PORT}")
print(f"[GDINO] Resource: {RESOURCE_ID} (type={RESOURCE_TYPE})")
app.run(host="0.0.0.0", port=PORT, threaded=True)

155
scripts/generate_asr1.py Normal file
View File

@@ -0,0 +1,155 @@
#!/opt/homebrew/bin/python3.11
"""
Generate {uuid}.asr-1.json by comparing asr.json (3417) with DB chunks (4188).
Identifies which ASR segments were split and records corrections.
"""
import json, os, subprocess, sys, time
PG_BIN = "/Users/accusys/pgsql/18.3/bin"
DB_USER = "accusys"
DB_NAME = "momentry"
OUTPUT_DIR = "/Users/accusys/momentry/output_dev"
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
def psql(sql):
r = subprocess.run([f"{PG_BIN}/psql", "-U", DB_USER, "-d", DB_NAME, "-t", "-A", "-F", chr(31), "-c", sql],
capture_output=True, text=True, timeout=30)
return r.stdout.strip()
def main():
t0 = time.time()
print(f"Loading ASR segments from {UUID}.asr.json...")
asr_path = os.path.join(OUTPUT_DIR, f"{UUID}.asr.json")
with open(asr_path) as f:
asr_data = json.load(f)
asr_segs = asr_data["segments"]
print(f" {len(asr_segs)} ASR segments")
print("Loading DB sentence chunks...")
rows = []
raw = psql(
f"SELECT chunk_index, start_frame, end_frame, start_time, end_time, chunk_id, text_content "
f"FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='sentence' "
f"ORDER BY chunk_index"
)
for line in raw.split("\n"):
if not line.strip():
continue
parts = line.split(chr(31))
rows.append(parts)
db_chunks = []
for r in rows:
db_chunks.append({
"chunk_index": int(r[0]),
"start_frame": int(r[1]),
"end_frame": int(r[2]),
"start_time": float(r[3]),
"end_time": float(r[4]),
"chunk_id": r[5],
"text_content": r[6] if len(r) > 6 and r[6] else "",
})
print(f" {len(db_chunks)} DB chunks")
# For each DB chunk, find the best-matching ASR segment.
# A DB chunk belongs to ASR segment i if chunk's time range
# falls WITHIN ASR segment i's time range.
asr_of_chunk = {} # chunk_index -> asr_idx
for dc in db_chunks:
ct_mid = (dc["start_time"] + dc["end_time"]) / 2
best_asr = None
for ai, a in enumerate(asr_segs):
if a["start"] - 0.1 <= dc["start_time"] and dc["end_time"] <= a["end"] + 0.1:
if best_asr is None:
best_asr = ai
else:
prev_a = asr_segs[best_asr]
prev_mid = (prev_a["start"] + prev_a["end"]) / 2
if abs(ct_mid - prev_mid) > abs(ct_mid - (a["start"] + a["end"]) / 2):
best_asr = ai
if best_asr is not None:
asr_of_chunk[dc["chunk_index"]] = best_asr
print(f" Mapped: {len(asr_of_chunk)} / {len(db_chunks)} chunks to ASR segments")
# Group DB chunks by ASR index
from collections import defaultdict
chunks_by_asr = defaultdict(list)
for ci, ai in asr_of_chunk.items():
chunks_by_asr[ai].append(ci)
# Build kept + corrections
corrections = []
kept = []
for ai, child_indices in sorted(chunks_by_asr.items()):
if len(child_indices) < 2:
dc = db_chunks[child_indices[0]]
kept.append({
"chunk_index": ai,
"start_frame": dc["start_frame"],
"end_frame": dc["end_frame"],
"text_content": dc["text_content"],
})
continue
a = asr_segs[ai]
children = []
for ci in child_indices:
dc = db_chunks[ci]
children.append({
"chunk_id": dc["chunk_id"],
"start_frame": dc["start_frame"],
"end_frame": dc["end_frame"],
"text_content": dc["text_content"],
})
children_sorted = sorted(children, key=lambda x: x["start_frame"])
# Assign new chunk_id format based on chunk_index
# The first child of parent ASR idx N gets "N-01", second "N-02", etc.
for si, child in enumerate(children_sorted):
child["new_chunk_id"] = f"{ai}-{si+1:02d}"
corrections.append({
"parent_chunk_index": ai,
"reason": "split",
"original": {
"start_frame": int(a["start"] * 24),
"end_frame": int(a["end"] * 24),
"text_content": a["text"],
},
"corrected": children_sorted
})
total_corrected = sum(len(c["corrected"]) for c in corrections)
print(f" Kept chunks: {len(kept)}")
print(f" Corrected chunks: {total_corrected}")
print(f" Total: {len(kept) + total_corrected} (should be {len(db_chunks)})\n")
# Write output
output = {
"file_uuid": UUID,
"asr_version": 1,
"kept": kept,
"corrections": corrections
}
output_path = os.path.join(OUTPUT_DIR, f"{UUID}.asr-1.json")
with open(output_path, "w") as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"\nSaved: {output_path} ({os.path.getsize(output_path) / 1024:.0f} KB)")
# Stats
split_sizes = {}
for c in corrections:
n = len(c["corrected"])
split_sizes[n] = split_sizes.get(n, 0) + 1
print(f"\nSplit distribution:")
for n in sorted(split_sizes):
print(f" {n} children: {split_sizes[n]} ASR segments → {n * split_sizes[n]} chunks")
elapsed = time.time() - t0
print(f"\nElapsed: {elapsed:.1f}s")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,198 @@
#!/opt/homebrew/bin/python3.11
"""
Generate sentence-level summaries using parent story context.
Each sentence gets an LLM summary informed by the parent chunk scene overview.
"""
import json, time, sys, os
from urllib.request import Request, urlopen
import psycopg2
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
QDRANT_URL = "http://localhost:6333"
LLM_URL = "http://localhost:8082/v1/chat/completions"
EMBED_URL = "http://localhost:11436/v1/embeddings"
CHECKPOINT = f"/tmp/sentence_summaries_{UUID}.json"
def call_llm(prompt):
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1, "max_tokens": 80}).encode()
req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
try:
resp = urlopen(req, timeout=30)
data = json.loads(resp.read())
return data["choices"][0]["message"]["content"].strip()
except Exception as e:
return ""
def call_embed(text):
body = json.dumps({"input": text}).encode()
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
try:
resp = urlopen(req, timeout=30)
return json.loads(resp.read())["data"][0]["embedding"]
except Exception as e:
return None
print("=== Step 1: Build sentence→parent mapping ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
# Get all story chunks with their child_chunk_ids
cur.execute("""
SELECT chunk_index, summary_text, child_chunk_ids
FROM dev.chunks
WHERE file_uuid = %s AND chunk_type = 'story'
ORDER BY chunk_index
""", (UUID,))
stories = cur.fetchall()
print(f"Loaded {len(stories)} story chunks")
# Get all sentence chunks
cur.execute("""
SELECT chunk_index, text_content, metadata->>'new_speaker_name' as speaker
FROM dev.chunks
WHERE file_uuid = %s AND chunk_type = 'sentence'
ORDER BY chunk_index
""", (UUID,))
all_sentences = {r[0]: {"text": r[1], "speaker": r[2]} for r in cur.fetchall()}
print(f"Loaded {len(all_sentences)} sentence chunks")
# Build: sentence_index → (parent_summary, sentence_text, speaker)
sentence_map = {}
for r in stories:
story_idx, summary_text, child_ids = r
if not child_ids:
continue
for cid in child_ids:
parts = cid.split("_")
child_idx = int(parts[-1])
if child_idx in all_sentences:
sentence_map[child_idx] = {
"parent_summary": summary_text or "",
"sentence_text": all_sentences[child_idx]["text"] or "",
"speaker": all_sentences[child_idx]["speaker"] or "Unknown",
}
# Load checkpoint if exists
completed = set()
if os.path.exists(CHECKPOINT):
with open(CHECKPOINT) as f:
old = json.load(f)
completed = set(old.get("completed", []))
print(f"Loaded checkpoint: {len(completed)} already completed")
conn.close()
print("\n=== Step 2: Generate summaries ===")
results = []
errors = 0
sorted_indices = sorted(sentence_map.keys())
for i, idx in enumerate(sorted_indices):
if idx in completed:
continue
info = sentence_map[idx]
parent_summary = info["parent_summary"]
sent_text = info["sentence_text"]
speaker = info["speaker"]
if not parent_summary or not sent_text:
summary = sent_text or ""
embedding = [0.0] * 768
else:
prompt = f"Context: {parent_summary}\nUtterance: {sent_text}\n\nIn one short sentence, explain what the speaker communicates with this line within the context above."
summary = call_llm(prompt)
if not summary:
summary = sent_text
embedding = [0.0] * 768
else:
embedding = call_embed(summary)
if embedding is None:
embedding = [0.0] * 768
time.sleep(0.15)
results.append({
"index": idx,
"chunk_id": f"{UUID}_{idx}",
"speaker_name": speaker,
"utterance": sent_text,
"summary": summary,
"embedding": embedding,
})
if (i + 1) % 50 == 0:
print(f" [{i+1}/{len(sorted_indices)}] idx={idx} summary_len={len(summary)} errs={errors}")
json.dump({"completed": list(completed | {r["index"] for r in results}), "results": results}, open(CHECKPOINT, "w"))
print(f"Generated {len(results)} summaries, {errors} errors")
# Recompute all results including checkpointed
all_results = results
if os.path.exists(CHECKPOINT):
cp = json.load(open(CHECKPOINT))
all_results = cp.get("results", [])
# Merge
existing = {r["index"] for r in all_results}
for r in results:
if r["index"] not in existing:
all_results.append(r)
all_results.sort(key=lambda x: x["index"])
print(f"\nTotal summaries: {len(all_results)}")
print("\n=== Step 3: Update Qdrant sentence_summary ===")
# Delete old collection
req = Request(f"{QDRANT_URL}/collections/sentence_summary", method="DELETE")
try:
urlopen(req)
time.sleep(0.5)
except:
pass
# Recreate
req = Request(f"{QDRANT_URL}/collections/sentence_summary",
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
urlopen(req)
time.sleep(0.5)
# Upload
batch_size = 100
points = []
for r in all_results:
points.append({
"id": r["index"] + 1,
"vector": r["embedding"],
"payload": {
"chunk_type": "sentence",
"uuid": UUID,
"chunk_id": r["chunk_id"],
"speaker_name": r["speaker_name"],
"utterance": r["utterance"],
"summary": r["summary"],
}
})
for start in range(0, len(points), batch_size):
batch = points[start:start+batch_size]
req = Request(f"{QDRANT_URL}/collections/sentence_summary/points?wait=true",
data=json.dumps({"points": batch}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
try:
urlopen(req)
except Exception as e:
print(f" Batch {start}: {e}")
if (start // batch_size) % 5 == 0:
print(f" Uploaded {start + len(batch)}/{len(points)}")
print(f"Done: {len(points)} points in sentence_summary")
# Verify
resp = json.loads(urlopen(f"{QDRANT_URL}/collections/sentence_summary").read())
info = resp["result"]
print(f"Verified: points={info['points_count']}, dim={info['config']['params']['vectors'].get('size','?')}")

View File

@@ -0,0 +1,161 @@
#!/opt/homebrew/bin/python3.11
"""
Gun Detector Scan — YOLOv8n fine-tuned gun detector on Charade (1963).
Scans at ASR "gun" trigger points + fixed intervals, saves annotated screenshots.
"""
import json, os, sys, time, cv2, re
import numpy as np
from pathlib import Path
from collections import defaultdict
from ultralytics import YOLO
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
MODEL = "/Users/accusys/momentry_core_0.1/models/gun/gun_detector/weights/best.pt"
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/gun_detections"
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
CLASS_NAMES = {0: "grenade", 1: "knife", 2: "pistol", 3: "rifle"}
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Load model
print(f"Loading model: {MODEL}")
model = YOLO(MODEL)
print(f"Classes: {model.names}")
# Open video
cap = cv2.VideoCapture(VIDEO)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"Video: {fps:.1f} fps, {total_frames} frames ({total_frames/fps/60:.1f} min)")
# === Collect scan timepoints ===
print("\n=== Collecting scan timepoints ===")
# 1. ASR mentions of "gun"
import psycopg2
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("""
SELECT DISTINCT start_time FROM dev.chunks
WHERE file_uuid=%s AND chunk_type='sentence'
AND text_content ILIKE CONCAT('%%', %s, '%%')
ORDER BY start_time
""", (UUID, 'gun'))
asr_times = [r[0] for r in cur.fetchall()]
conn.close()
print(f"ASR 'gun' mentions: {len(asr_times)} timepoints")
# 2. Fixed interval scan (every 60 seconds)
fixed_times = list(range(0, int(total_frames / fps), 60))
print(f"Fixed interval (60s): {len(fixed_times)} timepoints")
# 3. The original 5 pistol timestamps (3188, 5461, 6309, 6377, 6479)
original_hits = [3188, 5461, 6309, 6377, 6479]
# Merge all timepoints, rounded to nearest second
all_times = set()
for t in asr_times + fixed_times + original_hits:
all_times.add(int(round(t)))
all_times = sorted(all_times)
print(f"Total unique scan points: {len(all_times)}")
print(f"Range: {all_times[0]}s - {all_times[-1]}s")
# === Scan ===
print("\n=== Scanning ===")
results = []
frame_step = 30 # scan 30 frames around each timepoint
t0 = time.time()
for scan_idx, t_sec in enumerate(all_times):
# Scan frames around this timepoint
center_frame = int(t_sec * fps)
start_frame = max(0, center_frame - frame_step)
end_frame = min(total_frames, center_frame + frame_step)
for frame_num in range(start_frame, end_frame + 1, 3): # every 3rd frame
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = cap.read()
if not ret: break
dets = model(frame, conf=0.25, verbose=False)[0]
for det in dets.boxes.data:
cls_id = int(det[5])
conf = float(det[4])
class_name = CLASS_NAMES.get(cls_id, f"class_{cls_id}")
# Draw annotation
x1, y1, x2, y2 = map(int, det[:4])
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)
label = f"{class_name} {conf:.2f}"
cv2.putText(frame, label, (x1, y1-5), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)
ts = frame_num / fps
filename = f"{int(ts)}s_{class_name}_{conf:.3f}.jpg"
filepath = os.path.join(OUTPUT_DIR, filename)
cv2.imwrite(filepath, frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
results.append({
"timestamp": round(ts, 1),
"time_str": f"{int(ts//60)}:{int(ts%60):02d}.{int((ts%1)*fps):02.0f}",
"frame": frame_num,
"class": class_name,
"confidence": round(conf, 3),
"image": filename,
})
if (scan_idx + 1) % 20 == 0:
elapsed = time.time() - t0
print(f" [{scan_idx+1}/{len(all_times)}] {len(results)} detections so far [{elapsed:.0f}s]")
cap.release()
print(f"\n=== Scan Complete ===")
print(f"Scan points: {len(all_times)}")
print(f"Total detections: {len(results)}")
# Deduplicate nearby detections (same class within 2 seconds)
results.sort(key=lambda r: (r["timestamp"], r["class"]))
deduped = []
for r in results:
if deduped and r["timestamp"] - deduped[-1]["timestamp"] < 2 and r["class"] == deduped[-1]["class"]:
if r["confidence"] > deduped[-1]["confidence"]:
deduped[-1] = r
else:
deduped.append(r)
print(f"After dedup: {len(deduped)} detections")
# Group by class
by_class = defaultdict(list)
for r in deduped:
by_class[r["class"]].append(r)
print(f"\nDetections by class:")
for cls, items in sorted(by_class.items()):
print(f" {cls}: {len(items)}")
for r in sorted(items, key=lambda x: -x["confidence"])[:5]:
print(f" {r['time_str']} conf={r['confidence']:.3f} frame={r['frame']} {r['image']}")
# Check if original 5 were found
print(f"\nOriginal 5 pistol timestamps:")
for t in original_hits:
found = [r for r in deduped if abs(r["timestamp"] - t) < 3 and r["class"] == "pistol"]
if found:
best = max(found, key=lambda x: x["confidence"])
print(f" {t}s: ✅ FOUND conf={best['confidence']:.3f} {best['image']}")
else:
print(f" {t}s: ❌ NOT FOUND")
# Save JSON
output = {
"uuid": UUID,
"model": str(MODEL),
"scan_points": len(all_times),
"total_detections": len(results),
"after_dedup": len(deduped),
"detections": sorted(deduped, key=lambda x: x["timestamp"]),
}
json_path = os.path.join(OUTPUT_DIR, "gun_detections.json")
json.dump(output, open(json_path, "w"), indent=2)
print(f"\nSaved: {json_path}")
print(f"Images: {OUTPUT_DIR}/")

259
scripts/import_file.py Normal file
View File

@@ -0,0 +1,259 @@
#!/opt/homebrew/bin/python3.11
"""
momentry-import — 匯入檔案歷程封包
將 export_file.py 產出的 tar.gz 匯入到目標 Momentry 系統
Usage:
python3 scripts/import_file.py <package.tar.gz> [--schema <schema>]
Example:
python3 scripts/import_file.py /tmp/charade_export.tar.gz --schema dev
"""
import sys, os, json, argparse, tarfile, io, tempfile, shutil
from pathlib import Path
import psycopg2
import psycopg2.extras
DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
SCHEMA = os.environ.get("MOMENTRY_DB_SCHEMA", "dev")
OUTPUT_DIR = os.environ.get("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
def get_conn():
return psycopg2.connect(DB_URL)
def json_loads(data: bytes):
return json.loads(data.decode())
def import_package(package_path: str, schema: str):
print(f"[IMPORT] Opening {package_path}...")
with tarfile.open(package_path, "r:gz") as tar:
# 讀取 manifest
manifest = json_loads(tar.extractfile("manifest.json").read())
uuid = manifest["file_uuid"]
print(f"[IMPORT] File: {manifest.get('file_name','?')} ({uuid})")
print(f"[IMPORT] Exported at: {manifest.get('exported_at','?')}")
print(f"[IMPORT] Completeness: {manifest.get('completeness',{})}")
print(f"[IMPORT] Merge policy: {manifest.get('merge_policy',{})}")
conn = get_conn()
cur = conn.cursor()
# Step 1: 檢查目標系統是否已有此 file_uuid
cur.execute(
f"SELECT file_uuid FROM {schema}.videos WHERE file_uuid = %s",
(uuid,),
)
existing = cur.fetchone()
if existing:
print(f" ⚠️ UUID {uuid} 已存在於目標系統")
# TODO: 支援覆蓋或略過
# Step 2: 匯入 identities需先做 identity merge
identity_map = {} # old_id → new_id
if "data/identities.json" in [m.name for m in tar.getmembers()]:
identities = json_loads(tar.extractfile("data/identities.json").read())
print(f"\n ── Identity Merge ──")
for ident in identities:
old_id = ident["id"]
name = ident.get("name", "")
# 依名稱比對
cur.execute(
f"SELECT id FROM {schema}.identities WHERE name = %s",
(name,),
)
row = cur.fetchone()
if row:
# 已存在 → merge
identity_map[old_id] = row[0]
print(f" 🔗 '{name}' → 已存在 (id={row[0]}), 合併")
else:
# 不存在 → 新增
cur.execute(
f"INSERT INTO {schema}.identities (name) VALUES (%s) RETURNING id",
(name,),
)
new_id = cur.fetchone()[0]
identity_map[old_id] = new_id
print(f"'{name}' → 新增 (id={new_id})")
conn.commit()
print(f" ────────────────")
else:
print(f" [IMPORT] identities: (package 無 identity 資料)")
# Step 3: 匯入 identity_bindings若有
if "data/identity_bindings.json" in [m.name for m in tar.getmembers()]:
bindings = json_loads(tar.extractfile("data/identity_bindings.json").read())
for b in bindings:
b["identity_id"] = identity_map.get(b["identity_id"], b["identity_id"])
try:
cur.execute(
f"INSERT INTO {schema}.identity_bindings "
f"(identity_id, identity_type, identity_value, metadata, confidence) "
f"VALUES (%s, %s, %s, %s, %s) ON CONFLICT DO NOTHING",
(b["identity_id"], b["identity_type"], b["identity_value"],
json.dumps(b.get("metadata", {})), b.get("confidence", 1.0)),
)
except Exception as e:
print(f" ⚠️ binding 匯入失敗: {e}")
conn.commit()
print(f" [IMPORT] identity_bindings: {len(bindings)} rows")
# Step 4: 匯入 videos 資料
video_data = json_loads(tar.extractfile("data/video.json").read())
cur.execute(
f"""
INSERT INTO {schema}.videos
(file_uuid, file_path, file_name, file_type, duration, width, height,
fps, total_frames, probe_json, status)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, 'completed')
ON CONFLICT (file_uuid) DO UPDATE SET
file_path = EXCLUDED.file_path,
file_name = EXCLUDED.file_name,
status = 'completed'
""",
(
uuid,
video_data.get("file_path", ""),
video_data.get("file_name", ""),
video_data.get("file_type", "video"),
video_data.get("duration"),
video_data.get("width"),
video_data.get("height"),
float(video_data.get("fps") or 0),
video_data.get("total_frames"),
json.dumps(video_data.get("probe_json", {})),
),
)
conn.commit()
print(f" [IMPORT] videos: ✅")
# Step 5: 匯入 output JSON 檔案
output_dir = Path(OUTPUT_DIR)
for member in tar.getmembers():
if member.name.startswith("output/") and member.isfile():
fname = member.name.replace("output/", "")
dst = output_dir / fname
if not dst.parent.exists():
dst.parent.mkdir(parents=True)
with tar.extractfile(member) as src_f:
with open(dst, "wb") as dst_f:
shutil.copyfileobj(src_f, dst_f)
print(f" [IMPORT] output/{fname} ({member.size // 1024}KB)")
print(f" [IMPORT] output files: 完成")
# Step 6: 匯入 pre_chunks批次插入
if "data/pre_chunks.json" in [m.name for m in tar.getmembers()]:
pre_chunks = json_loads(tar.extractfile("data/pre_chunks.json").read())
# 先取得 file_idvideos table 的 id
cur.execute(f"SELECT id FROM {schema}.videos WHERE file_uuid = %s", (uuid,))
file_row = cur.fetchone()
if file_row:
file_id = file_row[0]
inserted = 0
for pc in pre_chunks:
try:
cur.execute(
f"INSERT INTO {schema}.pre_chunks "
f"(file_id, file_uuid, processor_type, coordinate_type, "
f"coordinate_index, start_frame, end_frame, start_time, end_time, "
f"fps, data) "
f"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) "
f"ON CONFLICT DO NOTHING",
(
file_id, uuid,
pc.get("processor_type"), pc.get("coordinate_type"),
pc.get("coordinate_index"),
pc.get("start_frame"), pc.get("end_frame"),
pc.get("start_time"), pc.get("end_time"),
pc.get("fps"), json.dumps(pc.get("data", {})),
),
)
inserted += 1
if inserted % 1000 == 0:
print(f" ... {inserted}/{len(pre_chunks)}", end="\r")
except Exception as e:
pass
conn.commit()
print(f" [IMPORT] pre_chunks: {inserted} rows \n")
else:
print(f" [IMPORT] pre_chunks: 無法取得 file_id")
# Step 7: 匯入 processor_results
if "data/processor_results.json" in [m.name for m in tar.getmembers()]:
results = json_loads(tar.extractfile("data/processor_results.json").read())
for r in results:
try:
cur.execute(
f"INSERT INTO {schema}.processor_results "
f"(job_id, file_uuid, processor, status, chunks_produced, frames_processed) "
f"VALUES (0, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING",
(uuid, r.get("processor"), r.get("status"),
r.get("chunks_produced", 0), r.get("frames_processed", 0)),
)
except Exception:
pass
conn.commit()
print(f" [IMPORT] processor_results: {len(results)} rows")
# Step 7: 匯入 face_detections若無 embedding 可省略該欄位)
face_detections_src = None
for candidate in ["data/face_detections.json", "data/face_detections_meta.json"]:
if candidate in [m.name for m in tar.getmembers()]:
face_detections_src = candidate
break
if face_detections_src:
fds = json_loads(tar.extractfile(face_detections_src).read())
inserted = 0
for fd in fds:
try:
cur.execute(
f"INSERT INTO {schema}.face_detections "
f"(file_uuid, face_id, frame_number, x, y, width, height, "
f"confidence, identity_id, trace_id) "
f"VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) "
f"ON CONFLICT DO NOTHING",
(
uuid,
fd.get("face_id"),
fd.get("frame_number"),
fd.get("x"), fd.get("y"),
fd.get("width"), fd.get("height"),
fd.get("confidence"),
identity_map.get(fd.get("identity_id"), fd.get("identity_id")),
fd.get("trace_id"),
),
)
inserted += 1
if inserted % 1000 == 0:
print(f" ... {inserted}/{len(fds)}", end="\r")
except Exception as e:
pass
conn.commit()
print(f" [IMPORT] face_detections: {inserted} rows \n")
cur.close()
conn.close()
print(f"\n[IMPORT] ✅ 完成: {manifest.get('file_name','?')} 已匯入 (file_uuid={uuid})")
def main():
parser = argparse.ArgumentParser(description="Import file processing history package")
parser.add_argument("package", help="Path to .tar.gz package")
parser.add_argument("--schema", default=SCHEMA, help="Target DB schema")
args = parser.parse_args()
if not os.path.exists(args.package):
print(f"[IMPORT] ❌ Package not found: {args.package}")
sys.exit(1)
import_package(args.package, args.schema)
if __name__ == "__main__":
main()

138
scripts/lip_analyzer.py Normal file
View File

@@ -0,0 +1,138 @@
#!/opt/homebrew/bin/python3.11
"""
Lip Analyzer — from face_test.json (Apple Vision outer_lips 14pts) + ASRX
Computes lip_openness per frame, compares with speaker segments.
"""
import json, sys, os
from pathlib import Path
from collections import defaultdict
def calc_lip_height(face):
lips_data = face.get("lips", {})
if isinstance(lips_data, dict):
pts = lips_data.get("outer_lips", [])
elif isinstance(lips_data, list):
pts = lips_data
else:
return None
if not pts or len(pts) < 3:
return None
ys = [pt[1] if isinstance(pt, (list, tuple)) else pt.get("y", 0) for pt in pts]
return max(ys) - min(ys)
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--face", required=True)
parser.add_argument("--asrx", required=True)
parser.add_argument("--output", required=True)
parser.add_argument("--threshold", type=float, default=0.05)
args = parser.parse_args()
# Load face data
with open(args.face) as f:
face_data = json.load(f)
frames_data = face_data.get("frames", face_data if isinstance(face_data, list) else [])
# face_test.json uses frames array
if not isinstance(frames_data, list) and isinstance(face_data, dict):
frames_data = face_data.get("frames", [])
print(f"\nFace data: {len(frames_data)} frames, {face_data.get('frame_count', '?')} total")
# Extract lip openness per frame, per face
lip_by_frame = {}
for fdata in frames_data:
fn = fdata.get("frame", 0) if isinstance(fdata, dict) else 0
faces = fdata.get("faces", fdata.get("detections", []))
heights = []
for face in faces:
h = calc_lip_height(face)
if h is not None:
heights.append(h)
if heights:
lip_by_frame[fn] = {"heights": heights, "avg": sum(heights)/len(heights), "count": len(heights)}
print(f"Frames with lip data: {len(lip_by_frame)}")
# Load ASRX speaker segments
with open(args.asrx) as f:
asrx = json.load(f)
segs = asrx.get("segments", [])
fps = 25.0
print(f"ASRX segments: {len(segs)}")
# Analyze each ASR segment
results = []
speakable = 0
total = 0
for seg in segs:
total += 1
st = seg.get("start_time", 0)
et = seg.get("end_time", 0)
speaker = seg.get("speaker_id", "?")
text = seg.get("text", "")
# Process all segments (no time limit)
# Find frames in this segment's window
start_frame = int(st * fps)
end_frame = int(et * fps) + 10 # allow some after
# Sample before ASR start (baseline 10 frames before)
baseline_frames = [fn for fn in lip_by_frame if abs(fn - start_frame) <= 10]
# Sample after ASR start (during speaking)
during_frames = [fn for fn in lip_by_frame if fn >= start_frame and fn <= end_frame]
baseline_avg = sum(lip_by_frame[fn]["avg"] for fn in baseline_frames) / max(len(baseline_frames), 1)
during_avg = sum(lip_by_frame[fn]["avg"] for fn in during_frames) / max(len(during_frames), 1)
# How many frames have detectable faces (any faces)
any_face = len(during_frames)
motion = (during_avg - baseline_avg) / max(baseline_avg, 1)
is_speaking = motion > args.threshold
r = {
"start_time": st, "end_time": et, "speaker": speaker,
"text": text[:40],
"baseline_avg": round(baseline_avg, 2),
"during_avg": round(during_avg, 2),
"motion_ratio": round(motion, 4),
"is_speaking": is_speaking,
"baseline_frames": len(baseline_frames),
"during_frames": any_face,
}
results.append(r)
if any_face > 0:
speakable += 1
# Summary
print(f"\n=== Results ===")
print(f"ASRX segments analyzed: {len(results)}")
print(f"With face data: {speakable} ({speakable*100//max(len(results),1)}%)")
speech_detected = sum(1 for r in results if r["is_speaking"] and r["during_frames"] > 0)
print(f"Lip motion detected: {speech_detected} ({speech_detected*100//max(speakable,1)}% of face-present)")
print(f"\n=== Sample: first 5 segments ===")
for r in results[:5]:
icon = "🗣" if r["is_speaking"] else "🤐"
print(f" {icon} {r['start_time']:.0f}s {r['speaker']:12s} motion={r['motion_ratio']:.3f} baseline={r['baseline_avg']:.1f} during={r['during_avg']:.1f} faces={r['during_frames']}")
# Save
output = {
"fps": fps,
"total_asrx_segments": len(results),
"segments_with_faces": speakable,
"segments_with_lip_motion": speech_detected,
"lip_by_frame_count": len(lip_by_frame),
"results": results,
}
with open(args.output, "w") as f:
json.dump(output, f, indent=2, ensure_ascii=False)
print(f"\nSaved: {args.output}")
if __name__ == "__main__":
main()

137
scripts/map_speakers_v2.py Normal file
View File

@@ -0,0 +1,137 @@
#!/opt/homebrew/bin/python3.11
"""
Build new ASRX speaker_id → character name mapping using:
1. Old DB sentence chunk metadata (speaker_name from face-to-TMDb match)
2. New ASRX segments (1:1 aligned with ASR, each with speaker_id + voice embedding)
"""
import json, sys, psycopg2
from collections import Counter, defaultdict
import numpy as np
from urllib.request import Request, urlopen
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
ASRX_PATH = f"/Users/accusys/momentry/output_dev/{UUID}.asrx.json"
QDRANT_URL = "http://localhost:6333"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
# Character name normalization
NAME_MAP = {
"Speaker_0": "Unknown",
"SPEAKER_0": "Unknown",
"SPEAKER_1": "Unknown",
"SPEAKER_2": "Unknown",
"SPEAKER_3": "Unknown",
"SPEAKER_4": "Unknown",
"SPEAKER_5": "Unknown",
"SPEAKER_6": "Unknown",
"SPEAKER_7": "Unknown",
"SPEAKER_8": "Unknown",
"SPEAKER_9": "Unknown",
}
print("=== Step 1: Load DB sentence chunks ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("""
SELECT chunk_index, metadata->>'speaker_id' as old_sid,
metadata->>'speaker_name' as old_name
FROM dev.chunks
WHERE file_uuid = %s AND chunk_type = 'sentence'
ORDER BY chunk_index
""", (UUID,))
rows = cur.fetchall()
cur.close()
conn.close()
print(f"Loaded {len(rows)} sentence chunks from DB")
# Build array indexed by chunk_index
db_by_idx = {}
for r in rows:
db_by_idx[r[0]] = {"old_sid": r[1], "old_name": r[2]}
print("=== Step 2: Load new ASRX ===")
asrx = json.load(open(ASRX_PATH))
segs = asrx["segments"]
embeddings = asrx.get("embeddings", [])
print(f"Loaded {len(segs)} ASRX segments, {len(embeddings)} embeddings")
# Build mapping: new_speaker_id --> old_name distribution
new_to_old = defaultdict(list)
old_name_counter = defaultdict(Counter)
unmapped = 0
total = 0
for i, seg in enumerate(segs):
new_sid = seg["speaker_id"]
total += 1
if i in db_by_idx:
old_name = db_by_idx[i].get("old_name", "")
old_sid = db_by_idx[i].get("old_sid", "")
# Normalize old name
if old_name and old_name not in NAME_MAP:
# Normalize case: "Speaker_0" → "Unknown"
if old_name.startswith("Speaker_") or old_name.startswith("SPEAKER_"):
old_name = "Unknown"
elif old_name in NAME_MAP:
old_name = NAME_MAP[old_name]
new_to_old[new_sid].append(old_name)
old_name_counter[new_sid][old_name] += 1
else:
unmapped += 1
new_to_old[new_sid].append("Unknown")
print(f"\nMapped {total - unmapped} segments, {unmapped} unmapped")
print(f"\nMapping {len(new_to_old)} new speaker IDs:")
# Determine best character name for each new speaker
speaker_identity = {}
for sid in sorted(new_to_old.keys()):
counter = old_name_counter[sid]
total_for_speaker = sum(counter.values())
best_name = counter.most_common(1)[0][0]
best_count = counter.most_common(1)[0][1]
pct = best_count / total_for_speaker * 100
speaker_identity[sid] = {
"name": best_name,
"confidence": round(pct, 1),
"count": total_for_speaker,
"distribution": dict(counter.most_common(5))
}
print(f" {sid}: {best_name} ({pct:.0f}%, {total_for_speaker} segs)")
for nm, cnt in counter.most_common(5):
if nm != best_name:
print(f" {nm}: {cnt}")
print("\n=== Step 3: Assign names to all new ASRX segments ===")
assignments = []
for i, seg in enumerate(segs):
new_sid = seg["speaker_id"]
assigned_name = speaker_identity[new_sid]["name"]
assignments.append({
"index": i,
"speaker_id": new_sid,
"speaker_name": assigned_name,
"start_time": seg["start_time"],
"end_time": seg["end_time"],
})
# Save mapping
output = {
"uuid": UUID,
"total_segments": len(segs),
"speaker_identity": speaker_identity,
"assignments": assignments,
}
with open(f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map_v2.json", "w") as f:
json.dump(output, f, indent=2)
print(f"\nSaved speaker mapping to output_dev/{UUID}.speaker_map_v2.json")
print("\n=== Summary ===")
for sid, info in sorted(speaker_identity.items()):
print(f" {sid} ({info['count']} segs, {info['confidence']}% confidence): {info['name']}")

185
scripts/migrate_to_4188.py Normal file
View File

@@ -0,0 +1,185 @@
#!/opt/homebrew/bin/python3.11
"""
Full pipeline migration: delete old chunks, create 4188 fine-grained chunks
with yolo_objects, face_ids, metadata per (recalculated) frame range.
"""
import json, sys, time, psycopg2
from collections import defaultdict
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
BASE = "/Users/accusys/momentry/output_dev"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
FPS = 25.0
FILE_ID = 242
print("=== Load asrx_fine ===")
fine = json.load(open(f"{BASE}/{UUID}.asrx.json"))
segs = fine["segments"]
print(f"Segments: {len(segs)}")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
# Step 2: Delete old chunks
print("\n=== Step 2: Delete old chunks ===")
for ctype in ['sentence', 'story', 'trace']:
cur.execute(
"DELETE FROM dev.chunks WHERE file_uuid=%s AND chunk_type=%s",
(UUID, ctype))
print(f" Deleted {cur.rowcount} {ctype} chunks")
conn.commit()
# Step 3: Build frame → data lookup for YOLO and faces
print("\n=== Step 3: Load yolo + face data ===")
# YOLO: frame → set of object class names (dedup, confidence > 0.5)
print(" Loading YOLO data...")
t0 = time.time()
cur.execute(
"SELECT start_frame, data FROM dev.pre_chunks "
"WHERE file_uuid=%s AND processor_type='yolo' "
"ORDER BY start_frame", (UUID,))
yolo_by_frame = {} # frame → set of class names
row_count = 0
for r in cur:
fn = r[0]
data = r[1]
if data and "objects" in data:
objects = data["objects"]
names = set()
for obj in objects:
if obj.get("confidence", 0) > 0.5:
names.add(obj.get("class_name", ""))
if names:
yolo_by_frame[fn] = names
row_count += 1
print(f" YOLO: {row_count} entries, {len(yolo_by_frame)} frames with objects ({time.time()-t0:.1f}s)")
# Face: frame → set of face_ids
print(" Loading face data...")
t0 = time.time()
cur.execute(
"SELECT frame_number, face_id FROM dev.face_detections "
"WHERE file_uuid=%s AND trace_id IS NOT NULL "
"ORDER BY frame_number", (UUID,))
face_by_frame = defaultdict(set) # frame → set of face_ids
row_count = 0
for r in cur:
fn = r[0]
fid = r[1]
if fid:
face_by_frame[fn].add(fid)
row_count += 1
print(f" Faces: {row_count} entries, {len(face_by_frame)} frames ({time.time()-t0:.1f}s)")
# Step 4: Create new chunks
print("\n=== Step 4: Create 4188 sentence chunks ===")
t0 = time.time()
batch_size = 100
inserted = 0
yolo_hit = 0
face_hit = 0
yolo_frames_sorted = sorted(yolo_by_frame.keys())
face_frames_sorted = sorted(face_by_frame.keys())
for batch_start in range(0, len(segs), batch_size):
batch = segs[batch_start:batch_start + batch_size]
values = []
for si, s in enumerate(batch):
idx = batch_start + si
st = s["start_time"]
et = s["end_time"]
sf = int(st * FPS)
ef = int(et * FPS)
spk_name = s.get("speaker_name", "Unknown")
spk_id = s.get("speaker_id", "SPEAKER_?")
raw_text = s.get("text", "")
# Query YOLO objects in frame range (binary search on sorted list)
yolo_objs = []
import bisect
left = bisect.bisect_left(yolo_frames_sorted, sf)
right = bisect.bisect_right(yolo_frames_sorted, ef)
for i in range(left, right):
fn = yolo_frames_sorted[i]
yolo_objs.extend(yolo_by_frame[fn])
yolo_objs = list(set(yolo_objs)) # dedup
if yolo_objs:
yolo_hit += 1
# Query face IDs in frame range
face_ids = []
left = bisect.bisect_left(face_frames_sorted, sf)
right = bisect.bisect_right(face_frames_sorted, ef)
for i in range(left, right):
fn = face_frames_sorted[i]
face_ids.extend(face_by_frame[fn])
face_ids = list(set(face_ids)) # dedup
if face_ids:
face_hit += 1
chunk_id = f"{UUID}_{idx}"
values.append((
UUID, # file_uuid
chunk_id, # old_chunk_id
idx, # chunk_index
"sentence", # chunk_type
st, # start_time
et, # end_time
json.dumps({"data": {"text": raw_text, "text_normalized": raw_text.lower()}, "rule": "rule_1"}), # content
json.dumps({ # metadata
"speaker_id": spk_id,
"speaker_name": spk_name,
"yolo_objects": yolo_objs,
"face_ids": face_ids,
"language": "en",
}),
f"[{spk_name}] {raw_text}", # text_content
FPS, # fps
sf, # start_frame
ef, # end_frame
ef - sf, # frame_count
FILE_ID, # file_id
chunk_id, # chunk_id
[], # pre_chunk_ids
[], # child_chunk_ids
))
cur.executemany("""
INSERT INTO dev.chunks
(file_uuid, old_chunk_id, chunk_index, chunk_type,
start_time, end_time, content, metadata,
text_content, fps, start_frame, end_frame, frame_count,
file_id, chunk_id, pre_chunk_ids, child_chunk_ids)
VALUES (%s,%s,%s,%s,%s,%s,%s::jsonb,%s::jsonb,%s,%s,%s,%s,%s,%s,%s,%s,%s)
""", values)
conn.commit()
inserted += len(batch)
if (batch_start // batch_size) % 5 == 0:
pct = inserted * 100 // len(segs)
print(f" {inserted}/{len(segs)} ({pct}%) yolo_hit={yolo_hit} face_hit={face_hit} [{time.time()-t0:.0f}s]")
print(f"\n Inserted: {inserted} chunks")
print(f" Chunks with YOLO objects: {yolo_hit}/{inserted}")
print(f" Chunks with face IDs: {face_hit}/{inserted}")
print(f" Time: {time.time()-t0:.1f}s")
# Verify
cur.execute(
"SELECT COUNT(*) FROM dev.chunks WHERE file_uuid=%s AND chunk_type='sentence'",
(UUID,))
cnt = cur.fetchone()[0]
print(f"\n DB sentence chunks: {cnt}")
cur.execute(
"SELECT metadata->>'speaker_name', COUNT(*) FROM dev.chunks "
"WHERE file_uuid=%s AND chunk_type='sentence' "
"GROUP BY 1 ORDER BY 2 DESC", (UUID,))
print(" Speaker distribution:")
for r in cur.fetchall():
print(f" {r[0]}: {r[1]}")
conn.close()
print("\n=== Done ===")

View File

@@ -0,0 +1,324 @@
#!/opt/homebrew/bin/python3.11
"""
Object Search Agent — searches across YOLO, OCR, ASR, and TKG.
Usage: python3 scripts/object_search_agent.py --keyword stamp [--uuid <UUID>]
"""
import json, sys, argparse
from collections import defaultdict
import psycopg2
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
FPS = 25.0
# YOLO class aliases for common search terms
ALIASES = {
"stamp": ["stamp"],
"gun": ["knife", "pistol", "rifle", "grenade"],
"weapon": ["knife", "pistol", "rifle", "grenade"],
"knife": ["knife"],
"person": ["person"],
"letter": ["book"],
"envelope": ["book"],
"car": ["car"],
"tie": ["tie"],
"phone": ["cell phone"],
"bottle": ["bottle", "wine glass", "cup"],
"chair": ["chair"],
"umbrella": ["umbrella"],
}
def search_yolo(cur, keyword, uuid):
"""Search YOLO detections for matching object classes."""
classes = ALIASES.get(keyword, [keyword])
results = []
for cls in classes:
cur.execute("""
SELECT start_frame, end_frame, data
FROM dev.pre_chunks
WHERE file_uuid=%s AND processor_type='yolo'
AND data->'objects' IS NOT NULL
AND data->'objects' @> jsonb_build_array(
jsonb_build_object('class_name', %s)
)
ORDER BY start_frame
LIMIT 100
""", (uuid, cls))
for r in cur.fetchall():
sf, ef, data = r
objects = [o for o in data.get("objects", []) if o.get("class_name") == cls]
top_conf = max((o.get("confidence", 0) for o in objects), default=0)
if top_conf > 0.3:
ts = sf / FPS
results.append({
"frame": int(sf),
"timestamp": ts,
"time_str": f"{int(ts//60)}:{int(ts%60):02d}.{int((ts%1)*25):02d}",
"class": cls,
"confidence": round(top_conf, 3),
"source": "yolo",
})
return results
def search_ocr(cur, keyword, uuid):
"""Search OCR text for keyword."""
cur.execute("""
SELECT start_frame, end_frame, data
FROM dev.pre_chunks
WHERE file_uuid=%s AND processor_type='ocr'
AND data->>'text' ILIKE %s
ORDER BY start_frame
LIMIT 50
""", (uuid, f"%{keyword}%"))
results = []
for r in cur.fetchall():
sf, ef, data = r
results.append({
"frame": sf,
"timestamp": sf / FPS,
"time_str": f"{int(sf//FPS//60)}:{sf//FPS%60:02d}.{sf%FPS:02.0f}",
"text": data.get("text", "")[:100],
"source": "ocr",
})
return results
def search_asr(cur, keyword, uuid):
"""Search ASR/sentence text for keyword."""
cur.execute("""
SELECT chunk_index, start_time, end_time, text_content
FROM dev.chunks
WHERE file_uuid=%s AND chunk_type='sentence'
AND text_content ILIKE %s
ORDER BY start_time
LIMIT 100
""", (uuid, f"%{keyword}%"))
results = []
for r in cur.fetchall():
idx, st, et, text = r
results.append({
"chunk_index": idx,
"timestamp": st,
"time_str": f"{int(st//60)}:{st%60:05.2f}",
"text": (text or "")[:120],
"source": "asr",
})
return results
GUN_MODEL_PATH = "/Users/accusys/momentry_core_0.1/models/gun/gun_detector/weights/best.pt"
GUN_CLASSES = {0: "grenade", 1: "knife", 2: "pistol", 3: "rifle"}
# Grounding DINO — Zero-shot gun detector (Large: 7 datasets, confirmed best on Charade)
GDINO_MODEL_NAME = "/Users/accusys/momentry_core_0.1/models/gun/grounding-dino-large-hf"
GDINO_PROMPTS = ["gun", "pistol", "rifle", "weapon", "firearm"]
_gdino_processor = None
_gdino_model = None
_gdino_device = None
def init_gdino():
global _gdino_processor, _gdino_model, _gdino_device
if _gdino_model is not None:
return
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
import torch
_gdino_processor = AutoProcessor.from_pretrained(GDINO_MODEL_NAME)
_gdino_model = AutoModelForZeroShotObjectDetection.from_pretrained(GDINO_MODEL_NAME)
_gdino_device = "mps" if torch.backends.mps.is_available() else "cpu"
_gdino_model.to(_gdino_device)
def search_zero_shot(video_path, keyword, threshold=0.05):
"""Search for objects using Grounding DINO zero-shot detection."""
import cv2
from PIL import Image
import torch
# Determine prompts based on keyword
if keyword in ("gun", "weapon", "pistol", "rifle", "firearm"):
prompts = GDINO_PROMPTS
else:
prompts = [keyword]
init_gdino()
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
results = []
for frame_num in range(0, total_frames, 1500): # every ~60s
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = cap.read()
if not ret: break
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
for prompt in prompts:
inputs = _gdino_processor(images=img, text=prompt, return_tensors="pt").to(_gdino_device)
with torch.no_grad():
outputs = _gdino_model(**inputs)
target = torch.tensor([img.size[::-1]])
dets = _gdino_processor.post_process_grounded_object_detection(
outputs, threshold=threshold, target_sizes=target)[0]
for i in range(len(dets["boxes"])):
score = dets["scores"][i].item()
ts = frame_num / fps
results.append({
"frame": frame_num,
"timestamp": ts,
"time_str": f"{int(ts//60)}:{int(ts%60):02d}",
"class": prompt,
"confidence": round(score, 3),
"source": "grounding-dino",
})
if len(results) >= 50:
break
cap.release()
return results
def search_gun_detector(video_path, keyword, frame_step=150, confidence=0.25):
"""Run custom gun detector model on keyframes."""
classes = ALIASES.get(keyword, [])
target_ids = [cid for cid, cname in GUN_CLASSES.items() if cname in classes]
if not target_ids:
return []
try:
from ultralytics import YOLO
import cv2
except ImportError:
return [{"error": "ultralytics or cv2 not available"}]
model = YOLO(GUN_MODEL_PATH)
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
results = []
for frame_num in range(0, total_frames, frame_step):
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = cap.read()
if not ret:
break
dets = model(frame, conf=confidence, verbose=False)[0]
for det in dets.boxes.data:
cls_id = int(det[5])
if cls_id in target_ids:
conf_val = float(det[4])
ts = frame_num / fps
results.append({
"frame": frame_num,
"timestamp": ts,
"time_str": f"{int(ts//60)}:{int(ts%60):02d}.{int((ts%1)*fps):02d}",
"class": GUN_CLASSES[cls_id],
"confidence": round(conf_val, 3),
"source": "gun_detector",
})
if len(results) >= 50:
break
cap.release()
return results
def search_tkg(cur, keyword, uuid):
"""Search TKG for related entities."""
cur.execute("""
SELECT node_type, external_id, label, properties
FROM dev.tkg_nodes
WHERE file_uuid=%s
AND (label ILIKE %s OR external_id ILIKE %s)
LIMIT 20
""", (uuid, f"%{keyword}%", f"%{keyword}%"))
results = []
for r in cur.fetchall():
node_type, ext_id, label, props = r
results.append({
"type": node_type,
"id": ext_id,
"label": label,
"properties": props,
"source": "tkg",
})
return results
def find_video(uuid):
"""Find Charade video file."""
import glob
base = "/Users/accusys/momentry/var/sftpgo/data/demo"
# Find Charade by name
for f in glob.glob(f"{base}/**/Charade*", recursive=True):
if f.endswith((".mp4", ".mov", ".avi")):
return f
# Fallback: search by uuid pattern
for f in glob.glob(f"{base}/**/*{uuid[:8]}*", recursive=True):
if f.endswith((".mp4", ".mov", ".avi")):
return f
return None
def main():
parser = argparse.ArgumentParser(description="Movie Object Search Agent")
parser.add_argument("--keyword", required=True, help="Object to search for")
parser.add_argument("--uuid", default=UUID)
parser.add_argument("--sources", default="all", help="yolo,ocr,asr,tkg,gun_custom,all")
parser.add_argument("--video", help="Path to video file (for gun detector)")
args = parser.parse_args()
kw = args.keyword.lower()
src = args.sources.split(",") if args.sources != "all" else ["yolo","ocr","asr","tkg"]
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
results = {}
if "yolo" in src:
r = search_yolo(cur, kw, args.uuid)
results["yolo"] = {"count": len(r), "results": r[:30]}
if "ocr" in src:
r = search_ocr(cur, kw, args.uuid)
results["ocr"] = {"count": len(r), "results": r[:20]}
if "asr" in src:
r = search_asr(cur, kw, args.uuid)
results["asr"] = {"count": len(r), "results": r[:20]}
if "tkg" in src:
r = search_tkg(cur, kw, args.uuid)
results["tkg"] = {"count": len(r), "results": r[:10]}
if "zero_shot" in src or kw in ("gun", "weapon", "pistol", "rifle", "firearm"):
video_path = args.video or find_video(args.uuid)
if video_path:
print(" Running Grounding DINO zero-shot search...")
r = search_zero_shot(video_path, kw)
results["zero_shot"] = {"count": len(r), "results": r[:20]}
else:
results["zero_shot"] = {"count": 0, "results": [], "error": "Video not found"}
conn.close()
# Print summary
print(f"\n=== Object Search: \"{args.keyword}\" ===\n")
for src_name, data in results.items():
print(f"[{src_name.upper()}] {data['count']} matches" + (" — top results:" if data['results'] else ""))
for i, r in enumerate(data['results'][:5]):
if src_name == "yolo":
print(f" {i+1}. {r['time_str']} frame={r['frame']} \"{r['class']}\" conf={r['confidence']}")
elif src_name == "ocr":
print(f" {i+1}. {r['time_str']} frame={r['frame']} \"{r['text'][:60]}\"")
elif src_name == "asr":
print(f" {i+1}. {r['time_str']} \"{r['text'][:60]}\"")
elif src_name == "tkg":
print(f" {i+1}. {r['type']}: {r['label']} ({r.get('properties',{}).get('total_detections','?')} detections)")
print()
# Output as JSON for machine parsing
print(json.dumps({"keyword": args.keyword, "sources": results}, indent=2))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,121 @@
#!/opt/homebrew/bin/python3.11
"""
Full comparison: Grounding DINO Base vs PaliGemma 3B mix-224
Tests on 8 known timepoints with gun/stamp prompts.
"""
import json, os, sys, time, cv2, torch, re
from PIL import Image
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/paligemma_vs_gdino"
os.makedirs(OUTPUT_DIR, exist_ok=True)
TIMEPOINTS = [
(2646, "2646s"), (3188, "3188s"), (3697, "3697s"),
(5341, "5341s"), (5461, "5461s"), (6309, "6309s"),
(6377, "6377s"), (6479, "6479s"),
]
PROMPTS = ["gun", "pistol", "stamp", "envelope", "passport"]
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Device: {device}")
# Load all frames
cap = cv2.VideoCapture(VIDEO)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
frames = {}
for t_sec, label in TIMEPOINTS:
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
ret, frame = cap.read()
if ret: frames[label] = frame
cap.release()
print(f"Loaded {len(frames)} frames")
all_results = {}
# ===== Grounding DINO Base =====
print("\n" + "="*60)
print("Grounding DINO Base")
print("="*60)
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
t0 = time.time()
gd_proc = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
gd_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-base").to(device)
gd_dets = {}
for label, frame in frames.items():
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
for pname in PROMPTS:
inputs = gd_proc(images=img, text=f"{pname}.", return_tensors="pt").to(device)
with torch.no_grad():
outputs = gd_model(**inputs)
target = torch.tensor([img.size[::-1]])
dets = gd_proc.post_process_grounded_object_detection(outputs, threshold=0.1, target_sizes=target)[0]
scores = [round(s.item(), 3) for s in dets["scores"]] if len(dets["boxes"]) > 0 else []
gd_dets[f"{label}_{pname}"] = scores
all_results["grounding-dino-base"] = {"elapsed": round(time.time()-t0, 1), "detections": gd_dets}
print(f" Done: {all_results['grounding-dino-base']['elapsed']}s")
del gd_model; torch.mps.empty_cache()
# ===== PaliGemma 3B mix-224 =====
print("\n" + "="*60)
print("PaliGemma 3B mix-224")
print("="*60)
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
t0 = time.time()
pg_proc = AutoProcessor.from_pretrained("google/paligemma-3b-mix-224")
pg_model = PaliGemmaForConditionalGeneration.from_pretrained(
"google/paligemma-3b-mix-224", dtype=torch.bfloat16
).to(device)
print(f" Model loaded: {sum(p.numel() for p in pg_model.parameters())/1e6:.0f}M params")
pg_dets = {}
for label, frame in frames.items():
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
for pname in PROMPTS:
t_infer = time.time()
prompt = f"detect {pname}"
inputs = pg_proc(text=prompt, images=img, return_tensors="pt").to(device)
with torch.no_grad():
outputs = pg_model.generate(**inputs, max_new_tokens=100)
result = pg_proc.decode(outputs[0], skip_special_tokens=True)
infer_time = time.time() - t_infer
# Parse bboxes from output
locs = re.findall(r'<loc(\d+)>', result)
n_dets = len(locs) // 4
has_detection = n_dets > 0 or (pname in result.lower() and 'detect' not in result.lower())
scores = []
if has_detection:
for _ in range(n_dets if n_dets > 0 else 1):
scores.append(1.0)
pg_dets[f"{label}_{pname}"] = scores
if has_detection:
print(f" {label} prompt={pname:10s}: {n_dets} det ({infer_time:.1f}s) result={result[:80]}")
all_results["paligemma-3b-mix-224"] = {"elapsed": round(time.time()-t0, 1), "detections": pg_dets}
del pg_model; torch.mps.empty_cache()
# ===== Summary =====
print("\n" + "="*70)
print(f"{'Model':<28} {'Time':>8} {'Params':>8} {'Gun hits':>12} {'Pistol hits':>14} {'Stamp h':>10}")
print("-"*80)
for model_name in ["grounding-dino-base", "paligemma-3b-mix-224"]:
d = all_results[model_name]
dets = d["detections"]
summary = {}
for pname in PROMPTS:
hits = 0
for label, _, _ in TIMEPOINTS:
key = f"{label}_{pname}"
if key in dets and dets[key]:
hits += 1
summary[pname] = hits
params = "232M" if "grounding" in model_name else "2923M"
gun_h = summary.get("gun", 0)
pistol_h = summary.get("pistol", 0)
stamp_h = summary.get("stamp", 0)
print(f"{model_name:<28} {d['elapsed']:>7.1f}s {params:>8} {gun_h:>6d}/8 {pistol_h:>6d}/8 {stamp_h:>6d}/8")
json.dump(all_results, open(os.path.join(OUTPUT_DIR, "comparison.json"), "w"), indent=2)
print(f"\nSaved to {OUTPUT_DIR}/")

View File

@@ -108,7 +108,7 @@ def check_job(uuid: str) -> dict:
stages = []
t0 = time.time()
# 1. ASR
# 1. ASR (pass 1: faster-whisper small)
t = time.time()
f = OUTPUT_DIR / f"{uuid}.asr.json"
ok = f.exists() and f.stat().st_size > 0
@@ -118,10 +118,10 @@ def check_job(uuid: str) -> dict:
with open(f) as fh: d = json.load(fh)
segs = len(d.get("segments", []))
except: ok = False
stages.append({"name": "ASR", "passed": ok and segs > 0, "detail": f"{segs} seg" if ok else file_size(str(f)),
stages.append({"name": "ASR", "passed": ok and segs > 0, "detail": f"faster-whisper ({segs})" if ok else file_size(str(f)),
"elapsed": round(time.time() - t, 1)})
# 2. ASRX
# 2. ASRX (ECAPA-TDNN speaker diarization)
t = time.time()
f = OUTPUT_DIR / f"{uuid}.asrx.json"
ok = f.exists() and f.stat().st_size > 0
@@ -131,15 +131,28 @@ def check_job(uuid: str) -> dict:
with open(f) as fh: d = json.load(fh)
segs = len(d.get("segments", []))
except: ok = False
stages.append({"name": "ASRX", "passed": ok and segs > 0, "detail": f"{segs} seg" if ok else file_size(str(f)),
stages.append({"name": "ASRX", "passed": ok and segs > 0, "detail": f"ECAPA-TDNN ({segs})" if ok else file_size(str(f)),
"elapsed": round(time.time() - t, 1)})
# 3. Sentence Chunks
# 3. ASR2 (pass 2: correct split segments)
t = time.time()
cnt = int(psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{uuid}' AND chunk_type='sentence'"))
stages.append({"name": "Sentence", "passed": cnt > 0, "detail": f"{cnt} chunks", "elapsed": round(time.time() - t, 1)})
f2 = OUTPUT_DIR / f"{uuid}.asr-1.json"
ok2 = f2.exists() and f2.stat().st_size > 0
cnt2 = 0
if ok2:
try:
with open(f2) as fh: d2 = json.load(fh)
cnt2 = len(d2.get("kept", [])) + sum(len(c["corrected"]) for c in d2.get("corrections", []))
except: ok2 = False
stages.append({"name": "ASR2", "passed": ok2 and cnt2 > 0, "detail": f"{cnt2} chunks (asr-1.json)" if ok2 else file_size(str(f2)),
"elapsed": round(time.time() - t, 1)})
# 4. Vectorization
# 4. Sentence Chunks (DB)
t = time.time()
cnt = int(psql(f"SELECT count(*) FROM dev.chunk WHERE file_uuid='{uuid}' AND chunk_type='sentence'"))
stages.append({"name": "Sentence", "passed": cnt > 0, "detail": f"{cnt} DB", "elapsed": round(time.time() - t, 1)})
# 5. Vectorization
t = time.time()
vec = int(psql(f"SELECT count(*) FROM dev.chunk_vectors WHERE uuid='{uuid}'"))
qdrant_ok = False
@@ -161,7 +174,7 @@ def check_job(uuid: str) -> dict:
"detail": f"{vec} PG, Qdrant={'ok' if qdrant_ok else '?'}",
"elapsed": round(time.time() - t, 1)})
# 5. Face Trace
# 6. Face Trace
t = time.time()
traces = int(psql(f"SELECT count(DISTINCT trace_id) FROM dev.face_detections WHERE file_uuid='{uuid}' AND trace_id IS NOT NULL"))
faces = int(psql(f"SELECT count(*) FROM dev.face_detections WHERE file_uuid='{uuid}' AND trace_id IS NOT NULL"))
@@ -169,7 +182,7 @@ def check_job(uuid: str) -> dict:
"detail": f"{traces} traces, {faces} faces",
"elapsed": round(time.time() - t, 1)})
# 6. TKG
# 7. TKG
t = time.time()
nodes = int(psql(f"SELECT count(*) FROM dev.tkg_nodes WHERE file_uuid='{uuid}'"))
edges = int(psql(f"SELECT count(*) FROM dev.tkg_edges WHERE file_uuid='{uuid}'"))
@@ -177,16 +190,16 @@ def check_job(uuid: str) -> dict:
"detail": f"{nodes} nodes, {edges} edges",
"elapsed": round(time.time() - t, 1)})
# 7. Trace Chunks
# 8. Trace Chunks
t = time.time()
tc = int(psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{uuid}' AND chunk_type='trace'"))
tc = int(psql(f"SELECT count(*) FROM dev.chunk WHERE file_uuid='{uuid}' AND chunk_type='trace'"))
stages.append({"name": "TraceChunks", "passed": tc > 0, "detail": f"{tc} chunks",
"elapsed": round(time.time() - t, 1)})
# 8. Phase 1 Release
# 9. Phase 1 Release
t = time.time()
p1 = PROJECT / "release" / "phase1" / "latest"
p1_files = [p1 / "RELEASE_INFO.txt", p1 / "chunks.csv", p1 / "vectors.csv"]
p1_files = [p1 / "RELEASE_INFO.txt", p1 / "schema.sql", p1 / "snapshots"]
p1_ok = all(f.exists() for f in p1_files)
p1_size = sum(f.stat().st_size for f in p1.rglob("*") if f.is_file()) // 1024 if p1.exists() else 0
stages.append({"name": "Phase1", "passed": p1_ok,

204
scripts/rebuild_parents.py Normal file
View File

@@ -0,0 +1,204 @@
#!/opt/homebrew/bin/python3.11
"""
Rebuild parent/story chunks (280 × 15 children) + LLM summaries + Qdrant momentry_dev_stories.
"""
import json, sys, time, psycopg2
from collections import Counter
from urllib.request import Request, urlopen
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
QDRANT_URL = "http://localhost:6333"
LLM_URL = "http://localhost:8082/v1/chat/completions"
EMBED_URL = "http://localhost:11436/v1/embeddings"
FPS = 25.0
FILE_ID = 242
CHILDREN_PER_PARENT = 15
print("=== Step 1: Load sentence chunks sorted by time ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("""
SELECT chunk_index, chunk_id, start_time, end_time, text_content,
metadata->>'speaker_name', file_uuid
FROM dev.chunks
WHERE file_uuid=%s AND chunk_type='sentence'
ORDER BY start_time, chunk_index
""", (UUID,))
children = cur.fetchall()
print(f"Loaded {len(children)} sentence chunks")
# Group into parents of 15
parents = []
for i in range(0, len(children), CHILDREN_PER_PARENT):
group = children[i:i+CHILDREN_PER_PARENT]
if not group: continue
p_start = group[0][2]
p_end = group[-1][3]
child_ids = [c[1] for c in group]
# Speaker breakdown
spk_counter = Counter(c[4] for c in group)
# Actually count speaker names
spk_names = Counter(c[5] for c in group)
primary = spk_names.most_common(1)[0][0] if spk_names else "Unknown"
parents.append({
"start": p_start, "end": p_end,
"child_ids": child_ids,
"child_indices": [c[0] for c in group],
"speakers": dict(spk_names.most_common()),
"primary": primary,
"texts": [c[4] for c in group],
})
print(f"Parent chunks: {len(parents)}")
print(f"Speakers per parent: {[len(p['speakers']) for p in parents[:5]]}")
# Delete old story chunks
cur.execute("DELETE FROM dev.chunks WHERE file_uuid=%s AND chunk_type='story'", (UUID,))
print(f"Deleted old story chunks: {cur.rowcount}")
# Insert parent chunks
print("\n=== Step 2: Insert parent chunks ===")
parent_records = []
for pi, p in enumerate(parents):
pid = f"{UUID}_story_{pi}"
dialogue = " ".join([t or "" for t in p["texts"]])
sf = int(p["start"] * FPS)
ef = int(p["end"] * FPS)
fc = ef - sf
metadata = json.dumps({
"method": "fixed_15",
"seg_count": len(p["child_ids"]),
"speakers": p["speakers"],
"speaker_count": len(p["speakers"]),
"primary_speaker": p["primary"],
"words": len(dialogue.split()),
})
parent_records.append((
UUID, pid, pi, "story", p["start"], p["end"],
json.dumps({"type": "story_parent"}),
dialogue, FPS, sf, ef, fc, FILE_ID, pid,
metadata, p["child_ids"], [], None,
))
cur.executemany("""
INSERT INTO dev.chunks
(file_uuid, chunk_id, chunk_index, chunk_type,
start_time, end_time, content, text_content,
fps, start_frame, end_frame, frame_count,
file_id, old_chunk_id, metadata, child_chunk_ids, pre_chunk_ids, summary_text)
VALUES (%s,%s,%s,%s,%s,%s,%s::jsonb,%s,%s,%s,%s,%s,%s,%s,%s::jsonb,%s,%s,%s)
""", parent_records)
conn.commit()
print(f"Inserted {len(parent_records)} parent chunks")
# Update sentence chunks with parent_chunk_id
for pi, p in enumerate(parents):
pid = f"{UUID}_story_{pi}"
for cid in p["child_ids"]:
cur.execute("UPDATE dev.chunks SET parent_chunk_id=%s WHERE chunk_id=%s", (pid, cid))
conn.commit()
print("Updated child parent references")
print("\n=== Step 3: Generate LLM summaries ===")
def call_llm(prompt):
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1, "max_tokens": 100}).encode()
req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
resp = urlopen(req, timeout=120)
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
def call_embed(text):
body = json.dumps({"input": text}).encode()
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
resp = urlopen(req, timeout=30)
return json.loads(resp.read())["data"][0]["embedding"]
t0 = time.time()
summaries = []
for pi, p in enumerate(parents):
dialogue = " ".join([t or "" for t in p["texts"]])
if len(dialogue) < 10:
summary = "[no dialogue]"
embedding = [0.0] * 768
else:
try:
prompt = f"Act as a film scene analyst. Summarize this dialogue in 50 words:\n{dialogue[:3000]}"
summary = call_llm(prompt)
time.sleep(0.2)
embedding = call_embed(summary)
except Exception as e:
print(f" P{pi} ERROR: {e}")
summary = "[error]"
embedding = [0.0] * 768
cur.execute("UPDATE dev.chunks SET summary_text=%s, updated_at=NOW() WHERE chunk_id=%s",
(summary, f"{UUID}_story_{pi}"))
summaries.append({"index": pi, "chunk_id": f"{UUID}_story_{pi}",
"summary": summary, "start": p["start"], "end": p["end"],
"embedding": embedding})
if (pi + 1) % 20 == 0:
print(f" [{pi+1}/{len(parents)}] [{time.time()-t0:.0f}s]")
conn.commit()
print(f"Summaries: {len(summaries)}")
print("\n=== Step 4: Update Qdrant momentry_dev_stories ===")
# Delete old
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories", method="DELETE")
try: urlopen(req); time.sleep(0.5)
except: pass
# Create
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories",
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
urlopen(req)
time.sleep(0.5)
# Upload dialogue + summary points (dialogue=0..279, summary=280..559)
points = []
for pi, p in enumerate(parents):
# Dialogue point (zero vector)
points.append({
"id": pi + 1,
"vector": [0.0] * 768,
"payload": {"chunk_id": f"{UUID}_story_{pi}", "file_uuid": UUID,
"start_time": p["start"], "end_time": p["end"],
"type": "story_dialogue", "text": " ".join(p["texts"])[:500]},
})
# Summary point
s = summaries[pi]
points.append({
"id": pi + 1 + len(parents),
"vector": s["embedding"],
"payload": {"chunk_id": s["chunk_id"], "file_uuid": UUID,
"start_time": s["start"], "end_time": s["end"],
"type": "story_summary", "summary": s["summary"]},
})
batch_size = 100
for start in range(0, len(points), batch_size):
batch = points[start:start+batch_size]
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories/points?wait=true",
data=json.dumps({"points": batch}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
try: urlopen(req)
except Exception as e: print(f" batch {start}: {e}")
print(f"Uploaded {len(points)} points")
# Verify
resp = json.loads(urlopen(f"{QDRANT_URL}/collections/momentry_dev_stories").read())
info = resp["result"]
print(f"Verifed: {info['points_count']} pts, {info['config']['params']['vectors'].get('size','?')}D")
conn.close()
print(f"\n=== Done [{time.time()-t0:.0f}s] ===")

View File

@@ -0,0 +1,320 @@
#!/opt/homebrew/bin/python3.11
"""
Rebuild story chunk text_content and regenerates summaries using new ASRX speaker assignments.
Then updates Qdrant momentry_dev_stories and sentence_story/sentence_summary collections.
"""
import json, sys, time, urllib.request
from urllib.request import Request, urlopen
import psycopg2
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
QDRANT_URL = "http://localhost:6333"
LLM_URL = "http://localhost:8082/v1/chat/completions"
EMBED_URL = "http://localhost:11436/v1/embeddings"
def call_llm(dialogue_text):
prompt = f"Dialogue:\n{dialogue_text}\n\n50-word summary:"
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1, "max_tokens": 100}).encode()
req = Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
try:
resp = urlopen(req, timeout=120)
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
except Exception as e:
print(f" LLM error: {e}")
return ""
def call_embed(text):
body = json.dumps({"input": text}).encode()
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
try:
resp = urlopen(req, timeout=30)
return json.loads(resp.read())["data"][0]["embedding"]
except Exception as e:
print(f" Embed error: {e}")
return [0.0] * 768
print("=== Step 1: Load sentence chunks with new speaker info ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("""
SELECT chunk_index, text_content, metadata->>'new_speaker_name',
metadata->>'speaker_name', content
FROM dev.chunks
WHERE file_uuid = %s AND chunk_type = 'sentence'
ORDER BY chunk_index
""", (UUID,))
sentence_rows = cur.fetchall()
print(f"Loaded {len(sentence_rows)} sentence chunks")
# Build lookup
sentences = {}
for r in sentence_rows:
idx, old_text, new_name, old_name, content = r
sentences[idx] = {
"old_text": old_text or "",
"new_name": new_name or old_name or "Unknown",
"old_name": old_name or "Unknown",
"content": content or {},
}
# Rebuild sentence text_content with new speaker names
print("\n=== Step 2: Rebuild sentence text_content ===")
updated_sentences = 0
for r in sentence_rows:
idx, old_text, new_name, old_name, content = r
new_name = new_name or old_name or "Unknown"
# Extract the text part (remove old speaker prefix if exists)
raw_text = ""
if content and isinstance(content, dict):
raw_text = content.get("data", {}).get("text", "")
if not raw_text and old_text:
# Parse old format: [Speaker] text
import re
m = re.search(r'\]\s*(.*)', old_text)
if m:
raw_text = m.group(1)
else:
raw_text = old_text
new_text = f"[{new_name}] {raw_text}"
cur.execute("""
UPDATE dev.chunks
SET text_content = %s, updated_at = NOW()
WHERE file_uuid = %s AND chunk_type = 'sentence' AND chunk_index = %s
""", (new_text, UUID, idx))
updated_sentences += 1
conn.commit()
print(f"Updated {updated_sentences} sentence chunks text_content")
print("\n=== Step 3: Rebuild story chunk text_content ===")
cur.execute("""
SELECT id, chunk_id, chunk_index, child_chunk_ids, start_time, end_time,
text_content, summary_text
FROM dev.chunks
WHERE file_uuid = %s AND chunk_type = 'story'
ORDER BY chunk_index
""", (UUID,))
story_rows = cur.fetchall()
print(f"Loaded {len(story_rows)} story chunks")
# Build child text per story chunk
story_dialogue_texts = []
for r in story_rows:
db_id, cid, idx, child_ids, st, et, old_text, old_summary = r
dialogue_parts = []
for child_cid in (child_ids or []):
parts = child_cid.split("_")
child_idx = int(parts[-1])
if child_idx in sentences:
s = sentences[child_idx]
raw = ""
if s["content"] and isinstance(s["content"], dict):
raw = s["content"].get("data", {}).get("text", "")
if not raw:
import re
m = re.search(r'\]\s*(.*)', s["old_text"])
if m:
raw = m.group(1)
else:
raw = s["old_text"]
if raw:
dialogue_parts.append(f'({s["new_name"]}) {raw}')
dialogue_text = " ".join(dialogue_parts)
story_dialogue_texts.append((db_id, cid, idx, st, et, dialogue_text, old_summary))
print(f"Built {len(story_dialogue_texts)} story dialogue texts")
# Update DB with new text_content (dialogue only, not summary yet)
for item in story_dialogue_texts:
db_id, cid, idx, st, et, dialogue_text, old_summary = item
cur.execute("""
UPDATE dev.chunks
SET text_content = %s, updated_at = NOW()
WHERE id = %s
""", (dialogue_text, db_id))
conn.commit()
print("Updated story chunk dialogue texts")
print("\n=== Step 4: Generate LLM summaries (all 228 stories) ===")
summaries = []
for i, item in enumerate(story_dialogue_texts):
db_id, cid, idx, st, et, dialogue_text, old_summary = item
if len(dialogue_text) < 10:
summary = "[no dialogue]"
embedding = [0.0] * 768
else:
print(f" [{i+1}/{len(story_dialogue_texts)}] {cid}: {len(dialogue_text)} chars", end="")
try:
summary = call_llm(dialogue_text[:3000])
print(f" -> {len(summary)} chars")
time.sleep(0.3)
embedding = call_embed(summary)
except Exception as e:
print(f" ERROR: {e}")
summary = "[error]"
embedding = [0.0] * 768
# Update DB
s_esc = summary.replace("'", "''")
cur.execute(f"""
UPDATE dev.chunks
SET summary_text = '{s_esc}', updated_at = NOW()
WHERE id = {db_id}
""")
summaries.append({
"db_id": db_id,
"chunk_id": cid,
"chunk_index": idx,
"start_time": st,
"end_time": et,
"dialogue": dialogue_text,
"summary": summary,
"embedding": embedding,
})
conn.commit()
print(f"\nGenerated {len(summaries)} summaries")
print("\n=== Step 5: Rebuild Qdrant momentry_dev_stories ===")
# Delete existing
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories", method="DELETE")
try:
urlopen(req)
time.sleep(0.3)
except:
pass
# Recreate
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories",
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
urlopen(req)
time.sleep(0.3)
# Upload dialogue points (0..227) and summary points (228..455)
dialogue_points = []
summary_points = []
for s in summaries:
idx = s["chunk_index"]
dialogue_points.append({
"id": idx + 1,
"vector": [0.0] * 768,
"payload": {
"chunk_id": s["chunk_id"],
"file_uuid": UUID,
"start_time": s["start_time"],
"end_time": s["end_time"],
"type": "story_dialogue",
"text": s["dialogue"][:500],
}
})
summary_points.append({
"id": idx + 1 + 228,
"vector": s["embedding"],
"payload": {
"chunk_id": s["chunk_id"],
"file_uuid": UUID,
"start_time": s["start_time"],
"end_time": s["end_time"],
"type": "story_summary",
"summary": s["summary"],
}
})
all_story_points = dialogue_points + summary_points
batch_size = 100
for start in range(0, len(all_story_points), batch_size):
batch = all_story_points[start:start+batch_size]
req = Request(f"{QDRANT_URL}/collections/momentry_dev_stories/points?wait=true",
data=json.dumps({"points": batch}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
try:
urlopen(req)
except Exception as e:
print(f" Batch {start}: {e}")
if (start // batch_size) % 3 == 0:
print(f" Uploaded {start + len(batch)}/{len(all_story_points)}")
print(f"Uploaded {len(all_story_points)} points to momentry_dev_stories")
print("\n=== Step 6: Populate sentence_story and sentence_summary ===")
# These are the per-sentence template + summary collections
# sentence_story: 3417 points, 768D, template payloads
# sentence_summary: 3417 points, 768D, LLM summary payloads
for col_name in ["sentence_story", "sentence_summary"]:
req = Request(f"{QDRANT_URL}/collections/{col_name}", method="DELETE")
try:
urlopen(req)
time.sleep(0.2)
except:
pass
req = Request(f"{QDRANT_URL}/collections/{col_name}",
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
urlopen(req)
time.sleep(0.2)
# Build points for sentence_story and sentence_summary
story_sentence_points = []
summary_sentence_points = []
for idx in sorted(sentences.keys()):
s = sentences[idx]
raw_text = ""
if s["content"] and isinstance(s["content"], dict):
raw_text = s["content"].get("data", {}).get("text", "")
dialog_line = f'({s["new_name"]}) {raw_text}'
story_sentence_points.append({
"id": idx + 1,
"vector": [0.0] * 768,
"payload": {
"chunk_id": f"{UUID}_{idx}",
"file_uuid": UUID,
"start_time": 0,
"end_time": 0,
"text": dialog_line,
"speaker_name": s["new_name"],
"chunk_type": "sentence",
}
})
# Upload sentence_story (dialogue template)
batch_size = 200
for start in range(0, len(story_sentence_points), batch_size):
batch = story_sentence_points[start:start+batch_size]
req = Request(f"{QDRANT_URL}/collections/sentence_story/points?wait=true",
data=json.dumps({"points": batch}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
try:
urlopen(req)
except Exception as e:
print(f" sentence_story batch {start}: {e}")
if (start // batch_size) % 5 == 0:
print(f" Uploaded {start + len(batch)}/3417 sentence_story")
print("Uploaded sentence_story points")
# sentence_summary will be populated when we generate per-sentence summaries
# For now, mark as TODO
print("sentence_summary: SKIPPED (needs per-sentence LLM summaries)")
cur.close()
conn.close()
print("\n=== Done ===")

View File

@@ -0,0 +1,180 @@
#!/opt/homebrew/bin/python3.11
"""
Rescan cut scenes at 1-frame interval to find more face detections
for single-frame traces.
Usage:
python3 scripts/rescan_single_frame_traces.py --file-uuid <uuid> [--workers 2]
"""
import os, sys, json, subprocess, tempfile, argparse, time, psycopg2
from pathlib import Path
from collections import defaultdict
DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
OUTPUT_DIR = os.environ.get("MOMENTRY_OUTPUT_DIR", "/Users/accusys/momentry/output_dev")
SCRIPTS_DIR = os.environ.get("MOMENTRY_SCRIPTS_DIR", "/Users/accusys/momentry_core_0.1/scripts")
VENV_PYTHON = "/Users/accusys/momentry_core_0.1/venv/bin/python"
def get_cut_scenes_with_single_traces(file_uuid):
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("SET search_path TO dev")
cur.execute("""
SELECT c.chunk_id, c.start_frame, c.end_frame, c.start_time, c.end_time,
COUNT(DISTINCT s.trace_id) as single_traces
FROM dev.chunks c
JOIN dev.face_detections fd ON fd.file_uuid=c.file_uuid
AND fd.frame_number >= c.start_frame AND fd.frame_number <= c.end_frame
JOIN (
SELECT trace_id FROM dev.face_detections
WHERE file_uuid=%s AND trace_id IS NOT NULL
GROUP BY trace_id HAVING COUNT(*) = 1
) s ON s.trace_id = fd.trace_id
WHERE c.file_uuid=%s AND c.chunk_type='cut'
GROUP BY c.id, c.chunk_id, c.start_frame, c.end_frame, c.start_time, c.end_time
ORDER BY single_traces DESC
""", (file_uuid, file_uuid))
scenes = cur.fetchall()
cur.close(); conn.close()
return scenes
def process_scene(file_uuid, video_path, chunk_id, start_frame, end_frame, start_time, end_time):
temp_dir = Path(OUTPUT_DIR) / f"rescan_{file_uuid[:8]}"
temp_dir.mkdir(exist_ok=True)
# Extract segment
seg_path = temp_dir / f"{chunk_id}.mp4"
duration = end_time - start_time + 2 # pad 2 seconds
result = subprocess.run([
"ffmpeg", "-y", "-i", video_path,
"-ss", str(max(0, start_time - 1)),
"-t", str(duration),
"-c:v", "libx264", "-preset", "ultrafast", "-crf", "28",
"-an", # no audio
str(seg_path)
], capture_output=True, text=True)
if not seg_path.exists():
return None, f"ffmpeg failed: {result.stderr[:200]}"
# Run face processor
out_path = temp_dir / f"{chunk_id}.face.json"
frame_offset = start_frame - 1 # ffmpeg extracts from start_time-1
result = subprocess.run([
VENV_PYTHON, str(Path(SCRIPTS_DIR) / "face_processor.py"),
str(seg_path), str(out_path),
"--sample-interval", "1",
"--uuid", file_uuid,
], capture_output=True, text=True, timeout=180)
if not out_path.exists():
seg_path.unlink(missing_ok=True)
return None, f"face processor failed"
# Read results and re-map frame numbers
with open(out_path) as f:
data = json.load(f)
new_detections = []
for entry in data.get("frames", []):
orig_frame = int(entry.get("frame", 0)) + frame_offset
if orig_frame < start_frame or orig_frame > end_frame:
continue
faces = entry.get("faces", [])
if faces:
new_detections.append({"frame": orig_frame, "faces": faces})
# Cleanup temp files
seg_path.unlink(missing_ok=True)
out_path.unlink(missing_ok=True)
return new_detections, None
def merge_into_face_json(file_uuid, scene_detections):
face_path = Path(OUTPUT_DIR) / f"{file_uuid}.face.json"
with open(face_path) as f:
face_data = json.load(f)
# Index existing frames
existing = {}
for i, entry in enumerate(face_data.get("frames", [])):
existing[entry["frame"]] = i
new_faces = 0
for entry in scene_detections:
fn = entry["frame"]
if fn in existing:
# Add new faces not already present
existing_face_ids = {f.get("face_id") for f in face_data["frames"][existing[fn]]["faces"]}
for face in entry["faces"]:
if face.get("face_id") not in existing_face_ids:
face_data["frames"][existing[fn]]["faces"].append(face)
new_faces += 1
else:
face_data["frames"].append({"frame": fn, "faces": entry["faces"]})
new_faces += len(entry["faces"])
# Re-sort by frame
face_data["frames"].sort(key=lambda x: x["frame"])
with open(face_path, "w") as f:
json.dump(face_data, f)
return new_faces
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--file-uuid", required=True)
parser.add_argument("--video-path", default=None)
args = parser.parse_args()
UUID = args.file_uuid
if args.video_path:
video_path = args.video_path
else:
# Try to find video path from DB
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("SET search_path TO dev")
cur.execute("SELECT file_path FROM dev.videos WHERE file_uuid=%s", (UUID,))
row = cur.fetchone()
cur.close(); conn.close()
if not row:
print(f"Video not found for UUID {UUID}")
return
video_path = row[0]
print(f"Scanning for single-frame traces in {UUID}")
scenes = get_cut_scenes_with_single_traces(UUID)
print(f"Found {len(scenes)} cut scenes with single-frame traces")
total_new = 0
start_time = time.time()
for i, (chunk_id, sf, ef, st, et, n_traces) in enumerate(scenes):
t0 = time.time()
detections, error = process_scene(UUID, video_path, chunk_id, sf, ef, st, et)
if error:
print(f"[{i+1}/{len(scenes)}] {chunk_id}: ERROR - {error}")
continue
if not detections:
print(f"[{i+1}/{len(scenes)}] {chunk_id}: no new detections")
continue
added = merge_into_face_json(UUID, detections)
total_new += added
elapsed = time.time() - t0
eta = (len(scenes) - i - 1) * elapsed
print(f"[{i+1}/{len(scenes)}] {chunk_id}: +{added} faces ({len(detections)} frames, {elapsed:.0f}s, ETA {eta/60:.0f}min)")
print(f"\nDone! Added {total_new} new face detections across {len(scenes)} scenes")
print(f"Total time: {(time.time()-start_time)/60:.1f} min")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,164 @@
#!/opt/homebrew/bin/python3.11
"""
Scan Charade for hand-held objects using YOLO spatial overlap + pose wrist verification.
Strategy:
1. Sample frames at regular intervals
2. For each person, check if non-person objects overlap with hand area
3. Use pose wrist keypoints to verify hand position
4. Classify with Grounding DINO
"""
import json, sys, time, psycopg2
from collections import defaultdict, Counter
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
FPS = 25.0
SAMPLE_INTERVAL = 300 # every 300 frames = every 12s
HAND_RADIUS = 100 # pixels around wrist to check for objects
def iou(box1, box2):
"""Calculate intersection over union of two boxes [x,y,w,h]."""
x1, y1, w1, h1 = box1
x2, y2, w2, h2 = box2
xi1 = max(x1, x2)
yi1 = max(y1, y2)
xi2 = min(x1 + w1, x2 + w2)
yi2 = min(y1 + h1, y2 + h2)
inter = max(0, xi2 - xi1) * max(0, yi2 - yi1)
if inter == 0: return 0
area1 = w1 * h1
area2 = w2 * h2
union = area1 + area2 - inter
return inter / union if union > 0 else 0
print("=== Hand-held Object Scanner ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
# Load pose wrist data (frame → wrist positions)
print("Loading pose wrist data...")
t0 = time.time()
cur.execute("""
SELECT start_frame, data
FROM dev.pre_chunks
WHERE file_uuid=%s AND processor_type='pose'
AND data->'persons' IS NOT NULL
ORDER BY start_frame
""", (UUID,))
pose_wrists = {} # frame → list of (x, y) wrist positions
for r in cur.fetchall():
frame = r[0]
persons = r[1].get("persons", [])
wrists = []
for p in persons:
for kp in p.get("keypoints", []):
name = kp.get("name", "")
if name in ("left_wrist", "right_wrist") and kp.get("confidence", 0) > 0.1:
wrists.append((kp["x"], kp["y"]))
if wrists:
pose_wrists[frame] = wrists
print(f" {len(pose_wrists)} frames with wrists ({time.time()-t0:.1f}s)")
# Scan YOLO frames
print(f"Scanning YOLO data (interval={SAMPLE_INTERVAL})...")
t0 = time.time()
# Get total frames
cur.execute("SELECT MAX(start_frame) FROM dev.pre_chunks WHERE file_uuid=%s AND processor_type='yolo'", (UUID,))
max_frame = cur.fetchone()[0] or 0
results = []
for frame_num in range(0, max_frame + 1, SAMPLE_INTERVAL):
# Get YOLO detections for this frame
cur.execute("""
SELECT data->'objects'
FROM dev.pre_chunks
WHERE file_uuid=%s AND processor_type='yolo' AND start_frame=%s
""", (UUID, frame_num))
yolo_row = cur.fetchone()
if not yolo_row or not yolo_row[0]:
continue
objects = yolo_row[0]
# Find persons
persons = [o for o in objects if o.get("class_name") == "person" and o.get("confidence", 0) > 0.5]
if not persons:
continue
# Find non-person objects
items = [o for o in objects if o.get("class_name") != "person" and o.get("confidence", 0) > 0.3]
if not items:
continue
# Get wrist positions for this frame
wrists = pose_wrists.get(frame_num, [])
ts = frame_num / FPS
frame_results = []
for item in items:
item_box = (item["x"], item["y"], item["width"], item["height"])
item_center_x = item["x"] + item["width"] / 2
item_center_y = item["y"] + item["height"] / 2
# Check if item is near any person
for person in persons:
person_box = (person["x"], person["y"], person["width"], person["height"])
overlap = iou(item_box, person_box)
if overlap > 0.01:
# Check if near a wrist (if pose data available)
near_hand = False
for wx, wy in wrists:
dist = ((item_center_x - wx) ** 2 + (item_center_y - wy) ** 2) ** 0.5
if dist < HAND_RADIUS:
near_hand = True
break
cls = item["class_name"]
conf = item.get("confidence", 0)
frame_results.append({
"frame": frame_num,
"timestamp": round(ts, 1),
"time_str": f"{int(ts//60)}:{int(ts%60):02d}",
"object": cls,
"confidence": round(conf, 3),
"near_hand": near_hand,
"overlap": round(overlap, 3),
})
if frame_results:
results.extend(frame_results)
elapsed = time.time() - t0
print(f" Scanned in {elapsed:.1f}s")
# Deduplicate
seen = set()
deduped = []
for r in results:
key = (r["frame"], r["object"])
if key not in seen:
seen.add(key)
deduped.append(r)
# Group by object type
by_object = defaultdict(list)
for r in deduped:
by_object[r["object"]].append(r)
print(f"\n=== Results: {len(deduped)} hand-held object detections ===")
print(f"{'Object':<20} {'Count':>6} {'Near hand':>12} {'Timestamps':<40}")
print("-"*80)
for obj, items in sorted(by_object.items(), key=lambda x: -len(x[1])):
near_hand = sum(1 for i in items if i["near_hand"])
ts_list = ", ".join(i["time_str"] for i in items[:5])
if len(items) > 5:
ts_list += f" ... (+{len(items)-5})"
print(f"{obj:<20} {len(items):>6} {near_hand:>8d} {ts_list:<40}")
# Save
json.dump(deduped, open("/Users/accusys/momentry/output_dev/handheld_objects.json", "w"), indent=2)
print(f"\nSaved to output_dev/handheld_objects.json")
conn.close()

169
scripts/speaker_bind_lip.py Normal file
View File

@@ -0,0 +1,169 @@
#!/opt/homebrew/bin/python3.11
"""
Speaker Binding with Lip Verification
Reads face.json (8Hz outer_lips) + asrx.json + identity_bindings
For each ASR segment with face data + lip motion, create speaker→identity binding.
"""
import json, subprocess, sys
from pathlib import Path
from collections import defaultdict
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
OUTPUT_DIR = Path("/Users/accusys/momentry/output_dev")
PSQL = ["/Users/accusys/pgsql/18.3/bin/psql", "-U", "accusys", "-d", "momentry", "-t", "-A"]
def psql(sql: str) -> str:
r = subprocess.run(PSQL + ["-c", sql], capture_output=True, text=True, timeout=30)
return r.stdout.strip()
def calc_lip_height(face_data):
"""Calculate lip height from outer_lips (14 [x,y] points)"""
lips = face_data.get("lips", {})
outer = lips.get("outer_lips", []) if isinstance(lips, dict) else lips
if not outer or len(outer) < 3:
return None
ys = [pt[1] for pt in outer]
return max(ys) - min(ys)
print("=== Speaker Binding with Lip Verification ===")
# Step 1: Load face traces with identity_id
traces = psql(f"""
SELECT trace_id, identity_id FROM dev.face_detections
WHERE file_uuid='{UUID}' AND trace_id IS NOT NULL AND identity_id IS NOT NULL
GROUP BY trace_id, identity_id
""")
trace_identity = {}
for line in traces.strip().split('\n'):
if not line.strip() or '|' not in line: continue
p = line.split('|')
trace_identity[int(p[0])] = int(p[1])
print(f"Traces with identity: {len(trace_identity)}")
# Step 2: Load trace frame ranges
tf = psql(f"""
SELECT trace_id, MIN(frame_number), MAX(frame_number), MIN(timestamp_secs), MAX(timestamp_secs)
FROM dev.face_detections WHERE file_uuid='{UUID}' AND trace_id IS NOT NULL
GROUP BY trace_id
""")
trace_ranges = {}
for line in tf.strip().split('\n'):
if not line.strip() or '|' not in line: continue
p = line.split('|')
tid = int(p[0])
trace_ranges[tid] = {
'min_frame': int(p[1]), 'max_frame': int(p[2]),
'min_ts': float(p[3]), 'max_ts': float(p[4])
}
# Step 3: Load lip analysis per frame from face.json
print("Loading face.json lips data...")
face = json.load(open(OUTPUT_DIR / f"{UUID}.face.json"))
frame_faces = {}
for fr in face.get("frames", []):
fn = fr["frame"]
faces_data = []
for face_data in fr.get("faces", []):
h = calc_lip_height(face_data)
if h is not None:
faces_data.append({"height": h})
if faces_data:
frame_faces[fn] = faces_data
print(f"Frames with lip data: {len(frame_faces)}")
# Step 4: Load ASRX segments
asrx = json.load(open(OUTPUT_DIR / f"{UUID}.asrx.json"))
segments = asrx.get("segments", [])
# Step 5: For each ASR segment with face overlap, compute lip motion
from collections import defaultdict
speaker_trace_scores = defaultdict(list)
for seg in segments:
st = seg.get("start_time", 0)
et = seg.get("end_time", 0)
speaker = seg.get("speaker_id", "")
if not speaker:
continue
fps = 25.0
start_frame = int(st * fps)
end_frame = int(et * fps) + 10
# Find overlapping traces
overlapping_traces = []
for tid, tr in trace_ranges.items():
if tr['min_ts'] <= et and tr['max_ts'] >= st:
overlapping_traces.append(tid)
if not overlapping_traces:
continue
# Compute lip motion for each overlapping trace
for tid in overlapping_traces:
tr = trace_ranges[tid]
# Baseline frames before ASR start
baseline = []
# During frames
during = []
for fn in frame_faces:
fn_ts = fn / fps
if fn_ts >= tr['min_ts'] and fn_ts <= tr['max_ts']:
if fn_ts < st - 1.0: # Before (baseline)
for fd in frame_faces[fn]:
baseline.append(fd["height"])
elif fn_ts >= st and fn_ts <= et: # During
for fd in frame_faces[fn]:
during.append(fd["height"])
if not baseline or not during:
continue
baseline_avg = sum(baseline) / len(baseline)
during_avg = sum(during) / len(during)
motion = (during_avg - baseline_avg) / max(baseline_avg, 0.1)
score = max(0, min(1.0, motion * 5)) # Normalize: 20% motion → 1.0
speaker_trace_scores[(speaker, tid)].append(score)
# Step 6: Create speaker bindings
bindings = 0
existing = psql(f"SELECT identity_value FROM dev.identity_bindings WHERE identity_type='speaker' AND identity_id IN (SELECT identity_id FROM dev.face_detections WHERE file_uuid='{UUID}' AND identity_id IS NOT NULL GROUP BY identity_id)")
existing_speakers = set(existing.strip().split('\n')) if existing.strip() else set()
new_bindings = 0
for (speaker, tid), scores in speaker_trace_scores.items():
if tid not in trace_identity:
continue
identity_id = trace_identity[tid]
avg_score = sum(scores) / len(scores) if scores else 0
if speaker in existing_speakers:
continue
if avg_score < 0.3: # Threshold: need meaningful lip motion
continue
r = psql(f"""
INSERT INTO dev.identity_bindings (identity_id, identity_type, identity_value, confidence, metadata)
VALUES ({identity_id}, 'speaker', '{speaker}', {avg_score:.3f}, '{{"source":"lip_analysis","trace_id":{tid},"segments":{len(scores)},"avg_score":{avg_score:.3f}}}'::jsonb)
ON CONFLICT (identity_id, identity_type, identity_value) DO UPDATE SET confidence=EXCLUDED.confidence
""")
new_bindings += 1
print(f"\n=== Done ===")
print(f"ASR segments analyzed: {len(segments)}")
print(f"Segments with face+lip data: {len(speaker_trace_scores)}")
print(f"New speaker bindings: {new_bindings}")
# Verify
binds = psql(f"SELECT ib.identity_value, i.name FROM dev.identity_bindings ib JOIN dev.identities i ON i.id=ib.identity_id WHERE ib.identity_type='speaker' AND i.id IN (SELECT identity_id FROM dev.face_detections WHERE file_uuid='{UUID}') ORDER BY ib.identity_value")
print(f"\nSpeaker bindings:")
for line in binds.strip().split('\n'):
if line.strip() and '|' in line:
p = line.split('|')
print(f" {p[0]:15s}{p[1]}")

View File

@@ -0,0 +1,204 @@
#!/opt/homebrew/bin/python3.11
"""
Split ASR segments at detected speaker change points.
Uses ECAPA-TDNN sub-window classification against reference centroids.
Output: new asrx_fine.json with fine-grained segments + parent_asr_idx reference.
"""
import json, sys, os, time, argparse, subprocess, tempfile, shutil
import numpy as np
from collections import Counter
from pathlib import Path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "asrx_self"))
from main_fixed import SelfASRXFixed
from speaker_encoder import extract_speaker_embedding, normalize_embeddings
import torchaudio, psycopg2
SUB_WIN = 0.5
SUB_STRIDE = 0.25
CHANGE_CONFIRM = 2
MIN_DUR = 0.7
BATCH_SIZE = 500
def load_reference(uuid, db_url):
conn = psycopg2.connect(db_url)
cur = conn.cursor()
cur.execute("SELECT chunk_index, metadata->>'new_speaker_name' FROM dev.chunks WHERE file_uuid=%s AND chunk_type='sentence' ORDER BY chunk_index", (uuid,))
name_by_idx = dict(cur.fetchall())
conn.close()
asrx_path = f"/Users/accusys/momentry/output_dev/{uuid}.asrx.json"
asrx_full = json.load(open(asrx_path))
ref = {"Cary Grant": [], "Audrey Hepburn": [], "Unknown": []}
for i, seg in enumerate(asrx_full["segments"]):
name = name_by_idx.get(i, "Unknown")
if name in ref and i < len(asrx_full.get("embeddings", [])):
ref[name].append(np.array(asrx_full["embeddings"][i]))
centroids = {}
for name, el in ref.items():
if el:
c = np.mean(el, axis=0)
centroids[name] = c / (np.linalg.norm(c) + 1e-10)
name_to_speaker = {}
for i, seg in enumerate(asrx_full["segments"]):
name = name_by_idx.get(i, "Unknown")
sid = seg["speaker_id"]
name_to_speaker.setdefault(name, sid)
return centroids, name_to_speaker
def extract_audio(video_path, sr=16000):
tmp = tempfile.mkdtemp(prefix="asr_split_")
wav = os.path.join(tmp, "audio.wav")
subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", video_path,
"-ar", str(sr), "-ac", "1", "-sample_fmt", "s16", wav], check=True, capture_output=True, timeout=300)
wav_data, sr_actual = torchaudio.load(wav)
if wav_data.shape[0] > 1:
wav_data = wav_data.mean(dim=0, keepdim=True)
return wav_data, sr_actual, tmp
def classify(emb, centroids):
return max(centroids, key=lambda n: float(np.dot(emb, centroids[n])))
def process_batch(asr_segs, wav, sr, centroids, encoder, offset_start=0):
ws = int(SUB_WIN * sr)
sw = int(SUB_STRIDE * sr)
results = []
for si, s in enumerate(asr_segs):
st = s["start"] - offset_start
et = s["end"] - offset_start
dur = et - st
if dur < 1.0:
a = wav[:, int(st*sr):int(et*sr)]
e = extract_speaker_embedding(encoder, a.numpy(), sr)
e /= np.linalg.norm(e) + 1e-10
results.append((s["start"], s["end"], classify(e, centroids), si))
continue
ss = int(st*sr); se = int(et*sr)
sub_e, sub_t = [], []
for wpos in range(ss, se-ws+1, sw):
chunk = wav[:, wpos:wpos+ws]
sub_e.append(extract_speaker_embedding(encoder, chunk.numpy(), sr))
sub_t.append(wpos/sr + offset_start)
if len(sub_e) < 3:
a = wav[:, ss:se]
e = extract_speaker_embedding(encoder, a.numpy(), sr)
e /= np.linalg.norm(e) + 1e-10
results.append((s["start"], s["end"], classify(e, centroids), si))
continue
sub_e = normalize_embeddings(np.array(sub_e))
names = []
for i in range(len(sub_e)):
names.append(classify(sub_e[i], centroids))
# Smooth
sm = list(names)
for i in range(1, len(names)-1):
sm[i] = Counter(names[max(0,i-1):min(len(names),i+2)]).most_common(1)[0][0]
# Find splits
splits = []
prev = sm[0]
for i in range(1, len(sm)):
if sm[i] != prev:
if i+CHANGE_CONFIRM < len(sm) and all(sm[i]==sm[j] for j in range(i, i+CHANGE_CONFIRM+1)):
splits.append(sub_t[i]); prev = sm[i]
elif i+CHANGE_CONFIRM >= len(sm):
splits.append(sub_t[i]); prev = sm[i]
if not splits:
results.append((s["start"], s["end"], Counter(names).most_common(1)[0][0], si))
else:
boundaries = [s["start"]] + splits + [s["end"]]
for pi in range(len(boundaries)-1):
ps, pe = boundaries[pi], boundaries[pi+1]
if pe-ps < MIN_DUR: continue
sub_i = [i for i, t in enumerate(sub_t) if ps <= t < pe]
lbl = Counter([names[i] for i in sub_i]).most_common(1)[0][0] if sub_i else Counter(names).most_common(1)[0][0]
results.append((round(ps,2), round(pe,2), lbl, si))
return results
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--uuid", default="aeed71342a899fe4b4c57b7d41bcb692")
parser.add_argument("--output", help="Output path for fine ASRX JSON")
args = parser.parse_args()
UUID = args.uuid
BASE = "/Users/accusys/momentry/output_dev"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
print(f"Processing {UUID}")
centroids, name_to_speaker = load_reference(UUID, DB_URL)
print(f"Centroids: {list(centroids.keys())}")
asr = json.load(open(f"{BASE}/{UUID}.asr.json"))
asr_segs = asr["segments"]
print(f"ASR segments: {len(asr_segs)}")
print("Extracting audio...")
wav, sr, tmp_dir = extract_audio(VIDEO)
print(f"Audio: {wav.shape[1]/sr:.0f}s")
inst = SelfASRXFixed()
encoder = inst.speaker_encoder
all_results = []
t0 = time.time()
for batch_start in range(0, len(asr_segs), BATCH_SIZE):
batch = asr_segs[batch_start:batch_start + BATCH_SIZE]
segs = process_batch(batch, wav, sr, centroids, encoder)
all_results.extend(segs)
pct = (batch_start + len(batch)) * 100 // len(asr_segs)
print(f" {batch_start+len(batch)}/{len(asr_segs)} ({pct}%) -> {len(all_results)} segments [{time.time()-t0:.0f}s]")
shutil.rmtree(tmp_dir, ignore_errors=True)
# Build output
spk_stats = {}
out_segs = []
# Assign sequential SPEAKER_X IDs based on name order
name_order = {name: i for i, name in enumerate(sorted(set(s[2] for s in all_results)))}
for start, end, name, asr_idx in all_results:
sid = f"SPEAKER_{name_order[name]}"
dur = end - start
spk_stats.setdefault(sid, {"count": 0, "duration": 0})
spk_stats[sid]["count"] += 1
spk_stats[sid]["duration"] += dur
out_segs.append({
"start_time": start,
"end_time": end,
"speaker_id": sid,
"speaker_name": name,
"parent_asr_idx": asr_idx,
})
output = {
"uuid": UUID,
"language": "en",
"segments": out_segs,
"speaker_stats": spk_stats,
"total_asr_segments": len(asr_segs),
"total_fine_segments": len(out_segs),
}
output_path = args.output or f"{BASE}/{UUID}.asrx_fine.json"
json.dump(output, open(output_path, "w"), indent=2)
print(f"\nSaved: {output_path}")
print(f"Segments: {len(out_segs)} (was {len(asr_segs)}, +{len(out_segs)-len(asr_segs)})")
print(f"Speakers: {len(spk_stats)}")
for sid, st in sorted(spk_stats.items()):
print(f" {sid}: {st['count']} segs, {st['duration']:.0f}s")
if __name__ == "__main__":
main()

98
scripts/step3_asr_fine.py Normal file
View File

@@ -0,0 +1,98 @@
#!/opt/homebrew/bin/python3.11
"""
Step 3: Re-run ASR with word_timestamps on full audio.
Map words to 4188 fine segments for accurate text.
"""
import json, sys, os, time, subprocess, tempfile, shutil
from faster_whisper import WhisperModel
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
BASE = "/Users/accusys/momentry/output_dev"
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
print("=== Load fine ASRX ===")
fine = json.load(open(f"{BASE}/{UUID}.asrx_fine.json"))
fine_segs = fine["segments"]
print(f"Fine segments: {len(fine_segs)}")
print("\n=== Extract audio WAV ===")
tmp_dir = tempfile.mkdtemp(prefix="asr_step3_")
wav_path = os.path.join(tmp_dir, "audio.wav")
subprocess.run(["ffmpeg", "-y", "-v", "quiet", "-i", VIDEO,
"-ar", "16000", "-ac", "1", "-sample_fmt", "s16", wav_path],
check=True, capture_output=True, timeout=300)
print("Loading model with word_timestamps...")
t0 = time.time()
model = WhisperModel("small", device="cpu", compute_type="int8")
print(f" Model loaded in {time.time()-t0:.1f}s")
print("Transcribing with word_timestamps=True...")
t0 = time.time()
segments, info = model.transcribe(
wav_path, beam_size=5, vad_filter=True,
vad_parameters={"min_silence_duration_ms": 500},
word_timestamps=True
)
# Collect all word-level data
words = []
for seg in segments:
if seg.words:
for w in seg.words:
wt = w.word.strip()
if wt:
words.append({"word": wt, "start": w.start, "end": w.end})
else:
words.append({"word": seg.text.strip(), "start": seg.start, "end": seg.end})
elapsed = time.time() - t0
print(f" Done in {elapsed:.1f}s, {len(words)} words")
# Map words to fine segments
print("\n=== Map words to fine segments ===")
wi = 0
assigned = 0
for si, fs in enumerate(fine_segs):
fstart = fs["start_time"]
fend = fs["end_time"]
seg_words = []
while wi < len(words):
w = words[wi]
if w["end"] <= fstart:
wi += 1
continue
if w["start"] >= fend:
break
seg_words.append(w["word"])
wi += 1
text = " ".join(seg_words)
fs["text"] = text
if text:
assigned += 1
print(f" Segments with text: {assigned}/{len(fine_segs)}")
# Show examples
print("\nSplit segment examples:")
for fs in fine_segs:
# Check if this was split (doesn't match an ASR boundary exactly)
is_split = True
# We can't easily check here, just show first 10 non-trivial
if len(fs.get('text','')) > 10 and is_split:
print(f" [{fs['start_time']:.1f}-{fs['end_time']:.1f}] {fs['speaker_name']:15s} \"{fs['text'][:60]}\"")
break # just one for now
# Count text lengths
text_lens = [len(fs.get('text','')) for fs in fine_segs]
print(f"\n Avg text length: {sum(text_lens)/len(text_lens):.0f} chars")
print(f" Empty texts: {sum(1 for l in text_lens if l == 0)}")
# Save
fine["_asr_meta"] = {"word_timestamps": True, "asr_runtime_secs": round(elapsed, 1)}
json.dump(fine, open(f"{BASE}/{UUID}.asrx_fine.json", "w"), indent=2)
print(f"\nSaved")
shutil.rmtree(tmp_dir, ignore_errors=True)

87
scripts/story_embed.py Normal file
View File

@@ -0,0 +1,87 @@
#!/opt/homebrew/bin/python3.11
"""
Story Embedding Pipeline:
1. Read story chunks → LLM summary (Gemma4)
2. Embed summary (EmbeddingGemma)
3. Store in chunks table + Qdrant
"""
import json, urllib.request, subprocess, sys, time, os
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
PSQL = ["/Users/accusys/pgsql/18.3/bin/psql", "-U", "accusys", "-d", "momentry", "-t", "-A"]
LLM_URL = "http://localhost:8082/v1/chat/completions"
EMBED_URL = "http://localhost:11436/v1/embeddings"
QDRANT_URL = "http://localhost:6333"
QDRANT_COL = "momentry_dev_stories"
def psql(sql):
r = subprocess.run(PSQL + ["-c", sql], capture_output=True, text=True, timeout=30)
return r.stdout.strip()
def call_llm(dialogue):
prompt = f"Dialogue: {dialogue}\n\n50-word summary:"
body = json.dumps({"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.1, "max_tokens": 100}).encode()
req = urllib.request.Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
resp = urllib.request.urlopen(req, timeout=120)
return json.loads(resp.read())["choices"][0]["message"]["content"].strip()
def call_embed(text):
body = json.dumps({"input": text}).encode()
req = urllib.request.Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
resp = urllib.request.urlopen(req, timeout=30)
return json.loads(resp.read())["data"][0]["embedding"]
# Step 0: Ensure Qdrant collection exists (768 dims)
subprocess.run(["curl", "-s", "-X", "PUT", f"{QDRANT_URL}/collections/{QDRANT_COL}",
"-H", "Content-Type: application/json",
"-d", '{"vectors":{"size":768,"distance":"Cosine"}}'], capture_output=True)
# Step 1: Get all story chunks that need summaries
lines = [l for l in psql(f"SELECT chunk_id, chunk_index, start_time, end_time, text_content FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story' AND (summary_text IS NULL OR summary_text = '') ORDER BY chunk_index").split('\n') if l.strip() and '|' in l]
print(f"Chunks to process: {len(lines)}")
total = len(lines)
errors = 0
for i, line in enumerate(lines):
parts = line.split('|', 4)
cid, idx, st, et, dialogue = parts[0].strip(), int(parts[1]), float(parts[2]), float(parts[3]), parts[4] if len(parts) > 4 else ""
if len(dialogue) < 10:
summary = "[no dialogue]"
embedding = [0.0] * 768
else:
try:
summary = call_llm(dialogue)
time.sleep(0.3)
embedding = call_embed(summary)
except Exception as e:
print(f"[{i+1}/{total}] Error: {cid} - {e}")
errors += 1
summary = "[error]"
embedding = [0.0] * 768
# Update DB
s_esc = summary.replace("'", "''")
psql(f"UPDATE dev.chunks SET summary_text='{s_esc}', updated_at=CURRENT_TIMESTAMP WHERE chunk_id='{cid}'")
# Store in Qdrant
point = json.dumps({"points": [{"id": idx + 1, "vector": embedding,
"payload": {"chunk_id": cid, "file_uuid": UUID, "start_time": st, "end_time": et,
"summary": summary, "type": "story_summary"}
}]}).encode()
req = urllib.request.Request(f"{QDRANT_URL}/collections/{QDRANT_COL}/points?wait=true",
data=point, headers={"Content-Type": "application/json"}, method="PUT")
try:
urllib.request.urlopen(req, timeout=10)
except:
pass
if (i+1) % 20 == 0:
print(f"[{i+1}/{total}] {errors} errors so far")
print(f"\nDone. Processed: {total}, Errors: {errors}")
print(f"Qdrant: {QDRANT_COL}")

View File

@@ -0,0 +1,230 @@
#!/opt/homebrew/bin/python3.11
"""
Story Pipeline Full — Speaker + Story + Summary
Step 1: Update sentence chunks with speaker name
Step 2: Rebuild story chunks + re-embed
Step 3: LLM summary × 228 + embed
"""
import json, urllib.request, subprocess, sys, time, os
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
DIR = "/Users/accusys/momentry/output_dev"
PSQL = ["/Users/accusys/pgsql/18.3/bin/psql", "-U", "accusys", "-d", "momentry", "-t", "-A"]
LLM_URL = "http://localhost:8082/v1/chat/completions"
EMBED_URL = "http://localhost:11436/v1/embeddings"
QDRANT_URL = "http://localhost:6333/collections/momentry_dev_stories/points"
def psql(sql):
r = subprocess.run(PSQL + ["-c", sql], capture_output=True, text=True, timeout=30)
return r.stdout.strip()
def psql_file(path):
r = subprocess.run(PSQL + ["-f", path], capture_output=True, text=True, timeout=60)
if r.stderr and "ERROR" in r.stderr:
print(f"SQL Error: {r.stderr[:200]}")
return r.returncode
def embed_text(text):
body = json.dumps({"input": text[:1024]}).encode()
req = urllib.request.Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
return json.loads(urllib.request.urlopen(req, timeout=30).read())["data"][0]["embedding"]
def llm_summary(dialogue):
body = json.dumps({
"model": "google_gemma-4-26B-A4B-it-Q5_K_M.gguf",
"messages": [{"role": "user", "content": f"Summarize concisely:\n{dialogue}\n\n50-word summary:"}],
"temperature": 0.1, "max_tokens": 100,
}).encode()
req = urllib.request.Request(LLM_URL, data=body, headers={"Content-Type": "application/json"})
return json.loads(urllib.request.urlopen(req, timeout=120).read())["choices"][0]["message"]["content"].strip()
fps = 25.0
FILE_ID = 242
# ═══════════════════════════════════════════════════
# Step 0: Load ASR + ASRX + speaker map
# ═══════════════════════════════════════════════════
print("=" * 60)
print("Step 0: Loading data...")
asr = json.load(open(f"{DIR}/{UUID}.asr.json"))
segs = asr["segments"]
asrx = json.load(open(f"{DIR}/{UUID}.asrx.json"))
asrx_segs = asrx["segments"]
# Speaker map from identity_bindings
r = psql("SELECT ib.identity_value, i.name FROM dev.identity_bindings ib JOIN dev.identities i ON i.id=ib.identity_id WHERE ib.identity_type='speaker'")
speaker_map = {}
for line in r.strip().split('\n'):
if line.strip() and '|' in line:
p = line.split('|')
speaker_map[p[0].strip()] = p[1].strip()
speaker_map["SPEAKER_0"] = "Speaker_0" # Fallback for unbounded
# ═══════════════════════════════════════════════════
# Step 1: Update sentence chunks with speaker
# ═══════════════════════════════════════════════════
print("\n" + "=" * 60)
print("Step 1: Updating sentence chunks with speaker...")
sql = ["BEGIN;"]
chunk_meta = {} # idx → {speaker_id, speaker_name}
for idx, seg in enumerate(segs):
st, et = seg["start"], seg["end"]
text = seg["text"].strip()
if not text:
continue
# Find overlapping ASRX segment → speaker_id
spk_id = "SPEAKER_0"
for ax in asrx_segs:
if ax.get("start_time", 0) <= st and ax.get("end_time", 0) >= et:
spk_id = ax.get("speaker_id", "SPEAKER_0")
break
spk_name = speaker_map.get(spk_id, spk_id)
new_text = f"[{spk_name}] {text}"
meta = json.dumps({"speaker_id": spk_id, "speaker_name": spk_name})
esc = new_text.replace("'", "''")
sql.append(f"UPDATE dev.chunks SET text_content='{esc}', metadata='{meta}'::jsonb WHERE file_uuid='{UUID}' AND chunk_id='{UUID}_{idx}';")
chunk_meta[idx] = {"speaker_id": spk_id, "speaker_name": spk_name}
sql.append("COMMIT;")
with open("/tmp/s1_speaker.sql", "w") as f:
f.write("\n".join(sql))
psql_file("/tmp/s1_speaker.sql")
print(f" Updated {len(chunk_meta)} sentence chunks with speaker")
# ═══════════════════════════════════════════════════
# Step 2: Rebuild story chunks + re-embed
# ═══════════════════════════════════════════════════
print("\n" + "=" * 60)
print("Step 2: Rebuilding story chunks...")
# Delete old story chunks
psql(f"DELETE FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story';")
# Recreate
CHUNK_SIZE = 15
sql2 = ["BEGIN;"]
story_meta = []
for i in range(0, len(segs), CHUNK_SIZE):
group = segs[i:i+CHUNK_SIZE]
st, et = group[0]["start"], group[-1]["end"]
idx = i // CHUNK_SIZE
chunk_id = f"{UUID}_story_{idx}"
# Build speaker text from individual sentences
texts = []
speakers_used = {}
for j, seg in enumerate(group):
seg_idx = i + j
if seg_idx in chunk_meta:
cm = chunk_meta[seg_idx]
text = seg["text"].strip()
if text:
texts.append(f"[{cm['speaker_name']}] {text}")
speakers_used[cm['speaker_name']] = speakers_used.get(cm['speaker_name'], 0) + 1
dialogue = " ".join(texts)
child_ids = ", ".join([f"'{UUID}_{j}'" for j in range(i, min(i+CHUNK_SIZE, len(segs)))])
words = sum(len(t.split()) for t in texts)
meta = json.dumps({"method": "fixed_15", "seg_count": len(group), "words": words, "speakers": speakers_used})
esc = dialogue.replace("'", "''")
sql2.append(f"""INSERT INTO dev.chunks (file_id,file_uuid,chunk_id,old_chunk_id,chunk_index,chunk_type,start_time,end_time,fps,start_frame,end_frame,text_content,content,metadata,frame_count,child_chunk_ids)
VALUES ({FILE_ID},'{UUID}','{chunk_id}','{chunk_id}',{idx},'story',{st},{et},{fps},{int(st*fps)},{int(et*fps)},'{esc}','{{"type":"story_parent"}}'::jsonb,'{meta}'::jsonb,{int((et-st)*fps)},ARRAY[{child_ids}]);""")
story_meta.append({"idx": idx, "st": st, "et": et, "dialogue": dialogue, "words": words, "speakers": speakers_used})
sql2.append("COMMIT;")
with open("/tmp/s2_story.sql", "w") as f:
f.write("\n".join(sql2))
psql_file("/tmp/s2_story.sql")
print(f" Created {len(story_meta)} story chunks")
# Embed + upsert to Qdrant
print("\n Embedding story chunks...")
points_dialogue = []
for sm in story_meta:
if len(sm["dialogue"]) < 10:
continue
vec = embed_text(sm["dialogue"])
points_dialogue.append({"id": sm["idx"] + 1, "vector": vec, "payload": {
"chunk_id": f"{UUID}_story_{sm['idx']}", "file_uuid": UUID,
"start_time": sm["st"], "end_time": sm["et"], "type": "story_dialogue"
}})
for i in range(0, len(points_dialogue), 100):
batch = points_dialogue[i:i+100]
data = json.dumps({"points": batch, "wait": True}).encode()
req = urllib.request.Request(f"{QDRANT_URL}?wait=true", data=data, headers={"Content-Type": "application/json"}, method="PUT")
urllib.request.urlopen(req, timeout=30)
print(f" Qdrant: {len(points_dialogue)} dialogue vectors")
# ═══════════════════════════════════════════════════
# Step 3: LLM summaries + embed
# ═══════════════════════════════════════════════════
print("\n" + "=" * 60)
print("Step 3: LLM summaries...")
points_summary = []
summary_sql = ["BEGIN;"]
for i, sm in enumerate(story_meta):
if len(sm["dialogue"]) < 10:
continue
try:
summary = llm_summary(sm["dialogue"])
time.sleep(0.3)
vec = embed_text(summary)
time.sleep(0.1)
except Exception as e:
print(f" Error on story {sm['idx']}: {e}")
summary = "[error]"
vec = [0.0] * 768
s_esc = summary.replace("'", "''")
summary_sql.append(f"UPDATE dev.chunks SET summary_text='{s_esc}', updated_at=CURRENT_TIMESTAMP WHERE file_uuid='{UUID}' AND chunk_id='{UUID}_story_{sm['idx']}';")
points_summary.append({"id": 100000 + sm["idx"] + 1, "vector": vec, "payload": {
"chunk_id": f"{UUID}_story_{sm['idx']}", "file_uuid": UUID,
"start_time": sm["st"], "end_time": sm["et"],
"summary": summary, "type": "story_summary"
}})
if (i + 1) % 50 == 0:
print(f" {i+1}/{len(story_meta)}")
# Update DB with summaries
summary_sql.append("COMMIT;")
with open("/tmp/s3_summary.sql", "w") as f:
f.write("\n".join(summary_sql))
psql_file("/tmp/s3_summary.sql")
# Upsert summary vectors to Qdrant
for i in range(0, len(points_summary), 100):
batch = points_summary[i:i+100]
data = json.dumps({"points": batch, "wait": True}).encode()
req = urllib.request.Request(f"{QDRANT_URL}?wait=true", data=data, headers={"Content-Type": "application/json"}, method="PUT")
urllib.request.urlopen(req, timeout=30)
print(f" Qdrant: {len(points_summary)} summary vectors")
# ═══════════════════════════════════════════════════
# Step 4: Verify
# ═══════════════════════════════════════════════════
print("\n" + "=" * 60)
print("Done.")
r1 = psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='sentence' AND text_content LIKE '[%'")
r2 = psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story'")
r3 = psql(f"SELECT count(*) FROM dev.chunks WHERE file_uuid='{UUID}' AND chunk_type='story' AND summary_text IS NOT NULL")
print(f"Sentence chunks with speaker: {r1}")
print(f"Story chunks: {r2}")
print(f"Story chunks with summary: {r3}")

View File

@@ -0,0 +1,74 @@
#!/opt/homebrew/bin/python3.11
"""
Compare ASR small vs large-v3 on a short test clip.
"""
import json, time, sys, os
from faster_whisper import WhisperModel
CLIP = "/tmp/charade_test_clip.mp4"
models = {
"small": {"size": "small", "device": "cpu", "compute": "int8"},
"large-v3": {"size": "large-v3", "device": "cpu", "compute": "int8"},
}
for name, cfg in models.items():
outfile = f"/tmp/asr_{name}_result.json"
if os.path.exists(outfile):
print(f"{name}: already done, skip")
continue
print(f"\n=== Loading {name} model ===")
t0 = time.time()
model = WhisperModel(cfg["size"], device=cfg["device"], compute_type=cfg["compute"])
print(f" Loaded in {time.time()-t0:.1f}s")
print(f" Transcribing...")
t0 = time.time()
segments, info = model.transcribe(CLIP, beam_size=5, vad_filter=True,
vad_parameters={"min_silence_duration_ms": 500})
segs = []
for seg in segments:
segs.append({"start": round(seg.start + 1540, 2), "end": round(seg.end + 1540, 2),
"text": seg.text.strip()})
elapsed = time.time() - t0
result = {
"model": name,
"language": info.language,
"segments": segs,
"segment_count": len(segs),
"duration_secs": round(elapsed, 1),
}
json.dump(result, open(outfile, "w"), indent=2, ensure_ascii=False)
print(f" Done: {len(segs)} segs in {elapsed:.1f}s")
del model # free memory
print("\n=== Comparison ===")
for name in models:
r = json.load(open(f"/tmp/asr_{name}_result.json"))
print(f"{name}: {r['segment_count']} segs, {r['duration_secs']}s runtime")
# Show differences
small = json.load(open("/tmp/asr_small_result.json"))["segments"]
large = json.load(open("/tmp/asr_large_v3_result.json"))["segments"]
small_texts = set(s["text"] for s in small)
large_texts = set(s["text"] for s in large)
only_small = small_texts - large_texts
only_large = large_texts - small_texts
print(f"\nTexts only in small: {len(only_small)}")
for t in sorted(only_small)[:10]:
print(f" SMALL: \"{t}\"")
print(f"\nTexts only in large: {len(only_large)}")
for t in sorted(only_large)[:10]:
print(f" LARGE: \"{t}\"")
# Compare segment boundaries
print(f"\nSegment time differences (large has more/fewer):")
print(f" Small: {len(small)} segments")
print(f" Large: {len(large)} segments")
print(f" Diff: {len(large) - len(small)}")

View File

@@ -0,0 +1,81 @@
#!/opt/homebrew/bin/python3.11
"""
Update DB sentence chunks with fine-grained ASRX speaker assignments.
Each ASR segment gets the majority speaker_name from overlapping fine segments.
"""
import json, psycopg2
from collections import Counter
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
BASE = "/Users/accusys/momentry/output_dev"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
print("=== Step 1: Load fine ASRX ===")
fine = json.load(open(f"{BASE}/{UUID}.asrx_fine.json"))
fine_segs = fine["segments"]
print(f"Fine segments: {len(fine_segs)}")
print("\n=== Step 2: Load existing sentence chunks ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("""
SELECT id, chunk_index, start_time, end_time, metadata
FROM dev.chunks
WHERE file_uuid=%s AND chunk_type='sentence'
ORDER BY chunk_index
""", (UUID,))
chunks = cur.fetchall()
print(f"DB sentence chunks: {len(chunks)}")
# For each chunk, find overlapping fine segments
print("\n=== Step 3: Update speaker assignments ===")
updated = 0
for row in chunks:
db_id, idx, st, et, meta = row
if meta is None or isinstance(meta, str):
try:
meta = json.loads(meta) if isinstance(meta, str) else {}
except:
meta = {}
# Find overlapping fine segments
overlapping = [s for s in fine_segs if s["start_time"] < et and s["end_time"] > st]
if overlapping:
# Majority vote
names = Counter(s["speaker_name"] for s in overlapping)
ids = Counter(s["speaker_id"] for s in overlapping)
best_name = names.most_common(1)[0][0]
best_id = ids.most_common(1)[0][0]
meta["speaker_name"] = best_name
meta["speaker_id"] = best_id
meta["fine_speaker_name"] = best_name
meta["fine_speaker_id"] = best_id
meta["fine_details"] = dict(names)
else:
meta["fine_speaker_name"] = meta.get("speaker_name", "Unknown")
meta["fine_speaker_id"] = meta.get("speaker_id", "Unknown")
cur.execute("""
UPDATE dev.chunks SET metadata=%s::jsonb, updated_at=NOW()
WHERE id=%s
""", (json.dumps(meta), db_id))
updated += 1
conn.commit()
print(f"Updated {updated} chunks")
# Verify distribution
cur.execute("""
SELECT metadata->>'fine_speaker_name', COUNT(*)
FROM dev.chunks
WHERE file_uuid=%s AND chunk_type='sentence'
GROUP BY 1 ORDER BY 2 DESC
""", (UUID,))
print("\nNew speaker distribution:")
for name, cnt in cur.fetchall():
print(f" {name}: {cnt}")
conn.close()
print("\n=== Done ===")

View File

@@ -0,0 +1,192 @@
#!/opt/homebrew/bin/python3.11
"""
Update sentence chunk metadata with new ASRX speaker_id and speaker_name.
Also update Qdrant momentry_dev_v1 and momentry_dev_voice collections.
"""
import json, sys, time
import psycopg2
import numpy as np
from urllib.request import Request, urlopen
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
ASRX_PATH = f"/Users/accusys/momentry/output_dev/{UUID}.asrx.json"
SPEAKER_MAP_PATH = f"/Users/accusys/momentry/output_dev/{UUID}.speaker_map_v2.json"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
QDRANT_URL = "http://localhost:6333"
print("=== Loading data ===")
asrx = json.load(open(ASRX_PATH))
segs = asrx["segments"]
embeddings = asrx.get("embeddings", [])
speaker_map = json.load(open(SPEAKER_MAP_PATH))
assignments = speaker_map["assignments"]
speaker_identity = speaker_map["speaker_identity"]
print(f"Loaded {len(segs)} segments, {len(embeddings)} embeddings")
print("\n=== Step 1: Update DB chunks with new speaker info ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
# Get existing chunks
cur.execute("""
SELECT id, chunk_index, metadata
FROM dev.chunks
WHERE file_uuid = %s AND chunk_type = 'sentence'
ORDER BY chunk_index
""", (UUID,))
db_chunks = cur.fetchall()
print(f"Found {len(db_chunks)} DB sentence chunks")
updated = 0
for row in db_chunks:
db_id, chunk_idx, old_meta = row
if chunk_idx >= len(assignments):
print(f"WARNING: chunk_idx {chunk_idx} out of range for assignments ({len(assignments)})")
continue
a = assignments[chunk_idx]
new_sid = a["speaker_id"]
new_name = a["speaker_name"]
# Preserve old metadata but update speaker fields
if old_meta is None:
old_meta = {}
elif isinstance(old_meta, str):
old_meta = json.loads(old_meta)
old_meta["new_speaker_id"] = new_sid
old_meta["new_speaker_name"] = new_name
old_meta["old_speaker_id"] = old_meta.get("speaker_id", "")
old_meta["old_speaker_name"] = old_meta.get("speaker_name", "")
# Update
meta_json = json.dumps(old_meta)
cur.execute("""
UPDATE dev.chunks
SET metadata = %s::jsonb, updated_at = NOW()
WHERE id = %s
""", (meta_json, db_id))
updated += 1
conn.commit()
print(f"Updated {updated} DB chunks")
# Also update story chunks with new aggregated speaker info
print("\n=== Step 2: Update story chunk aggregates ===")
cur.execute("""
SELECT id, chunk_index, metadata, child_chunk_ids
FROM dev.chunks
WHERE file_uuid = %s AND chunk_type = 'story'
ORDER BY chunk_index
""", (UUID,))
stories = cur.fetchall()
print(f"Found {len(stories)} story chunks")
# Get all sentence chunks with their new speaker info
cur.execute("""
SELECT chunk_index, metadata->>'new_speaker_name' as speaker_name
FROM dev.chunks
WHERE file_uuid = %s AND chunk_type = 'sentence'
ORDER BY chunk_index
""", (UUID,))
sentences = cur.fetchall()
sent_names = {s[0]: s[1] for s in sentences}
for row in stories:
db_id, idx, meta, child_ids = row
if meta is None:
meta = {}
elif isinstance(meta, str):
meta = json.loads(meta)
if child_ids:
# Aggregate speaker info from child chunks
speaker_counts = {}
for cid in child_ids:
# Parse chunk_index from child chunk_id
parts = cid.split("_")
child_idx = int(parts[-1])
if child_idx in sent_names:
name = sent_names[child_idx]
speaker_counts[name] = speaker_counts.get(name, 0) + 1
meta["speaker_breakdown"] = speaker_counts
primary = max(speaker_counts, key=speaker_counts.get) if speaker_counts else "Unknown"
meta["primary_speaker"] = primary
meta["speaker_count"] = len(speaker_counts)
meta_json = json.dumps(meta)
cur.execute("""
UPDATE dev.chunks
SET metadata = %s::jsonb, updated_at = NOW()
WHERE id = %s
""", (meta_json, db_id))
conn.commit()
print(f"Updated {len(stories)} story chunks")
print("\n=== Step 3: Update Qdrant momentry_dev_voice ===")
# Delete old voice collection and recreate
# First check if it exists
import urllib.request
req = Request(f"{QDRANT_URL}/collections/momentry_dev_voice", method="DELETE")
try:
urlopen(req)
print("Deleted old momentry_dev_voice collection")
except:
print("Could not delete or doesn't exist")
time.sleep(0.5)
# Create collection
req = Request(f"{QDRANT_URL}/collections/momentry_dev_voice",
data=json.dumps({"vectors": {"size": 192, "distance": "Cosine"}}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
try:
urlopen(req)
print("Created momentry_dev_voice collection (192D)")
except Exception as e:
print(f"Create collection error: {e}")
# Upload in batches
batch_size = 100
total_uploaded = 0
for start in range(0, len(assignments), batch_size):
batch = assignments[start:start+batch_size]
points = []
for i, a in enumerate(batch):
idx = start + i
emb = embeddings[idx]
points.append({
"id": idx + 1,
"vector": emb,
"payload": {
"file_uuid": UUID,
"speaker_id": a["speaker_id"],
"speaker_name": a["speaker_name"],
"start_time": a["start_time"],
"end_time": a["end_time"],
"segment_index": idx,
}
})
req = Request(f"{QDRANT_URL}/collections/momentry_dev_voice/points?wait=true",
data=json.dumps({"points": points}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
try:
urlopen(req)
total_uploaded += len(points)
except Exception as e:
print(f" Batch {start} error: {e}")
if (start // batch_size) % 5 == 0:
print(f" Uploaded {total_uploaded}/{len(assignments)} voice embeddings")
print(f"\nUploaded {total_uploaded} voice embeddings to momentry_dev_voice")
cur.close()
conn.close()
print("\n=== Done ===")

139
scripts/vectorize_4188.py Normal file
View File

@@ -0,0 +1,139 @@
#!/opt/homebrew/bin/python3.11
"""
Vectorize 4188 sentence chunks via EmbeddingGemma (768D) + rebuild Qdrant collections.
"""
import json, sys, time
from urllib.request import Request, urlopen
import psycopg2
import urllib.request
UUID = "aeed71342a899fe4b4c57b7d41bcb692"
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
QDRANT_URL = "http://localhost:6333"
EMBED_URL = "http://localhost:11436/v1/embeddings"
COLLECTIONS = ["momentry_dev_v1", "sentence_story", "sentence_summary"]
def call_embed(text):
body = json.dumps({"input": text}).encode()
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
resp = urlopen(req, timeout=30)
return json.loads(resp.read())["data"][0]["embedding"]
print("=== Step 1: Load chunks ===")
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("""
SELECT chunk_index, chunk_id, text_content, metadata->>'speaker_name',
start_time, end_time, metadata->>'speaker_id'
FROM dev.chunks
WHERE file_uuid=%s AND chunk_type='sentence'
ORDER BY chunk_index
""", (UUID,))
chunks = cur.fetchall()
conn.close()
print(f"Loaded {len(chunks)} chunks")
print("\n=== Step 2: Vectorize (EmbeddingGemma 768D) ===")
# Generate cleaned text for embedding: "Speaker: text" format
texts_for_embed = []
for r in chunks:
spk = r[3] or "Unknown"
txt = r[2] or ""
# Remove [Speaker] prefix if present
if txt.startswith("["):
txt = txt.split("]", 1)[-1].strip()
texts_for_embed.append(f"{spk}: \"{txt}\"")
t0 = time.time()
embeddings = []
batch_size = 50
for start in range(0, len(texts_for_embed), batch_size):
batch = texts_for_embed[start:start+batch_size]
# Try batch embed
body = json.dumps({"input": batch}).encode()
req = Request(EMBED_URL, data=body, headers={"Content-Type": "application/json"})
try:
resp = json.loads(urlopen(req, timeout=60).read())
batch_embs = [d["embedding"] for d in resp["data"]]
except:
# Fallback to single
batch_embs = []
for t in batch:
batch_embs.append(call_embed(t))
embeddings.extend(batch_embs)
if (start // batch_size) % 10 == 0:
pct = (start + len(batch)) * 100 // len(texts_for_embed)
print(f" {start+len(batch)}/{len(texts_for_embed)} ({pct}%) [{time.time()-t0:.0f}s]")
elapsed = time.time() - t0
print(f" Done: {len(embeddings)} embeddings in {elapsed:.1f}s ({elapsed/len(embeddings):.2f}s each)")
print("\n=== Step 3: Rebuild Qdrant collections ===")
import time as time_module
for col in COLLECTIONS:
# Delete
req = Request(f"{QDRANT_URL}/collections/{col}", method="DELETE")
try: urlopen(req); time_module.sleep(0.3)
except: pass
# Create
req = Request(f"{QDRANT_URL}/collections/{col}",
data=json.dumps({"vectors": {"size": 768, "distance": "Cosine"}}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
urlopen(req)
time_module.sleep(0.3)
print(f" Created {col}")
# Upload
print("\n=== Step 4: Upload points ===")
batch_size = 100
for col in COLLECTIONS:
points = []
for i, r in enumerate(chunks):
idx = r[0]
cid = r[1]
spk_name = r[3] or "Unknown"
spk_id = r[6] or "Unknown"
txt = r[2] or ""
st = r[4]
et = r[5]
payload = {
"chunk_type": "sentence", "uuid": UUID,
"chunk_id": cid, "start_time": st, "end_time": et,
"speaker_name": spk_name, "speaker_id": spk_id,
}
if col == "momentry_dev_v1":
payload["text"] = txt
elif col == "sentence_story":
payload["text"] = txt
elif col == "sentence_summary":
payload["summary"] = txt
points.append({
"id": idx + 1,
"vector": embeddings[i],
"payload": payload,
})
for start in range(0, len(points), batch_size):
batch = points[start:start+batch_size]
req = Request(f"{QDRANT_URL}/collections/{col}/points?wait=true",
data=json.dumps({"points": batch}).encode(),
headers={"Content-Type": "application/json"}, method="PUT")
try: urlopen(req)
except Exception as e: print(f" {col} batch {start}: {e}")
if (start // batch_size) % 5 == 0:
print(f" {col}: {start+len(batch)}/{len(points)}")
print(f" {col}: done")
# Verify
print("\n=== Verify ===")
for col in COLLECTIONS:
resp = json.loads(urlopen(f"{QDRANT_URL}/collections/{col}").read())
info = resp["result"]
print(f" {col}: {info['points_count']} pts, {info['config']['params']['vectors'].get('size','?')}D")
print("\n=== Done ===")

573
scripts/vision_agent.py Normal file
View File

@@ -0,0 +1,573 @@
#!/opt/homebrew/bin/python3.11
"""
Momentry Eye — Multi-model vision detection agent
Models: grounding-dino (default), paligemma
Usage:
python3 scripts/vision_agent.py
curl localhost:5052/health
curl localhost:5052/detect -d '{"time":5461,"prompt":"gun","model":"grounding-dino"}'
curl localhost:5052/search -d '{"query":"find the gun","model":"paligemma"}'
"""
import json, os, sys, time, cv2, torch, re, psycopg2, threading
from PIL import Image, ImageDraw
from flask import Flask, request, jsonify, send_file
app = Flask(__name__)
DB_URL = "postgresql://accusys@localhost:5432/momentry?host=/tmp"
BASE_DIR = "/Users/accusys/momentry/output_dev"
SHOTS_DIR = os.path.join(BASE_DIR, "vision_shots")
os.makedirs(SHOTS_DIR, exist_ok=True)
PORT = int(os.environ.get("VISION_AGENT_PORT", 5052))
DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
VIDEO_PATHS = {
"aeed71342a899fe4b4c57b7d41bcb692":
"/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4",
}
# ======================== Model Registry ========================
MODELS = {} # name -> {"model": obj, "processor": obj, "info": dict}
def load_gdino():
"""Load Grounding DINO Base."""
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
print("[GDINO] Loading...")
t0 = time.time()
proc = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-base").to(DEVICE)
print(f"[GDINO] Loaded in {time.time()-t0:.1f}s")
return {
"model": model, "processor": proc,
"info": {
"name": "grounding-dino", "params_m": 232, "size_mb": 891,
"resolution": 384, "has_confidence": True,
"license": "Apache 2.0",
}
}
def load_paligemma():
"""Load PaliGemma 3B mix-224."""
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
print("[PaliGemma] Loading...")
t0 = time.time()
proc = AutoProcessor.from_pretrained("google/paligemma-3b-mix-224")
model = PaliGemmaForConditionalGeneration.from_pretrained(
"google/paligemma-3b-mix-224", dtype=torch.bfloat16
).to(DEVICE)
print(f"[PaliGemma] Loaded in {time.time()-t0:.1f}s")
return {
"model": model, "processor": proc,
"info": {
"name": "paligemma", "params_m": 2923, "size_mb": 3000,
"resolution": 224, "has_confidence": False,
"license": "Gemma license",
}
}
MODEL_REGISTRY = {
"grounding-dino": load_gdino,
"paligemma": load_paligemma,
}
def get_model(name):
"""Lazy-load and cache a model by name."""
if name not in MODELS:
if name not in MODEL_REGISTRY:
return None
MODELS[name] = MODEL_REGISTRY[name]()
return MODELS[name]
# ======================== Inference ========================
def infer_gdino(img, prompt, threshold=0.1):
"""Grounding DINO inference. Returns [{bbox, score, label}]."""
m = get_model("grounding-dino")
inputs = m["processor"](images=img, text=f"{prompt}.", return_tensors="pt").to(DEVICE)
with torch.no_grad():
outputs = m["model"](**inputs)
dets = m["processor"].post_process_grounded_object_detection(
outputs, threshold=threshold, target_sizes=[img.size[::-1]])[0]
results = []
for i in range(len(dets["boxes"])):
results.append({
"bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
"score": round(dets["scores"][i].item(), 3),
"label": prompt,
})
return results
def infer_paligemma(img, prompt, threshold=0.1):
"""PaliGemma inference. Returns [{bbox, label}] — no confidence scores."""
m = get_model("paligemma")
inputs = m["processor"](text=f"detect {prompt}", images=img, return_tensors="pt").to(DEVICE)
with torch.no_grad():
outputs = m["model"].generate(**inputs, max_new_tokens=100)
result = m["processor"].decode(outputs[0], skip_special_tokens=True)
# Parse PaliGemma output format: <locXXXX><locXXXX><locXXXX><locXXXX> label
locs = re.findall(r'<loc(\d+)>', result)
results = []
if len(locs) >= 4:
n_dets = len(locs) // 4
# Extract labels (text between bbox tokens)
labels = re.findall(r'>\s*(\w+)\s*<|>\s*(\w+)$', result.replace('detect ' + prompt, ''))
for i in range(n_dets):
idx = i * 4
# Convert PaliGemma loc tokens to image coordinates (0-1024 range)
img_w, img_h = img.size
x1 = int(locs[idx]) / 1024 * img_w
y1 = int(locs[idx+1]) / 1024 * img_h
x2 = int(locs[idx+2]) / 1024 * img_w
y2 = int(locs[idx+3]) / 1024 * img_h
results.append({
"bbox": [round(x1, 1), round(y1, 1), round(x2, 1), round(y2, 1)],
"score": 1.0,
"label": prompt,
})
return results
INFERENCE = {
"grounding-dino": infer_gdino,
"paligemma": infer_paligemma,
}
# ======================== Utilities ========================
def find_video(uuid):
if uuid in VIDEO_PATHS: return VIDEO_PATHS[uuid]
import glob
base = "/Users/accusys/momentry/var/sftpgo/data/demo"
for f in glob.glob(f"{base}/**/Charade*", recursive=True):
if f.endswith((".mp4", ".mov", ".avi")): VIDEO_PATHS[uuid] = f; return f
for f in glob.glob(f"{base}/**/*{uuid[:8]}*", recursive=True):
if f.endswith((".mp4", ".mov", ".avi")): VIDEO_PATHS[uuid] = f; return f
return None
def parse_query(query):
query = query.lower().strip()
prefixes = ["find ", "show ", "search ", "where is ", "where are ",
"looking for ", "detect ", "locate ", "spot ", "scan for "]
for p in prefixes:
if query.startswith(p):
query = query[len(p):]
for a in ["a ", "an ", "the ", "some ", "any "]:
if query.startswith(a):
query = query[len(a):]
query = query.rstrip(".?!,")
for s in [" in the image", " in this scene", " in the picture",
" being held", " in hand", " in frame", " please"]:
if query.endswith(s):
query = query[: -len(s)]
return query.strip()
def resolve_target(target_str):
if not target_str or ":" not in target_str:
return None
parts = target_str.split(":", 1)
if len(parts) != 2: return None
uuid, identifier = parts
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("SELECT start_time, end_time FROM dev.chunks WHERE file_uuid=%s AND chunk_id=%s LIMIT 1", (uuid, identifier))
row = cur.fetchone()
if row: cur.close(); conn.close(); return (uuid, float(row[0]), float(row[1]))
if identifier.isdigit():
cid = f"{uuid}_{identifier}"
cur.execute("SELECT start_time, end_time FROM dev.chunks WHERE file_uuid=%s AND chunk_id=%s LIMIT 1", (uuid, cid))
row = cur.fetchone()
if row: cur.close(); conn.close(); return (uuid, float(row[0]), float(row[1]))
tid = identifier.replace("trace_", "")
cur.execute("SELECT MIN(start_time), MAX(end_time) FROM dev.chunks WHERE file_uuid=%s AND chunk_type='trace' AND chunk_id LIKE %s", (uuid, f"%_trace_{tid}"))
row = cur.fetchone()
if row and row[0] is not None: cur.close(); conn.close(); return (uuid, float(row[0]), float(row[1]))
cur.close(); conn.close()
return None
def register_resource(resource_id, name, info):
try:
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
cur.execute("""
INSERT INTO dev.resources (resource_id, resource_type, category, capabilities, config, metadata, status, last_heartbeat)
VALUES (%s, %s, %s, %s::jsonb, %s::jsonb, %s::jsonb, %s, NOW())
ON CONFLICT (resource_id) DO UPDATE SET status=%s, last_heartbeat=NOW(), config=EXCLUDED.config
""", (
resource_id, "vision_model", "object_detection",
json.dumps({"detect": "Single-frame detection", "search": "Range search with NL query",
"has_confidence": info.get("has_confidence", True)}),
json.dumps({"name": name, "port": PORT, "device": DEVICE, "params_m": info.get("params_m"),
"resolution": info.get("resolution"), "license": info.get("license")}),
json.dumps({"version": "2.0", "docs": "/health"}),
"online", "online"))
conn.commit(); cur.close(); conn.close()
print(f"[Resource] Registered '{resource_id}'")
except Exception as e:
print(f"[Resource] Register '{resource_id}' failed: {e}")
def heartbeat_loop(resource_ids):
while True:
try:
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
for rid in resource_ids:
cur.execute("UPDATE dev.resources SET last_heartbeat = NOW() WHERE resource_id = %s", (rid,))
conn.commit(); cur.close(); conn.close()
except: pass
time.sleep(60)
# ======================== Annotate ========================
def annotate_image(img, detections, prompt):
draw = ImageDraw.Draw(img)
for d in detections:
b = d["bbox"]
score = d.get("score", 1.0)
draw.rectangle(b, outline="lime", width=3)
draw.text((b[0], b[1]-18), f"{prompt} {score:.2f}", fill="lime")
return img
# ======================== API Routes ========================
@app.route("/models", methods=["GET"])
def list_models():
"""List available models and their status."""
result = []
for name, loader in MODEL_REGISTRY.items():
cached = name in MODELS
info = dict(MODELS[name]["info"]) if cached else {"name": name, "loaded": False}
info["loaded"] = cached
result.append(info)
return jsonify({"models": result})
# Default fusion weights: GDINO 0.6, PaliGemma 0.4
FUSION_WEIGHTS = {"grounding-dino": 0.6, "paligemma": 0.4}
@app.route("/detect", methods=["POST"])
def detect():
data = request.json or {}
uuid = data.get("uuid", "aeed71342a899fe4b4c57b7d41bcb692")
t_sec = data.get("time", 0)
prompt = data.get("prompt", "gun")
model_name = data.get("model", "grounding-dino")
threshold = data.get("threshold", 0.1)
weights = data.get("weights", None) # e.g. {"grounding-dino":0.7,"paligemma":0.3}
fusion_weights = weights if weights else \
({model_name: 1.0} if model_name != "fusion" else FUSION_WEIGHTS)
# Determine which models to run
if model_name == "fusion":
models_to_run = list(INFERENCE.keys())
elif model_name in INFERENCE:
models_to_run = [model_name]
else:
return jsonify({"error": f"Unknown model: {model_name}"}), 400
video = find_video(uuid)
if not video: return jsonify({"error": "Video not found"}), 404
cap = cv2.VideoCapture(video)
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * (cap.get(cv2.CAP_PROP_FPS) or 25.0)))
ret, frame = cap.read()
cap.release()
if not ret: return jsonify({"error": f"Cannot read frame at {t_sec}s"}), 400
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
all_detections = {}
fusion_results = []
t0 = time.time()
for mn in models_to_run:
if mn not in INFERENCE: continue
detections = INFERENCE[mn](img, prompt, threshold)
all_detections[mn] = detections
w = fusion_weights.get(mn, 0.5)
for d in detections:
gdino_score = d.get("score", 1.0)
# PaliGemma has no score, treat detected=1.0
model_score = gdino_score if mn == "grounding-dino" else 1.0
fused = round(model_score * w, 3)
fusion_results.append({
"bbox": d["bbox"],
"label": d["label"],
"score": model_score,
"fused_score": fused,
"source_model": mn,
})
infer_ms = (time.time() - t0) * 1000
# Deduplicate by bbox IOU for fusion mode
if model_name == "fusion" and len(fusion_results) > 1:
deduped = []
fusion_results.sort(key=lambda x: -x["fused_score"])
for r in fusion_results:
overlap = False
for d in deduped:
b1, b2 = r["bbox"], d["bbox"]
iou = calc_iou(b1, b2)
if iou > 0.5:
overlap = True
break
if not overlap:
deduped.append(r)
fusion_results = deduped
# Annotate with best result
display_dets = [{"bbox": r["bbox"], "score": r["fused_score"], "label": prompt} for r in fusion_results]
if model_name != "fusion":
display_dets = all_detections.get(model_name, [])
img_ann = annotate_image(img.copy(), display_dets, prompt)
shot_name = f"{uuid[:8]}_{int(t_sec)}s_{prompt}_{model_name}.jpg"
img_ann.save(os.path.join(SHOTS_DIR, shot_name))
return jsonify({
"model": model_name,
"fusion_weights": fusion_weights,
"models_used": models_to_run,
"per_model": {mn: {"detections": all_detections.get(mn, []),
"n_detections": len(all_detections.get(mn, []))}
for mn in models_to_run},
"fusion": fusion_results if model_name == "fusion" else None,
"detections": display_dets,
"time_ms": round(infer_ms, 1),
"n_detections": len(display_dets),
"shot_url": f"/shots/{shot_name}",
})
def calc_iou(b1, b2):
xi1 = max(b1[0], b2[0]); yi1 = max(b1[1], b2[1])
xi2 = min(b1[2], b2[2]); yi2 = min(b1[3], b2[3])
inter = max(0, xi2 - xi1) * max(0, yi2 - yi1)
a1 = (b1[2]-b1[0])*(b1[3]-b1[1])
a2 = (b2[2]-b2[0])*(b2[3]-b2[1])
return inter / (a1 + a2 - inter + 1e-10)
@app.route("/search", methods=["POST"])
def search():
data = request.json or {}
uuid = data.get("uuid", "aeed71342a899fe4b4c57b7d41bcb692")
target_str = data.get("target", "")
query = data.get("query", "find the gun")
range_str = data.get("range", "0-6780")
interval = data.get("interval", 30)
threshold = data.get("threshold", 0.15)
model_name = data.get("model", "grounding-dino")
if model_name not in INFERENCE:
return jsonify({"error": f"Unknown model: {model_name}. Available: {list(INFERENCE.keys())}"}), 400
# Parse query → object name
prompt = parse_query(query)
if not prompt:
return jsonify({"error": f"Cannot parse query: {query}"}), 400
# Resolve target → time range
resolved_label = ""
if target_str:
resolved = resolve_target(target_str)
if not resolved:
return jsonify({"error": f"Cannot resolve target: {target_str}"}), 404
uuid, range_start, range_end = resolved
else:
parts = range_str.split("-") if "-" in range_str else ["0", "6780"]
range_start = float(parts[0])
range_end = float(parts[1]) if len(parts) > 1 else 6780
video = find_video(uuid)
if not video: return jsonify({"error": "Video not found"}), 404
cap = cv2.VideoCapture(video)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
hits = []
t_start = time.time()
infer_fn = INFERENCE[model_name]
frame_step = int(interval * fps)
for frame_num in range(int(range_start * fps), min(int(range_end * fps), total_frames), frame_step):
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = cap.read()
if not ret: continue
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
detections = infer_fn(img, prompt, threshold)
if detections:
ts = frame_num / fps
best = max(d.get("score", 1.0) for d in detections)
hits.append({
"time": round(ts, 1),
"time_str": f"{int(ts//60)}:{int(ts%60):02d}.{int((ts%1)*fps):02d}",
"frame": frame_num,
"n_detections": len(detections),
"best_score": best,
"detections": detections[:3],
})
if len(hits) >= 100: break
cap.release()
elapsed = time.time() - t_start
return jsonify({
"model": model_name,
"query": query, "object": prompt,
"target": target_str or None,
"range": f"{range_start:.0f}-{range_end:.0f}",
"interval_secs": interval,
"hits": hits,
"n_hits": len(hits),
"elapsed_secs": round(elapsed, 1),
})
@app.route("/multimodal", methods=["POST"])
def multimodal_search():
"""Multi-modal search across all chunk types.
For sentence chunks: ASR text + visual confirmation.
For trace/story/cut chunks: visual detection only (no ASR text).
Input:
{"keyword":"gun"} — find chunks mentioning "gun" in ASR + visually confirm
{"keyword":"gun","chunk_type":"trace"} — search trace chunks visually (no ASR)
{"target":"file_uuid:chunk_id"} — search a specific chunk visually
"""
data = request.json or {}
uuid = data.get("uuid", "aeed71342a899fe4b4c57b7d41bcb692")
keyword = data.get("keyword", "")
prompt = data.get("prompt", keyword or "")
target_str = data.get("target", "")
chunk_type = data.get("chunk_type", "sentence") # sentence, trace, story, cut
threshold = data.get("threshold", 0.15)
model_name = "grounding-dino"
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
# Resolve target first if provided
if target_str:
resolved = resolve_target(target_str)
if not resolved:
return jsonify({"error": f"Cannot resolve target: {target_str}"}), 404
uuid, st, et = resolved
cur.execute("SELECT chunk_id, chunk_index, chunk_type, text_content FROM dev.chunks WHERE file_uuid=%s AND start_time=%s AND end_time=%s LIMIT 1",
(uuid, st, et))
chunks = [(r[0], r[1], r[2], st, et, r[3] or "") for r in cur.fetchall()]
elif keyword and chunk_type == "sentence":
# Search sentence chunks by ASR text keyword
cur.execute("""
SELECT chunk_id, chunk_index, chunk_type, start_time, end_time, text_content
FROM dev.chunks
WHERE file_uuid=%s AND chunk_type='sentence'
AND text_content ILIKE CONCAT('%%', %s, '%%')
ORDER BY start_time
""", (uuid, keyword))
chunks = cur.fetchall()
else:
# Search any chunk type by time range (visual only, no ASR)
range_str = data.get("range", "0-6780")
parts = range_str.split("-") if "-" in range_str else ["0", "6780"]
rs, re = float(parts[0]), float(parts[1]) if len(parts) > 1 else 6780
cur.execute("""
SELECT chunk_id, chunk_index, chunk_type, start_time, end_time, COALESCE(text_content, '')
FROM dev.chunks
WHERE file_uuid=%s AND chunk_type=%s
AND start_time BETWEEN %s AND %s
ORDER BY start_time
""", (uuid, chunk_type, rs, re))
chunks = cur.fetchall()
conn.close()
if not chunks:
return jsonify({"error": f"No matching chunks found"}), 404
# Visual confirmation
video = find_video(uuid)
if not video:
return jsonify({"error": "Video not found"}), 404
cap = cv2.VideoCapture(video)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
infer_fn = INFERENCE.get(model_name)
results = []
t_start = time.time()
for chunk_id, chunk_idx, ctype, st, et, text in chunks:
center = (st + et) / 2
frame_num = int(center * fps)
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = cap.read()
if not ret: continue
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
detections = infer_fn(img, prompt or keyword, threshold)
entry = {
"chunk_id": chunk_id,
"chunk_index": chunk_idx,
"chunk_type": ctype,
"time_range": f"{st:.1f}-{et:.1f}",
"time_str": f"{int(st//60)}:{int(st%60):02d}-{int(et//60)}:{int(et%60):02d}",
"visual_confirmed": len(detections) > 0,
"best_score": round(max(d.get("score", 1.0) for d in detections), 3) if detections else 0,
"n_visual_dets": len(detections),
}
if keyword and ctype == "sentence":
entry["asr_text"] = text[:150]
entry["asr_matched"] = keyword.lower() in text.lower()
results.append(entry)
cap.release()
elapsed = time.time() - t_start
return jsonify({
"keyword": keyword or prompt,
"chunk_type": chunk_type,
"target": target_str or None,
"total_chunks": len(chunks),
"visual_confirmed": sum(1 for r in results if r["visual_confirmed"]),
"asr_matched": sum(1 for r in results if r.get("asr_matched")),
"elapsed_secs": round(elapsed, 1),
"results": results,
})
@app.route("/shots/<filename>")
def serve_shot(filename):
path = os.path.join(SHOTS_DIR, filename)
if not os.path.exists(path): return jsonify({"error": "Not found"}), 404
return send_file(path, mimetype="image/jpeg")
@app.route("/health")
def health():
loaded = list(MODELS.keys())
available = list(MODEL_REGISTRY.keys())
return jsonify({
"status": "ok",
"models_loaded": loaded,
"models_available": available,
"device": DEVICE,
"port": PORT,
})
if __name__ == "__main__":
# Register both as resources
gdino_info = {"params_m": 232, "resolution": 384, "has_confidence": True, "license": "Apache 2.0"}
pg_info = {"params_m": 2923, "resolution": 224, "has_confidence": False, "license": "Gemma license"}
register_resource("eye-gdino", "grounding-dino", gdino_info)
register_resource("eye-paligemma", "paligemma", pg_info)
# Start heartbeat
t = threading.Thread(target=heartbeat_loop, args=(["eye-gdino", "eye-paligemma"],), daemon=True)
t.start()
# Pre-load grounding-dino by default
print(f"\n{'='*60}")
print(f" 👁️ Momentry Eye — port {PORT}")
print(f"{'='*60}")
print(f" Models: {', '.join(MODEL_REGISTRY.keys())}")
print(f" Device: {DEVICE}")
print(f" Resources: eye-gdino, eye-paligemma")
print(f" Loading default model...")
get_model("grounding-dino")
print(f" 👁️ Ready: http://localhost:{PORT}")
app.run(host="0.0.0.0", port=PORT, threaded=True)

View File

@@ -0,0 +1,84 @@
#!/opt/homebrew/bin/python3.11
"""
Test Grounding DINO Large with COMBINED prompts — one inference per frame.
"""
import json, os, time, cv2, torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
MODEL_PATH = "/Users/accusys/momentry_core_0.1/models/gun/grounding-dino-large-hf"
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/zero_shot_objects"
os.makedirs(OUTPUT_DIR, exist_ok=True)
TIMEPOINTS = [
(429, "stamp"), (691, "stamp_letter"), (762, "passport"),
(3491, "passport"), (5054, "passport"),
(5434, "letter"), (5443, "stamp_envelope"),
(5467, "envelope"), (5500, "stamp"), (5506, "stamp"),
(5783, "letter"), (5786, "envelope"),
]
COMBINED_PROMPT = "stamp. postage stamp. envelope. passport. identification. letter."
print("Loading Large model...")
t0 = time.time()
processor = AutoProcessor.from_pretrained(MODEL_PATH)
model = AutoModelForZeroShotObjectDetection.from_pretrained(MODEL_PATH)
device = "mps" if torch.backends.mps.is_available() else "cpu"
model.to(device)
print(f"Loaded in {time.time()-t0:.1f}s")
cap = cv2.VideoCapture(VIDEO)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
print(f"\nTesting {len(TIMEPOINTS)} timepoints with combined prompt...")
t_infer = time.time()
for t_sec, label in TIMEPOINTS:
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
ret, frame = cap.read()
if frame is None: continue
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
# ONE inference with ALL prompts
inputs = processor(images=img, text=COMBINED_PROMPT, return_tensors="pt").to(device)
with torch.no_grad():
outputs = model(**inputs)
target = torch.tensor([img.size[::-1]])
dets = processor.post_process_grounded_object_detection(
outputs, threshold=0.1, target_sizes=target
)[0]
det_list = []
for i in range(len(dets["boxes"])):
det_list.append({
"bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
"score": round(dets["scores"][i].item(), 3),
"label": str(dets["labels"][i]) if "labels" in dets else "object",
})
# Classify which expected objects were found
found = set()
for d in det_list:
lbl = d["label"].lower()
for obj in ["stamp", "envelope", "passport", "letter"]:
if obj in lbl:
found.add(obj)
found_str = ", ".join(sorted(found)) if found else "none"
print(f" {t_sec//60}:{t_sec%60:02d} {label:20s} | {len(det_list)} dets | found: [{found_str}]")
# Save annotated frame
for d in det_list:
x1, y1, x2, y2 = [int(v) for v in d["bbox"]]
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(frame, f"{d['label']} {d['score']:.2f}", (x1, y1-5),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
cv2.imwrite(os.path.join(OUTPUT_DIR, f"combined_{t_sec}s.jpg"), frame, [cv2.IMWRITE_JPEG_QUALITY, 85])
cap.release()
print(f"\nDone in {time.time()-t_infer:.0f}s")
print(f"Screenshots: {OUTPUT_DIR}/")

View File

@@ -0,0 +1,156 @@
#!/opt/homebrew/bin/python3.11
"""
Zero-shot Gun Detection Test — OWL-ViT vs Grounding DINO
Tests on 8 known timepoints: 5 original pistol frames + 3 ASR gun mentions.
"""
import json, os, sys, time, cv2
import torch
from PIL import Image
import numpy as np
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/zero_shot_test"
os.makedirs(OUTPUT_DIR, exist_ok=True)
TIMEPOINTS = [
(2646, "2646s", "ASR: He has a gun"),
(3188, "3188s", "Original pistol"),
(3697, "3697s", "ASR: Where's your gun"),
(5341, "5341s", "ASR: He already killed 3 men"),
(5461, "5461s", "Original pistol"),
(6309, "6309s", "Original pistol"),
(6377, "6377s", "Original gun"),
(6479, "6479s", "Original pistol"),
]
PROMPTS = ["gun", "pistol", "rifle", "weapon"]
cap = cv2.VideoCapture(VIDEO)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
def get_frame(t_sec):
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
ret, frame = cap.read()
return frame if ret else None
def save_annotated(frame, detections, prompt, model_name, label):
img = frame.copy()
for d in detections:
x1, y1, x2, y2 = [int(v) for v in d["bbox"]]
conf = d["score"]
cls = d["label"]
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(img, f"{cls} {conf:.2f}", (x1, y1-5),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
filename = f"{label}_{model_name}_prompt-{prompt}.jpg"
cv2.imwrite(os.path.join(OUTPUT_DIR, filename), img, [cv2.IMWRITE_JPEG_QUALITY, 85])
return filename
all_results = {}
# ========== OWL-ViT ==========
print("=" * 60)
print("OWL-ViT (google/owlvit-base-patch32)")
print("=" * 60)
from transformers import OwlViTProcessor, OwlViTForObjectDetection
owl_proc = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
owl_model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
device = "mps" if torch.backends.mps.is_available() else "cpu"
owl_model.to(device)
print(f"Device: {device}")
owl_dets = {}
t0 = time.time()
for t_sec, label, desc in TIMEPOINTS:
frame = get_frame(t_sec)
if frame is None: continue
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
for prompt in PROMPTS:
inputs = owl_proc(text=[[prompt]], images=img, return_tensors="pt").to(device)
with torch.no_grad():
outputs = owl_model(**inputs)
target = torch.tensor([img.size[::-1]])
dets = owl_proc.post_process_grounded_object_detection(outputs, threshold=0.05, target_sizes=target)[0]
det_list = []
for i in range(len(dets["boxes"])):
det_list.append({
"bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
"score": round(dets["scores"][i].item(), 3),
"label": prompt,
})
save_annotated(frame, det_list, prompt, "owlvit", label)
key = f"{label}_prompt-{prompt}"
owl_dets[key] = det_list
if det_list:
best = max(d["score"] for d in det_list)
print(f" [{desc}] prompt='{prompt}': {len(det_list)} det best={best:.3f}")
all_results["owlvit"] = {"elapsed": round(time.time()-t0, 1), "detections": owl_dets}
# ========== Grounding DINO ==========
print("\n" + "=" * 60)
print("Grounding DINO (IDEA-Research/grounding-dino-base)")
print("=" * 60)
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
gd_proc = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-base")
gd_model = AutoModelForZeroShotObjectDetection.from_pretrained("IDEA-Research/grounding-dino-base")
gd_model.to(device)
gd_dets = {}
t0 = time.time()
for t_sec, label, desc in TIMEPOINTS:
frame = get_frame(t_sec)
if frame is None: continue
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
for prompt in PROMPTS:
inputs = gd_proc(images=img, text=prompt, return_tensors="pt").to(device)
with torch.no_grad():
outputs = gd_model(**inputs)
target = torch.tensor([img.size[::-1]])
dets = gd_proc.post_process_grounded_object_detection(outputs, threshold=0.05, target_sizes=target)[0]
det_list = []
for i in range(len(dets["boxes"])):
det_list.append({
"bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
"score": round(dets["scores"][i].item(), 3),
"label": prompt,
})
save_annotated(frame, det_list, prompt, "grounding-dino", label)
key = f"{label}_prompt-{prompt}"
gd_dets[key] = det_list
if det_list:
best = max(d["score"] for d in det_list)
print(f" [{desc}] prompt='{prompt}': {len(det_list)} det best={best:.3f}")
all_results["grounding-dino"] = {"elapsed": round(time.time()-t0, 1), "detections": gd_dets}
cap.release()
# ========== Summary ==========
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
for model in ["owlvit", "grounding-dino"]:
d = all_results[model]
dets = d["detections"]
hits = sum(1 for v in dets.values() if v)
total = sum(len(v) for v in dets.values())
print(f"\n{model} ({d['elapsed']}s): {hits}/8 timepoints, {total} total detections")
for t_sec, label, desc in TIMEPOINTS:
candidates = []
for p in PROMPTS:
key = f"{label}_prompt-{p}"
if key in dets and dets[key]:
for dd in dets[key]:
candidates.append((p, dd["score"]))
if candidates:
best = max(candidates, key=lambda x: x[1])
print(f" {desc}: best={best[1]:.3f} (prompt='{best[0]}')")
else:
print(f" {desc}: no detections")
json.dump(all_results, open(os.path.join(OUTPUT_DIR, "zero_shot_results.json"), "w"), indent=2)
print(f"\nSaved to {OUTPUT_DIR}/")

View File

@@ -0,0 +1,103 @@
#!/opt/homebrew/bin/python3.11
"""
Test Grounding DINO Large on stamps, envelopes, passports, letters.
"""
import json, os, time, cv2, torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
MODEL_PATH = "/Users/accusys/momentry_core_0.1/models/gun/grounding-dino-large-hf"
VIDEO = "/Users/accusys/momentry/var/sftpgo/data/demo/Charade (1963) Cary Grant & Audrey Hepburn \uff5c Comedy Mystery Romance Thriller \uff5c Full Movie.mp4"
OUTPUT_DIR = "/Users/accusys/momentry/output_dev/zero_shot_objects"
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Timepoints per object type
TESTS = [
# (label, time_sec, prompts)
("stamp_001", 429, ["stamp", "postage stamp"]),
("stamp_002", 691, ["stamp", "envelope", "letter"]),
("stamp_003", 5443, ["stamp", "envelope"]),
("stamp_004", 5500, ["stamp"]),
("stamp_005", 5506, ["stamp"]),
("envelope_001", 5443, ["envelope"]),
("envelope_002", 5467, ["envelope"]),
("envelope_003", 5786, ["envelope"]),
("passport_001", 762, ["passport", "identification"]),
("passport_002", 3491, ["passport", "identification"]),
("passport_003", 5054, ["passport"]),
("letter_001", 691, ["letter", "envelope"]),
("letter_002", 5434, ["letter", "envelope"]),
("letter_003", 5783, ["letter", "stamp"]),
]
print(f"Loading Large model...")
t0 = time.time()
processor = AutoProcessor.from_pretrained(MODEL_PATH)
model = AutoModelForZeroShotObjectDetection.from_pretrained(MODEL_PATH)
device = "mps" if torch.backends.mps.is_available() else "cpu"
model.to(device)
print(f"Loaded in {time.time()-t0:.1f}s, device={device}")
cap = cv2.VideoCapture(VIDEO)
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
results = {}
t_infer = time.time()
for label, t_sec, prompts in TESTS:
cap.set(cv2.CAP_PROP_POS_FRAMES, int(t_sec * fps))
ret, frame = cap.read()
if frame is None: continue
img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
key = f"{label}_{t_sec}s"
results[key] = {"time": t_sec, "time_str": f"{t_sec//60}:{t_sec%60:02d}", "prompts": {}}
for prompt in prompts:
inputs = processor(images=img, text=f"{prompt}.", return_tensors="pt").to(device)
with torch.no_grad():
outputs = model(**inputs)
target = torch.tensor([img.size[::-1]])
dets = processor.post_process_grounded_object_detection(
outputs, threshold=0.1, target_sizes=target
)[0]
det_list = []
for i in range(len(dets["boxes"])):
det_list.append({
"bbox": [round(v, 1) for v in dets["boxes"][i].tolist()],
"score": round(dets["scores"][i].item(), 3),
})
results[key]["prompts"][prompt] = det_list
# Save annotated frame
if det_list:
cv2_img = frame.copy()
for d in det_list:
x1, y1, x2, y2 = [int(v) for v in d["bbox"]]
cv2.rectangle(cv2_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
cv2.putText(cv2_img, f"{prompt} {d['score']:.2f}", (x1, y1-5),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
cv2.imwrite(os.path.join(OUTPUT_DIR, f"{label}_{t_sec}s_{prompt}.jpg"), cv2_img,
[cv2.IMWRITE_JPEG_QUALITY, 85])
cap.release()
elapsed = time.time() - t_infer
# Summary
print(f"\n{'='*60}")
print(f"Results ({elapsed:.0f}s)")
print(f"{'='*60}")
for key, data in sorted(results.items()):
found = [p for p, dets in data["prompts"].items() if dets]
if found:
best = max(
((p, d["score"]) for p, dets in data["prompts"].items() for d in dets),
key=lambda x: x[1]
)
print(f" {data['time_str']} {key:20s}{best[1]:.3f} ({best[0]})")
else:
print(f" {data['time_str']} {key:20s} ❌ none")
json.dump(results, open(os.path.join(OUTPUT_DIR, "results.json"), "w"), indent=2)
print(f"\nScreenshots saved to {OUTPUT_DIR}/")