feat: ASRX hybrid pipeline, identity history, worker fixes, checkpoint system

This commit is contained in:
Accusys
2026-06-02 07:13:23 +08:00
parent e3066c3f49
commit e1572907ae
198 changed files with 43705 additions and 8910 deletions

View File

@@ -1,124 +1,320 @@
#!/opt/homebrew/bin/python3.11
"""
ASRX Processor - Speaker Diarization
Uses whisperx for speaker diarization (local model)
ASRX Processor - Hybrid Pipeline Wrapper
Pipeline:
1. ffprobe → select best audio track → ffmpeg → 16kHz mono WAV
2. SelfASRXFixed.process() (7-step hybrid speaker diarization)
3. Convert to Rust-expected format
"""
import sys
import json
import argparse
import os
import subprocess
import tempfile
from pathlib import Path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(
0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "asrx_self")
)
from redis_publisher import RedisPublisher
def process_asrx(video_path: str, output_path: str, uuid: str = ""):
"""Process video for speaker diarization using whisperx"""
def probe_audio_tracks(video_path: str) -> list:
"""ffprobe 列出所有音軌"""
cmd = [
"ffprobe", "-v", "quiet", "-print_format", "json",
"-show_streams", "-select_streams", "a", video_path,
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
data = json.loads(result.stdout)
tracks = []
for stream in data.get("streams", []):
tracks.append({
"index": stream.get("index"),
"codec": stream.get("codec_name"),
"language": stream.get("tags", {}).get("language", "und"),
"channels": stream.get("channels", 0),
"sample_rate": stream.get("sample_rate", "0"),
})
return tracks
except Exception as e:
print(f"[ASRX] ffprobe failed: {e}")
return []
def select_best_track(tracks: list) -> int:
"""選最佳音軌: English > 最多channels > 0"""
if not tracks:
return 0
for i, t in enumerate(tracks):
if t["language"] in ("eng", "en"):
return i
best = 0
for i, t in enumerate(tracks):
if t["channels"] > tracks[best]["channels"]:
best = i
return best
def extract_audio_to_wav(video_path: str, track_index: int, output_wav: str) -> bool:
"""ffmpeg 提取音軌為 16kHz mono WAV"""
cmd = [
"ffmpeg", "-y", "-v", "quiet",
"-i", video_path,
"-map", f"0:{track_index}",
"-ar", "16000",
"-ac", "1",
"-sample_fmt", "s16",
output_wav,
]
try:
subprocess.run(cmd, check=True, capture_output=True, timeout=300)
return True
except Exception as e:
print(f"[ASRX] ffmpeg extraction failed: {e}")
return False
def _cleanup(tmp_dir):
if tmp_dir and os.path.exists(tmp_dir):
import shutil
shutil.rmtree(tmp_dir, ignore_errors=True)
def _atomic_write(path: str, data: dict):
tmp = path + ".tmp"
with open(tmp, "w") as f:
json.dump(data, f, indent=2)
os.rename(tmp, path)
def _shared_audio_setup(video_path):
"""提取音頻,回傳 (tmp_dir, wav_path)"""
tracks = probe_audio_tracks(video_path)
track_idx = select_best_track(tracks) if tracks else 0
actual_track_index = tracks[track_idx]["index"] if tracks else track_idx
tmp_dir = tempfile.mkdtemp(prefix="asrx_")
wav_path = os.path.join(tmp_dir, "audio.wav")
if extract_audio_to_wav(video_path, actual_track_index, wav_path):
return tmp_dir, wav_path
print("[ASRX] Audio extraction failed, falling back to original file",
file=sys.stderr)
return tmp_dir, video_path
def _convert_result(result, output_path):
"""Stage 3: 將 SelfASRXFixed result 轉為 Rust-expected format"""
fps = 30.0
base_name = os.path.basename(output_path)
uuid_part = base_name.split(".")[0]
probe_path = os.path.join(os.path.dirname(output_path),
f"{uuid_part}.probe.json")
if os.path.exists(probe_path):
try:
with open(probe_path) as pf:
probe_data = json.load(pf)
if "fps" in probe_data:
fps = float(probe_data["fps"])
except Exception:
pass
output_result = {
"language": result.get("language"),
"segments": [],
"n_speakers": result.get("n_speakers", 0),
"speaker_stats": result.get("speaker_stats", {}),
}
for seg in result.get("segments", []):
start_sec = seg["start"]
end_sec = seg["end"]
output_result["segments"].append({
"start_time": start_sec,
"end_time": end_sec,
"start_frame": int(start_sec * fps),
"end_frame": int(end_sec * fps),
"text": seg.get("text", ""),
"speaker_id": seg.get("speaker_id", seg.get("speaker", "")),
"language": seg.get("language", ""),
"lang_prob": seg.get("lang_prob", 0.0),
"quality": seg.get("quality", 0.0),
})
if "references" in result:
output_result["references"] = result["references"]
return output_result
def process_asrx(video_path: str, output_path: str, uuid: str = "",
file_uuid: str = "", resume: bool = False):
"""主處理函數"""
publisher = RedisPublisher(uuid) if uuid else None
if publisher:
publisher.info("asrx", "ASRX_START")
try:
import whisperx
import torch
except ImportError:
if publisher:
publisher.error("asrx", "whisperx not installed")
result = {"language": None, "segments": []}
if publisher:
publisher.complete("asrx", "0 segments")
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
return result
checkpoint_path = output_path + ".stage1.json"
if publisher:
publisher.info("asrx", "ASRX_LOADING_MODEL")
# ── Phase 2: Resume from checkpoint (Steps 4-7 only) ──
if resume and os.path.exists(checkpoint_path):
print(f"[ASRX] Found checkpoint, resuming from Step 4...")
tmp_dir, audio_input = _shared_audio_setup(video_path)
try:
from asrx_self.main_fixed import SelfASRXFixed
asrx = SelfASRXFixed()
result = asrx.resume_from_checkpoint(
checkpoint_path, audio_input, output_path=output_path,
)
if "error" in result:
if publisher:
publisher.error("asrx", result["error"])
output_result = {"language": None, "segments": []}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", "0 segments")
_cleanup(tmp_dir)
return output_result
output_result = _convert_result(result, output_path)
if publisher:
publisher.info("asrx",
f"ASRX_COMPLETE:{len(output_result['segments'])}")
_atomic_write(output_path, output_result)
if publisher:
publisher.complete(
"asrx", f"{len(output_result['segments'])} segments")
print(f"[ASRX] Saved {len(output_result['segments'])} segments "
f"to {output_path}", file=sys.stderr)
# 刪除 checkpoint完成後清理
try:
os.remove(checkpoint_path)
print(f"[ASRX] Removed checkpoint: {checkpoint_path}")
except Exception:
pass
_cleanup(tmp_dir)
return output_result
except Exception as e:
if publisher:
publisher.error("asrx", str(e))
import traceback
traceback.print_exc()
output_result = {"language": None, "segments": []}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", "0 segments")
_cleanup(tmp_dir)
return output_result
# ── Phase 1: Full 7-step pipeline ──
tmp_dir = None
try:
# Fix for PyTorch 2.6+ compatibility
# Allow omegaconf types in torch.load
import omegaconf
# Stage 1: Audio Track Preprocessing
tmp_dir, audio_input = _shared_audio_setup(video_path)
torch.serialization.add_safe_globals(
[omegaconf.listconfig.ListConfig, omegaconf.dictconfig.DictConfig]
)
# Stage 2: SelfASRXFixed 7-step pipeline
from asrx_self.main_fixed import SelfASRXFixed
# Load model - using faster-whisper for better performance
# You can also use: "large-v3", "medium", "small", "base", "tiny"
model = whisperx.load_model("base", device="cpu", compute_type="int8")
if publisher:
publisher.info("asrx", "ASRX_LOADING_MODEL")
asrx = SelfASRXFixed()
if publisher:
publisher.info("asrx", "ASRX_TRANSCRIBING")
# Transcribe audio
result = model.transcribe(video_path, language="en")
# Align timestamps
model_a, metadata = whisperx.load_align_model(language_code=result["language"])
result = whisperx.align(
result["segments"], model_a, metadata, video_path, device="cpu"
result = asrx.process(
audio_input,
output_path=None,
file_uuid=file_uuid or None,
max_speakers=10,
quality_threshold=0.85,
checkpoint_path=checkpoint_path,
)
# Diarization (speaker segmentation)
try:
from whisperx.diarize import DiarizationPipeline
# DiarizationPipeline parameters: model_name, token, device, cache_dir
diarize_model = DiarizationPipeline(
model_name="pyannote/speaker-diarization",
token=None, # HuggingFace token (None for public models)
device="cpu",
)
diarize_segments = diarize_model(video_path)
# Assign speaker labels
result = whisperx.assign_word_speakers(diarize_segments, result)
except Exception as e:
if "error" in result:
if publisher:
publisher.info("asrx", f"Diarization skipped: {e}")
publisher.error("asrx", result["error"])
output_result = {"language": None, "segments": []}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", "0 segments")
_cleanup(tmp_dir)
return output_result
# Build output
segments = []
for seg in result.get("segments", []):
text = seg.get("text", "").strip()
if text:
segments.append(
{
"start": seg.get("start", 0.0),
"end": seg.get("end", 0.0),
"text": text,
"speaker_id": seg.get("speaker", None),
}
)
output_result = {"language": result.get("language"), "segments": segments}
# Stage 3: Convert to Rust-expected format
output_result = _convert_result(result, output_path)
if publisher:
publisher.complete("asrx", f"{len(segments)} segments")
publisher.info("asrx", f"ASRX_COMPLETE:{len(output_result['segments'])}")
with open(output_path, "w") as f:
json.dump(output_result, f, indent=2)
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx",
f"{len(output_result['segments'])} segments")
print(f"[ASRX] Saved {len(output_result['segments'])} segments "
f"to {output_path}", file=sys.stderr)
_cleanup(tmp_dir)
return output_result
except Exception as e:
if publisher:
publisher.error("asrx", f"Error: {e}")
result = {"language": None, "segments": []}
publisher.error("asrx", str(e))
import traceback
traceback.print_exc()
output_result = {"language": None, "segments": []}
_atomic_write(output_path, output_result)
if publisher:
publisher.complete("asrx", "0 segments")
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
return result
# 如果 checkpoint 已存在Step 3 完成後 crash保留 WAV 給 resume
if not os.path.exists(checkpoint_path):
_cleanup(tmp_dir)
else:
print(f"[ASRX] Checkpoint saved, keeping temp dir for resume: {tmp_dir}")
return output_result
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="ASRX Speaker Diarization")
parser.add_argument("video_path", help="Path to video file")
parser.add_argument("output_path", help="Output JSON path")
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
parser = argparse.ArgumentParser(description="ASRX Processor (Hybrid Pipeline)")
parser.add_argument("video_path", help="Path to video/audio file")
parser.add_argument("output_path", help="Path to output JSON file")
parser.add_argument("--uuid", help="UUID for Redis publishing", default="")
parser.add_argument("--file-uuid", help="File UUID for Qdrant storage", default="")
parser.add_argument("--resume", action="store_true",
help="Resume from checkpoint (skip Steps 1-3)")
args = parser.parse_args()
process_asrx(args.video_path, args.output_path, args.uuid)
if not args.resume and not Path(args.video_path).exists():
print(f"Error: Video file not found: {args.video_path}")
sys.exit(1)
result = process_asrx(args.video_path, args.output_path, args.uuid,
args.file_uuid, resume=args.resume)
print("\n[Summary]")
print(f" Total segments: {len(result.get('segments', []))}")
if "speaker_stats" in result:
print(f" Detected speakers: {len(result['speaker_stats'])}")
for speaker, stats in result["speaker_stats"].items():
print(f" {speaker}: {stats['count']} segments")

View File

@@ -1,584 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
ASRX Processor - AI-Driven Processor Contract Version 1.0
Compliant with AI-Driven Processor Contract v1.0
Effective Date: 2025-03-27
Features:
1. Standardized command-line interface
2. Redis progress reporting
3. Signal handling (SIGTERM, SIGINT)
4. Health check mode
5. Resource monitoring
6. Contract-compliant JSON output
7. Unified configuration
"""
import sys
import json
import os
import argparse
import signal
import time
import subprocess
import traceback
from datetime import datetime
from typing import Dict, Any
# Redis Publisher for progress reporting
try:
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher
REDIS_AVAILABLE = True
except ImportError:
REDIS_AVAILABLE = False
print(
"WARNING: RedisPublisher not available, progress reporting disabled",
file=sys.stderr,
)
# Contract version
CONTRACT_VERSION = "1.0"
PROCESSOR_NAME = (
"/Users/accusys/momentry_core_0.1/scripts/asrx_processor_contract_v1.py"
)
PROCESSOR_VERSION = "1.0.0"
MODEL_NAME = "pyannote"
MODEL_VERSION = "3.1"
# Unified configuration defaults
DEFAULT_TIMEOUT = 7200 # 2 hours for speaker diarization
DEFAULT_MODEL_SIZE = "base"
DEFAULT_DEVICE = "cpu"
DEFAULT_LANGUAGE = "auto"
DEFAULT_BATCH_SIZE = 16
DEFAULT_DIARIZATION = True
DEFAULT_MIN_SPEAKERS = 1
DEFAULT_MAX_SPEAKERS = 10
# Signal handling with timeout support
class SignalHandler:
"""Handle system signals for graceful shutdown"""
def __init__(self):
self.should_exit = False
self.exit_code = 0
signal.signal(signal.SIGTERM, self.handle_signal)
signal.signal(signal.SIGINT, self.handle_signal)
def handle_signal(self, signum, frame):
"""Handle termination signals"""
print(f"\n收到信号 {signum},正在优雅关闭...")
self.should_exit = True
self.exit_code = 128 + signum
def should_stop(self):
"""Check if should stop processing"""
return self.should_exit
# Timeout manager
class TimeoutManager:
"""Manage processing timeouts"""
def __init__(self, timeout_seconds: int):
self.timeout_seconds = timeout_seconds
self.start_time = time.time()
self.timer = None
def check_timeout(self) -> bool:
"""Check if timeout has been reached"""
elapsed = time.time() - self.start_time
return elapsed > self.timeout_seconds
def get_remaining_time(self) -> float:
"""Get remaining time in seconds"""
elapsed = time.time() - self.start_time
return max(0, self.timeout_seconds - elapsed)
def format_remaining_time(self) -> str:
"""Format remaining time as HH:MM:SS"""
remaining = self.get_remaining_time()
hours = int(remaining // 3600)
minutes = int((remaining % 3600) // 60)
seconds = int(remaining % 60)
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
# Health check functions
def check_environment() -> Dict[str, Any]:
"""Check environment and dependencies"""
checks = []
# Check 1: whisperx for speaker diarization
try:
import whisperx
checks.append(
{
"name": "whisperx",
"status": "available",
"version": getattr(whisperx, "__version__", "unknown"),
}
)
except ImportError:
checks.append({"name": "whisperx", "status": "missing", "version": None})
# Check 2: FFmpeg/FFprobe
try:
ffprobe_result = subprocess.run(
["ffprobe", "-version"],
capture_output=True,
text=True,
timeout=5,
)
if ffprobe_result.returncode == 0:
version_line = ffprobe_result.stdout.split("\n")[0]
checks.append(
{"name": "ffprobe", "status": "available", "version": version_line}
)
else:
checks.append({"name": "ffprobe", "status": "error", "version": None})
except (subprocess.TimeoutExpired, FileNotFoundError):
checks.append({"name": "ffprobe", "status": "missing", "version": None})
# Check 3: Redis (optional)
checks.append(
{
"name": "redis",
"status": "available" if REDIS_AVAILABLE else "optional",
"version": None,
}
)
# Check 4: Python version
checks.append(
{
"name": "python",
"status": "available",
"version": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
}
)
# Check 5: CUDA/GPU availability (optional)
try:
import torch
cuda_available = torch.cuda.is_available()
checks.append(
{
"name": "cuda",
"status": "available" if cuda_available else "optional",
"version": torch.version.cuda if cuda_available else None,
}
)
except ImportError:
checks.append({"name": "cuda", "status": "optional", "version": None})
return {
"timestamp": datetime.now().isoformat(),
"processor_name": PROCESSOR_NAME,
"processor_version": PROCESSOR_VERSION,
"contract_version": CONTRACT_VERSION,
"model_name": MODEL_NAME,
"model_version": MODEL_VERSION,
"checks": checks,
}
def check_video_file(video_path: str) -> Dict[str, Any]:
"""Check video file properties"""
try:
result = subprocess.run(
[
"ffprobe",
"-v",
"error",
"-select_streams",
"v:0",
"-show_entries",
"stream=codec_name,width,height,duration,r_frame_rate",
"-show_entries",
"format=duration,size",
"-of",
"json",
video_path,
],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode != 0:
return {
"valid": False,
"error": result.stderr[:200] if result.stderr else "Unknown error",
}
info = json.loads(result.stdout)
video_info = {}
if "streams" in info and len(info["streams"]) > 0:
stream = info["streams"][0]
video_info = {
"codec": stream.get("codec_name", "unknown"),
"width": int(stream.get("width", 0)),
"height": int(stream.get("height", 0)),
"duration": float(stream.get("duration", 0)),
"frame_rate": stream.get("r_frame_rate", "0/0"),
}
format_info = {}
if "format" in info:
format_info = {
"format_duration": float(info["format"].get("duration", 0)),
"file_size": int(info["format"].get("size", 0)),
}
return {
"valid": True,
"video_info": video_info,
"format_info": format_info,
"exists": os.path.exists(video_path),
"file_size": os.path.getsize(video_path)
if os.path.exists(video_path)
else 0,
}
except Exception as e:
return {"valid": False, "error": str(e)}
# Main processing function
def process_asrx(
video_path: str,
output_path: str,
uuid: str = "",
model_size: str = DEFAULT_MODEL_SIZE,
device: str = DEFAULT_DEVICE,
language: str = DEFAULT_LANGUAGE,
batch_size: int = DEFAULT_BATCH_SIZE,
diarization: bool = DEFAULT_DIARIZATION,
min_speakers: int = DEFAULT_MIN_SPEAKERS,
max_speakers: int = DEFAULT_MAX_SPEAKERS,
timeout: int = DEFAULT_TIMEOUT,
) -> Dict[str, Any]:
"""Process video for speaker diarization using whisperx"""
# Initialize
signal_handler = SignalHandler()
timeout_manager = TimeoutManager(timeout)
publisher = RedisPublisher(uuid) if REDIS_AVAILABLE and uuid else None
def publish(stage: str, message: str, data: Dict = None):
if publisher:
publisher.info(PROCESSOR_NAME, stage, message, data)
publish("ASRX_START", f"开始处理: {os.path.basename(video_path)}")
result = {
"processor_name": PROCESSOR_NAME,
"processor_version": PROCESSOR_VERSION,
"contract_version": CONTRACT_VERSION,
"model_name": MODEL_NAME,
"model_version": MODEL_VERSION,
"video_path": video_path,
"output_path": output_path,
"uuid": uuid,
"timestamp": datetime.now().isoformat(),
"parameters": {
"model_size": model_size,
"device": device,
"language": language,
"batch_size": batch_size,
"diarization": diarization,
"min_speakers": min_speakers,
"max_speakers": max_speakers,
"timeout": timeout,
},
"success": False,
"error": None,
"segments": [],
"speakers": [],
"processing_time": 0,
"resource_usage": {},
}
start_time = time.time()
try:
# Check timeout
if timeout_manager.check_timeout():
raise TimeoutError(f"超时 ({timeout} 秒)")
# Check if should exit
if signal_handler.should_stop():
raise KeyboardInterrupt("收到停止信号")
# Check video file
publish("ASRX_CHECK_VIDEO", "检查视频文件")
video_check = check_video_file(video_path)
if not video_check.get("valid", False):
raise ValueError(f"无效的视频文件: {video_check.get('error', '未知错误')}")
result["video_info"] = video_check.get("video_info", {})
result["format_info"] = video_check.get("format_info", {})
# Import whisperx
publish("ASRX_LOAD_MODEL", f"加载模型: {model_size}")
try:
import whisperx
except ImportError as e:
raise ImportError(f"whisperx 未安装: {e}")
# Load model
publish("ASRX_LOADING", f"加载 whisperx 模型 ({model_size}, {device})")
model = whisperx.load_model(
model_size,
device=device,
compute_type="int8" if device == "cpu" else "float16",
)
# Transcribe
publish("ASRX_TRANSCRIBING", "转录音频")
transcript = model.transcribe(
video_path,
language=language if language != "auto" else None,
batch_size=batch_size,
)
# Align timestamps
publish("ASRX_ALIGNING", "对齐时间戳")
model_a, metadata = whisperx.load_align_model(
language_code=transcript["language"]
)
transcript = whisperx.align(
transcript["segments"],
model_a,
metadata,
video_path,
device,
return_char_alignments=False,
)
# Speaker diarization
if diarization:
publish("ASRX_DIARIZATION", "说话人分离")
diarize_model = whisperx.DiarizationPipeline(
use_auth_token=None, device=device
)
# Add min/max speakers
diarize_segments = diarize_model(
video_path,
min_speakers=min_speakers,
max_speakers=max_speakers,
)
transcript = whisperx.assign_word_speakers(diarize_segments, transcript)
# Extract speaker information
speakers = {}
for segment in transcript["segments"]:
if "speaker" in segment:
speaker_id = segment["speaker"]
if speaker_id not in speakers:
speakers[speaker_id] = {
"id": speaker_id,
"segment_count": 0,
"total_words": 0,
"total_duration": 0.0,
}
speakers[speaker_id]["segment_count"] += 1
speakers[speaker_id]["total_words"] += len(
segment.get("text", "").split()
)
speakers[speaker_id]["total_duration"] += segment.get(
"end", 0
) - segment.get("start", 0)
result["speakers"] = list(speakers.values())
# Format segments
segments = []
for segment in transcript.get("segments", []):
segments.append(
{
"start": segment.get("start", 0.0),
"end": segment.get("end", 0.0),
"text": segment.get("text", ""),
"speaker": segment.get("speaker", None),
"words": segment.get("words", []),
"confidence": segment.get("confidence", 0.0),
}
)
result["segments"] = segments
result["language"] = transcript.get("language", "unknown")
result["success"] = True
publish("ASRX_COMPLETE", f"完成: {len(segments)} 个片段")
except TimeoutError as e:
result["error"] = f"处理超时: {e}"
publish("ASRX_TIMEOUT", f"超时: {e}")
except KeyboardInterrupt:
result["error"] = "处理被用户中断"
publish("ASRX_INTERRUPTED", "处理被中断")
except ImportError as e:
result["error"] = f"依赖缺失: {e}"
publish("ASRX_MISSING_DEPS", f"缺少依赖: {e}")
except Exception as e:
result["error"] = f"处理错误: {str(e)}"
publish("ASRX_ERROR", f"错误: {str(e)}")
traceback.print_exc()
# Calculate processing time
processing_time = time.time() - start_time
result["processing_time"] = processing_time
# Add resource usage
try:
import psutil
process = psutil.Process()
memory_info = process.memory_info()
result["resource_usage"] = {
"cpu_percent": process.cpu_percent(),
"memory_mb": memory_info.rss / (1024 * 1024),
"user_time": process.cpu_times().user,
"system_time": process.cpu_times().system,
}
except ImportError:
result["resource_usage"] = {"error": "psutil not available"}
# Save result
try:
with open(output_path, "w") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
publish("ASRX_SAVED", f"结果保存到: {output_path}")
except Exception as e:
result["error"] = f"保存结果失败: {str(e)}"
publish("ASRX_SAVE_ERROR", f"保存失败: {str(e)}")
return result
def main():
"""Main entry point"""
parser = argparse.ArgumentParser(
description=f"{PROCESSOR_NAME.upper()} Processor v{PROCESSOR_VERSION} - Speaker Diarization"
)
parser.add_argument("video_path", help="Path to input video file")
parser.add_argument("output_path", help="Path to output JSON file")
parser.add_argument("--uuid", help="UUID for progress tracking", default="")
parser.add_argument(
"--model-size",
help=f"Model size (default: {DEFAULT_MODEL_SIZE})",
default=DEFAULT_MODEL_SIZE,
choices=["tiny", "base", "small", "medium", "large-v3"],
)
parser.add_argument(
"--device",
help=f"Device to use (default: {DEFAULT_DEVICE})",
default=DEFAULT_DEVICE,
choices=["cpu", "cuda"],
)
parser.add_argument(
"--language",
help=f"Language code or 'auto' (default: {DEFAULT_LANGUAGE})",
default=DEFAULT_LANGUAGE,
)
parser.add_argument(
"--batch-size",
help=f"Batch size for processing (default: {DEFAULT_BATCH_SIZE})",
type=int,
default=DEFAULT_BATCH_SIZE,
)
parser.add_argument(
"--no-diarization",
help="Disable speaker diarization",
action="store_true",
)
parser.add_argument(
"--min-speakers",
help=f"Minimum number of speakers (default: {DEFAULT_MIN_SPEAKERS})",
type=int,
default=DEFAULT_MIN_SPEAKERS,
)
parser.add_argument(
"--max-speakers",
help=f"Maximum number of speakers (default: {DEFAULT_MAX_SPEAKERS})",
type=int,
default=DEFAULT_MAX_SPEAKERS,
)
parser.add_argument(
"--timeout",
help=f"Timeout in seconds (default: {DEFAULT_TIMEOUT})",
type=int,
default=DEFAULT_TIMEOUT,
)
parser.add_argument(
"--health-check",
help="Run health check and exit",
action="store_true",
)
parser.add_argument(
"--check-video",
help="Check video file and exit",
action="store_true",
)
args = parser.parse_args()
# Health check mode
if args.health_check:
health = check_environment()
print(json.dumps(health, indent=2, ensure_ascii=False))
return (
0
if all(c["status"] in ["available", "optional"] for c in health["checks"])
else 1
)
# Video check mode
if args.check_video:
video_check = check_video_file(args.video_path)
print(json.dumps(video_check, indent=2, ensure_ascii=False))
return 0 if video_check.get("valid", False) else 1
# Normal processing mode
result = process_asrx(
video_path=args.video_path,
output_path=args.output_path,
uuid=args.uuid,
model_size=args.model_size,
device=args.device,
language=args.language,
batch_size=args.batch_size,
diarization=not args.no_diarization,
min_speakers=args.min_speakers,
max_speakers=args.max_speakers,
timeout=args.timeout,
)
# Print result summary
if result.get("success", False):
print(f"{PROCESSOR_NAME.upper()} 处理成功")
print(f" 片段数: {len(result.get('segments', []))}")
print(f" 说话人数: {len(result.get('speakers', []))}")
print(f" 处理时间: {result.get('processing_time', 0):.1f}")
print(f" 输出文件: {args.output_path}")
return 0
else:
print(f"{PROCESSOR_NAME.upper()} 处理失败")
print(f" 错误: {result.get('error', '未知错误')}")
return 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -1,328 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
ASRX Processor - Custom Implementation Wrapper
Uses SpeechBrain ECAPA-TDNN (no HuggingFace token required)
Pipeline:
1. Preprocess: ffprobe audio tracks → select best track → extract WAV
2. Process: VAD (Silero) → Speaker embedding (ECAPA-TDNN) → Spectral clustering
3. Output: segments with speaker_id
"""
import sys
import json
import argparse
import os
import subprocess
import tempfile
from pathlib import Path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(
0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "asrx_self")
)
from redis_publisher import RedisPublisher
def probe_audio_tracks(video_path: str) -> list:
"""Use ffprobe to list all audio tracks in the video file."""
cmd = [
"ffprobe", "-v", "quiet", "-print_format", "json",
"-show_streams", "-select_streams", "a", video_path,
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
data = json.loads(result.stdout)
tracks = []
for stream in data.get("streams", []):
track = {
"index": stream.get("index"),
"codec": stream.get("codec_name"),
"language": stream.get("tags", {}).get("language", "und"),
"channels": stream.get("channels", 0),
"sample_rate": stream.get("sample_rate", "0"),
}
tracks.append(track)
return tracks
except Exception as e:
print(f"[ASRX] ffprobe failed: {e}")
return []
def select_best_track(tracks: list) -> int:
"""Select the best audio track: English > first available > fallback to 0."""
if not tracks:
return 0
# Priority 1: English track
for i, t in enumerate(tracks):
if t["language"] == "eng" or t["language"] == "en":
print(f"[ASRX] Selected English track (index {t['index']})")
return i
# Priority 2: First track with the most channels
best = 0
for i, t in enumerate(tracks):
if t["channels"] > tracks[best]["channels"]:
best = i
print(f"[ASRX] Selected track {best} (lang={tracks[best]['language']}, ch={tracks[best]['channels']})")
return best
def extract_audio_to_wav(video_path: str, track_index: int, output_wav: str) -> bool:
"""Extract selected audio track to 16kHz mono WAV using ffmpeg."""
cmd = [
"ffmpeg", "-y", "-v", "quiet",
"-i", video_path,
"-map", f"0:{track_index}",
"-ar", "16000",
"-ac", "1",
"-sample_fmt", "s16",
output_wav,
]
try:
subprocess.run(cmd, check=True, capture_output=True, timeout=300)
return True
except Exception as e:
print(f"[ASRX] ffmpeg extraction failed: {e}")
return False
def _cleanup(tmp_dir):
"""Clean up temporary directory."""
if tmp_dir and os.path.exists(tmp_dir):
import shutil
shutil.rmtree(tmp_dir, ignore_errors=True)
def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
"""Process video for speaker diarization using custom implementation"""
publisher = RedisPublisher(uuid) if uuid else None
if publisher:
publisher.info("asrx", "ASRX_START")
tmp_dir = None
try:
# Ensure working directory is the scripts dir for model loading
script_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(script_dir)
# Debug: check ffmpeg availability
import shutil
ffmpeg_path = shutil.which("ffmpeg")
print(f"[ASRX] ffmpeg: {ffmpeg_path}", file=sys.stderr)
print(f"[ASRX] CWD: {os.getcwd()}", file=sys.stderr)
# ---- Stage 1: Audio Track Preprocessing ----
print("\n[ASRX] ===== Stage 1: Audio Track Analysis =====", file=sys.stderr)
print(f"[ASRX] Input: {video_path}", file=sys.stderr)
tracks = probe_audio_tracks(video_path)
if tracks:
print(f"[ASRX] Found {len(tracks)} audio track(s):", file=sys.stderr)
for t in tracks:
print(f" Track {t['index']}: {t['codec']} {t['channels']}ch {t['sample_rate']}Hz lang={t['language']}", file=sys.stderr)
else:
print("[ASRX] No audio tracks found via ffprobe, using raw file", file=sys.stderr)
# Select best track
track_idx = select_best_track(tracks) if tracks else 0
actual_track_index = tracks[track_idx]["index"] if tracks else track_idx
# Extract audio to WAV
tmp_dir = tempfile.mkdtemp(prefix="asrx_")
wav_path = os.path.join(tmp_dir, "audio.wav")
if extract_audio_to_wav(video_path, actual_track_index, wav_path):
wav_size = os.path.getsize(wav_path)
print(f"[ASRX] Audio extracted: {wav_path} ({wav_size / 1024 / 1024:.1f}MB)", file=sys.stderr)
audio_input = wav_path
else:
print("[ASRX] Audio extraction failed, falling back to original file", file=sys.stderr)
audio_input = video_path
# ---- Stage 2: Load ASR segments for time alignment ----
# Try multiple paths to find ASR JSON
asr_segments = []
asr_fallback_reason = ""
asr_candidates = [
output_path.replace(".asrx.json", ".asr.json") if output_path else "",
os.path.join(os.path.dirname(output_path) if output_path else ".", os.path.basename(video_path).rsplit(".", 1)[0] + ".asr.json"),
os.path.join(os.path.dirname(output_path) if output_path else ".", "dd61fda85fee441fdd00ab5528213ff7.asr.json"),
]
asr_path = ""
for candidate in asr_candidates:
if candidate and os.path.exists(candidate):
asr_path = candidate
break
if asr_path:
try:
with open(asr_path) as f:
asr_data = json.load(f)
asr_segments = asr_data.get("segments", [])
print(f"[ASRX] Loaded {len(asr_segments)} ASR segments from {asr_path}", file=sys.stderr)
asr_fallback_reason = f"loaded_{len(asr_segments)}_segments"
except Exception as e:
asr_fallback_reason = f"load_error_{e}"
print(f"[ASRX] Failed to load ASR segments: {e}", file=sys.stderr)
else:
asr_fallback_reason = f"asr_json_not_found_tried_{len(asr_candidates)}_paths"
print(f"[ASRX] ASR output not found, tried {len(asr_candidates)} paths. First candidate: {asr_candidates[0]}", file=sys.stderr)
# ---- Stage 3: ASRX Processing ----
from asrx_self.main_fixed import SelfASRXFixed
if publisher:
publisher.info("asrx", "ASRX_LOADING_MODEL")
asrx = SelfASRXFixed()
if publisher:
publisher.info("asrx", "ASRX_TRANSCRIBING")
if asr_segments:
# Use ASR segment boundaries for speaker embedding extraction
print(f"[ASRX] Using {len(asr_segments)} ASR segments for diarization", file=sys.stderr)
result = asrx.process_with_segments(
audio_input,
asr_segments,
output_path=None,
)
else:
# Fallback: VAD-based diarization
result = asrx.process(
audio_input,
output_path=None,
min_speech_duration_ms=500,
max_speakers=10,
)
if "error" in result:
if publisher:
publisher.error("asrx", result["error"])
# Return empty result
output_result = {"language": None, "segments": []}
with open(output_path, "w") as f:
json.dump(output_result, f, indent=2)
if publisher:
publisher.complete("asrx", "0 segments")
_cleanup(tmp_dir)
return output_result
# Convert to Rust-expected format (start_frame/end_frame/speaker)
# Read fps from probe json ({file_uuid}.probe.json)
_debug = {"asr_fallback": asr_fallback_reason, "asr_path": asr_path}
fps = 30.0
output_dir = os.path.dirname(output_path) if output_path else "."
base_name = os.path.basename(output_path) if output_path else ""
# Extract uuid from {uuid}.{type}.json format
uuid_part = base_name.split(".")[0] if base_name else ""
probe_candidates = [
os.path.join(output_dir, f"{uuid_part}.probe.json"),
]
for p in probe_candidates:
if os.path.exists(p):
try:
with open(p) as pf:
probe_data = json.load(pf)
if "fps" in probe_data:
fps = float(probe_data["fps"])
print(f"[ASRX] FPS from probe: {fps}", file=sys.stderr)
break
except:
pass
output_result = {
"language": None,
"segments": [],
}
# Convert segments
for seg in result["segments"]:
start_sec = seg["start"]
end_sec = seg["end"]
output_result["segments"].append(
{
"start_time": start_sec,
"end_time": end_sec,
"start_frame": int(start_sec * fps),
"end_frame": int(end_sec * fps),
"text": "",
"speaker_id": seg["speaker"],
}
)
# Add speaker_stats as optional metadata
if "speaker_stats" in result:
output_result["speaker_stats"] = result["speaker_stats"]
# 傳遞 embeddings每個 segment 對應的 192-D speaker embedding
if "embeddings" in result:
output_result["embeddings"] = result["embeddings"]
if publisher:
publisher.info("asrx", f"ASRX_COMPLETE:{len(output_result['segments'])}")
# Save output
output_result["_debug"] = _debug
with open(output_path, "w") as f:
json.dump(output_result, f, indent=2)
if publisher:
publisher.complete("asrx", f"{len(output_result['segments'])} segments")
print(f"[ASRX-Custom] Saved {len(output_result['segments'])} segments to {output_path}", file=sys.stderr)
_cleanup(tmp_dir)
return output_result
except Exception as e:
if publisher:
publisher.error("asrx", str(e))
import traceback
traceback.print_exc()
# Return empty result on error
output_result = {"language": None, "segments": []}
with open(output_path, "w") as f:
json.dump(output_result, f, indent=2)
if publisher:
publisher.complete("asrx", "0 segments")
_cleanup(tmp_dir)
return output_result
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="ASRX Processor (Custom Implementation)"
)
parser.add_argument("video_path", help="Path to video/audio file")
parser.add_argument("output_path", help="Path to output JSON file")
parser.add_argument("--uuid", help="UUID for Redis publishing", default="")
args = parser.parse_args()
if not Path(args.video_path).exists():
print(f"Error: Video file not found: {args.video_path}")
sys.exit(1)
result = process_asrx_custom(args.video_path, args.output_path, args.uuid)
print("\n[Summary]")
print(f" Total segments: {len(result['segments'])}")
if "speaker_stats" in result:
print(f" Detected speakers: {len(result['speaker_stats'])}")
for speaker, stats in result["speaker_stats"].items():
print(f" {speaker}: {stats['count']} segments")

View File

@@ -1,177 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
ASRX 處理器 - 簡化版
先做轉錄,說話人分離可選
修復 PyTorch 2.6 兼容性問題
"""
# Fix for PyTorch 2.6+ compatibility - MUST be set before importing torch
import os
os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
import sys
import json
import argparse
import signal
import subprocess
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher
def signal_handler(signum, frame):
print(f"ASRX: Received signal {signum}, exiting...")
sys.exit(1)
def has_audio_stream(video_path):
"""Check if video file has audio stream using ffprobe."""
try:
cmd = [
"ffprobe",
"-v",
"error",
"-select_streams",
"a",
"-show_entries",
"stream=codec_type",
"-of",
"csv=p=0",
video_path,
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
return bool(result.stdout.strip())
except subprocess.CalledProcessError:
return False
except FileNotFoundError:
print("WARNING: ffprobe not found, assuming audio exists")
return True
def process_asrx(video_path: str, output_path: str, uuid: str = "", skip_diarization: bool = True):
"""
Process video for speaker diarization using whisperx
Args:
video_path: Path to video file
output_path: Path to output JSON
uuid: UUID for Redis progress
skip_diarization: Skip speaker diarization (only transcription)
"""
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
publisher = RedisPublisher(uuid) if uuid else None
if publisher:
publisher.info("asrx", "ASRX_START")
try:
import whisperx
import torch
except ImportError as e:
if publisher:
publisher.error("asrx", f"Missing dependency: {e}")
result = {"language": None, "segments": []}
if publisher:
publisher.complete("asrx", "0 segments")
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
sys.exit(1)
# Check for audio stream
if not has_audio_stream(video_path):
if publisher:
publisher.info("asrx", "No audio stream detected, skipping transcription")
output = {"language": "", "language_probability": 0.0, "segments": []}
with open(output_path, "w") as f:
json.dump(output, f, indent=2)
if publisher:
publisher.complete("asrx", "0 segments (no audio)")
sys.stderr.write("ASRX: No audio stream, skipping transcription\n")
sys.stderr.flush()
sys.exit(0)
if publisher:
publisher.info("asrx", "ASRX_LOADING_MODEL")
try:
# Load model
if publisher:
publisher.info("asrx", "Loading whisperx base model (this may take a while)...")
model = whisperx.load_model("base", device="cpu", compute_type="int8")
if publisher:
publisher.info("asrx", "ASRX_TRANSCRIBING")
# Transcribe with language detection
result = model.transcribe(video_path)
if publisher:
publisher.info("asrx", f"ASRX_LANGUAGE:{result.get('language', 'unknown')}")
# Build output (without diarization for now)
segments = []
for seg in result.get("segments", []):
text = seg.get("text", "").strip()
if text:
segments.append(
{
"start": seg.get("start", 0.0),
"end": seg.get("end", 0.0),
"text": text,
"speaker_id": None, # Will be added when diarization is enabled
}
)
output_result = {
"language": result.get("language"),
"language_probability": result.get("language_probability", 0),
"segments": segments,
"diarization_enabled": not skip_diarization
}
if publisher:
publisher.complete("asrx", f"{len(segments)} segments")
with open(output_path, "w") as f:
json.dump(output_result, f, indent=2, ensure_ascii=False)
sys.stderr.write(
f"ASRX: Transcription complete, {len(segments)} segments written to {output_path}\n"
)
sys.stderr.flush()
sys.exit(0)
except Exception as e:
if publisher:
publisher.error("asrx", f"Error: {e}")
import traceback
traceback.print_exc()
result = {"language": None, "segments": [], "error": str(e)}
if publisher:
publisher.complete("asrx", "0 segments (error)")
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
sys.exit(1)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="ASRX Speaker Diarization (Simplified)")
parser.add_argument("video_path", help="Path to video file")
parser.add_argument("output_path", help="Output JSON path")
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
parser.add_argument(
"--skip-diarization",
action="store_true",
help="Skip speaker diarization (only transcription)"
)
args = parser.parse_args()
process_asrx(
args.video_path,
args.output_path,
args.uuid,
args.skip_diarization
)

View File

@@ -1,212 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
ASRX 處理器 v2 - 說話人分離
使用 whisperx 進行轉錄和說話人分離
需要 PyTorch 2.5.0 + torchvision 0.20.0 + torchaudio 2.5.0
"""
# Fix for PyTorch 2.5 compatibility
import os
os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
import sys
import json
import argparse
import signal
import subprocess
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher
def signal_handler(signum, frame):
print(f"ASRX: Received signal {signum}, exiting...")
sys.exit(1)
def has_audio_stream(video_path):
"""Check if video file has audio stream using ffprobe."""
try:
cmd = [
"ffprobe",
"-v",
"error",
"-select_streams",
"a",
"-show_entries",
"stream=codec_type",
"-of",
"csv=p=0",
video_path,
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
return bool(result.stdout.strip())
except subprocess.CalledProcessError:
return False
except FileNotFoundError:
print("WARNING: ffprobe not found, assuming audio exists")
return True
def process_asrx(video_path: str, output_path: str, uuid: str = "", skip_diarization: bool = False):
"""
Process video for speaker diarization using whisperx
Args:
video_path: Path to video file
output_path: Path to output JSON
uuid: UUID for Redis progress
skip_diarization: Skip speaker diarization (only transcription)
"""
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
publisher = RedisPublisher(uuid) if uuid else None
if publisher:
publisher.info("asrx", "ASRX_START")
# Check for audio stream
if not has_audio_stream(video_path):
if publisher:
publisher.info("asrx", "No audio stream detected, skipping transcription")
output = {"language": "", "language_probability": 0.0, "segments": []}
with open(output_path, "w") as f:
json.dump(output, f, indent=2)
if publisher:
publisher.complete("asrx", "0 segments (no audio)")
sys.stderr.write("ASRX: No audio stream, skipping transcription\n")
sys.stderr.flush()
sys.exit(0)
if publisher:
publisher.info("asrx", "ASRX_LOADING_MODEL")
try:
import whisperx
import torch
except ImportError as e:
if publisher:
publisher.error("asrx", f"Missing dependency: {e}")
result = {"language": None, "segments": [], "error": str(e)}
if publisher:
publisher.complete("asrx", "0 segments")
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
sys.exit(1)
try:
# Load model
if publisher:
publisher.info("asrx", "Loading whisperx base model (this may take a while)...")
model = whisperx.load_model("base", device="cpu", compute_type="int8")
if publisher:
publisher.info("asrx", "ASRX_TRANSCRIBING")
# Transcribe with language detection
result = model.transcribe(video_path)
if publisher:
publisher.info("asrx", f"ASRX_LANGUAGE:{result.get('language', 'unknown')}")
# Align timestamps
if publisher:
publisher.info("asrx", "ASRX_ALIGNING_TIMESTAMPS")
model_a, metadata = whisperx.load_align_model(
language_code=result["language"],
device="cpu"
)
result = whisperx.align(
result["segments"],
model_a,
metadata,
video_path,
device="cpu"
)
# Diarization (speaker segmentation)
if not skip_diarization:
if publisher:
publisher.info("asrx", "ASRX_DIARIZATION")
try:
diarize_model = whisperx.DiarizationPipeline(use_auth_token=None)
diarize_segments = diarize_model(video_path)
# Assign speaker labels
result = whisperx.assign_word_speakers(diarize_segments, result)
if publisher:
publisher.info("asrx", "Diarization completed")
except Exception as e:
if publisher:
publisher.info("asrx", f"Diarization skipped: {e}")
sys.stderr.write(f"ASRX: Diarization failed: {e}\n")
# Build output
segments = []
for seg in result.get("segments", []):
text = seg.get("text", "").strip()
if text:
segments.append(
{
"start": seg.get("start", 0.0),
"end": seg.get("end", 0.0),
"text": text,
"speaker_id": seg.get("speaker", None),
}
)
output_result = {
"language": result.get("language"),
"language_probability": result.get("language_probability", 0),
"segments": segments,
"diarization_enabled": not skip_diarization
}
if publisher:
publisher.complete("asrx", f"{len(segments)} segments")
with open(output_path, "w") as f:
json.dump(output_result, f, indent=2, ensure_ascii=False)
sys.stderr.write(
f"ASRX: Transcription complete, {len(segments)} segments written to {output_path}\n"
)
sys.stderr.flush()
sys.exit(0)
except Exception as e:
if publisher:
publisher.error("asrx", f"Error: {e}")
import traceback
traceback.print_exc()
result = {"language": None, "segments": [], "error": str(e)}
if publisher:
publisher.complete("asrx", "0 segments (error)")
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
sys.exit(1)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="ASRX Speaker Diarization v2")
parser.add_argument("video_path", help="Path to video file")
parser.add_argument("output_path", help="Output JSON path")
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
parser.add_argument(
"--skip-diarization",
action="store_true",
help="Skip speaker diarization (only transcription)"
)
args = parser.parse_args()
process_asrx(
args.video_path,
args.output_path,
args.uuid,
args.skip_diarization
)

View File

@@ -1,184 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
ASRX 處理器 v2 - 快速版(跳過對齊)
使用 whisperx 進行轉錄和說話人分離
跳過時間戳對齊以避開 PyTorch 版本問題
"""
import os
os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
import sys
import json
import argparse
import signal
import subprocess
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher
def signal_handler(signum, frame):
print(f"ASRX: Received signal {signum}, exiting...")
sys.exit(1)
def has_audio_stream(video_path):
"""Check if video file has audio stream using ffprobe."""
try:
cmd = [
"ffprobe",
"-v",
"error",
"-select_streams",
"a",
"-show_entries",
"stream=codec_type",
"-of",
"csv=p=0",
video_path,
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
return bool(result.stdout.strip())
except subprocess.CalledProcessError:
return False
except FileNotFoundError:
print("WARNING: ffprobe not found, assuming audio exists")
return True
def process_asrx(video_path: str, output_path: str, uuid: str = ""):
"""
Process video for speaker diarization using whisperx (no alignment)
Args:
video_path: Path to video file
output_path: Path to output JSON
uuid: UUID for Redis progress
"""
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
publisher = RedisPublisher(uuid) if uuid else None
if publisher:
publisher.info("asrx", "ASRX_START")
# Check for audio stream
if not has_audio_stream(video_path):
if publisher:
publisher.info("asrx", "No audio stream detected")
output = {"language": "", "language_probability": 0.0, "segments": []}
with open(output_path, "w") as f:
json.dump(output, f, indent=2)
if publisher:
publisher.complete("asrx", "0 segments (no audio)")
sys.exit(0)
if publisher:
publisher.info("asrx", "ASRX_LOADING_MODEL")
try:
import whisperx
import torch
except ImportError as e:
if publisher:
publisher.error("asrx", f"Missing dependency: {e}")
result = {"language": None, "segments": [], "error": str(e)}
if publisher:
publisher.complete("asrx", "0 segments")
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
sys.exit(1)
try:
# Load model
if publisher:
publisher.info("asrx", "Loading whisperx base model...")
model = whisperx.load_model("base", device="cpu", compute_type="int8")
if publisher:
publisher.info("asrx", "ASRX_TRANSCRIBING")
# Transcribe with language detection
result = model.transcribe(video_path)
if publisher:
publisher.info("asrx", f"ASRX_LANGUAGE:{result.get('language', 'unknown')}")
# Skip alignment (requires PyTorch 2.6+)
# Go directly to diarization
if publisher:
publisher.info("asrx", "ASRX_DIARIZATION")
try:
diarize_model = whisperx.DiarizationPipeline(use_auth_token=None)
diarize_segments = diarize_model(video_path)
# Assign speaker labels
result = whisperx.assign_word_speakers(diarize_segments, result)
if publisher:
publisher.info("asrx", "Diarization completed")
except Exception as e:
if publisher:
publisher.info("asrx", f"Diarization info: {e}")
sys.stderr.write(f"ASRX: Diarization note: {e}\n")
# Build output
segments = []
for seg in result.get("segments", []):
text = seg.get("text", "").strip()
if text:
segments.append(
{
"start": seg.get("start", 0.0),
"end": seg.get("end", 0.0),
"text": text,
"speaker_id": seg.get("speaker", None),
}
)
output_result = {
"language": result.get("language"),
"language_probability": result.get("language_probability", 0),
"segments": segments,
"diarization_enabled": True,
"alignment_enabled": False,
"note": "Alignment skipped due to PyTorch version compatibility"
}
if publisher:
publisher.complete("asrx", f"{len(segments)} segments")
with open(output_path, "w") as f:
json.dump(output_result, f, indent=2, ensure_ascii=False)
sys.stderr.write(
f"ASRX: Transcription complete, {len(segments)} segments written to {output_path}\n"
)
sys.stderr.flush()
sys.exit(0)
except Exception as e:
if publisher:
publisher.error("asrx", f"Error: {e}")
import traceback
traceback.print_exc()
result = {"language": None, "segments": [], "error": str(e)}
if publisher:
publisher.complete("asrx", "0 segments (error)")
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
sys.exit(1)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="ASRX Speaker Diarization v2 (No Alignment)")
parser.add_argument("video_path", help="Path to video file")
parser.add_argument("output_path", help="Output JSON path")
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
args = parser.parse_args()
process_asrx(args.video_path, args.output_path, args.uuid)

View File

@@ -1,165 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
ASRX 處理器 v2 - 轉錄版
使用 whisperx 進行轉錄(不含說話人分離)
說話人分離需要額外安裝 pyannote.audio 並配置 HuggingFace token
"""
import os
os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
import sys
import json
import argparse
import signal
import subprocess
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher
def signal_handler(signum, frame):
print(f"ASRX: Received signal {signum}, exiting...")
sys.exit(1)
def has_audio_stream(video_path):
"""Check if video file has audio stream using ffprobe."""
try:
cmd = [
"ffprobe",
"-v",
"error",
"-select_streams",
"a",
"-show_entries",
"stream=codec_type",
"-of",
"csv=p=0",
video_path,
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
return bool(result.stdout.strip())
except subprocess.CalledProcessError:
return False
except FileNotFoundError:
print("WARNING: ffprobe not found, assuming audio exists")
return True
def process_asrx(video_path: str, output_path: str, uuid: str = ""):
"""
Process video for transcription using whisperx
Args:
video_path: Path to video file
output_path: Path to output JSON
uuid: UUID for Redis progress
"""
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
publisher = RedisPublisher(uuid) if uuid else None
if publisher:
publisher.info("asrx", "ASRX_START")
# Check for audio stream
if not has_audio_stream(video_path):
if publisher:
publisher.info("asrx", "No audio stream detected")
output = {"language": "", "language_probability": 0.0, "segments": []}
with open(output_path, "w") as f:
json.dump(output, f, indent=2)
if publisher:
publisher.complete("asrx", "0 segments (no audio)")
sys.exit(0)
if publisher:
publisher.info("asrx", "ASRX_LOADING_MODEL")
try:
import whisperx
import torch
except ImportError as e:
if publisher:
publisher.error("asrx", f"Missing dependency: {e}")
result = {"language": None, "segments": [], "error": str(e)}
if publisher:
publisher.complete("asrx", "0 segments")
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
sys.exit(1)
try:
# Load model
if publisher:
publisher.info("asrx", "Loading whisperx base model...")
model = whisperx.load_model("base", device="cpu", compute_type="int8")
if publisher:
publisher.info("asrx", "ASRX_TRANSCRIBING")
# Transcribe with language detection
result = model.transcribe(video_path)
if publisher:
publisher.info("asrx", f"ASRX_LANGUAGE:{result.get('language', 'unknown')}")
# Build output (without alignment and diarization due to PyTorch version)
segments = []
for seg in result.get("segments", []):
text = seg.get("text", "").strip()
if text:
segments.append(
{
"start": seg.get("start", 0.0),
"end": seg.get("end", 0.0),
"text": text,
"speaker_id": None, # Requires pyannote.audio + HuggingFace token
}
)
output_result = {
"language": result.get("language"),
"language_probability": result.get("language_probability", 0),
"segments": segments,
"diarization_enabled": False,
"alignment_enabled": False,
"note": "PyTorch 2.5.0 compatibility - alignment and diarization require additional setup"
}
if publisher:
publisher.complete("asrx", f"{len(segments)} segments")
with open(output_path, "w") as f:
json.dump(output_result, f, indent=2, ensure_ascii=False)
sys.stderr.write(
f"ASRX: Transcription complete, {len(segments)} segments written to {output_path}\n"
)
sys.stderr.flush()
sys.exit(0)
except Exception as e:
if publisher:
publisher.error("asrx", f"Error: {e}")
import traceback
traceback.print_exc()
result = {"language": None, "segments": [], "error": str(e)}
if publisher:
publisher.complete("asrx", "0 segments (error)")
with open(output_path, "w") as f:
json.dump(result, f, indent=2)
sys.exit(1)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="ASRX Transcription (PyTorch 2.5.0)")
parser.add_argument("video_path", help="Path to video file")
parser.add_argument("output_path", help="Output JSON path")
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
args = parser.parse_args()
process_asrx(args.video_path, args.output_path, args.uuid)

View File

@@ -1,178 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
整合 Face + ASRX 說話人分離(版本 3 - 修復 face_detected 檢查)
"""
import json
import argparse
from pathlib import Path
from typing import Dict, List
def load_json(path: str):
"""載入 JSON 文件"""
with open(path, 'r', encoding='utf-8') as f:
return json.load(f)
def match_face_with_speaker_v3(face_data: Dict, asrx_data: Dict,
time_threshold: float = 3.0) -> List[Dict]:
"""
匹配人臉與說話人(版本 3 - 修復版)
修復Face 數據沒有 face_detected 欄位,改用 faces 列表是否為空判斷
"""
face_frames = face_data.get('frames', [])
asrx_segments = asrx_data.get('segments', [])
# 將 Face 幀按時間排序
face_frames_sorted = sorted(face_frames, key=lambda x: x.get('timestamp', 0))
print(f" Face frames: {len(face_frames_sorted)}")
print(f" ASRX segments: {len(asrx_segments)}")
# 匹配
integrated = []
for i, seg in enumerate(asrx_segments):
start = seg['start']
end = seg['end']
speaker = seg['speaker']
mid_time = (start + end) / 2
# 找到時間範圍內的人臉
faces_in_range = []
for frame in face_frames_sorted:
ts = frame.get('timestamp', 0)
# 檢查是否在時間範圍內
if start - time_threshold <= ts <= end + time_threshold:
# 檢查是否有人臉faces 列表不為空)
faces = frame.get('faces', [])
if faces and len(faces) > 0:
faces_in_range.append({
'timestamp': ts,
'faces': faces,
'distance_from_mid': abs(ts - mid_time)
})
# 選擇最接近片段中間的人臉
if faces_in_range:
faces_in_range.sort(key=lambda x: x['distance_from_mid'])
best_face = faces_in_range[0]
else:
best_face = None
# 建立整合結果
integrated.append({
'start': start,
'end': end,
'duration': seg.get('duration', end - start),
'speaker': speaker,
'has_face': best_face is not None,
'face_timestamp': best_face['timestamp'] if best_face else None,
'face_location': best_face['faces'][0] if best_face and best_face['faces'] else None,
'face_count_in_range': len(faces_in_range)
})
# 進度顯示
if (i + 1) % 200 == 0:
print(f" Processed {i+1}/{len(asrx_segments)} segments...")
return integrated
def analyze_speaker_face(integrated: List[Dict]):
"""分析說話人與人臉的對應"""
speaker_stats = {}
for item in integrated:
speaker = item['speaker']
if speaker not in speaker_stats:
speaker_stats[speaker] = {
'total_segments': 0,
'with_face': 0,
'without_face': 0,
'total_duration': 0
}
speaker_stats[speaker]['total_segments'] += 1
speaker_stats[speaker]['total_duration'] += item['duration']
if item['has_face']:
speaker_stats[speaker]['with_face'] += 1
else:
speaker_stats[speaker]['without_face'] += 1
return speaker_stats
def main():
parser = argparse.ArgumentParser(description='整合 Face + ASRX 說話人')
parser.add_argument('face_json', help='Face 檢測結果 JSON')
parser.add_argument('asrx_json', help='ASRX 說話人分離 JSON')
parser.add_argument('-o', '--output', help='輸出整合結果 JSON')
parser.add_argument('--threshold', type=float, default=3.0,
help='時間閾值(秒)')
parser.add_argument('--stats', action='store_true', help='只显示統計')
args = parser.parse_args()
# 載入數據
print(f"[Load] Face: {args.face_json}")
face_data = load_json(args.face_json)
print(f"[Load] ASRX: {args.asrx_json}")
asrx_data = load_json(args.asrx_json)
# 匹配
print(f"\n[Match] Matching faces with speakers (threshold={args.threshold}s)...")
integrated = match_face_with_speaker_v3(face_data, asrx_data, args.threshold)
# 分析
print("\n[Analyze] Analyzing speaker-face correspondence...")
speaker_stats = analyze_speaker_face(integrated)
# 顯示統計
print(f"\n{'='*70}")
print("說話人 - 人臉對應統計")
print(f"{'='*70}")
total_segments = len(integrated)
total_with_face = sum(1 for item in integrated if item['has_face'])
for speaker, stats in sorted(speaker_stats.items()):
with_face_pct = stats['with_face'] / stats['total_segments'] * 100 if stats['total_segments'] > 0 else 0
print(f"\n🔊 {speaker}:")
print(f" 總片段:{stats['total_segments']}")
print(f" 有人臉:{stats['with_face']} ({with_face_pct:.1f}%)")
print(f" 無人臉:{stats['without_face']}")
print(f" 總時長:{stats['total_duration']:.1f}s ({stats['total_duration']/60:.1f}分鐘)")
print(f"\n{'='*70}")
print(f"總計:{total_segments} 片段,{total_with_face} 片段有人臉 ({total_with_face/total_segments*100:.1f}%)")
print(f"{'='*70}")
# 保存結果
if args.output:
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
result = {
'face_source': str(args.face_json),
'asrx_source': str(args.asrx_json),
'time_threshold': args.threshold,
'integrated_segments': integrated,
'speaker_stats': speaker_stats
}
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f"\n[Save] Results saved to: {output_path}")
return integrated, speaker_stats
if __name__ == "__main__":
main()

View File

@@ -1,268 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Self-implemented ASRX - 自實作說話人分離系統
基於聲紋嵌入 + 譜聚類
技術架構:
1. VAD (Silero VAD) - 語音活動檢測
2. Speaker Encoder (ECAPA-TDNN) - 聲紋特徵提取
3. Spectral Clustering - 譜聚類
4. Post-processing - 後處理
流程:
音頻 → VAD → 語音片段 → 聲紋嵌入 → 相似度矩陣 → 譜聚類 → 說話人 ID
"""
import sys
import json
import time
from pathlib import Path
# 導入自定義模組
from vad import load_vad_model, extract_speech_segments
from speaker_encoder import (
load_speaker_encoder,
extract_speaker_embeddings_batch,
compute_similarity_matrix,
normalize_embeddings,
)
from speaker_cluster import spectral_clustering_speaker, smooth_speaker_labels
class SelfASRX:
"""
自實作說話人分離系統
"""
def __init__(self):
"""初始化模型"""
print("[SelfASRX] Initializing models...")
# 載入 VAD 模型
print("[SelfASRX] Loading VAD model (Silero)...")
self.vad_model, self.vad_utils = load_vad_model()
# 載入聲紋模型
print("[SelfASRX] Loading speaker encoder (ECAPA-TDNN)...")
self.speaker_encoder = load_speaker_encoder()
print("[SelfASRX] Models loaded successfully")
def process(
self,
audio_path,
output_path=None,
min_speech_duration_ms=500,
n_speakers=None,
smooth_window=5,
):
"""
處理音頻文件進行說話人分離
Args:
audio_path: 音頻文件路徑
output_path: 輸出 JSON 路徑(可選)
min_speech_duration_ms: 最小語音持續時間
n_speakers: 說話人數量None=自動估計)
smooth_window: 平滑窗口大小
Returns:
result: 說話人分離結果
"""
start_time = time.time()
print(f"\n[SelfASRX] Processing: {audio_path}")
print("=" * 60)
# 步驟 1: VAD - 語音活動檢測
print("\n[Step 1] Voice Activity Detection...")
step1_start = time.time()
speech_segments, wav, sample_rate = extract_speech_segments(
audio_path,
self.vad_model,
self.vad_utils,
min_speech_duration_ms=min_speech_duration_ms,
)
step1_time = time.time() - step1_start
print(f" Speech segments: {len(speech_segments)}")
print(f" Total duration: {len(wav) / sample_rate:.2f}s")
print(f" VAD time: {step1_time:.2f}s")
if len(speech_segments) == 0:
print("[SelfASRX] No speech detected!")
return {"error": "No speech detected", "segments": []}
# 步驟 2: 聲紋特徵提取
print("\n[Step 2] Speaker embedding extraction...")
step2_start = time.time()
# 提取語音片段音頻
audio_segments = []
for start_sec, end_sec in speech_segments:
start_sample = int(start_sec * sample_rate)
end_sample = int(end_sec * sample_rate)
audio_segments.append(wav[start_sample:end_sample])
# 批量提取嵌入
embeddings = extract_speaker_embeddings_batch(
self.speaker_encoder, audio_segments, sample_rate
)
# 正規化
embeddings = normalize_embeddings(embeddings)
step2_time = time.time() - step2_start
print(f" Embedding shape: {embeddings.shape}")
print(f" Embedding time: {step2_time:.2f}s")
# 步驟 3: 計算相似度矩陣
print("\n[Step 3] Computing similarity matrix...")
step3_start = time.time()
similarity_matrix = compute_similarity_matrix(embeddings, method="cosine")
step3_time = time.time() - step3_start
print(f" Similarity matrix shape: {similarity_matrix.shape}")
print(f" Similarity time: {step3_time:.2f}s")
# 步驟 4: 譜聚類
print("\n[Step 4] Spectral clustering...")
step4_start = time.time()
speaker_labels, estimated_n_speakers = spectral_clustering_speaker(
similarity_matrix, n_speakers=n_speakers, auto_estimate=(n_speakers is None)
)
# 平滑標籤
if smooth_window > 1:
speaker_labels = smooth_speaker_labels(
speaker_labels, window_size=smooth_window
)
step4_time = time.time() - step4_start
print(f" Estimated speakers: {estimated_n_speakers}")
print(f" Clustering time: {step4_time:.2f}s")
# 步驟 5: 建立輸出結果
print("\n[Step 5] Building output...")
result = {
"audio_path": str(audio_path),
"total_duration": len(wav) / sample_rate,
"n_speech_segments": len(speech_segments),
"n_speakers": int(estimated_n_speakers),
"segments": [],
}
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
result["segments"].append(
{
"index": i,
"start": round(start, 3),
"end": round(end, 3),
"duration": round(end - start, 3),
"speaker": f"SPEAKER_{int(label)}",
}
)
# 統計每個說話人的總時長
speaker_stats = {}
for seg in result["segments"]:
speaker = seg["speaker"]
if speaker not in speaker_stats:
speaker_stats[speaker] = {"count": 0, "duration": 0}
speaker_stats[speaker]["count"] += 1
speaker_stats[speaker]["duration"] += seg["duration"]
result["speaker_stats"] = speaker_stats
total_time = time.time() - start_time
result["processing_time"] = round(total_time, 2)
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
print("\n[SelfASRX] Processing completed!")
print(f" Total time: {total_time:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f" Detected speakers: {estimated_n_speakers}")
# 保存結果
if output_path:
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f" Results saved to: {output_path}")
print("=" * 60)
return result
def main():
"""主函數"""
import argparse
parser = argparse.ArgumentParser(
description="Self-implemented ASRX - Speaker Diarization"
)
parser.add_argument("audio_path", help="Path to audio file")
parser.add_argument("-o", "--output", help="Output JSON path")
parser.add_argument(
"--min-speech-duration",
type=int,
default=500,
help="Minimum speech duration in ms (default: 500)",
)
parser.add_argument(
"--n-speakers",
type=int,
default=None,
help="Number of speakers (default: auto-estimate)",
)
parser.add_argument(
"--smooth-window",
type=int,
default=5,
help="Smoothing window size (default: 5)",
)
args = parser.parse_args()
# 檢查文件是否存在
if not Path(args.audio_path).exists():
print(f"Error: Audio file not found: {args.audio_path}")
sys.exit(1)
# 創建 ASRX 實例並處理
asrx = SelfASRX()
result = asrx.process(
args.audio_path,
args.output,
min_speech_duration_ms=args.min_speech_duration,
n_speakers=args.n_speakers,
smooth_window=args.smooth_window,
)
# 顯示結果摘要
if "error" not in result:
print("\n[Summary]")
print(f" Audio duration: {result['total_duration']:.2f}s")
print(f" Speech segments: {result['n_speech_segments']}")
print(f" Detected speakers: {result['n_speakers']}")
print(f" Processing time: {result['processing_time']:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print("\n[Speaker Statistics]")
for speaker, stats in result["speaker_stats"].items():
pct = stats["duration"] / result["total_duration"] * 100
print(
f" {speaker}: {stats['count']} segments, "
+ f"{stats['duration']:.2f}s ({pct:.1f}%)"
)
if __name__ == "__main__":
main()

View File

@@ -1,308 +1,728 @@
#!/opt/homebrew/bin/python3.11
"""
Self-implemented ASRX - Fixed Version
使用魯棒的聚類算法
SelfASRXFixed - 7 步 Hybrid Speaker Diarization Pipeline
Pipeline:
1. whisper.transcribe(full_audio) → rough segments + text + language
2. VAD scan each rough segment → refined segments
3. whisper per refined segment → {text, language, lang_prob}
4. ECAPA-TDNN per refined segment → 192-dim embeddings
5. AgglomerativeClustering → speaker_labels
6. Store all embeddings in Qdrant (payload: file_uuid, speaker_id, text, ...)
7. High-quality embeddings → gender classify + store reference in Qdrant
"""
import sys
import json
import time
import os
import numpy as np
from pathlib import Path
from urllib.request import Request, urlopen
from urllib.error import URLError
# 導入自定義模組
from vad import load_vad_model, extract_speech_segments
from speaker_encoder import (
load_speaker_encoder,
extract_speaker_embeddings_batch,
normalize_embeddings
)
from speaker_cluster_fixed import robust_speaker_clustering
def _load_audio(path):
"""載入音頻文件,回傳 (wav_numpy, sample_rate)"""
import soundfile as sf
wav, sr = sf.read(path)
if len(wav.shape) > 1:
wav = np.mean(wav, axis=1)
return wav, sr
def _load_whisper_model(size="small"):
from whisper_local import load_model
return load_model(size)
def _load_vad():
from vad import load_vad_model
return load_vad_model()
def _load_speaker_encoder():
from speaker_encoder import load_speaker_encoder
return load_speaker_encoder()
def _load_gender_classifier():
try:
from speechbrain.inference.classifiers import EncoderClassifier
classifier = EncoderClassifier.from_hparams(
source="speechbrain/gender-recognition-ecapa",
run_opts={"device": "cpu"},
)
print("[Gender] Classifier loaded: speechbrain/gender-recognition-ecapa")
return classifier
except Exception as e:
print(f"[Gender] Classifier not available: {e}")
return None
def _ensure_speaker_collection(qdrant_url, api_key, collection):
"""確認 Qdrant speaker collection 存在,不存在則建立 (dim=192, cosine)"""
try:
url = f"{qdrant_url}/collections/{collection}"
req = Request(url, method="GET",
headers={"api-key": api_key} if api_key else {})
try:
urlopen(req)
return True
except URLError as e:
if getattr(e, "code", None) == 404:
body = json.dumps({
"vectors": {
"size": 192,
"distance": "Cosine"
}
}).encode()
req = Request(url, data=body, method="PUT",
headers={"Content-Type": "application/json",
**({"api-key": api_key} if api_key else {})})
urlopen(req)
print(f"[Qdrant] Created collection: {collection} (dim=192)")
return True
raise
except Exception as e:
print(f"[Qdrant] Cannot access Qdrant: {e}")
return False
def _qdrant_upsert(qdrant_url, api_key, collection, points):
"""批量寫入 Qdrant points"""
try:
url = f"{qdrant_url}/collections/{collection}/points?wait=true"
body = json.dumps({"points": points}).encode()
headers = {"Content-Type": "application/json"}
if api_key:
headers["api-key"] = api_key
req = Request(url, data=body, headers=headers, method="PUT")
urlopen(req)
return True
except Exception as e:
print(f"[Qdrant] Upsert failed: {e}")
return False
def _hash_point_id(file_uuid, label):
"""產生一致的 point ID"""
s = f"{file_uuid}_{label}"
return hash(s) & 0x7FFFFFFFFFFFFFFF
def _save_checkpoint(path: str, data: dict):
"""原子寫入 checkpoint先 .tmp 再 rename"""
tmp = path + ".tmp"
Path(tmp).parent.mkdir(parents=True, exist_ok=True)
with open(tmp, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
os.replace(tmp, path)
def compute_embedding_quality(embeddings, labels):
"""每個 embedding 到所屬 cluster centroid 的餘弦相似度"""
from sklearn.metrics.pairwise import cosine_similarity
unique_labels = set(labels)
centroids = {}
for label in unique_labels:
mask = labels == label
centroid = np.mean(embeddings[mask], axis=0)
norm = np.linalg.norm(centroid)
if norm > 0:
centroid = centroid / norm
centroids[label] = centroid
qualities = []
for emb, label in zip(embeddings, labels):
sim = cosine_similarity([emb], [centroids[label]])[0][0]
qualities.append(sim)
return np.array(qualities)
class SelfASRXFixed:
"""自實作說話人分離系統(修復版)"""
"""7 步 Hybrid Speaker Diarization Pipeline"""
def __init__(self):
print("[SelfASRX-Fixed] Initializing models...")
# 載入 VAD 模型
print("[SelfASRX-Fixed] Loading VAD model (Silero)...")
self.vad_model, self.vad_utils = load_vad_model()
# 載入聲紋模型
print("[SelfASRX-Fixed] Loading speaker encoder (ECAPA-TDNN)...")
self.speaker_encoder = load_speaker_encoder()
print("[SelfASRX-Fixed] Models loaded successfully")
def process(self, audio_path, output_path=None,
min_speech_duration_ms=500,
n_speakers=None,
max_speakers=10):
"""處理音頻文件"""
start_time = time.time()
print(f"\n[SelfASRX-Fixed] Processing: {audio_path}")
print("=" * 60)
# 步驟 1: VAD
print("\n[Step 1] Voice Activity Detection...")
step1_start = time.time()
speech_segments, wav, sample_rate = extract_speech_segments(
audio_path, self.vad_model, self.vad_utils,
min_speech_duration_ms=min_speech_duration_ms
)
step1_time = time.time() - step1_start
print(f" Speech segments: {len(speech_segments)}")
print(f" Total duration: {len(wav)/sample_rate:.2f}s")
print(f" VAD time: {step1_time:.2f}s")
if len(speech_segments) == 0:
print("[SelfASRX-Fixed] No speech detected!")
return {"error": "No speech detected", "segments": []}
# 步驟 2: 聲紋特徵提取
print("\n[Step 2] Speaker embedding extraction...")
step2_start = time.time()
# 提取語音片段音頻
audio_segments = []
for start_sec, end_sec in speech_segments:
start_sample = int(start_sec * sample_rate)
end_sample = int(end_sec * sample_rate)
audio_segments.append(wav[start_sample:end_sample])
# 批量提取嵌入
embeddings = extract_speaker_embeddings_batch(
self.speaker_encoder, audio_segments, sample_rate
)
# 正規化
embeddings = normalize_embeddings(embeddings)
step2_time = time.time() - step2_start
print(f" Embedding shape: {embeddings.shape}")
print(f" Embedding time: {step2_time:.2f}s")
# 步驟 3: 魯棒聚類
print("\n[Step 3] Robust speaker clustering...")
step3_start = time.time()
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
embeddings,
n_speakers=n_speakers,
max_speakers=max_speakers
)
step3_time = time.time() - step3_start
print(f" Clustering time: {step3_time:.2f}s")
# 步驟 4: 建立輸出
print("\n[Step 4] Building output...")
result = {
"audio_path": str(audio_path),
"total_duration": len(wav) / sample_rate,
"n_speech_segments": len(speech_segments),
"n_speakers": int(estimated_n_speakers),
"segments": []
}
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
result["segments"].append({
"index": i,
"start": round(start, 3),
"end": round(end, 3),
"duration": round(end - start, 3),
"speaker": f"SPEAKER_{int(label)}"
})
# 統計每個說話人的總時長
speaker_stats = {}
for seg in result["segments"]:
speaker = seg["speaker"]
if speaker not in speaker_stats:
speaker_stats[speaker] = {"count": 0, "duration": 0}
speaker_stats[speaker]["count"] += 1
speaker_stats[speaker]["duration"] += seg["duration"]
result["speaker_stats"] = speaker_stats
total_time = time.time() - start_time
result["processing_time"] = round(total_time, 2)
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
print("\n[SelfASRX-Fixed] Processing completed!")
print(f" Total time: {total_time:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f" Detected speakers: {estimated_n_speakers}")
# 保存結果
if output_path:
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f" Results saved to: {output_path}")
print("=" * 60)
return result
print("[SelfASRX] Initializing models...")
print("[SelfASRX] Loading whisper model...")
self.whisper = _load_whisper_model("small")
print("[SelfASRX] Loading VAD model (Silero)...")
self.vad_model, self.vad_utils = _load_vad()
print("[SelfASRX] Loading speaker encoder (ECAPA-TDNN)...")
self.speaker_encoder = _load_speaker_encoder()
print("[SelfASRX] Loading gender classifier...")
self.gender_classifier = _load_gender_classifier()
# Qdrant 設定
self.qdrant_url = os.environ.get("QDRANT_URL", "http://localhost:6333")
self.qdrant_api_key = os.environ.get("QDRANT_API_KEY", "")
schema = os.environ.get("DATABASE_SCHEMA", "public")
self.qdrant_collection = os.environ.get(
"QDRANT_SPEAKER_COLLECTION",
f"momentry_{schema}_speaker"
)
self._qdrant_ok = False
print("[SelfASRX] Models loaded successfully")
def process(self, audio_path, output_path=None, file_uuid=None,
max_speakers=10, quality_threshold=0.85,
checkpoint_path=None):
"""7 步 speaker diarization pipeline
def process_with_segments(self, audio_path, asr_segments, output_path=None):
"""
使用 ASR segment 邊界進行 speaker diarization取代 VAD 步驟。
Args:
audio_path: 音頻文件路徑WAV
asr_segments: ASR segment 列表,每個包含 start/end
output_path: 輸出 JSON 路徑(可選)
audio_path: 音頻文件路徑 (WAV 16kHz mono)
output_path: 輸出 JSON 路徑 (可選)
file_uuid: 檔案 UUID (用於 Qdrant 儲存)
max_speakers: 最大說話人數
quality_threshold: 高品質聲紋門檻 (0-1)
checkpoint_path: Step 3 完成後儲存 checkpoint 路徑
Returns:
dict: segments, speaker_stats, n_speakers, total_duration, references
"""
start_time = time.time()
print(f"\n[SelfASRX-Fixed] Processing with {len(asr_segments)} ASR segments: {audio_path}")
print(f"\n[SelfASRX] Processing: {audio_path}")
print("=" * 60)
# 載入完整音頻
import soundfile as sf
wav, sample_rate = sf.read(audio_path)
if len(wav.shape) > 1:
wav = np.mean(wav, axis=1) # 轉 mono
print(f" Audio loaded: {len(wav)/sample_rate:.2f}s, {sample_rate}Hz")
# 載入音頻
wav, sample_rate = _load_audio(audio_path)
total_duration = len(wav) / sample_rate
print(f" Audio: {total_duration:.2f}s, {sample_rate}Hz")
# 使用 ASR segments 取代 VAD (audio处理用time)
speech_segments = [(s["start_time"], s["end_time"]) for s in asr_segments]
print(f" Speech segments from ASR: {len(speech_segments)}")
# ── Step 1: whisper 粗略定位 (faster-whisper) ──
print("\n[Step 1] Initial whisper transcription...")
t1 = time.time()
seg_gen, info = self.whisper.transcribe(audio_path)
rough_segments = []
for seg in seg_gen:
rough_segments.append({"start": seg.start, "end": seg.end, "text": seg.text})
language = info.language if info else None
print(f" Rough segments: {len(rough_segments)}")
print(f" Language: {language}")
print(f" Step 1 time: {time.time() - t1:.2f}s")
if len(speech_segments) == 0:
print("[SelfASRX-Fixed] No ASR segments provided!")
return {"error": "No ASR segments", "segments": []}
if not rough_segments:
print("[SelfASRX] No speech detected by whisper!")
return {"error": "No speech detected", "segments": []}
# 提取語音片段
audio_segments = []
for start_sec, end_sec in speech_segments:
start_sample = int(start_sec * sample_rate)
end_sample = int(end_sec * sample_rate)
if start_sample >= len(wav):
# ── Step 2: VAD scan 每個 rough segment 細切 ──
print("\n[Step 2] VAD scan for refined segmentation...")
t2 = time.time()
refined_segments = []
for seg in rough_segments:
s = seg["start"]
e = seg["end"]
sub = self._vad_scan_segment(wav, sample_rate, s, e)
if sub:
refined_segments.extend(sub)
else:
refined_segments.append((s, e))
print(f" Refined segments: {len(refined_segments)}")
print(f" Step 2 time: {time.time() - t2:.2f}s")
if not refined_segments:
return {"error": "No segments after VAD scan", "segments": []}
# ── Step 3: whisper per refined segment ──
print("\n[Step 3] Per-segment transcription...")
t3 = time.time()
CHECKPOINT_INTERVAL = 50
segment_texts = []
resume_from = 0
# 載入既有 partial checkpoint中斷續接
if checkpoint_path and os.path.exists(checkpoint_path):
try:
with open(checkpoint_path, "r") as f:
cp = json.load(f)
if cp.get("checkpoint_version") == 2 and not cp.get("step3_completed"):
saved = cp.get("segment_texts", [])
if saved:
resume_from = len(saved)
segment_texts = saved
print(f"[Step 3] Resuming from #{resume_from}/{len(refined_segments)}")
except Exception:
pass
for i, (start_sec, end_sec) in enumerate(refined_segments):
if i < resume_from:
continue
audio_segments.append(wav[start_sample:min(end_sample, len(wav))])
seg_text = self._transcribe_segment(wav, sample_rate, start_sec, end_sec)
segment_texts.append(seg_text)
print(f" Audio segments extracted: {len(audio_segments)}")
if checkpoint_path and (i + 1) % CHECKPOINT_INTERVAL == 0:
_save_checkpoint(checkpoint_path, {
"checkpoint_version": 2,
"step3_completed": False,
"step3_progress": i + 1,
"language": language,
"total_duration": total_duration,
"refined_segments": [[s, e] for s, e in refined_segments],
"segment_texts": [{
"text": st["text"],
"language": st["language"],
"lang_prob": st["lang_prob"],
} for st in segment_texts],
"file_uuid": file_uuid,
"max_speakers": max_speakers,
"quality_threshold": quality_threshold,
})
print(f"[Checkpoint] Step 3: {i+1}/{len(refined_segments)}")
# 批量提取聲紋嵌入
print("\n[Step 2] Speaker embedding extraction...")
step2_start = time.time()
print(f" Step 3 time: {time.time() - t3:.2f}s")
# ── Save final checkpoint after Step 3 ──
if checkpoint_path:
_save_checkpoint(checkpoint_path, {
"checkpoint_version": 2,
"step3_completed": True,
"language": language,
"total_duration": total_duration,
"refined_segments": [[s, e] for s, e in refined_segments],
"segment_texts": [{
"text": st["text"],
"language": st["language"],
"lang_prob": st["lang_prob"],
} for st in segment_texts],
"file_uuid": file_uuid,
"max_speakers": max_speakers,
"quality_threshold": quality_threshold,
})
print(f"[Checkpoint] Step 3 complete, saved to {checkpoint_path}")
# ── Step 4: ECAPA-TDNN per refined segment ──
print("\n[Step 4] Speaker embedding extraction...")
t4 = time.time()
audio_segments = []
for start_sec, end_sec in refined_segments:
s = int(start_sec * sample_rate)
e = int(end_sec * sample_rate)
audio_segments.append(wav[s:min(e, len(wav))])
from speaker_encoder import extract_speaker_embeddings_batch, normalize_embeddings
embeddings = extract_speaker_embeddings_batch(
self.speaker_encoder, audio_segments, sample_rate
)
embeddings = normalize_embeddings(embeddings)
step2_time = time.time() - step2_start
print(f" Embedding shape: {embeddings.shape}")
print(f" Embedding time: {step2_time:.2f}s")
print(f" Embeddings: {embeddings.shape}")
print(f" Step 4 time: {time.time() - t4:.2f}s")
# 聚類
print("\n[Step 3] Robust speaker clustering...")
step3_start = time.time()
# ── Step 5: AgglomerativeClustering ──
print("\n[Step 5] Speaker clustering...")
t5 = time.time()
from speaker_cluster_fixed import robust_speaker_clustering
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
embeddings, n_speakers=None, max_speakers=10
embeddings, n_speakers=None, max_speakers=max_speakers
)
step3_time = time.time() - step3_start
print(f" Clustering time: {step3_time:.2f}s")
print(f" Speakers: {estimated_n_speakers}")
print(f" Step 5 time: {time.time() - t5:.2f}s")
# 建立輸出
result = {
"audio_path": str(audio_path),
"total_duration": len(wav) / sample_rate,
"n_speech_segments": len(speech_segments),
"n_speakers": int(estimated_n_speakers),
"segments": []
}
# 品質計算
qualities = compute_embedding_quality(embeddings, speaker_labels)
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
result["segments"].append({
"index": i,
"start": round(start, 3),
"end": round(end, 3),
"duration": round(end - start, 3),
"speaker": f"SPEAKER_{int(label)}"
})
# 加入 embeddings每個 segment 對應的 192-D speaker embedding
result["embeddings"] = []
for emb in embeddings:
result["embeddings"].append(emb.tolist())
# 建立輸出 segments
segments = []
for i, ((start_sec, end_sec), label) in enumerate(
zip(refined_segments, speaker_labels)):
seg = {
"start": round(start_sec, 3),
"end": round(end_sec, 3),
"start_frame": int(start_sec * 30),
"end_frame": int(end_sec * 30),
"text": segment_texts[i]["text"],
"language": segment_texts[i]["language"],
"lang_prob": segment_texts[i]["lang_prob"],
"speaker": f"SPEAKER_{int(label)}",
"speaker_id": f"SPEAKER_{int(label)}",
"quality": float(qualities[i]),
}
segments.append(seg)
# 統計
speaker_stats = {}
for seg in result["segments"]:
speaker = seg["speaker"]
if speaker not in speaker_stats:
speaker_stats[speaker] = {"count": 0, "duration": 0}
speaker_stats[speaker]["count"] += 1
speaker_stats[speaker]["duration"] += seg["duration"]
result["speaker_stats"] = speaker_stats
for seg in segments:
spk = seg["speaker_id"]
dur = seg["end"] - seg["start"]
if spk not in speaker_stats:
speaker_stats[spk] = {"count": 0, "duration": 0}
speaker_stats[spk]["count"] += 1
speaker_stats[spk]["duration"] += dur
result = {
"language": language or "",
"segments": segments,
"n_speakers": int(estimated_n_speakers),
"speaker_stats": speaker_stats,
"total_duration": total_duration,
"n_segments": len(segments),
}
# ── Step 6: Store embeddings in Qdrant ──
if file_uuid:
print("\n[Step 6] Storing embeddings in Qdrant...")
t6 = time.time()
self._store_speaker_embeddings(segments, embeddings, speaker_labels,
file_uuid)
print(f" Step 6 time: {time.time() - t6:.2f}s")
# ── Step 7: High-quality classification ──
if file_uuid:
print("\n[Step 7] Classifying high-quality embeddings...")
t7 = time.time()
references = self._classify_high_quality_speakers(
segments, embeddings, speaker_labels, file_uuid,
wav, sample_rate, quality_threshold
)
if references:
result["references"] = references
print(f" Step 7 time: {time.time() - t7:.2f}s")
total_time = time.time() - start_time
result["processing_time"] = round(total_time, 2)
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
print("\n[SelfASRX-Fixed] Processing completed!")
print(f" Total time: {total_time:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f" Detected speakers: {estimated_n_speakers}")
if total_duration > 0:
result["realtime_factor"] = round(total_duration / total_time, 2)
# 保存輸出
if output_path:
import json
with open(output_path, 'w', encoding='utf-8') as f:
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f" Results saved to: {output_path}")
print(f"\n[SelfASRX] Saved to: {output_path}")
print(f"\n[SelfASRX] Done! {len(segments)} segments, "
f"{estimated_n_speakers} speakers, "
f"{total_time:.2f}s")
print("=" * 60)
return result
def resume_from_checkpoint(self, checkpoint_path, audio_path,
output_path=None):
"""從 checkpoint 載入 Steps 1-3 結果,執行 Steps 4-7"""
print(f"\n[SelfASRX] Resuming from checkpoint: {checkpoint_path}")
print("=" * 60)
with open(checkpoint_path, "r", encoding="utf-8") as f:
cp = json.load(f)
if not cp.get("step3_completed"):
error_msg = f"Checkpoint step3 not completed (progress: {cp.get('step3_progress', '?')})"
print(f"[SelfASRX] {error_msg}")
return {"error": error_msg, "segments": []}
wav, sample_rate = _load_audio(audio_path)
refined_segments = [tuple(s) for s in cp["refined_segments"]]
segment_texts = cp["segment_texts"]
language = cp.get("language", "")
total_duration = cp.get("total_duration", 0)
file_uuid = cp.get("file_uuid")
max_speakers = cp.get("max_speakers", 10)
quality_threshold = cp.get("quality_threshold", 0.85)
print(f" Loaded checkpoint: {len(refined_segments)} segments, "
f"language={language}, duration={total_duration:.2f}s")
start_time = time.time()
# ── Step 4: ECAPA-TDNN per refined segment ──
print("\n[Step 4] Speaker embedding extraction...")
t4 = time.time()
audio_segments = []
for start_sec, end_sec in refined_segments:
s = int(start_sec * sample_rate)
e = int(end_sec * sample_rate)
audio_segments.append(wav[s:min(e, len(wav))])
from speaker_encoder import extract_speaker_embeddings_batch, normalize_embeddings
embeddings = extract_speaker_embeddings_batch(
self.speaker_encoder, audio_segments, sample_rate
)
embeddings = normalize_embeddings(embeddings)
print(f" Embeddings: {embeddings.shape}")
print(f" Step 4 time: {time.time() - t4:.2f}s")
# ── Step 5: AgglomerativeClustering ──
print("\n[Step 5] Speaker clustering...")
t5 = time.time()
from speaker_cluster_fixed import robust_speaker_clustering
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
embeddings, n_speakers=None, max_speakers=max_speakers
)
print(f" Speakers: {estimated_n_speakers}")
print(f" Step 5 time: {time.time() - t5:.2f}s")
# 品質計算
qualities = compute_embedding_quality(embeddings, speaker_labels)
# 建立輸出 segments
segments = []
for i, ((start_sec, end_sec), label) in enumerate(
zip(refined_segments, speaker_labels)):
seg = {
"start": round(start_sec, 3),
"end": round(end_sec, 3),
"start_frame": int(start_sec * 30),
"end_frame": int(end_sec * 30),
"text": segment_texts[i]["text"],
"language": segment_texts[i]["language"],
"lang_prob": segment_texts[i]["lang_prob"],
"speaker": f"SPEAKER_{int(label)}",
"speaker_id": f"SPEAKER_{int(label)}",
"quality": float(qualities[i]),
}
segments.append(seg)
# 統計
speaker_stats = {}
for seg in segments:
spk = seg["speaker_id"]
dur = seg["end"] - seg["start"]
if spk not in speaker_stats:
speaker_stats[spk] = {"count": 0, "duration": 0}
speaker_stats[spk]["count"] += 1
speaker_stats[spk]["duration"] += dur
result = {
"language": language or "",
"segments": segments,
"n_speakers": int(estimated_n_speakers),
"speaker_stats": speaker_stats,
"total_duration": total_duration,
"n_segments": len(segments),
}
# ── Step 6: Store embeddings in Qdrant ──
if file_uuid:
print("\n[Step 6] Storing embeddings in Qdrant...")
t6 = time.time()
self._store_speaker_embeddings(segments, embeddings, speaker_labels,
file_uuid)
print(f" Step 6 time: {time.time() - t6:.2f}s")
# ── Step 7: High-quality classification ──
if file_uuid:
print("\n[Step 7] Classifying high-quality embeddings...")
t7 = time.time()
references = self._classify_high_quality_speakers(
segments, embeddings, speaker_labels, file_uuid,
wav, sample_rate, quality_threshold
)
if references:
result["references"] = references
print(f" Step 7 time: {time.time() - t7:.2f}s")
total_time = time.time() - start_time
result["processing_time"] = round(total_time, 2)
if total_duration > 0:
result["realtime_factor"] = round(total_duration / total_time, 2)
# 保存輸出
if output_path:
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f"\n[SelfASRX] Saved to: {output_path}")
print(f"\n[SelfASRX] Done! {len(segments)} segments, "
f"{estimated_n_speakers} speakers, "
f"{total_time:.2f}s")
return result
# ── Internal helpers ──
def _vad_scan_segment(self, wav, sample_rate, start_sec, end_sec):
"""VAD 細切單一段落"""
from vad import scan_within_segment
return scan_within_segment(
wav, sample_rate, start_sec, end_sec,
self.vad_model, self.vad_utils
)
def _transcribe_segment(self, wav, sample_rate, start_sec, end_sec):
"""轉錄單一段落"""
from whisper_local import transcribe_segment
return transcribe_segment(wav, sample_rate, start_sec, end_sec, self.whisper)
def _store_speaker_embeddings(self, segments, embeddings, labels, file_uuid):
"""Step 6: 所有 embedding 存入 Qdrant"""
if not self._ensure_qdrant():
return
points = []
for i, (seg, emb, label) in enumerate(
zip(segments, embeddings, labels)):
point_id = _hash_point_id(file_uuid, f"{i}")
points.append({
"id": point_id,
"vector": emb.tolist(),
"payload": {
"type": "speaker_embedding",
"file_uuid": file_uuid,
"speaker_id": seg["speaker_id"],
"text": seg["text"],
"language": seg["language"],
"start_time": seg["start"],
"end_time": seg["end"],
}
})
ok = _qdrant_upsert(self.qdrant_url, self.qdrant_api_key,
self.qdrant_collection, points)
if ok:
print(f" Stored {len(points)} speaker embeddings to Qdrant")
return ok
def _classify_high_quality_speakers(self, segments, embeddings, labels,
file_uuid, wav, sample_rate,
threshold=0.85):
"""Step 7: 高品質聲紋分級 + 性別分類 → Qdrant reference"""
qualities = compute_embedding_quality(embeddings, labels)
high_mask = qualities >= threshold
if not np.any(high_mask):
print(" No high-quality embeddings found")
return []
unique_labels = set(labels)
references = []
for label in unique_labels:
mask = (labels == label) & high_mask
if not np.any(mask):
continue
high_indices = [i for i in range(len(segments)) if mask[i]]
high_segs = [segments[i] for i in high_indices]
# 取品質最高的 segment index
best_idx = high_indices[int(np.argmax(qualities[mask]))]
best_seg = segments[best_idx]
centroid = np.mean(embeddings[mask], axis=0)
norm = np.linalg.norm(centroid)
if norm > 0:
centroid = centroid / norm
avg_quality = float(np.mean(qualities[mask]))
speaker_id = f"SPEAKER_{int(label)}"
text_samples = [s["text"] for s in high_segs[:5] if s["text"]]
total_dur = sum(s["end"] - s["start"] for s in high_segs)
ref_id = _hash_point_id(file_uuid, f"ref_{label}")
ref_payload = {
"type": "speaker_reference",
"file_uuid": file_uuid,
"speaker_id": speaker_id,
"n_segments": int(np.sum(mask)),
"avg_quality": avg_quality,
"total_duration": round(total_dur, 2),
"language": best_seg.get("language", ""),
"text_samples": text_samples,
}
# 性別分類:用最佳 segment 的音頻
if self.gender_classifier is not None:
try:
import torch
s = int(best_seg["start"] * sample_rate)
e = int(best_seg["end"] * sample_rate)
seg_wav = wav[s:min(e, len(wav))]
seg_tensor = torch.from_numpy(seg_wav).float().unsqueeze(0)
# SpeechBrain gender classifier 接受音頻
out = self.gender_classifier.classify_batch(seg_tensor)
probs = torch.softmax(out[0], dim=-1).squeeze().cpu().detach().numpy()
if len(probs) >= 2:
idx = int(np.argmax(probs))
ref_payload["gender"] = "male" if idx == 0 else "female"
ref_payload["gender_conf"] = float(probs[idx])
else:
ref_payload["gender"] = "unknown"
ref_payload["gender_conf"] = 0.0
except Exception as e:
print(f"[Gender] Classify error: {e}")
ref_payload["gender"] = "unknown"
ref_payload["gender_conf"] = 0.0
else:
ref_payload["gender"] = "unknown"
ref_payload["gender_conf"] = 0.0
_qdrant_upsert(self.qdrant_url, self.qdrant_api_key,
self.qdrant_collection, [{
"id": ref_id,
"vector": centroid.tolist(),
"payload": ref_payload,
}])
references.append({
"speaker_id": speaker_id,
"n_segments": int(np.sum(mask)),
"avg_quality": avg_quality,
"gender": ref_payload["gender"],
})
print(f" Ref: {speaker_id}, gender={ref_payload['gender']}"
f" ({ref_payload['gender_conf']:.2f}), q={avg_quality:.3f}")
return references
def _ensure_qdrant(self):
"""確保 Qdrant collection 可用"""
if not self._qdrant_ok:
ok = _ensure_speaker_collection(
self.qdrant_url, self.qdrant_api_key, self.qdrant_collection
)
self._qdrant_ok = ok
return self._qdrant_ok
def main():
import argparse
parser = argparse.ArgumentParser(description="Self-implemented ASRX (Fixed)")
parser.add_argument("audio_path", help="Path to audio file")
parser = argparse.ArgumentParser(description="SelfASRX - Hybrid Speaker Diarization")
parser.add_argument("audio_path", help="Path to audio file (WAV)")
parser.add_argument("-o", "--output", help="Output JSON path")
parser.add_argument("--min-speech-duration", type=int, default=500)
parser.add_argument("--n-speakers", type=int, default=None)
parser.add_argument("--file-uuid", help="File UUID for Qdrant storage")
parser.add_argument("--max-speakers", type=int, default=10)
parser.add_argument("--quality-threshold", type=float, default=0.85)
parser.add_argument("--resume", help="Checkpoint path to resume from")
parser.add_argument("--checkpoint", help="Save checkpoint path after Step 3")
args = parser.parse_args()
if not Path(args.audio_path).exists():
print(f"Error: Audio file not found: {args.audio_path}")
sys.exit(1)
asrx = SelfASRXFixed()
result = asrx.process(
args.audio_path,
args.output,
min_speech_duration_ms=args.min_speech_duration,
n_speakers=args.n_speakers,
max_speakers=args.max_speakers
)
if args.resume:
if not Path(args.resume).exists():
print(f"Error: Checkpoint not found: {args.resume}")
sys.exit(1)
result = asrx.resume_from_checkpoint(
args.resume, args.audio_path,
output_path=args.output,
)
else:
if not Path(args.audio_path).exists():
print(f"Error: Audio file not found: {args.audio_path}")
sys.exit(1)
result = asrx.process(
args.audio_path,
output_path=args.output,
file_uuid=args.file_uuid,
max_speakers=args.max_speakers,
quality_threshold=args.quality_threshold,
checkpoint_path=args.checkpoint,
)
if "error" not in result:
print("\n[Summary]")
print(f" Audio duration: {result['total_duration']:.2f}s")
print(f" Speech segments: {result['n_speech_segments']}")
print(f" Detected speakers: {result['n_speakers']}")
print(f" Processing time: {result['processing_time']:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print("\n[Speaker Statistics]")
for speaker, stats in result['speaker_stats'].items():
pct = stats['duration'] / result['total_duration'] * 100
print(f" {speaker}: {stats['count']} segments, " +
f"{stats['duration']:.2f}s ({pct:.1f}%)")
print(f" Duration: {result['total_duration']:.2f}s")
print(f" Segments: {result['n_segments']}")
print(f" Speakers: {result['n_speakers']}")
if "references" in result:
for ref in result["references"]:
print(f" {ref['speaker_id']}: gender={ref['gender']}, "
f"quality={ref['avg_quality']:.3f}")
if __name__ == "__main__":

View File

@@ -1,280 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Speaker Audio Player - 說話人語音播放器
從 ASRX 結果中提取並播放每個說話人的語音片段
"""
import json
import argparse
import subprocess
import tempfile
import os
from pathlib import Path
from typing import List, Dict
def load_asrx_result(result_path: str) -> Dict:
"""載入 ASRX 結果"""
with open(result_path, "r", encoding="utf-8") as f:
return json.load(f)
def extract_audio_segment(
audio_path: str, start_sec: float, end_sec: float, output_path: str
) -> bool:
"""
使用 ffmpeg 提取音頻片段
Args:
audio_path: 原始音頻路徑
start_sec: 開始時間(秒)
end_sec: 結束時間(秒)
output_path: 輸出路徑
Returns:
bool: 是否成功
"""
duration = end_sec - start_sec
cmd = [
"ffmpeg",
"-y",
"-i",
audio_path,
"-ss",
str(start_sec),
"-t",
str(duration),
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
output_path,
]
try:
result = subprocess.run(cmd, capture_output=True, text=True)
return result.returncode == 0
except Exception as e:
print(f"Error extracting audio: {e}")
return False
def play_audio(audio_path: str) -> bool:
"""
播放音頻文件
使用 macOS 的 afplay 或 Linux 的 aplay
"""
try:
# 嘗試使用 afplay (macOS)
if os.path.exists("/usr/bin/afplay"):
subprocess.run(["afplay", audio_path], check=True)
# 嘗試使用 aplay (Linux)
elif os.path.exists("/usr/bin/aplay"):
subprocess.run(["aplay", audio_path], check=True)
else:
print(
"No audio player found. Please install afplay (macOS) or aplay (Linux)"
)
return False
return True
except Exception as e:
print(f"Error playing audio: {e}")
return False
def group_segments_by_speaker(segments: List[Dict]) -> Dict[str, List[Dict]]:
"""將語音片段按說話人分組"""
speaker_segments = {}
for seg in segments:
speaker = seg["speaker"]
if speaker not in speaker_segments:
speaker_segments[speaker] = []
speaker_segments[speaker].append(seg)
# 按開始時間排序
for speaker in speaker_segments:
speaker_segments[speaker].sort(key=lambda x: x["start"])
return speaker_segments
def play_speaker_segments(
audio_path: str,
result_path: str,
speaker_id: str = None,
limit: int = None,
temp_dir: str = None,
):
"""
播放指定說話人的語音片段
Args:
audio_path: 原始音頻路徑
result_path: ASRX 結果 JSON 路徑
speaker_id: 說話人 IDNone=播放所有)
limit: 最多播放幾個片段None=全部)
temp_dir: 臨時目錄
"""
# 載入結果
print(f"[Load] Loading ASRX result: {result_path}")
result = load_asrx_result(result_path)
segments = result.get("segments", [])
total_duration = result.get("total_duration", 0)
print(f"[Info] Total segments: {len(segments)}")
print(f"[Info] Total duration: {total_duration / 60:.1f} minutes")
# 分組
speaker_segments = group_segments_by_speaker(segments)
# 選擇說話人
if speaker_id:
speakers_to_play = [speaker_id]
else:
speakers_to_play = sorted(speaker_segments.keys())
# 創建臨時目錄
if temp_dir is None:
temp_dir = tempfile.mkdtemp(prefix="speaker_audio_")
print(f"\n[Info] Temp directory: {temp_dir}")
print(f"[Info] Speakers to play: {speakers_to_play}")
print("=" * 60)
# 播放每個說話人的片段
for speaker in speakers_to_play:
if speaker not in speaker_segments:
print(f"\n[Warning] Speaker {speaker} not found!")
continue
segs = speaker_segments[speaker]
if limit:
segs = segs[:limit]
print(f"\n▶️ {speaker} ({len(segs)} segments)")
print("-" * 60)
for i, seg in enumerate(segs, 1):
start = seg["start"]
end = seg["end"]
duration = seg["duration"]
# 提取音頻
temp_audio = os.path.join(temp_dir, f"{speaker}_{i:03d}.wav")
print(
f" [{i:3d}] {start:7.2f}s - {end:7.2f}s ({duration:5.2f}s) ... ",
end="",
flush=True,
)
if extract_audio_segment(audio_path, start, end, temp_audio):
print("", end="", flush=True)
# 播放
if play_audio(temp_audio):
print(" ▶️ Played")
else:
print(" ❌ Play failed")
else:
print(" ❌ Extract failed")
print()
def show_speaker_stats(result_path: str):
"""顯示說話人統計資訊"""
result = load_asrx_result(result_path)
segments = result.get("segments", [])
speaker_segments = group_segments_by_speaker(segments)
print("\n" + "=" * 60)
print("說話人統計")
print("=" * 60)
# 按時長排序
speaker_stats = []
for speaker, segs in speaker_segments.items():
total_duration = sum(seg["duration"] for seg in segs)
speaker_stats.append((speaker, len(segs), total_duration))
speaker_stats.sort(key=lambda x: x[2], reverse=True)
total_duration = result.get("total_duration", 0)
for speaker, count, duration in speaker_stats:
pct = duration / total_duration * 100 if total_duration > 0 else 0
print(f"{speaker:12} {count:4} segments {duration:8.1f}s ({pct:5.1f}%)")
print("=" * 60)
def main():
parser = argparse.ArgumentParser(
description="Speaker Audio Player - 播放說話人語音片段",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# 顯示說話人統計
python3 speaker_audio_player.py --stats result.json
# 播放所有說話人的前 3 個片段
python3 speaker_audio_player.py audio.wav result.json --limit 3
# 播放特定說話人的所有片段
python3 speaker_audio_player.py audio.wav result.json --speaker SPEAKER_0
# 播放 SPEAKER_1 的前 5 個片段
python3 speaker_audio_player.py audio.wav result.json --speaker SPEAKER_1 --limit 5
""",
)
parser.add_argument("audio_path", nargs="?", help="原始音頻文件路徑")
parser.add_argument("result_path", help="ASRX 結果 JSON 路徑")
parser.add_argument("--stats", action="store_true", help="只显示說話人統計")
parser.add_argument("--speaker", type=str, help="指定說話人 ID如 SPEAKER_0")
parser.add_argument(
"--limit",
type=int,
default=None,
help="每個說話人最多播放幾個片段None=全部)",
)
parser.add_argument("--temp-dir", type=str, default=None, help="臨時目錄路徑")
args = parser.parse_args()
if args.stats:
show_speaker_stats(args.result_path)
return
if not args.audio_path:
print("Error: audio_path is required unless --stats is specified")
parser.print_help()
return
if not Path(args.audio_path).exists():
print(f"Error: Audio file not found: {args.audio_path}")
return
if not Path(args.result_path).exists():
print(f"Error: Result file not found: {args.result_path}")
return
play_speaker_segments(
args.audio_path,
args.result_path,
speaker_id=args.speaker,
limit=args.limit,
temp_dir=args.temp_dir,
)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,65 @@
"""
Speaker Classifier - 聲紋品質評估與性別分類
提供品質計算與性別分類功能,作為 main_fixed.py 的輔助模組。
"""
import numpy as np
def compute_embedding_quality(embeddings, labels):
"""每個 embedding 到所屬 cluster centroid 的餘弦相似度
Args:
embeddings: [n_segments, 192] 聲紋向量矩陣
labels: [n_segments] 聚類標籤
Returns:
qualities: [n_segments] 品質分數 (0-1)
"""
from sklearn.metrics.pairwise import cosine_similarity
unique_labels = set(labels)
centroids = {}
for label in unique_labels:
mask = labels == label
centroid = np.mean(embeddings[mask], axis=0)
norm = np.linalg.norm(centroid)
if norm > 0:
centroid = centroid / norm
centroids[label] = centroid
qualities = []
for emb, label in zip(embeddings, labels):
sim = cosine_similarity([emb], [centroids[label]])[0][0]
qualities.append(sim)
return np.array(qualities)
def classify_gender(audio_wav, sample_rate, classifier):
"""從音頻段分類性別
Args:
audio_wav: 音頻波形 (numpy array)
sample_rate: 採樣率
classifier: SpeechBrain EncoderClassifier (gender-recognition-ecapa)
Returns:
dict: {"gender": "male"|"female"|"unknown", "confidence": float}
"""
default = {"gender": "unknown", "confidence": 0.0}
if classifier is None or len(audio_wav) == 0:
return default
try:
import torch
seg_tensor = torch.from_numpy(audio_wav).float().unsqueeze(0)
out = classifier.classify_batch(seg_tensor)
probs = torch.softmax(out[0], dim=-1).squeeze().cpu().detach().numpy()
if len(probs) >= 2:
idx = int(np.argmax(probs))
label = "male" if idx == 0 else "female"
return {"gender": label, "confidence": float(probs[idx])}
except Exception as e:
pass
return default

View File

@@ -1,310 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Speaker Clustering - 說話人聚類
使用譜聚類算法將聲紋嵌入分組
技術來源:
- 譜聚類Shi & Malik (2000), IEEE TPAMI
- 論文https://ieeexplore.ieee.org/document/868688
- 應用於說話人分離Wooters & Huijbregts (2008), ICASSP
"""
import numpy as np
from sklearn.cluster import SpectralClustering, AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
def estimate_n_speakers_eigengap(similarity_matrix, max_speakers=10):
"""
使用特徵值間隙方法估計說話人數量
技術來源:
- 特徵值間隙理論Lu et al. (2010)
- 原理:相似度矩陣的特徵值分佈中,最大間隙對應最佳聚類數
Args:
similarity_matrix: 相似度矩陣 [n, n]
max_speakers: 最大說話人數
Returns:
n_speakers: 估計的說話人數量
"""
# 計算特徵值
eigenvalues = np.linalg.eigvalsh(similarity_matrix)
# 降序排列
eigenvalues = np.sort(eigenvalues)[::-1]
# 只考慮前 max_speakers 個特徵值
eigenvalues = eigenvalues[:max_speakers]
# 計算間隙
gaps = np.diff(eigenvalues)
# 找到最大間隙的位置
if len(gaps) > 0:
n_speakers = np.argmax(np.abs(gaps)) + 1
else:
n_speakers = 1
# 限制範圍
n_speakers = max(2, min(n_speakers, max_speakers))
return n_speakers
def estimate_n_speakers_silhouette(embeddings, max_speakers=10):
"""
使用輪廓係數估計說話人數量
Args:
embeddings: 嵌入矩陣 [n, d]
max_speakers: 最大說話人數
Returns:
n_speakers: 估計的說話人數量
"""
from sklearn.metrics import silhouette_score
best_score = -1
best_n = 2
for n in range(2, min(max_speakers + 1, len(embeddings))):
clustering = AgglomerativeClustering(n_clusters=n)
labels = clustering.fit_predict(embeddings)
if len(np.unique(labels)) > 1:
score = silhouette_score(embeddings, labels)
if score > best_score:
best_score = score
best_n = n
return best_n
def spectral_clustering_speaker(
similarity_matrix, n_speakers=None, auto_estimate=True, max_speakers=10
):
"""
使用譜聚類進行說話人分離
Args:
similarity_matrix: 相似度矩陣 [n, n]
n_speakers: 說話人數量(可選,如果為 None 則自動估計)
auto_estimate: 是否自動估計說話人數量
max_speakers: 最大說話人數
Returns:
speaker_labels: 說話人標籤 [n,]
n_speakers: 使用的說話人數量
"""
n_segments = len(similarity_matrix)
# 清洗相似度矩陣
similarity_matrix = np.nan_to_num(
similarity_matrix, nan=0.5, posinf=1.0, neginf=-1.0
)
# 確保對角線為 1
np.fill_diagonal(similarity_matrix, 1.0)
# 確保值在 [-1, 1] 範圍
similarity_matrix = np.clip(similarity_matrix, -1.0, 1.0)
# 自動估計說話人數量
if n_speakers is None and auto_estimate:
n_speakers = estimate_n_speakers_eigengap(
similarity_matrix, max_speakers=max_speakers
)
print(f"[Clustering] Estimated n_speakers: {n_speakers}")
if n_speakers is None:
n_speakers = 2 # 預設值
# 確保 n_speakers 不超過樣本數
n_speakers = min(n_speakers, n_segments)
print(f"[Clustering] Running spectral clustering with {n_speakers} clusters...")
# 譜聚類
try:
clustering = SpectralClustering(
n_clusters=int(n_speakers),
affinity="precomputed",
assign_labels="kmeans",
random_state=42,
n_init=10,
)
speaker_labels = clustering.fit_predict(similarity_matrix)
print("[Clustering] Spectral clustering completed")
print(f"[Clustering] n_speakers: {n_speakers}")
print(f"[Clustering] n_segments: {n_segments}")
return speaker_labels, n_speakers
except Exception as e:
print(f"[Clustering] Spectral clustering failed: {e}")
print("[Clustering] Using fallback: 2 speakers")
# 簡單分配:前一半是 SPEAKER_0後一半是 SPEAKER_1
speaker_labels = np.array(
[0] * (n_segments // 2) + [1] * (n_segments - n_segments // 2)
)
return speaker_labels, 2
def agglomerative_clustering_speaker(
embeddings, n_speakers=None, threshold=0.5, max_speakers=10
):
"""
使用層次聚類進行說話人分離
Args:
embeddings: 嵌入矩陣 [n, d]
n_speakers: 說話人數量(可選)
threshold: 距離閾值(用於自動決定聚類數)
max_speakers: 最大說話人數
Returns:
speaker_labels: 說話人標籤 [n,]
n_speakers: 使用的說話人數量
"""
n_segments = len(embeddings)
if n_speakers is None:
# 使用距離閾值自動決定
from sklearn.metrics.pairwise import cosine_distances
distances = cosine_distances(embeddings)
# 計算平均最近鄰距離
avg_distances = []
for i in range(min(100, n_segments)):
dists = distances[i]
dists = np.sort(dists)
if len(dists) > 1:
avg_distances.append(dists[1]) # 最近鄰(排除自己)
if avg_distances:
avg_dist = np.mean(avg_distances)
# 根據平均距離估計聚類數
n_speakers = max(2, int(avg_dist / threshold))
n_speakers = min(n_speakers, max_speakers)
else:
n_speakers = 2
n_speakers = min(n_speakers, n_segments)
# 層次聚類
clustering = AgglomerativeClustering(
n_clusters=n_speakers, metric="cosine", linkage="average"
)
speaker_labels = clustering.fit_predict(embeddings)
print("[Clustering] Agglomerative clustering completed")
print(f"[Clustering] n_speakers: {n_speakers}")
return speaker_labels, n_speakers
def smooth_speaker_labels(speaker_labels, window_size=5):
"""
平滑說話人標籤(去除噪聲)
Args:
speaker_labels: 原始說話人標籤
window_size: 平滑窗口大小
Returns:
smoothed_labels: 平滑後的標籤
"""
from scipy import stats
smoothed = np.copy(speaker_labels)
half_window = window_size // 2
for i in range(len(speaker_labels)):
start = max(0, i - half_window)
end = min(len(speaker_labels), i + half_window + 1)
window_labels = speaker_labels[start:end]
mode_result = stats.mode(window_labels, keepdims=True)
smoothed[i] = mode_result.mode[0]
return smoothed
def compute_diarization_purity(speaker_labels, ground_truth_labels=None):
"""
計算說話人分離純度(如果有 ground truth
Args:
speaker_labels: 預測的說話人標籤
ground_truth_labels: 真實的說話人標籤(可選)
Returns:
purity: 純度分數0-1
"""
if ground_truth_labels is None:
# 沒有 ground truth使用聚類純度近似
# 使用餘弦相似度作為距離
purity = 0.5 # 預設值
else:
# 計算純度
from sklearn.metrics import adjusted_rand_score
purity = adjusted_rand_score(ground_truth_labels, speaker_labels)
return purity
if __name__ == "__main__":
# 測試聚類算法
print("[Test] Testing speaker clustering algorithms")
# 生成模擬數據
np.random.seed(42)
n_speakers = 3
n_segments_per_speaker = 20
# 生成 3 個說話人的嵌入
embeddings = []
for i in range(n_speakers):
# 每個說話人有不同的中心
center = np.random.randn(192) * 2 + i * 3
# 添加噪聲
for _ in range(n_segments_per_speaker):
emb = center + np.random.randn(192) * 0.5
embeddings.append(emb)
embeddings = np.array(embeddings)
print(f"[Test] Generated {len(embeddings)} embeddings for {n_speakers} speakers")
# 計算相似度矩陣
similarity = cosine_similarity(embeddings)
print(f"[Test] Similarity matrix shape: {similarity.shape}")
# 估計說話人數量
estimated_n = estimate_n_speakers_eigengap(similarity, max_speakers=10)
print(f"[Test] Estimated n_speakers (eigengap): {estimated_n}")
estimated_n_silhouette = estimate_n_speakers_silhouette(embeddings, max_speakers=10)
print(f"[Test] Estimated n_speakers (silhouette): {estimated_n_silhouette}")
# 譜聚類
labels, n_clusters = spectral_clustering_speaker(
similarity, n_speakers=None, auto_estimate=True
)
print("\n[Test] Clustering results:")
print(f" True n_speakers: {n_speakers}")
print(f" Estimated n_speakers: {n_clusters}")
print(f" Unique labels: {np.unique(labels)}")
# 計算每個聚類的大小
for label in np.unique(labels):
count = np.sum(labels == label)
print(f" Cluster {label}: {count} segments")

View File

@@ -1,431 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Speaker Player GUI - 說話人語音播放器(圖形界面)
使用 tkinter 顯示播放進度和 Speaker ID
"""
import json
import subprocess
import tempfile
import os
import threading
import time
from pathlib import Path
try:
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
HAS_TKINTER = True
except ImportError:
HAS_TKINTER = False
class SpeakerPlayerGUI:
"""說話人語音播放器 GUI"""
def __init__(self, root):
self.root = root
self.root.title("🎬 Speaker Audio Player - Face Integration")
self.root.geometry("1100x800")
# 數據
self.audio_path = None
self.result_path = None
self.face_path = None
self.result_data = None
self.face_data = None
self.integrated_data = None
self.speaker_segments = {}
self.speakers = []
self.current_speaker_idx = 0
self.is_playing = False
self.stop_flag = False
# 創建界面
self.create_widgets()
def create_widgets(self):
"""創建界面組件"""
# 頂部:文件選擇
top_frame = ttk.Frame(self.root, padding="10")
top_frame.pack(fill=tk.X)
ttk.Label(top_frame, text="📁 Audio:").pack(side=tk.LEFT)
self.audio_label = ttk.Label(top_frame, text="未選擇", width=50)
self.audio_label.pack(side=tk.LEFT, padx=5)
ttk.Button(top_frame, text="選擇音頻", command=self.select_audio).pack(
side=tk.LEFT, padx=5
)
ttk.Label(top_frame, text=" 📊 Result:").pack(side=tk.LEFT, padx=(20, 0))
self.result_label = ttk.Label(top_frame, text="未選擇", width=50)
self.result_label.pack(side=tk.LEFT, padx=5)
ttk.Button(top_frame, text="選擇結果", command=self.select_result).pack(
side=tk.LEFT, padx=5
)
# 中間:說話人列表和片段列表
mid_frame = ttk.Frame(self.root, padding="10")
mid_frame.pack(fill=tk.BOTH, expand=True)
# 左側:說話人列表
left_frame = ttk.LabelFrame(mid_frame, text="📢 說話人列表", padding="10")
left_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=False)
self.speaker_listbox = tk.Listbox(
left_frame, width=35, height=20, font=("Arial", 11)
)
self.speaker_listbox.pack(fill=tk.BOTH, expand=True)
self.speaker_listbox.bind("<<ListboxSelect>>", self.on_speaker_select)
# 右側:片段列表
right_frame = ttk.LabelFrame(mid_frame, text="🎵 語音片段", padding="10")
right_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=10)
# 片段列表(带滚动条)
list_frame = ttk.Frame(right_frame)
list_frame.pack(fill=tk.BOTH, expand=True)
scrollbar = ttk.Scrollbar(list_frame)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
self.segment_listbox = tk.Listbox(
list_frame,
width=50,
height=20,
font=("Courier", 10),
yscrollcommand=scrollbar.set,
)
self.segment_listbox.pack(fill=tk.BOTH, expand=True)
scrollbar.config(command=self.segment_listbox.yview)
self.segment_listbox.bind("<Double-Button-1>", self.on_segment_double_click)
# 底部:播放控制和進度
bottom_frame = ttk.Frame(self.root, padding="10")
bottom_frame.pack(fill=tk.X)
# 播放控制
control_frame = ttk.Frame(bottom_frame)
control_frame.pack(fill=tk.X)
self.play_button = ttk.Button(
control_frame, text="▶️ 播放所選", command=self.play_selected, width=15
)
self.play_button.pack(side=tk.LEFT, padx=5)
self.stop_button = ttk.Button(
control_frame, text="⏹️ 停止", command=self.stop_playing, width=10
)
self.stop_button.pack(side=tk.LEFT, padx=5)
self.stop_button.config(state=tk.DISABLED)
self.play_all_button = ttk.Button(
control_frame, text="▶️▶️ 播放全部", command=self.play_all, width=15
)
self.play_all_button.pack(side=tk.LEFT, padx=5)
# 進度條
progress_frame = ttk.Frame(bottom_frame)
progress_frame.pack(fill=tk.X, pady=(10, 0))
ttk.Label(progress_frame, text="⏱️ 進度:").pack(side=tk.LEFT)
self.progress_bar = ttk.Progressbar(progress_frame, mode="determinate")
self.progress_bar.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=10)
self.progress_label = ttk.Label(progress_frame, text="0:00 / 0:00", width=20)
self.progress_label.pack(side=tk.LEFT)
# 狀態欄
self.status_label = ttk.Label(
bottom_frame, text="就緒", relief=tk.SUNKEN, anchor=tk.W
)
self.status_label.pack(fill=tk.X, pady=(10, 0))
def select_audio(self):
"""選擇音頻文件"""
filename = filedialog.askopenfilename(
title="選擇音頻文件",
filetypes=[("WAV files", "*.wav"), ("All files", "*.*")],
)
if filename:
self.audio_path = filename
self.audio_label.config(text=Path(filename).name)
self.check_ready()
def select_result(self):
"""選擇結果文件"""
filename = filedialog.askopenfilename(
title="選擇 ASRX 結果文件",
filetypes=[("JSON files", "*.json"), ("All files", "*.*")],
)
if filename:
self.result_path = filename
self.result_label.config(text=Path(filename).name)
self.load_result()
self.check_ready()
def load_result(self):
"""載入 ASRX 結果"""
try:
with open(self.result_path, "r", encoding="utf-8") as f:
self.result_data = json.load(f)
# 分組
self.speaker_segments = {}
for seg in self.result_data.get("segments", []):
speaker = seg["speaker"]
if speaker not in self.speaker_segments:
self.speaker_segments[speaker] = []
self.speaker_segments[speaker].append(seg)
# 排序
for speaker in self.speaker_segments:
self.speaker_segments[speaker].sort(key=lambda x: x["start"])
# 說話人列表(按時長排序)
self.speakers = sorted(
self.speaker_segments.keys(),
key=lambda s: sum(seg["duration"] for seg in self.speaker_segments[s]),
reverse=True,
)
# 更新列表框
self.speaker_listbox.delete(0, tk.END)
for speaker in self.speakers:
segs = self.speaker_segments[speaker]
total_dur = sum(seg["duration"] for seg in segs)
total_dur_min = total_dur / 60
self.speaker_listbox.insert(
tk.END,
f"🔊 {speaker:12} | {len(segs):4d}段 | {total_dur_min:5.1f}分鐘",
)
self.status_label.config(
text=f"載入成功:{len(self.speakers)} 個說話人,{len(self.result_data.get('segments', []))} 個片段"
)
except Exception as e:
messagebox.showerror("錯誤", f"載入結果文件失敗:{e}")
self.result_path = None
self.result_label.config(text="載入失敗")
def check_ready(self):
"""檢查是否就緒"""
if self.audio_path and self.result_path:
self.status_label.config(text="✅ 就緒 - 請選擇說話人並播放")
self.play_button.config(state=tk.NORMAL)
self.play_all_button.config(state=tk.NORMAL)
else:
self.status_label.config(text="⚠️ 請選擇音頻和結果文件")
self.play_button.config(state=tk.DISABLED)
self.play_all_button.config(state=tk.DISABLED)
def on_speaker_select(self, event):
"""說話人選擇事件"""
selection = self.speaker_listbox.curselection()
if not selection:
return
self.current_speaker_idx = selection[0]
speaker = self.speakers[self.current_speaker_idx]
# 更新片段列表
self.segment_listbox.delete(0, tk.END)
for i, seg in enumerate(self.speaker_segments[speaker], 1):
start = seg["start"]
end = seg["end"]
duration = seg["duration"]
self.segment_listbox.insert(
tk.END,
f"[{i:4d}] {speaker:12} | {start:7.2f}s - {end:7.2f}s ({duration:5.2f}s)",
)
self.status_label.config(
text=f"選擇:{speaker} - {len(self.speaker_segments[speaker])} 個片段"
)
def on_segment_double_click(self, event):
"""片段雙擊事件"""
self.play_selected()
def extract_and_play(self, start_sec: float, end_sec: float) -> bool:
"""提取並播放音頻"""
duration = end_sec - start_sec
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
temp_path = temp_file.name
temp_file.close()
try:
# 提取
cmd = [
"ffmpeg",
"-y",
"-loglevel",
"quiet",
"-i",
self.audio_path,
"-ss",
str(start_sec),
"-t",
str(duration),
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
temp_path,
]
result = subprocess.run(cmd, capture_output=True)
if result.returncode != 0:
return False
# 播放
if os.path.exists("/usr/bin/afplay"):
subprocess.run(["afplay", temp_path], capture_output=True)
elif os.path.exists("/usr/bin/aplay"):
subprocess.run(["aplay", temp_path], capture_output=True)
else:
return False
return True
finally:
if os.path.exists(temp_path):
os.unlink(temp_path)
def play_segment(self, speaker: str, seg: dict, seg_idx: int, total: int):
"""播放單個片段"""
if self.stop_flag:
return False
start = seg["start"]
end = seg["end"]
duration = seg["duration"]
# 更新 UI
self.root.after(
0,
lambda: self.status_label.config(
text=f"▶️ {speaker} [{seg_idx}/{total}] {start:.2f}s - {end:.2f}s"
),
)
# 更新進度
progress = (seg_idx / total) * 100
self.root.after(0, lambda: self.progress_bar.config(value=progress))
self.root.after(
0, lambda: self.progress_label.config(text=f"{seg_idx}:{total}")
)
# 播放
if self.extract_and_play(start, end):
return True
else:
self.root.after(
0,
lambda: messagebox.showwarning(
"警告", f"播放失敗:{speaker} [{seg_idx}]"
),
)
return True
def play_selected(self):
"""播放所選片段"""
selection = self.segment_listbox.curselection()
if not selection:
# 如果沒選擇,播放第一個
if self.speakers:
speaker = self.speakers[self.current_speaker_idx]
segs = self.speaker_segments[speaker]
if segs:
self.play_all()
return
# 播放所選
seg_idx = selection[0]
speaker = self.speakers[self.current_speaker_idx]
seg = self.speaker_segments[speaker][seg_idx]
self.is_playing = True
self.stop_flag = False
self.play_button.config(state=tk.DISABLED)
self.stop_button.config(state=tk.NORMAL)
# 在後台線程播放
def play_thread():
success = self.play_segment(speaker, seg, seg_idx + 1, 1)
self.root.after(0, lambda: self.on_play_done())
thread = threading.Thread(target=play_thread, daemon=True)
thread.start()
def play_all(self):
"""播放所選說話人的所有片段"""
if not self.speakers:
return
speaker = self.speakers[self.current_speaker_idx]
segs = self.speaker_segments[speaker]
if not segs:
return
self.is_playing = True
self.stop_flag = False
self.play_button.config(state=tk.DISABLED)
self.play_all_button.config(state=tk.DISABLED)
self.stop_button.config(state=tk.NORMAL)
# 在後台線程播放
def play_thread():
for i, seg in enumerate(segs, 1):
if self.stop_flag:
break
self.play_segment(speaker, seg, i, len(segs))
time.sleep(0.3) # 片段間隔
self.root.after(0, lambda: self.on_play_done())
thread = threading.Thread(target=play_thread, daemon=True)
thread.start()
def stop_playing(self):
"""停止播放"""
self.stop_flag = True
self.is_playing = False
self.on_play_done()
def on_play_done(self):
"""播放完成"""
self.is_playing = False
self.stop_flag = False
self.play_button.config(state=tk.NORMAL)
self.play_all_button.config(state=tk.NORMAL)
self.stop_button.config(state=tk.DISABLED)
self.progress_bar.config(value=0)
self.progress_label.config(text="0:00 / 0:00")
if self.stop_flag:
self.status_label.config(text="⏹️ 已停止")
else:
self.status_label.config(text="✅ 播放完成")
def main():
"""主函數"""
if not HAS_TKINTER:
print("❌ tkinter 未安裝")
print("請使用以下命令安裝:")
print(" brew install python-tk@3.9")
return
root = tk.Tk()
app = SpeakerPlayerGUI(root)
root.mainloop()
if __name__ == "__main__":
main()

View File

@@ -1,522 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Speaker Player GUI - 說話人語音播放器Face 整合版)
使用 tkinter 顯示播放進度、Speaker ID 和人臉信息
"""
import json
import subprocess
import tempfile
import os
import threading
import time
from pathlib import Path
try:
import tkinter as tk
from tkinter import ttk, filedialog, messagebox
HAS_TKINTER = True
except ImportError:
HAS_TKINTER = False
class SpeakerPlayerGUI:
"""說話人語音播放器 GUIFace 整合版)"""
def __init__(self, root):
self.root = root
self.root.title("🎬 Speaker Player - Face Integration")
self.root.geometry("1200x800")
# 數據
self.audio_path = None
self.result_path = None
self.face_path = None
self.result_data = None
self.face_data = None
self.integrated_data = None
self.speaker_segments = {}
self.speakers = []
self.current_speaker_idx = 0
self.is_playing = False
self.stop_flag = False
# 創建界面
self.create_widgets()
def create_widgets(self):
"""創建界面組件"""
# 頂部:文件選擇
top_frame = ttk.Frame(self.root, padding="10")
top_frame.pack(fill=tk.X)
# 第一行:音頻和 ASRX 結果
row1_frame = ttk.Frame(top_frame)
row1_frame.pack(fill=tk.X)
ttk.Label(row1_frame, text="📁 Audio:").pack(side=tk.LEFT)
self.audio_label = ttk.Label(row1_frame, text="未選擇", width=50)
self.audio_label.pack(side=tk.LEFT, padx=5)
ttk.Button(row1_frame, text="選擇音頻", command=self.select_audio).pack(
side=tk.LEFT, padx=5
)
ttk.Label(row1_frame, text=" 📊 ASRX:").pack(side=tk.LEFT, padx=(20, 0))
self.result_label = ttk.Label(row1_frame, text="未選擇", width=50)
self.result_label.pack(side=tk.LEFT, padx=5)
ttk.Button(row1_frame, text="選擇結果", command=self.select_result).pack(
side=tk.LEFT, padx=5
)
# 第二行Face 結果
row2_frame = ttk.Frame(top_frame)
row2_frame.pack(fill=tk.X, pady=(5, 0))
ttk.Label(row2_frame, text="👤 Face:").pack(side=tk.LEFT)
self.face_label = ttk.Label(row2_frame, text="未選擇 (可選)", width=50)
self.face_label.pack(side=tk.LEFT, padx=5)
ttk.Button(row2_frame, text="選擇 Face", command=self.select_face).pack(
side=tk.LEFT, padx=5
)
self.integrate_button = ttk.Button(
row2_frame,
text="🔗 整合 Face",
command=self.integrate_face,
state=tk.DISABLED,
)
self.integrate_button.pack(side=tk.LEFT, padx=5)
# 中間:說話人列表和片段列表
mid_frame = ttk.Frame(self.root, padding="10")
mid_frame.pack(fill=tk.BOTH, expand=True)
# 左側:說話人列表(帶 Face 統計)
left_frame = ttk.LabelFrame(mid_frame, text="📢 說話人列表", padding="10")
left_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=False)
self.speaker_listbox = tk.Listbox(
left_frame, width=45, height=20, font=("Arial", 11)
)
self.speaker_listbox.pack(fill=tk.BOTH, expand=True)
self.speaker_listbox.bind("<<ListboxSelect>>", self.on_speaker_select)
# 右側:片段列表(帶 Face 信息)
right_frame = ttk.LabelFrame(
mid_frame, text="🎵 語音片段 + 👥 人臉", padding="10"
)
right_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=10)
# 片段列表(带滚动条)
list_frame = ttk.Frame(right_frame)
list_frame.pack(fill=tk.BOTH, expand=True)
scrollbar = ttk.Scrollbar(list_frame)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
self.segment_listbox = tk.Listbox(
list_frame,
width=65,
height=20,
font=("Courier", 9),
yscrollcommand=scrollbar.set,
)
self.segment_listbox.pack(fill=tk.BOTH, expand=True)
scrollbar.config(command=self.segment_listbox.yview)
self.segment_listbox.bind("<Double-Button-1>", self.on_segment_double_click)
# 底部:播放控制和進度
bottom_frame = ttk.Frame(self.root, padding="10")
bottom_frame.pack(fill=tk.X)
# 播放控制
control_frame = ttk.Frame(bottom_frame)
control_frame.pack(fill=tk.X)
self.play_button = ttk.Button(
control_frame, text="▶️ 播放所選", command=self.play_selected, width=15
)
self.play_button.pack(side=tk.LEFT, padx=5)
self.play_button.config(state=tk.DISABLED)
self.stop_button = ttk.Button(
control_frame, text="⏹️ 停止", command=self.stop_playing, width=10
)
self.stop_button.pack(side=tk.LEFT, padx=5)
self.stop_button.config(state=tk.DISABLED)
self.play_all_button = ttk.Button(
control_frame, text="▶️▶️ 播放全部", command=self.play_all, width=15
)
self.play_all_button.pack(side=tk.LEFT, padx=5)
self.play_all_button.config(state=tk.DISABLED)
# 進度條
progress_frame = ttk.Frame(bottom_frame)
progress_frame.pack(fill=tk.X, pady=(10, 0))
ttk.Label(progress_frame, text="⏱️ 進度:").pack(side=tk.LEFT)
self.progress_bar = ttk.Progressbar(progress_frame, mode="determinate")
self.progress_bar.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=10)
self.progress_label = ttk.Label(progress_frame, text="0:00 / 0:00", width=20)
self.progress_label.pack(side=tk.LEFT)
# 狀態欄
self.status_label = ttk.Label(
bottom_frame, text="就緒", relief=tk.SUNKEN, anchor=tk.W
)
self.status_label.pack(fill=tk.X, pady=(10, 0))
def select_audio(self):
"""選擇音頻文件"""
filename = filedialog.askopenfilename(
title="選擇音頻文件",
filetypes=[("WAV files", "*.wav"), ("All files", "*.*")],
)
if filename:
self.audio_path = filename
self.audio_label.config(text=Path(filename).name)
self.check_ready()
def select_result(self):
"""選擇 ASRX 結果文件"""
filename = filedialog.askopenfilename(
title="選擇 ASRX 結果文件",
filetypes=[("JSON files", "*.json"), ("All files", "*.*")],
)
if filename:
self.result_path = filename
self.result_label.config(text=Path(filename).name)
self.load_result()
self.check_ready()
def select_face(self):
"""選擇 Face 結果文件"""
filename = filedialog.askopenfilename(
title="選擇 Face 檢測結果",
filetypes=[("JSON files", "*.json"), ("All files", "*.*")],
)
if filename:
self.face_path = filename
self.face_label.config(text=Path(filename).name)
self.integrate_button.config(state=tk.NORMAL)
self.status_label.config(text="✅ Face 已選擇 - 請點擊整合")
def integrate_face(self):
"""整合 Face 與 ASRX"""
if not self.face_path or not self.result_path:
messagebox.showwarning("警告", "請先選擇 Face 和 ASRX 文件")
return
self.status_label.config(text="🔄 整合中...")
self.root.update()
try:
# 載入 Face 數據
with open(self.face_path, "r", encoding="utf-8") as f:
self.face_data = json.load(f)
# 重新載入 ASRX 數據並整合
self.load_result(integrate_with_face=True)
self.status_label.config(text="✅ Face 整合完成")
self.integrate_button.config(state=tk.DISABLED)
except Exception as e:
messagebox.showerror("錯誤", f"整合失敗:{e}")
self.status_label.config(text="❌ 整合失敗")
def load_result(self, integrate_with_face=False):
"""載入 ASRX 結果"""
try:
with open(self.result_path, "r", encoding="utf-8") as f:
self.result_data = json.load(f)
# 分組
self.speaker_segments = {}
for seg in self.result_data.get("segments", []):
speaker = seg["speaker"]
if speaker not in self.speaker_segments:
self.speaker_segments[speaker] = []
self.speaker_segments[speaker].append(seg)
# 排序
for speaker in self.speaker_segments:
self.speaker_segments[speaker].sort(key=lambda x: x["start"])
# 說話人列表(按時長排序)
self.speakers = sorted(
self.speaker_segments.keys(),
key=lambda s: sum(seg["duration"] for seg in self.speaker_segments[s]),
reverse=True,
)
# 更新列表框
self.speaker_listbox.delete(0, tk.END)
for speaker in self.speakers:
segs = self.speaker_segments[speaker]
total_dur = sum(seg["duration"] for seg in segs)
total_dur_min = total_dur / 60
# 如果有 Face 數據,計算有人臉的片段數
face_info = ""
if integrate_with_face and self.integrated_data:
speaker_integrated = [
item
for item in self.integrated_data
if item["speaker"] == speaker
]
with_face = sum(
1 for item in speaker_integrated if item.get("has_face", False)
)
face_info = f" | 👥 {with_face}/{len(segs)}"
self.speaker_listbox.insert(
tk.END,
f"🔊 {speaker:12} | {len(segs):4d}段 | {total_dur_min:5.1f}分鐘{face_info}",
)
total_segments = len(self.result_data.get("segments", []))
self.status_label.config(
text=f"載入成功:{len(self.speakers)} 個說話人,{total_segments} 個片段"
)
except Exception as e:
messagebox.showerror("錯誤", f"載入結果文件失敗:{e}")
self.result_path = None
self.result_label.config(text="載入失敗")
def check_ready(self):
"""檢查是否就緒"""
if self.audio_path and self.result_path:
self.status_label.config(text="✅ 就緒 - 請選擇說話人並播放")
self.play_button.config(state=tk.NORMAL)
self.play_all_button.config(state=tk.NORMAL)
else:
self.status_label.config(text="⚠️ 請選擇音頻和結果文件")
self.play_button.config(state=tk.DISABLED)
self.play_all_button.config(state=tk.DISABLED)
def on_speaker_select(self, event):
"""說話人選擇事件"""
selection = self.speaker_listbox.curselection()
if not selection:
return
self.current_speaker_idx = selection[0]
speaker = self.speakers[self.current_speaker_idx]
# 更新片段列表
self.segment_listbox.delete(0, tk.END)
for i, seg in enumerate(self.speaker_segments[speaker], 1):
start = seg["start"]
end = seg["end"]
duration = seg["duration"]
# 如果有整合 Face 數據
face_info = ""
if self.integrated_data:
matching = [
item
for item in self.integrated_data
if abs(item["start"] - start) < 0.1 and item["speaker"] == speaker
]
if matching and matching[0].get("has_face", False):
face_info = " 👥✅"
elif matching:
face_info = " 👥❌"
self.segment_listbox.insert(
tk.END,
f"[{i:4d}] {speaker:12} | {start:7.2f}s - {end:7.2f}s ({duration:5.2f}s){face_info}",
)
self.status_label.config(
text=f"選擇:{speaker} - {len(self.speaker_segments[speaker])} 個片段"
)
def on_segment_double_click(self, event):
"""片段雙擊事件"""
self.play_selected()
def extract_and_play(self, start_sec: float, end_sec: float) -> bool:
"""提取並播放音頻"""
duration = end_sec - start_sec
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
temp_path = temp_file.name
temp_file.close()
try:
# 提取
cmd = [
"ffmpeg",
"-y",
"-loglevel",
"quiet",
"-i",
self.audio_path,
"-ss",
str(start_sec),
"-t",
str(duration),
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
temp_path,
]
result = subprocess.run(cmd, capture_output=True)
if result.returncode != 0:
return False
# 播放
if os.path.exists("/usr/bin/afplay"):
subprocess.run(["afplay", temp_path], capture_output=True)
elif os.path.exists("/usr/bin/aplay"):
subprocess.run(["aplay", temp_path], capture_output=True)
else:
return False
return True
finally:
if os.path.exists(temp_path):
os.unlink(temp_path)
def play_segment(self, speaker: str, seg: dict, seg_idx: int, total: int):
"""播放單個片段"""
if self.stop_flag:
return False
start = seg["start"]
end = seg["end"]
duration = seg["duration"]
# 更新 UI
self.root.after(
0,
lambda: self.status_label.config(
text=f"▶️ {speaker} [{seg_idx}/{total}] {start:.2f}s - {end:.2f}s"
),
)
# 更新進度
progress = (seg_idx / total) * 100
self.root.after(0, lambda: self.progress_bar.config(value=progress))
self.root.after(
0, lambda: self.progress_label.config(text=f"{seg_idx}:{total}")
)
# 播放
if self.extract_and_play(start, end):
return True
else:
self.root.after(
0,
lambda: messagebox.showwarning(
"警告", f"播放失敗:{speaker} [{seg_idx}]"
),
)
return True
def play_selected(self):
"""播放所選片段"""
selection = self.segment_listbox.curselection()
if not selection:
# 如果沒選擇,播放第一個
if self.speakers:
speaker = self.speakers[self.current_speaker_idx]
segs = self.speaker_segments[speaker]
if segs:
self.play_all()
return
# 播放所選
seg_idx = selection[0]
speaker = self.speakers[self.current_speaker_idx]
seg = self.speaker_segments[speaker][seg_idx]
self.is_playing = True
self.stop_flag = False
self.play_button.config(state=tk.DISABLED)
self.stop_button.config(state=tk.NORMAL)
# 在後台線程播放
def play_thread():
success = self.play_segment(speaker, seg, seg_idx + 1, 1)
self.root.after(0, lambda: self.on_play_done())
thread = threading.Thread(target=play_thread, daemon=True)
thread.start()
def play_all(self):
"""播放所選說話人的所有片段"""
if not self.speakers:
return
speaker = self.speakers[self.current_speaker_idx]
segs = self.speaker_segments[speaker]
if not segs:
return
self.is_playing = True
self.stop_flag = False
self.play_button.config(state=tk.DISABLED)
self.play_all_button.config(state=tk.DISABLED)
self.stop_button.config(state=tk.NORMAL)
# 在後台線程播放
def play_thread():
for i, seg in enumerate(segs, 1):
if self.stop_flag:
break
self.play_segment(speaker, seg, i, len(segs))
time.sleep(0.3) # 片段間隔
self.root.after(0, lambda: self.on_play_done())
thread = threading.Thread(target=play_thread, daemon=True)
thread.start()
def stop_playing(self):
"""停止播放"""
self.stop_flag = True
self.is_playing = False
self.on_play_done()
def on_play_done(self):
"""播放完成"""
self.is_playing = False
self.stop_flag = False
self.play_button.config(state=tk.NORMAL)
self.play_all_button.config(state=tk.NORMAL)
self.stop_button.config(state=tk.DISABLED)
self.progress_bar.config(value=0)
self.progress_label.config(text="0:00 / 0:00")
if self.stop_flag:
self.status_label.config(text="⏹️ 已停止")
else:
self.status_label.config(text="✅ 播放完成")
def main():
"""主函數"""
if not HAS_TKINTER:
print("❌ tkinter 未安裝")
print("請使用以下命令安裝:")
print(" brew install python-tk@3.9")
return
root = tk.Tk()
app = SpeakerPlayerGUI(root)
root.mainloop()
if __name__ == "__main__":
main()

View File

@@ -1,267 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
Interactive Speaker Audio Player - 交互式說話人語音播放器
可以選擇播放哪個說話人的哪些片段
"""
import json
import subprocess
import tempfile
import os
from pathlib import Path
from typing import List, Dict
def load_asrx_result(result_path: str) -> Dict:
"""載入 ASRX 結果"""
with open(result_path, "r", encoding="utf-8") as f:
return json.load(f)
def extract_and_play(audio_path: str, start_sec: float, end_sec: float) -> bool:
"""提取並播放音頻片段"""
duration = end_sec - start_sec
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
temp_path = temp_file.name
temp_file.close()
try:
# 提取
cmd = [
"ffmpeg",
"-y",
"-loglevel",
"quiet",
"-i",
audio_path,
"-ss",
str(start_sec),
"-t",
str(duration),
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
temp_path,
]
result = subprocess.run(cmd, capture_output=True)
if result.returncode != 0:
return False
# 播放
if os.path.exists("/usr/bin/afplay"):
subprocess.run(["afplay", temp_path], capture_output=True)
elif os.path.exists("/usr/bin/aplay"):
subprocess.run(["aplay", temp_path], capture_output=True)
else:
print(" ⚠️ No audio player found")
return False
return True
finally:
if os.path.exists(temp_path):
os.unlink(temp_path)
def show_menu(speaker_segments: Dict[str, List[Dict]], speaker_id: str):
"""顯示選單"""
segs = speaker_segments[speaker_id]
total_duration = sum(seg["duration"] for seg in segs)
print(f"\n{'=' * 70}")
print(f"🔊 {speaker_id}")
print(f"{'=' * 70}")
print(f" Segments: {len(segs)}")
print(
f" Total duration: {total_duration / 60:.1f} minutes ({total_duration:.1f}s)"
)
print(f"{'=' * 70}")
# 顯示前 20 個片段
for i, seg in enumerate(segs[:20], 1):
start = seg["start"]
end = seg["end"]
duration = seg["duration"]
print(
f" [{i:3d}] {speaker_id:12} | {start:7.2f}s - {end:7.2f}s ({duration:5.2f}s)"
)
if len(segs) > 20:
print(f" ... and {len(segs) - 20} more segments")
print(f"\n{'=' * 70}")
print("Commands:")
print(f" [1-{min(20, len(segs))}] Play specific segment")
print(" all Play all segments (may take a while)")
print(" first N Play first N segments")
print(" next Next speaker")
print(" prev Previous speaker")
print(" list List all speakers")
print(" quit Exit")
print(f"{'=' * 70}")
def interactive_player(audio_path: str, result_path: str):
"""交互式播放器"""
# 載入結果
result = load_asrx_result(result_path)
segments = result.get("segments", [])
total_duration = result.get("total_duration", 0)
# 分組
speaker_segments = {}
for seg in segments:
speaker = seg["speaker"]
if speaker not in speaker_segments:
speaker_segments[speaker] = []
speaker_segments[speaker].append(seg)
# 排序
for speaker in speaker_segments:
speaker_segments[speaker].sort(key=lambda x: x["start"])
# 說話人列表
speakers = sorted(
speaker_segments.keys(),
key=lambda s: sum(seg["duration"] for seg in speaker_segments[s]),
reverse=True,
)
current_speaker_idx = 0
print("\n🎬 Speaker Audio Player")
print(f"📁 Audio: {audio_path}")
print(f"📊 Speakers: {len(speakers)}")
print(f"{'=' * 70}")
while True:
current_speaker = speakers[current_speaker_idx]
show_menu(speaker_segments, current_speaker)
try:
cmd = input(f"\n▶️ {current_speaker} > ").strip().lower()
except (EOFError, KeyboardInterrupt):
print("\n\nExiting...")
break
if not cmd:
continue
# 播放特定片段
if cmd.isdigit():
idx = int(cmd) - 1
if 0 <= idx < len(speaker_segments[current_speaker]):
seg = speaker_segments[current_speaker][idx]
print(f"\n 🔊 {current_speaker} - Segment {idx + 1}")
print(
f" ⏱️ {seg['start']:.2f}s - {seg['end']:.2f}s ({seg['duration']:.2f}s)"
)
print(" ▶️ Playing...", end="", flush=True)
if extract_and_play(audio_path, seg["start"], seg["end"]):
print(" ✅ Done")
else:
print(" ❌ Failed")
else:
print(
f" Invalid segment number (1-{len(speaker_segments[current_speaker])})"
)
# 播放所有
elif cmd == "all":
print(
f"\n 🔊 {current_speaker} - Playing all {len(speaker_segments[current_speaker])} segments..."
)
print("=" * 70)
for i, seg in enumerate(speaker_segments[current_speaker], 1):
print(
f" [{i:3d}/{len(speaker_segments[current_speaker])}] {current_speaker} | "
+ f"{seg['start']:7.2f}s - {seg['end']:7.2f}s ({seg['duration']:5.2f}s)",
end="",
flush=True,
)
if extract_and_play(audio_path, seg["start"], seg["end"]):
print("")
else:
print("")
print("=" * 70)
# 播放前 N 個
elif cmd.startswith("first "):
try:
n = int(cmd.split()[1])
print(f"\n 🔊 {current_speaker} - Playing first {n} segments...")
print("=" * 70)
for i, seg in enumerate(speaker_segments[current_speaker][:n], 1):
print(
f" [{i:3d}/{n}] {current_speaker} | "
+ f"{seg['start']:7.2f}s - {seg['end']:7.2f}s ({seg['duration']:5.2f}s)",
end="",
flush=True,
)
if extract_and_play(audio_path, seg["start"], seg["end"]):
print("")
else:
print("")
print("=" * 70)
except (IndexError, ValueError):
print(" Usage: first N")
# 下一個說話人
elif cmd == "next":
current_speaker_idx = (current_speaker_idx + 1) % len(speakers)
# 上一個說話人
elif cmd == "prev":
current_speaker_idx = (current_speaker_idx - 1) % len(speakers)
# 列出所有說話人
elif cmd == "list":
print(f"\n{'=' * 70}")
print("📢 All speakers:")
print(f"{'=' * 70}")
for i, speaker in enumerate(speakers, 1):
segs = speaker_segments[speaker]
total_dur = sum(seg["duration"] for seg in segs)
pct = total_dur / total_duration * 100 if total_duration > 0 else 0
print(
f" {i:2d}. 🔊 {speaker:12} | {len(segs):4d} segments, "
+ f"{total_dur:7.1f}s ({pct:5.1f}%)"
)
print(f"{'=' * 70}")
print(f" Current: 🔊 {speakers[current_speaker_idx]}")
print(f"{'=' * 70}")
# 退出
elif cmd == "quit" or cmd == "exit" or cmd == "q":
print("\nExiting...")
break
else:
print(f" Unknown command: {cmd}")
def main():
import argparse
parser = argparse.ArgumentParser(description="Interactive Speaker Audio Player")
parser.add_argument("audio_path", help="原始音頻文件路徑")
parser.add_argument("result_path", help="ASRX 結果 JSON 路徑")
args = parser.parse_args()
if not Path(args.audio_path).exists():
print(f"Error: Audio file not found: {args.audio_path}")
return
if not Path(args.result_path).exists():
print(f"Error: Result file not found: {args.result_path}")
return
interactive_player(args.audio_path, args.result_path)
if __name__ == "__main__":
main()

View File

@@ -1,164 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
GUI Face Player 自動化測試腳本
測試所有功能並生成測試報告
"""
import json
import subprocess
from pathlib import Path
def check_file_exists(path, description):
"""檢查文件是否存在"""
exists = Path(path).exists()
status = "" if exists else ""
size = Path(path).stat().st_size / 1024 / 1024 if exists else 0
print(f"{status} {description}: {path} ({size:.1f} MB)")
return exists
def check_process_running(pattern):
"""檢查進程是否運行"""
result = subprocess.run(['pgrep', '-f', pattern], capture_output=True, text=True)
running = result.returncode == 0
status = "" if running else ""
print(f"{status} 進程:{pattern} ({'運行中' if running else '未運行'})")
return running
def test_json_structure(path, required_keys, description):
"""測試 JSON 文件結構"""
try:
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
missing_keys = [key for key in required_keys if key not in data]
if missing_keys:
print(f"{description}: 缺少鍵 {missing_keys}")
return False
else:
print(f"{description}: 結構正確")
return True
except Exception as e:
print(f"{description}: {e}")
return False
def test_integration_script():
"""測試整合腳本"""
print("\n" + "="*70)
print("測試整合腳本")
print("="*70)
cmd = [
'python3',
'integrate_face_asrx_speaker.py',
'/tmp/face_long.json',
'/tmp/asrx_charade_optimized.json',
'--threshold', '3.0',
'--stats'
]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
# 檢查輸出
if '99.8%' in result.stdout:
print("✅ 整合腳本:匹配率正確 (99.8%)")
return True
else:
print("❌ 整合腳本:匹配率異常")
print(result.stdout)
return False
def test_gui_startup():
"""測試 GUI 啟動"""
print("\n" + "="*70)
print("測試 GUI 啟動")
print("="*70)
# 檢查進程
running = check_process_running('speaker_player_gui_face')
if running:
print("✅ GUI 進程:正常運行")
return True
else:
print("❌ GUI 進程:未運行")
return False
def main():
"""主測試函數"""
print("="*70)
print("GUI Face Player 自動化測試")
print("="*70)
# 測試文件
print("\n" + "="*70)
print("測試文件")
print("="*70)
files_ok = True
files_ok &= check_file_exists('/tmp/charade_audio.wav', '音頻文件')
files_ok &= check_file_exists('/tmp/asrx_charade_optimized.json', 'ASRX 結果')
files_ok &= check_file_exists('/tmp/face_long.json', 'Face 結果')
files_ok &= check_file_exists('/tmp/charade_integrated.json', '整合結果')
# 測試 JSON 結構
print("\n" + "="*70)
print("測試 JSON 結構")
print("="*70)
json_ok = True
json_ok &= test_json_structure(
'/tmp/asrx_charade_optimized.json',
['segments', 'n_speakers'],
'ASRX 結果'
)
json_ok &= test_json_structure(
'/tmp/face_long.json',
['frames', 'frame_count'],
'Face 結果'
)
json_ok &= test_json_structure(
'/tmp/charade_integrated.json',
['integrated_segments', 'speaker_stats'],
'整合結果'
)
# 測試整合腳本
integration_ok = test_integration_script()
# 測試 GUI
gui_ok = test_gui_startup()
# 總結
print("\n" + "="*70)
print("測試總結")
print("="*70)
all_ok = files_ok and json_ok and integration_ok and gui_ok
if all_ok:
print("✅ 所有測試通過!")
else:
print("❌ 部分測試失敗")
if not files_ok:
print(" - 文件測試失敗")
if not json_ok:
print(" - JSON 結構測試失敗")
if not integration_ok:
print(" - 整合腳本測試失敗")
if not gui_ok:
print(" - GUI 啟動測試失敗")
print("\n" + "="*70)
return all_ok
if __name__ == "__main__":
success = main()
exit(0 if success else 1)

View File

@@ -1,240 +0,0 @@
#!/opt/homebrew/bin/python3.11
"""
長影片Charade 1963114 分鐘)完整測試腳本
"""
import json
import subprocess
from pathlib import Path
from datetime import datetime
def print_header(title):
"""打印標題"""
print("\n" + "="*70)
print(f" {title}")
print("="*70)
def test_data_files():
"""測試數據文件"""
print_header("1. 數據文件測試")
files = {
'音頻文件': '/tmp/charade_audio.wav',
'ASRX 結果': '/tmp/asrx_charade_optimized.json',
'Face 結果': '/tmp/face_long.json',
'整合結果': '/tmp/charade_integrated.json'
}
all_ok = True
for name, path in files.items():
exists = Path(path).exists()
size = Path(path).stat().st_size / 1024 / 1024 if exists else 0
status = "" if exists else ""
print(f"{status} {name}: {size:.1f} MB")
all_ok = all_ok and exists
return all_ok
def test_asrx_results():
"""測試 ASRX 結果"""
print_header("2. ASRX 結果測試")
with open('/tmp/asrx_charade_optimized.json', 'r', encoding='utf-8') as f:
data = json.load(f)
total_duration = data.get('total_duration', 0)
n_speakers = data.get('n_speakers', 0)
n_segments = data.get('n_speech_segments', 0)
print(f"📊 影片時長:{total_duration/60:.1f} 分鐘 ({total_duration:.1f}秒)")
print(f" 說話人數量:{n_speakers}")
print(f"📊 語音片段:{n_segments}")
# 說話人統計
print("\n📢 說話人分佈:")
speaker_stats = data.get('speaker_stats', {})
for speaker, stats in sorted(speaker_stats.items(), key=lambda x: x[1]['duration'], reverse=True):
duration = stats.get('duration', 0)
count = stats.get('count', 0)
pct = duration / total_duration * 100 if total_duration > 0 else 0
print(f" {speaker}: {count} 片段,{duration/60:.1f}分鐘 ({pct:.1f}%)")
return n_speakers >= 2 and n_segments > 100
def test_face_results():
"""測試 Face 結果"""
print_header("3. Face 結果測試")
with open('/tmp/face_long.json', 'r', encoding='utf-8') as f:
data = json.load(f)
total_frames = data.get('frame_count', 0)
detected_frames = data.get('frames', [])
fps = data.get('fps', 0)
print(f"📊 總數:{total_frames:,}")
print(f"📊 檢測到人臉:{len(detected_frames):,}")
print(f"📊 FPS: {fps:.2f}")
print(f"📊 檢測率:{len(detected_frames)/total_frames*100:.2f}%")
return len(detected_frames) > 0
def test_integration():
"""測試整合結果"""
print_header("4. Face + ASRX 整合測試")
with open('/tmp/charade_integrated.json', 'r', encoding='utf-8') as f:
data = json.load(f)
segments = data.get('integrated_segments', [])
total = len(segments)
with_face = sum(1 for seg in segments if seg.get('has_face', False))
match_rate = with_face / total * 100 if total > 0 else 0
print(f"📊 總片段:{total}")
print(f"📊 有人臉:{with_face}")
print(f"📊 匹配率:{match_rate:.2f}%")
# 說話人匹配統計
print("\n📢 說話人匹配詳情:")
speaker_stats = data.get('speaker_stats', {})
for speaker, stats in sorted(speaker_stats.items()):
total_seg = stats.get('total_segments', 0)
with_face_seg = stats.get('with_face', 0)
rate = with_face_seg / total_seg * 100 if total_seg > 0 else 0
status = "" if rate >= 99 else "⚠️" if rate >= 50 else ""
print(f" {status} {speaker}: {with_face_seg}/{total_seg} ({rate:.1f}%)")
return match_rate >= 95
def test_gui_process():
"""測試 GUI 進程"""
print_header("5. GUI 進程測試")
result = subprocess.run(['pgrep', '-f', 'speaker_player_gui_face'],
capture_output=True, text=True)
running = result.returncode == 0
if running:
pid = result.stdout.strip()
print(f"✅ GUI 進程運行中 (PID: {pid})")
# 檢查進程資源使用
ps_result = subprocess.run(['ps', 'aux'], capture_output=True, text=True)
for line in ps_result.stdout.split('\n'):
if 'speaker_player_gui_face' in line and 'grep' not in line:
parts = line.split()
if len(parts) >= 8:
cpu = parts[2]
mem = parts[3]
print(f" CPU: {cpu}%, 記憶體:{mem}%")
else:
print("❌ GUI 進程未運行")
return running
def test_playback():
"""測試播放功能(模擬)"""
print_header("6. 播放功能測試")
# 測試 ffmpeg 是否可用
result = subprocess.run(['which', 'ffmpeg'], capture_output=True, text=True)
ffmpeg_ok = result.returncode == 0
print(f"{'' if ffmpeg_ok else ''} ffmpeg: {'可用' if ffmpeg_ok else '不可用'}")
# 測試 afplay 是否可用
result = subprocess.run(['which', 'afplay'], capture_output=True, text=True)
afplay_ok = result.returncode == 0
print(f"{'' if afplay_ok else ''} afplay: {'可用' if afplay_ok else '不可用'}")
# 測試音頻提取(第一個片段)
with open('/tmp/asrx_charade_optimized.json', 'r', encoding='utf-8') as f:
asrx_data = json.load(f)
first_seg = asrx_data['segments'][0]
start = first_seg['start']
end = first_seg['end']
duration = end - start
print("\n🎵 測試提取第一個片段:")
print(f" 時間:{start:.2f}s - {end:.2f}s ({duration:.2f}s)")
# 實際提取測試
temp_file = '/tmp/test_segment.wav'
cmd = [
'ffmpeg', '-y', '-loglevel', 'quiet',
'-i', '/tmp/charade_audio.wav',
'-ss', str(start),
'-t', str(duration),
temp_file
]
result = subprocess.run(cmd, capture_output=True)
extract_ok = result.returncode == 0 and Path(temp_file).exists()
print(f"{'' if extract_ok else ''} 音頻提取: {'成功' if extract_ok else '失敗'}")
if extract_ok:
size = Path(temp_file).stat().st_size / 1024
print(f" 文件大小:{size:.1f} KB")
Path(temp_file).unlink() # 清理
return ffmpeg_ok and afplay_ok and extract_ok
def generate_report():
"""生成測試報告"""
print_header("測試報告")
tests = [
("數據文件", test_data_files()),
("ASRX 結果", test_asrx_results()),
("Face 結果", test_face_results()),
("整合結果", test_integration()),
("GUI 進程", test_gui_process()),
("播放功能", test_playback())
]
passed = sum(1 for _, result in tests if result)
total = len(tests)
print("\n" + "="*70)
print(f" 測試總結:{passed}/{total} 通過")
print("="*70)
for name, result in tests:
status = "" if result else ""
print(f"{status} {name}")
if passed == total:
print("\n🎉 所有測試通過!")
else:
print(f"\n⚠️ {total - passed} 個測試失敗")
# 保存報告
report_path = '/tmp/long_movie_test_report.md'
with open(report_path, 'w', encoding='utf-8') as f:
f.write("# 長影片測試報告\n\n")
f.write(f"**測試時間**: {datetime.now().isoformat()}\n")
f.write("**測試影片**: Charade 1963 (114.7 分鐘)\n\n")
f.write("## 結果\n\n")
f.write(f"**通過**: {passed}/{total}\n\n")
for name, result in tests:
status = "" if result else ""
f.write(f"- {status} {name}\n")
print(f"\n📄 報告已保存:{report_path}")
return passed == total
if __name__ == "__main__":
success = generate_report()
exit(0 if success else 1)

View File

@@ -126,6 +126,52 @@ def extract_speech_audio(audio_path, model, utils, output_dir=None):
return speech_audios, speech_segments
def scan_within_segment(wav, sample_rate, start_sec, end_sec, model, utils,
min_speech_duration_ms=500, min_silence_duration_ms=300):
"""
在一個時間範圍內執行 VAD 掃描,切出子片段。
用途: whisper 給出的粗略時間段內,利用句間停頓細切。
Args:
wav: 完整音頻波形 (numpy array)
sample_rate: 採樣率
start_sec: 掃描起始時間 (秒)
end_sec: 掃描結束時間 (秒)
model: VAD 模型
utils: VAD 工具函數
min_speech_duration_ms: 最小語音持續時間
min_silence_duration_ms: 最小靜音持續時間
Returns:
sub_segments: [(start_sec, end_sec), ...] 子片段列表 (原始時間軸)
"""
get_speech_timestamps, _, _, _, _ = utils
# 提取該時間範圍內的音頻
start_sample = int(start_sec * sample_rate)
end_sample = int(end_sec * sample_rate)
segment_wav = wav[start_sample:end_sample]
# 在子音頻上執行 VAD
speech_ts = get_speech_timestamps(
segment_wav,
model,
sampling_rate=sample_rate,
min_speech_duration_ms=min_speech_duration_ms,
min_silence_duration_ms=min_silence_duration_ms,
return_seconds=True,
)
# 轉換回原始時間軸
sub_segments = [
(ts["start"] + start_sec, ts["end"] + start_sec)
for ts in speech_ts
]
return sub_segments
if __name__ == "__main__":
# 測試 VAD
import sys

View File

@@ -0,0 +1,35 @@
"""
Whisper Local - uses faster-whisper for per-segment transcription
"""
import numpy as np
def load_model(size="small"):
from faster_whisper import WhisperModel
return WhisperModel(size, device="cpu", compute_type="int8")
def transcribe_segment(wav, sample_rate, start_sec, end_sec, model):
start_sample = int(start_sec * sample_rate)
end_sample = int(end_sec * sample_rate)
if start_sample >= len(wav):
return {"text": "", "language": "", "lang_prob": 0.0, "segments": []}
segment_wav = wav[start_sample:min(end_sample, len(wav))]
segments_generator, info = model.transcribe(segment_wav, language=None)
text = ""
lang_prob = info.language_probability if info else 0.0
language = info.language if info else ""
segs = list(segments_generator)
for seg in segs:
text += seg.text + " "
return {
"text": text.strip(),
"language": language,
"lang_prob": lang_prob,
"segments": segs,
}

View File

@@ -1,346 +1,293 @@
2bfe6a1c1263f35916d4a28981814515fc40cb473f7bbc801f84842904c888f6 ./add_yolo_to_chunks.py
f61f7126698018b346c8bafc45501708c17e3b45d9db54be5f0109afeee63176 ./age_benchmark.py
8efb13239db2a25a728abbdebd92affe685b69402a277cceb0d76e62ed9451ac ./analyze_asr_lip.py
432b3e3b30578e71ef973aca758bd1964102cbbb19530620df8ac02df00eefb8 ./analyze_video_faces.py
732609ef1882e14dc7ed60488697f6ae7e2607ec90b240a86ea9e585f052b9be ./apply_asr_corrections.py
790bd25424e93ca5a0743ea1a740a9a70f6ae6f8a9ca411012eb1e9b03907eb4 ./asr_benchmark_runner.py
18744dc3bebdce0d89ea7076b5e43febd35ad3c84064bb52adde4d128d50bc9f ./asr_face_stats.py
1577d055328a73561f9ccfaf0c54727532e3dddcd1bf0f33e3c38081415cced8 ./asr_model_benchmark.py
fcbb81639f53e9e08bee436853c84d918c0eeac09d985b34634d5ddc00055b61 ./asr_processor_base.py
25948a204e45ce844d43606b7e45c9532321d48df44887d261fc886748276b10 ./asr_processor_contract_v1.py
e9209cf028a11bdc45514124826374e58458ee06b054cfedffe8013d751735ea ./asr_processor_contract_v2.py
407dd0ec772027e0df27af0b66ea8130cb390595ccdeca4350e7bdc210acee6c ./asr_processor_debug.py
dcee1b80071b47c974bcffe3d27ec2f2269f4b8de7e7409ceaec7e6f271d31aa ./asr_processor_legacy_v2.py
10728a05a6ff2d56a70bb831abb51e05b03309e45bc5fa068c5a0702a4c73769 ./asr_processor_legacy.py
9106bfe07de9cfc920f4f4d2f821dc024df612f4c2a8f5f75d35f012d26440f0 ./asr_processor_simplified.py
7eabdcf7320302ee65c67e801f3ac7ca5801abc76165faa182348d30a8113e9f ./asr_processor_small_multilingual.py
2714f7be88f286635ea8465daf8fa969e6b27d2b2d1f73ac5e98f5e496139cad ./asr_processor_small.py
1089ff10b9b0a9f528cac79580aec25e33f8eeea485ac44b6aaf8c7c0cab5b42 ./asr_processor_v2.py
e9e622d737990bea8ecc139fa310a7cb4b0ca0309d6783f8105e74f864dfb850 ./asr_processor.py
5431b57d4369a841d51a6d6c5e1fb5e6c2932cb97cb4601f5e1b41ffe9f7ecaf ./asr_side_by_side_comparison.py
6c11efc3d40e559bfbeadcbf4f51eb353b744cc4f765bd8abc472a701e3f33cb ./asrx_processor_contract_v1.py
93501463af84d6541405057da3783d40492aec5e536b4210dcaffe460cdb5503 ./asrx_processor_custom.py
6adfbee842d134b9d180e2d1104694ed5cdc1fa4febcd0c502801b8f87b3ce66 ./asrx_processor_simplified.py
60fc3465f9c461583f8d0b888e85b3a6e04e1f252a1e1c21d036b52e1ce4b43c ./asrx_processor_v2_noalign.py
82d65b71bd86874e484870c40214d3fbd9343c39d5d635896fb4d257d13a410f ./asrx_processor_v2_transcribe.py
5a0c9905a2e10c847aa74f108e4054de4704bbafb2004589db15bf33833ea3c7 ./asrx_processor_v2.py
b16b00cf9e5de96abc512022af9bb81196405b10988f5a39dfd3a9b6471f1155 ./asrx_processor.py
d570fbe89bf84c50f180e8f3ec26c30092e07e3fa4883fb83a644670c13b8588 ./asrx_self/__init__.py
3b7a788e5fe2fa1a7518bf2a639ccd09b304b264b952c88a3e6612aba30faef5 ./asrx_self/integrate_face_asrx_speaker.py
1fe4b9ac1d04c2f2ef5361d8325cf9333e434b126be6a53a4c0d40a04f32a34d ./asrx_self/main_fixed.py
e4a2894bd4207f6d034c86e1d232001e2e0f9e65856c89d84d8a038473a5e50b ./asrx_self/main.py
46f61075b403729e4ff9bf0b05367b5319acf5d8c696a0517033699dcba36276 ./asrx_self/speaker_audio_player.py
2a072521662906e5ca84ec54cb1963930a1c795f8d64906b66e889c0f442198b ./asrx_self/speaker_cluster_fixed.py
db4ddc98d563bf4a8c34fcd1fe40edd34fab63fa8c293644a8a40ae87be521dd ./asrx_self/speaker_cluster.py
a50d0ae549b733532f940332e4656a4dcf0623703240eb74832524eedf54f888 ./asrx_self/speaker_encoder.py
42f325168e1f6edd514eb00321f18ce581f7b61d18c50798271c3da8410cb248 ./asrx_self/speaker_player_gui_face.py
54a847a8862e2f7400c4d8425f4bebaeb230fd50932933734819fbb6729bb560 ./asrx_self/speaker_player_gui.py
43508b714f2f1aa8bacdb9c4f52152f3fd14f6c2e2529460e5b24b29846c8c37 ./asrx_self/speaker_player_interactive.py
e25e789552fef129bd6f536140ec4deead8e242091ab60ab679b544ff9d43307 ./asrx_self/test_gui_face_player.py
788014df1faf7cfa09fbce16781f8bf9da1acef75e8891592b3b4d51b91e93f2 ./asrx_self/test_long_movie.py
8bac63ea24cd06b9d398c2650ac396e10db64e33f0686a01bd460e17286e7574 ./asrx_self/vad.py
f11b67ada6167540d2f95cb2af93d0e3a0de55bce659745baa37c4aa4805212e ./audio_taxonomy_processor_v2.py
ded810b81cda24e31e82de14ba9846770ee2b18d84d52b9d570de5877e9e2513 ./audio_taxonomy_processor.py
f7c53be5a031a8bff15c3165543586529932d81c4312521654d132b1f0ed6bc3 ./auto_identify_persons.py
5497a6f1f7ae267c796a398a9f020ea485aa45f980f2eca932b904ad61ce9b40 ./backfill_demographics.py
39a479ca4f8986f3255b0bcd0d9162a1f2ae339bb4dcf081f931ff9b304797a1 ./backfill_frame_data.py
308c8e3f3d45ee273504f9f415eaf6c025f06aaf1cca33156a66431ed6e64f43 ./build_semantic_index_poc.py
4eb37768edd252d94f0d751f219c317e905bc093f414b2a6350efb8294131138 ./build_semantic_index.py
debbd058957d09c2397f3f4c028edaa0a658002921dcca95eae2a20070ba95fb ./caption_processor_contract_v1.py
7236cdb5deaeada266cc246ee11380248bb9f2255888c25a152b2f6ab1f981cc ./caption_processor.py
e73cbb688dade5c5b6fc4276f0c78b377903ff83f3830b63d8bcdacd8da8aecf ./check_all_stamps.py
7ecdbd4b1f94be8ebab9935ea210a868330e7030b6e19c73229c579c1189fd5c ./check_architecture_all.py
7179ed1a87241904af29542f9018398f8afd9b9dd89af7bb11909310ab7b49e0 ./check_architecture_docs.py
7e6bd7d14582e494baf8b28354bbded3f79b43f0bd271ab33874da55b9086311 ./check_code_document_consistency.py
5ffca7c55edafad755e84499981553fcb48ce6056ca7b04130acafb9e6a9b1c3 ./check_frame_112_36.py
f49c7b0cfa53b657f69b2ad97a6e18393741cc2151b32c9d7dde2e078b75953f ./check_frame_91_59.py
d2cb7475262ee711a4b06e53559f0927242be4a924a56e7fe212225f318f4193 ./chinese_vector_test.py
ecde3d3df773916f62de4e34f8d8693feaedf112a3ef9955e22417c8421722bd ./chunk_statistics.py
2588ecf27c13020d894e46ba70a76de89f09556b475f555dae59db36da0b90a0 ./clean_sentence_text.py
98ab1129032f42fddc020f9b3492d1fc133851d1af33ddeb57e2385d88425af4 ./clip_logo_integration.py
bf6f74c09b8f8c7f25c5fffb9c36f16a8afb483a7b65903cfc75e2ea641bdf49 ./compare_asr_content.py
1f2caadcded724aa04a929018a35ace53dd79d172f5ee2720308fbd4581b0c6c ./compare_asr_models.py
1ed8a9530f40e304b556ff76c7cac40468c86a0cd32ff2a8bc7bf2a69669121d ./compare_models_gun_test.py
6bf790fe75a7a2a5220052ca14c31e90a97eabc4558cd5e9059280913862a81e ./compare_search.py
875e7a598982c8ad7222a51b7b147e91cd5e1a930f41214b3942107cb932fc5c ./compare_segmentation.py
e432b6f2364d5a9aaf207a1de0dca3fb14ab8d118c53ee34306abfe6fd211ba8 ./comprehensive_search_test.py
43df85cf860ac28e083de35b511bb2a7b91ed48f596757f52f19487768987500 ./coreml_embed_server.py
9149ccc8de5adfec69c6f3f2ec502ae7d5e7844518a228ba587af2e08cb38805 ./crop_opencv_stamp.py
fc36ecbb1455d959456945266e193b601a29c4210b4938a3f0d4a9aaf44b5cee ./crop_real_stamps.py
34a694624ce94d916b06a847bc4d41e7665985b85e55a626a4bc3a4370c21acf ./crop_stamp_112_36.py
27099dc9c8ee52a6949ce18c505089afef1720fe70858b90d0801972c3b43fff ./crop_stamp_closeup.py
01b5a3b091ebcffc0c1e2637b7af8192ba597239fa80d152738e3b8cfdf8174d ./crop_stamp.py
71b2a362b5395c6e4d70e62766820db92d94eaf140d98eecb2880bcd98d55be9 ./crop_top_candidates.py
60f18c5fa03ffbc80c209337cd1c8b6acd0b8471e600119340aa8cdfeef14f5b ./cut_benchmark_runner.py
deba86a1645ca5b1acf413dd9edfad77b93ff213897d739a32de1ba629bfce52 ./cut_processor_contract_v1.py
01024f947f0326c124293a30e4f2cdb859f21cfb2d4c07f9c1030e2934f7bc44 ./cut_processor.py
ff092ad2373b57321f87d1dd123fff8a99c8207057591e8526e56cb1424d47c6 ./dashboard.py
f184bf3e546db0253ffb71895e8d42aeb06588c71c4914c2fe656f42ef463c9a ./debug_face_registration.py
a9acce1ebd6ea821a8dc5009b8fc40586a98d31c23e93c97fd844bdadbda4ed2 ./deep_analysis_112_36.py
7767ee7455a956d14d286ad558c4c312c2ad3ccee1c73adc1bc8f761c96ad72a ./demo_dashboard.py
425290c12161c5cfcb0c505a737ba3951656b39e425e792919d4812e15b9b8e3 ./demo_face_learning.py
d7e3e27e6a65b1fa62530ee954c227dbb4f97593c5a5dcc48b39e5ebae4656e5 ./dense_scan_traces.py
df79b7fc7a03a8e754de5123a23bb33b1d5c23d832adc1886fb846ca517dd24d ./detect_language.py
f6f8047e24ebbec81ef27dd38f4242e63385f8ebe5be471cae156b8aa5fc4477 ./detect_objects_keyframes.py
e61d2ef5043bda3674a0050d83ba3bc6a70c47f54e456124a736b4328f0c0638 ./detect_stamp_shapes.py
f23a382113e9c7de2ec3b24e95160daef48f9336ae6d4ec9ee7a18f4bf529f6d ./download_places365_classes.py
a747e5e17960b972549714786bb9e28ea578e10e6c80788e298a0149c970bcc5 ./embed_faces.py
f1a2b3820e1a763eba6d8d905a5bb87f5a9b4a2f005e709e313bb7505ba7ddaa ./embeddinggemma_server.py
43c540c02c1be992e7d44ab4fc76a759815db3ed5f25bcbb594328b50ed7c73b ./export_file_package.py
19d23e4604d5532928412afe4d5d39ff49194ab4a046825286ae1be154326a1f ./export_file.py
5f10bab1dcb0b5fad233a74069f9e2f89043e7c848c9c38ae7e2806e6940c75d ./export_identities.py
2a1d0a1b853fd2c28f9a404871d33912f93521358576833be0999271bae02bcb ./export_person_thumbnails.py
a81bf1d6af78c052e638f5d5677b4edb512d0de5441025d86fd970d3e7993922 ./export_sqlite.py
8b5cc0ff437fb4dd0df28b7b20a78469cdca3621e2eeb4b6d46ad2391acb0596 ./extract_female_faces.py
bdecbaf0496bf536dce2ef4897f7090749820d15dcca03492d4d736ab0f8c6c5 ./face_benchmark_runner.py
22319a38bd684fb235fec681ddc60f45821e4bb2181f2b31fdf945f7ad9a1b85 ./face_clustering_processor.py
5adce4e444743331fa592e13d71e52f26554eadb9744d350a7654a449a8fb8a3 ./face_count_comparison.py
3574454c74eaf11021f9052f77d93044cca4ae0285d0f2630b4016c2ec0df783 ./face_cross_validate.py
4f09b3b66b14a5eefb14fcf915a1ad1e9147010f6ae7671731566679b1cae461 ./face_embedding_extractor.py
87f1b69affbac03fbd87331a99cd7c4faba6c72d359ffcfebb62d6ad8f70445b ./face_landmark_qc.py
28776dfcc6ac40e9481c25467438745fed60fecdfd4fc19f9f4c7396397591a7 ./face_mediapipe_test.py
f4d1b4334a49357b74b80e390ad5a3d16263e51cbe5cab661af92bd2e9721f02 ./face_processor_contract_v1.py
802015c73dfce0866f2a0bc94c645aa35ba30a6de78244af23090bb1f1828c6e ./face_processor_mps.py
96ffdbde3f4d87e9942f9e1f4c93cbd999dc404b43e00d4cdcbb22de3c0f16b7 ./face_processor_optimized.py
17e7d0bd142bddfead94b1dd959c1f41c0dad7063ffc677dff1a99d62aab6cf8 ./face_processor_v1.py
d6ddad29a5e53b43b887554072d7965f0535e47fb62dad1a8b87e44fa1be6015 ./face_processor.py
8edab61189ad1a8fa60c203077e814e82d46c5bae67054fa2ab1958e199c05f9 ./face_recognition_processor.py
9ea19f357b3fcec6c8b3875c538e53cb46e407ab188cd544963e0123e535fa03 ./face_registration.py
72648816de611fd9b84d2b98c177b8b4f24374024b69184e8151c06cf44d633b ./face_statistics_report.py
499f197a06f50839ebd5350af380fa56506ce08f073ba40c0e863b8e02b34133 ./fast_face_clustering_processor.py
0191781635b98d0675969fb87733af19525d7b5c148723346c5378c08a00fe33 ./fast_stamp_search.py
00e7e8ed06f6a0f2c46c84a47d7e7f5d366acee941d546a52c4b1b7885c71e08 ./filter_stamp_colors.py
5341fd648cffafc77568070313b06417636943d50ff3b4380a61381260acaafa ./final_face_validation.py
213793ab719f4ef42ec9b22f351dd86d4739211c17be486a46b76ba7e64fd8f1 ./find_blue_stamp_opencv.py
e1490317c0f56b895f73cfbb6f57c8e3ea5c65304bfdd7663f103f6b564e148c ./find_kids_pose.py
08d4cba0650f6a22fc134d07fd15fe8784c8472c3ba687b587e31e0b980e2b1c ./find_kids_refined.py
aecec0784ce5d0e98176c15798f05d4f67ab6a686f9ffafba71fbd82157027f8 ./find_magnifying_glass.py
620db08dd84f00af0c6d744dac54c68360548dd5b2cc26b12ddcefd936239b2e ./find_pink_stamp.py
1f4555b3578f4dc6bc08aa37e34eda1d91ea25d8134439771678d1a57bfdaeb9 ./find_realistic_stamp_opencv.py
277aa3b48eec2e739de3bb95ef501ffbd24104aa2a1bdef28c844ef44fd75013 ./find_small_stamp_opencv.py
fc73bbc9605938db495bd33ea74955e454e9384130531a16d42f25dbd9b515d8 ./find_stamp_in_hands.py
c6ed0f12e78c12df977ddca5d699f58edb174b47199f584e7a24dbdc3b7d02b1 ./find_stamp_in_magnifier_scene.py
ecf12e346619c27a985452e9f84ee262c2da25de9df0ff6e0b293279ccba559b ./find_stamp_opencv.py
4ff93cbcc781a5cff023f78006f1aebbe2d954405ae7d00a473fef6b41b2ebee ./fix_asr_text.py
4090cb892115843a909aa41426c0f39c5a53d8d88a5db69499ec8bafcb780d77 ./florence2_scan_stamps.py
e90e4447db3328b64a2062ca13ed41f6a045220d8fb640542dff5b790d3c4d3b ./gdino_comparison_test.py
7071a9999057c347e2275381f1f0c58e19aa8581d70a572d3170ed14a295a48d ./gdino_frame_api.py
891410310b415ff68a0f7ee0aa39e84eef7f2c75887487bdb88b8f4718d40e94 ./generate_asr1.py
24efe7db016387b40bd9caae449f0445a3d47eb878c00399803bb6e78e6dd5fc ./generate_benchmark_summary.py
dc956a78a3ed26686f45dd6d6d9cb42c023751fcd9b8789585450b6df63670a1 ./generate_chunk_summaries.py
8a0922d75fdc7c5994ebfb31881d765db4b105cbcddfcaa4b4c49d11950b8df4 ./generate_chunk_visual_stats.py
4860bfd00cc6c1c842c2f8e17e725eebca191d81067af3cb5a28661b45d74bd3 ./generate_parent_chunks_gemma4.py
e9fca223a8329ff6bdcb8552fecedb2d8b4607c6516c373c3023f29edfd42e06 ./generate_sentence_summaries.py
cbae7c3e85457274e8c284005196c39dc97f9d9200ed6b0e4ea266e48a381d3a ./generate_synonyms_llamacpp.py
57512cd7a5ec2f52813717fd3d81dec1aaa69dc9c91a9edbca847e7012b1c86f ./generate_synonyms_ollama.py
dc495cb8127858fa03a5f8b8bb4a772c5934ada1abecf97459bf71de80417672 ./gun_detector_scan.py
1a7cfb72723b3b94e3f4fe368477ba693ac3d20ac7af7351962bc548c700b451 ./head_shoulder_bench.py
b2fe8e4d8d7d1057ba928fc5e190f4a06cb60e83e2a02c5d7c423791596c11b8 ./head_shoulder_quick.py
ba5e67a97cb465e6a1a942c2f7342406031759ffcea2b897ae963bee4bc551c4 ./hybrid_stamp_search.py
f5847b6c8ed4c7c51290df9032d5a192317b5f03b5ff418ead1181a6e1b655f2 ./identity_agent.py
61bea1980af5861a02d6e9b47ac5ad0bd04a4fd633af477d2179b7361ae58c01 ./identity_bind.py
046aa90eb4a4b830910912362a9865d1e6170f5bc176fae42be630f967f9d3ff ./import_file_package.py
7cc260d4411ab13559803686f8b645afa07738d652d9459830aecac268597fa7 ./import_file.py
071e3a5141d04cb9e6bd31489a835c778608785896b18ea7fa65e8db9f1547e5 ./insert_chunks.py
d3d53f44daa7f1526488677b141e90fbf4aa5625369b96a3ca275b802414802f ./integrate_face_asrx.py
4cb6a93ef8006cb69e8bdb1bc72899ee9bab1bf7eceaafe9896923bb7023bbd5 ./integrate_rule3_markers.py
75aa3e4bffc9f9cb8b9254db19095c93c3efb43d465fb5dcca8c7b9b730f5c59 ./integrated_body_action_decoder.py
f4dd2e21fb6b668bdf0c51cc56e214188b46937b96a2b4a10d13783e171d0472 ./language_router.py
bef426641645fcf7dcc68c87e3325a6edf3f70925febaf1df84f7c6ff87681e5 ./lip_analyzer.py
7f98b0cc8379b3759cc7e805dd56f736cc518093e83f43b2e5ecf559a19b95f0 ./lip_processor_cv.py
a1473eeba17fce25e4678234fe4e8793a132514e0566b03b36a0bec04eb93acb ./lip_processor_media.py
0df61396756ee22d35356776c189b354458661916c8baf85bcef97c9f8b62ec8 ./lip_processor_mp.py
3202aeca29e651ef1a54f47681c6b3b2d0680555fe3c6d318a932bb12b49e58c ./lip_processor_simple.py
fed15bafb5e09715cc03962f465b2ff618bf05ebeafdf932643690c9635c9840 ./lip_processor.py
1773054e8d563b493865880d0d8bda105e3eb6fb536a25817517237b3bb76afe ./magnifying_glass_analyze.py
7d4d048c452bf273f4a6d96da13eb7bab6aa60ca9dd51de5ca0fb0a01e587b13 ./magnifying_glass_extract.py
8528bbf89d2770fa5a23f461274038898be251fb6e48c5d3adece5aab3bf976d ./magnifying_glass_owl.py
cb645f5e29ee5a36b2f97812039abfdaed7328386bcd25ad7b742af6a6b16399 ./map_speakers_v2.py
a90bd3fb729a05010c29a213134c60cc0bdd17769e27a7d3f1250919b7bf1613 ./match_face_identity.py
2d864dc831c2fd0142b19b8ad2cda169c2a05facd9662d31861d29bb710c4979 ./match_face_with_pose_filtering.py
889d4853707896885ed96ab945d4266acb213f4b122e2ba7c4563eb0e3e9e865 ./match_identities_to_tmdb.py
b34ec373bcf65139e08e41967f58a2fc8ebb67a59c361074d3590cd16541415a ./match_speakers_to_chunks.py
fe6260a94d01d8b43d0d3b59eb820cfd7b4711c907343a1261c69f9010ae990d ./mediapipe_holistic_processor.py
bb36844b4d13bba8edc1b7f0703f02081b62bea795535b8cd8dcbfdb4281f402 ./migrate_asr_to_children.py
819312cbfce6e68a0d8d731e02d283946f79de6044f207991ddf9a28ac853d79 ./migrate_face_results.py
c418f6e50054fa7eae1d0d879e28997b98f57437acec48b53ecb09f332728867 ./migrate_to_4188.py
6f60aa899e06f05e575cb5b461ea517481119cc32644566245d74c96eccde722 ./multi_stage_stamp_search.py
b24e2289c00f803c8339f59c34d44ed6c53a3c19dafc13e72c4b260d6bb312a6 ./music_segmentation_processor.py
da2546f84d0dbd711c8800ae4e32e59d9c38de9e62e1b423c4518fa1fda1dbea ./natural_language_top10.py
78c3d1a9302dbfacdf9b3655dab07348957fd9dbb4af94aae83eefecd5343a33 ./natural_language_vector_detailed.py
e924f04d68c9a8211ad373da811aa6671d2c5654281c1634dbf8b1e5e5b51533 ./natural_language_vector_test.py
df6ac92367b1afb50c0af958e362d87555fe569f608a8d213e0a593e2a43cde8 ./object_search_agent.py
fd39b779a0337f521940f3f7b159931f1f207f200eefd610183781fdcf3dfafd ./object_search.py
42d2952fc78b57302b0d12bc3d45790a2c2c46d4ffa3c713a82686134bd63f13 ./ocr_benchmark_runner.py
7b3ccb5c4ddd4c62c5ad04d0e3aafaecc2c1441012b6a98613cdcf055e2e50e8 ./ocr_processor_contract_v1.py
271023eec42d6be4a1ce6ae2ce3f29e825210a57e6bb37554a6f7fdf54616f9a ./ocr_processor_mps.py
2e73c41285e52ef013594fcd4d20df9f5781bfc26bcf62e54dd2c04ec44200c3 ./ocr_processor.py
62196108cb3337b5f9a873d70d2981ac8f49152369afbcc8a12b3a13de579e80 ./opencv_stamp_search.py
b2e8d552c272fd173c77693e9453a85fe16dfc12f7c2cd304d299c6188c14077 ./paligemma_vs_gdino.py
1534d5b7617dbae77f7a37a2c33a89b90f965247a6828f00b73ea6b720f6f4fc ./parent_chunk_5w1h.py
5208c738d4b615282813d351daf09872ce516121bb604caa64968ef5e52c53d3 ./pipeline_checklist.py
8f80c3a2be5c330e2d1853d9250a171c75db84598dbf3304280c42237ed4fb1f ./pipeline_status.py
94db44c0f49115a677d117d4901a1b7991c1517905300eaa495dd62b8ac1c79c ./pose_processor_contract_v1.py
167dee5e42c6bd46674bcffcfd92f368fc0b48a1f42c459c806853b281bc6482 ./pose_processor_mps.py
a6ef3a785ef5c6dc47fa38dbed80d76bc7d4bf48cbaf0f7edb3d26df98d7262c ./pose_processor.py
45e6798dc5900f2f7c8776a2d260c122aae5068a075256b8a5c02e8d0be6c131 ./probe_file.py
139a68b5915680ec697d4bb5420adbd20b89637de2c16a15d68aca4fc22da02b ./qa/executor.py
4a59b36c29e1ee6e2b169db3b0201d2f7088c6ccbfdf642a3b522aeb182bbeea ./qa/judges/facenet.py
0dcea0258ae3309cdec93dc4dd534d1a42511c327d528a117c8e3085f5b30386 ./qa/judges/gdino.py
7c9392436477662bc1b49d719f0c78f96e8e7e180fd281d4c59c36fd241a3e6a ./qa/judges/gemma4.py
84c6f793538981bdafdc08bb9bd5f12401b442441fae54936f610a758d18e972 ./qa/judges/maskformer.py
2f9b5dd3373fdec77a84f117ab620230e208f96d015c960275ab60a0656575b6 ./qa/judges/paligemma.py
52dedc276f6f9806710f1ef510aabd88032afe4abad364f5963fd2bd5b6cf14d ./qa/judges/yolo.py
c4e4424aad1847d822e9cf7dc98a1b2e903735a61e8ec056c6a9be75f79486bd ./qa/pipeline.py
96f5ab509622118db307641082a19daff6b9a36bcc66451c35ed2abee4fe4249 ./qa/query_generator.py
00b1716423a184856bbe44d4132fd6d84ca13f3ae018964caa6f3389c1ab98a5 ./qa/scorer.py
01c7b3c30c1531224f9605f0ee633285fe8489ab2d0a3c9c6a41f2b2b60d6626 ./quick_stamp_search.py
e3143673a2bff6139e05c82446fd8770c4b7e59a854a42c3b29662f5ac75efe2 ./rebuild_parents.py
4aa98981632d4f8a11039c510e86aa296ae1cd4b399fc871ed664ac11e445bd9 ./rebuild_story_content.py
205cfc47b603b5ab94d97dae8c25486b342b7c2858afe6d6dae27615ca0b2aeb ./redis_publisher.py
750f778946b56bc57c47d9d2295332bb0f8cec2c1aa03c6b882d39ef4432673d ./refine_search.py
0f8a6a6866a5797e964d3b17e2b7ef146fe7a798f09fcea982fcda6f629b4d06 ./regenerate_parent_5w1h.py
3ee192b623f290136b36bd63abd018aad6e6639a9543970c3415734628b33bd6 ./register_sample_faces.py
334782f0f66d0ad3818a51adf6343186a2de65467378ab68a81ade806e496af9 ./release_manager.py
9a44cdd155953778b52ac0cfb118504c56eb6b1141984365ffbb717e28f3e65b ./release_pack.py
3906b48f3a7764d19605def2bf8ef84a54a6afe64c9291a7cc0881a91472a826 ./render_face_heatmap.py
44e432c31a35211a37dd26695772b7e250487ac42ba4f16a56f843277c2fabbf ./render_offline_report.py
3fac1e6a4125042185a2ce82771f695c562b3137c7aa58a912bada00ad8ecf78 ./rescan_single_frame_traces.py
9c3212cb455c2a6230be918448560fee00c153a8956ffd04fcb62974d5e1abff ./resume_framework.py
7c95ec08daf4f980bd53233503b7a4fa01afc08660e8fe8cd031ea3613ead8f7 ./save_events_to_db.py
24795e1531fe05e33d515104e4fb2f9567b46d802ef1b5a38f11268cf105be76 ./scan_charade_stamps.py
cad2da5073577f851c5cb2abdbd7cab05b39caa0d1179ccc89c378a7df2736c8 ./scan_full_video_stamps.py
03ae71470331fe5b7f8e394f7f789eee08cad4ed5ec9196b46ab2c9dbefa7fec ./scan_handheld_objects.py
d3935ba498786cf260d9d5370ca60d3af7bc4fd438f6be33ce23cfd0b7bab593 ./scan_keyframes_opencv.py
12c9b35212f587f5adb37584bf3c3844804d2bc642ebfc5d82b86b44f46d2472 ./scan_keyframes.py
f386130ac203308c904ba7efea09ce0ca0d640d36762b113bf0cfedc24d7f885 ./scene_classifier.py
482edae04e5467a68c77729760db53d3653e8d7654fa49e5ec9a36f1f8f22616 ./search_blue_stamp.py
e3786422932138272d1096ad4c800594e62c9640952a286a9158372a1e5443e3 ./search_envelope.py
2df1e259c2e52d10d79b20856cb94ffff5a9bfdbe47cee587b1148b2f1c16101 ./search_objects_in_hands.py
9fd49be8ab16f94fd82efc5ae035c029372a7ddeb7fd779b557f1917cdc14592 ./search_vase.py
7a6d8e7c435368f6218db972c04a7be16d7d6680d8d4374f82c05b7162716b9d ./select_face_reference_vectors_v2.py
2bcf7c1b3c407b51a134a5ee4982713f0ea387cfd6df01ed75554c94603971a6 ./select_face_reference_vectors_v3.py
d52098fcf1f9f7ba14f31a9a90bc5b3bc933e1a5e5697e3d09eff389c153cb18 ./select_face_reference_vectors.py
a02cb37639275d86ae0b4504d21f50963b45aaf94630c59472ba30d07722e50c ./simple_api_test.py
02516ab1616c1756c4f8041f48ff12811cc5d672c53b34850b84ce682fefdff1 ./simple_face_stats.py
b024d9bfe244d0d058daae0acd314b9344d6f0912e4f3b02dbc618f9fe3e4949 ./simple_test.py
af8703506769f3cdb89ff7849b071c2421307717850596dd86d2fe0b053e7809 ./smart_stamp_v2.py
5e5f86d47ea2b75bcaa8662689f73af1963645149c0da688dc43482616aa4e76 ./sound_event_detector.py
bab7697e4b4b05e93babc116e0c5b13cbaf1f4d419a65acd5dc1de5bdfc510dc ./speaker_assign.py
381ff240ce806ead7d6463ee40c5b830035eb6252180b4b0901b3c8313fa4bbd ./speaker_bind_lip.py
5eede29fa0966974c1943792d7fcca2dd9179d4f23570cf1a3964dc97bc9ac1e ./specific_stamp_search.py
d5363d832272bdb3c1d6f6d93eee7b7894893b9164a3f5ad5fa08a4a0eaeeb47 ./split_asr_segments.py
8e1269f173f2c72de78857c2d83d3111b62ec89bd79f4fb00c3f57390986ae4f ./step3_asr_fine.py
7592df8be5dc58376b33960bfa7fc0003c51114b70ebc01f1589f39ee9568d3b ./store_traced_faces.py
7ac32c1e2146a19e6654ab3e4bbbfd42e1a6540fb8717d40d55c61e9f5d1bf71 ./story_embed.py
74cc24b328a075f48b1f44a465611157f44eadc8f5dabf6d95cd5cc5f80dd9dc ./story_pipeline_full.py
97628f0f1270825dabafdf0a69f10ef12c4ffe2be4ac12941315f06bfb084e7c ./story_processor_contract_v1.py
1b1f42fc4bbff26551f26f4ac1e8a995dfe3ff98b940a29c9e130410965d0fa0 ./story_processor.py
cdbc7ef88551e2b3a3771eac5be5e0360989e71fa009ac28c97e548507e08a5e ./sync_face_speaker_to_chunks.py
8b08e9a33f5917aad10e070d6aa48805f5e7c23f905ba8fff3b8697b2109d962 ./sync_to_mongodb.py
f64cc6dcb72f54d3e97aa981b40591aef4804ca769e1f14628d901b98bc6aeac ./terminology_manager.py
455546b9bb3a2c2c877c7720229b254e75b28eea33b3715d1731c02ca85294ae ./test_api_correct_usage.py
b03dc1bbb091672e7da2b131850b17badac896b4fbba92fe9bce76c232c99be4 ./test_api_with_key_id.py
7d295c77d5bcd4c72c5673370af48cc89bbccf9292c3b82aad3a230d242547a9 ./test_args.py
f474ec88e6634decbf178da497443fa709096b174bb4a4320a07256f516b1044 ./test_asr_large_model.py
aa952524dd86f346740ffe555075b74adf2e60bb822bb04a943a51b1fd262445 ./test_birth_uuid.py
db87badad7948527325a528400d67a4eeef76abf8d13f5c4254c812e944e4e0c ./test_end_to_end.py
e191c98a82f7e089f7dccfc4c536244da2bf14339f982a3afef05d33332c3755 ./test_face_api_final.py
1b97c9aae2e1744aa7aefb192eaef86c64e6134efc8f08ffa9a274bff16a58d3 ./test_face_api_with_correct_key.py
f7e4078f31b1ca8494c18878219cf2f90c301f19fc851b9e7084657b71a5e150 ./test_face_api.py
9eafc49f8fa42b4cd58109e9b725b3aec3b06943ec426919b1788838ccf1ed92 ./test_face_db_fix.py
38bce82b167e0c97b257cc6b955fdc2e9ded581ce2d39eb0fd2c60249275394b ./test_face_direct.py
24e82bf0af82407e6c04361e9a671770cbfb0b05d92df589bd0d5a0118bb5a98 ./test_face_learning.py
8dcdb144c4253fbb466f220359b42c2a9579193865e320a56e682e384c2ae176 ./test_face_recognition_integration.py
b921e3256fdea176d4391116d1ead472c4f3ca8aac6999140367818818c35ec3 ./test_face_registration_api.py
9af6c6ff0c766b3de92185c3602f2b8b62b815bf88dcb0e3251c2676e61e0a48 ./test_face_tracker.py
4f70eadb6a8b80eb8febe32b17b77e58d1a4823cc5d598e5ea45555342d2d4cb ./test_florence2_direct.py
0588be0acea540950d737943073f71e769b6301374eaa4ff7fdb96a80145c4e0 ./test_florence2_pipeline.py
694c15193616157ddae4bdb0a45feada2a8f8490f01d290a28aa77a4b24eabb2 ./test_florence2_stamps.py
2c281f698616a83e9eeccd610555d9f9ab657b2deac65ae9e3dbfba0b450d9b0 ./test_identity_db.py
7a73e8314ea7e91ca9dad3867a83b9c1101fdab09bdc0fdac0f798d0a7a204f3 ./test_llm_capabilities.py
68300f87b96a474f06a3071a833e6b3ae48d1db5fb8a7e5a3ec1834fd878d808 ./test_multilingual.py
c17cdd0f4ffb7a151a634add08d13cc576ba7a848bb20f54fb97d0c1d9d81cc0 ./test_object_search.py
d07bd363a2878259fbf4ffcba40e367f7f1bf4171b5a5dfdda97f7a53b450d0e ./test_ollama_feasibility.py
8421003b1f66cbd21c6fe5d3aff0a526897753e959b23905ca8f502f644f66a5 ./test_owl_vit_debug.py
6f9e8b7947229ea4aa0a62b59bda5fcec05bd74f6c00dc4a7b06d932bd1b730f ./test_owl_vit_stamps.py
da91a7c97466ce7f03cde13aa9bf6e691b3e482d2cac74519a2e1a61a2abb05a ./test_parent_chunk_generation.py
19d9f2492d3b04b7dafa008f106767d3107dd36b0c8e4601765dca30131027cd ./test_places365_scene.py
de44553023067362e8b2223f03e1bff55fcbd2f11ddf3d01060dc02c4675a744 ./test_probe_file.py
c0e987ba06a61cc0426ffbca8af1eb51a97bd79acab59b70453cfbb18eaee093 ./test_processor_performance.py
7b4b55e23dff35ba107b3da5b0560d03b1b41dfdea1d3a59eac777b4be4d4033 ./test_pyannote_audio.py
5cb8b42033ffba41f25e7ef74ef04cf352c0c277a9971e9eaef53fd673902712 ./test_pyannote_multilingual.py
8580e689ae148754e03d958419e108241040a012584ba49e8a90db114a9f8c13 ./test_scene_api.py
1194d450070b1f42e045d98e532f41205bb3e52fc48ba26e7c9b72a188fe1b2c ./test_segment_count.py
147bfffeac9561cfa407207b04a825862ac623ba97deecf5ed7c6257432dc62c ./test_speechbrain.py
22e4b865bc769329c1146c2f914395044a9bc84cd2a13acf68fb374a57fe1e3e ./test_v2_detailed.py
a616570a2a080b5b19f4bf783877147e714a014103b274143dd37984a946ca08 ./test_v2_model.py
7b83611f6b3028500c91c62197f774c0769e299136eca8dc4b612a7b5743e3d6 ./test_v2_with_text.py
1dd983c78074a61ceec26d7e3623d40772ca55fd6ee63ba368afe756c66ae091 ./test_with_real_image.py
1b738cc0d69d33e967cbb775def0a7f58dc02f1911404af56a5825bd60a5b75b ./text_semantic_analysis.py
a4221417ae00add76881c6c715ee4257c263e2dfd0a846a8887738682dfe8cda ./thumbnail_extractor.py
0d188a738a0df79ead10065d9f17c366fe159c862bd4bafa2860d0e6ba2640c3 ./tkg_builder.py
8b97f0fdfc0899460bf23d420dba0a51a34737c74ebad0519856909d198662bf ./tmdb_cast_fetcher.py
4858909a0beaf8397becf4103be17fcc350841217afcdc1d917c48c512a9041b ./tmdb_embed_extractor.py
54d8321dfe0f8caa669e4a9d1b48dc772a5b25817eab95b552944140c91f457d ./tmdb_identity_integration.py
2a84aa2dcfb83ac385d2c394f884926f306c81798e4277a26dbd1f3c5506be46 ./trace_face_aggregator.py
61d3b4b362722ce24326a204f1b72cc7b1dcc20cf3264a4f526d4ea343a8d33d ./transcribe.py
ede9a184fd51ef4c87eb3e2541f09b91739a49986cb588591a7c6fbb33433020 ./unified_synonym_processor.py
a408f294c3a71eb6a0eea80b9b586f73dedcefe286c62233f713a7428a9979be ./update_all_demographics.py
e6520bb10ae6835ceade487ceb5e3fa549ca6f06de35b2c785d649921ef443f4 ./update_fine_speakers.py
a2191daff2ad228725b6a66f0e472ec659a6b4fa8f2cbbd74d1bf9c35cca63eb ./update_person_demographics.py
60060753cfd2a6d1241e55bf40a0c74f1df15739656d0349e22e8543036b2424 ./update_speaker_assignments.py
fdc61009c351263e0018801b32ad90ffd8919af611a2a0580546be7fd62c99c4 ./update_terminology.py
0d337c821b36eb7761c0e439b63b8192ff54a542c539d1279efa6854f0b0cdc2 ./utils/body_action_decoder.py
3b384a8d88f6147d1953b14bd6b55672f4f161885e29bc241a466d4cfbd50e99 ./utils/face_trace_visualizer.py
52a7b79ade15f213841c70416565d3c5e46c145c9a72724ce545143c6e0bdea8 ./utils/face_tracker.py
ecd902a4a6f1084d8396af0b4d88079105c84fa6170e3a394720a6452ff3aa3b ./utils/pose_action_decoder.py
29dd3e0f802c0347cd9d9465123915b4604c990d7250048b7ae388af03cf5f36 ./utils/pose_analyzer.py
bc6184153096e5cd8d89d02fa3279c6587f60a49c6b3366b4d82cee722bbf352 ./utils/pose_transition_analyzer.py
d0ec8f4a67c1a1eb1356ad6e9b2f466575691bd336621cdbbfd31dd10159f2dc ./utils/test_mediapipe.py
4840c11964a59eabad26b97fe01033ccaf7903e2d24edd5e1035f6dd5fc995ea ./vectorize_4188.py
078979114c5f248d2bfd43aa8df55235fa03ab812f26998b984cd485a3d2cda8 ./vectorize_chunk_summaries.py
ff98864f1b11795cc3bb64f30ccb6f8609771ddc7a5df2c003ba7c2233d16fc2 ./vectorize_chunks.py
5880c128400e6e36c8eb7dffd009dbbc99dd13f8575b0037bdc854e25ddc41fb ./video_comparison_statistics.py
0a1501ffdc027236cdf88706b3d61229e2998ab268fd57fb60e399ccb734b6a1 ./vision_agent.py
eac8f90fbbb655614abcefc4b887e346bf94db5f015d33d37bc9514fb030489d ./visual_chunk_processor.py
c165dfc5fc981dc731b25ef414184ee58e56b73b148d41a32fdce985c701efd5 ./visualize_stamp.py
6c65a82fdd1d585e20bee4fcb2d1bdec2e6220bda71d6ef9cd00d6a3cf74c4d7 ./voice_embedding_extractor.py
2b3a7b357db4ddd07ca30bf200c6600724e33441d8def0a4d9a39673e2cfb1c0 ./weather_sound_detector.py
206b61ebf3c91d7ce3f1488247b52aca6e955042d8aa979c59723e3ff10dd36a ./yolo_benchmark_runner.py
e8cb0963c90fbd1c2aa91141f80340edd3c9560d69780dd825d107c6ed14fa64 ./yolo_count_comparison.py
dad775ecdca0144bd14b7abaa7ec8fb213e8b9428e39906abce541e93db496b6 ./yolo_processor_contract_v1.py
74ff880e664ec514223a4f220b682fbc87089f8c0851c93ac68c97269b8a59b6 ./yolo_processor_mps.py
8af0a6db683b6626e07820b302135ac5960d38e3d4b3d187c640b23ce8a14f72 ./yolo_processor.py
e13cf22b9aeae96c7e28b4512dd2137743a25eb59027da446966c1aaaaf4ce71 ./zero_shot_combined_test.py
f4aaf017ff588999f06cd9ba1787517e06c6d6e6228a15a54d8aa4f54fde5eb3 ./zero_shot_gun_test.py
0a285b8ec33d7999e9d4ae8d43ce768c9f06ee1929e13a6809e98bdabe6357ce ./zero_shot_objects_test.py
5711c6d18acba76511a3f088d4d0f095b47c978a6c6ae3e086e2b7cbee7b9e55 ./backup_all.sh
c8860e3d55b99745265998abaae63efe28c83d7c1bfd91b30dfba54d146793d3 ./check_config.sh
6321793085bfb33b751b2848dddc41f13d9ead9763f6e581f9dcfceea9090f8b ./demo_identity_full_cycle.sh
77382d8671059ff99fd5ca3db42590de47ecf4e1555eea950bd3a7016b1547b0 ./deploy_package.sh
09bda12152917b969259ff3ca0bcda63f615bdf4873dbb8bb7f7ce5eec742a9f ./final_validation.sh
491e609bb43526b0c41d3dd060a3813bbeb3defc70fc88fe36f9fbbd2280e720 ./install_mongodb.sh
09e21960f0d7fdd00ff1d30334b753a8216ad17fc3644c9dbb129b4446ecc12c ./package_delivery.sh
0c2fe9288f9b51ad34aadf87093c1e1a423483ad7a972861ace811250e30204c ./package_file.sh
c233bb7b854dfd68e75808640fdea379af6952095a93cc8884d7e8b7ecbb4539 ./package_release.sh
02e85ba83e8d3da68bf9320ff25506714ce460736b8824309027a5ec375ea86b ./package_system.sh
7557f1999bde53ef397b78208713e8df8817171dfbc053d6bed130b57a229517 ./release_preflight_check.sh
091087dad7f38e8a0d98458b64fdeb0ac5770534f7dfebdbdf3b80d945ff39df ./security_check.sh
25711049adabfd179d4e19c2a4c4bd675ed9da8e8913ed1bdaac7519f6cde7ac ./setup_fresh_mac.sh
f6dae232edd5d2d111468be125609feb0dbd8db1895846f3d1c48f0e411e3a16 ./setup/01_postgresql.sh
8a405e2372ddb5958f7bfac15d330a2f189ffe2583ae37bba4c953ac45412c80 ./setup/check_momentry.sh
72dc22172a201a060a20f21b89af38c80ecb6399f594ecca81cafa8a918c764f ./setup/install_momentry.sh
5eccd14e8e4b3c91159b17756892dd03a7d26cb7bbc1961d783188ed10411770 ./setup/upgrade_momentry.sh
e48ab4673f71370dc7d4ce5c32d159bf9438e9e1dd7c9edd9c6053156fbdaa99 ./start_momentry.sh
ffe7e91a24fbfa826eb816f66cdb315097fe841a7b67a476865aec1ad7a4dda0 ./swift_processors/.build/checkouts/swift-argument-parser/Scripts/environment.sh
b2ee4f8a445a7e83f7b99ae5d4139fd525d9e3e58a360bfef054d441aa21d901 ./swift_processors/.build/checkouts/swift-argument-parser/Scripts/format.sh
9461213a77531fb3a5742fda0c9024304abe47988bb33852da55e96ae01a382a ./test_api_validation.sh
7cb98fb67007abe03bb57ef58a5e7499ae389693b33a14e015c9ef6061d6b0f5 ./test_face_recognition.sh
46bf67f794dbcd2c191f1933f1c05a1eef0ad3f5bb2e1d64e11e5f23a44ffc10 ./test_identity_agent.sh
7763d5bfbd83ede94e31eb8e44dd0d422fe2d1221b9e112d73fc637f29fdb7ea ./test_multilingual.sh
8a730fedf9252b7ed352b8447773c9c256f064fd64ca20efa05f9021766b09e5 ./test_search_modes_v2.sh
fbca5ba0783153c4e21c174b0cbf75b582514f6ef0f92750a82d3178bc170f48 ./test_search_modes.sh
f8c1647cdb4db8adef1829e41fbecd97f6b3b2e62927f195cd8e68127876069d ./troubleshoot.sh
992296b5218f3ef97ce53325be12f71848f3c3aeb3ee81d764bfe4bd61e1de05 ./verify_package.sh
b6f95fa070cc0258bc5d005f10d13025ba8b08d3ee1598bcdad405ff1d3332ed ./tmdb_agent.py
2bfe6a1c1263f35916d4a28981814515fc40cb473f7bbc801f84842904c888f6 add_yolo_to_chunks.py
f61f7126698018b346c8bafc45501708c17e3b45d9db54be5f0109afeee63176 age_benchmark.py
8efb13239db2a25a728abbdebd92affe685b69402a277cceb0d76e62ed9451ac analyze_asr_lip.py
432b3e3b30578e71ef973aca758bd1964102cbbb19530620df8ac02df00eefb8 analyze_video_faces.py
732609ef1882e14dc7ed60488697f6ae7e2607ec90b240a86ea9e585f052b9be apply_asr_corrections.py
790bd25424e93ca5a0743ea1a740a9a70f6ae6f8a9ca411012eb1e9b03907eb4 asr_benchmark_runner.py
18744dc3bebdce0d89ea7076b5e43febd35ad3c84064bb52adde4d128d50bc9f asr_face_stats.py
1577d055328a73561f9ccfaf0c54727532e3dddcd1bf0f33e3c38081415cced8 asr_model_benchmark.py
fcbb81639f53e9e08bee436853c84d918c0eeac09d985b34634d5ddc00055b61 asr_processor_base.py
25948a204e45ce844d43606b7e45c9532321d48df44887d261fc886748276b10 asr_processor_contract_v1.py
e9209cf028a11bdc45514124826374e58458ee06b054cfedffe8013d751735ea asr_processor_contract_v2.py
407dd0ec772027e0df27af0b66ea8130cb390595ccdeca4350e7bdc210acee6c asr_processor_debug.py
dcee1b80071b47c974bcffe3d27ec2f2269f4b8de7e7409ceaec7e6f271d31aa asr_processor_legacy_v2.py
10728a05a6ff2d56a70bb831abb51e05b03309e45bc5fa068c5a0702a4c73769 asr_processor_legacy.py
9106bfe07de9cfc920f4f4d2f821dc024df612f4c2a8f5f75d35f012d26440f0 asr_processor_simplified.py
7eabdcf7320302ee65c67e801f3ac7ca5801abc76165faa182348d30a8113e9f asr_processor_small_multilingual.py
2714f7be88f286635ea8465daf8fa969e6b27d2b2d1f73ac5e98f5e496139cad asr_processor_small.py
1089ff10b9b0a9f528cac79580aec25e33f8eeea485ac44b6aaf8c7c0cab5b42 asr_processor_v2.py
b9e826f23f080ae67f5961ad750ec2a6834cd18335955c3b3175b8cd06ebd6d3 asr_processor.py
5431b57d4369a841d51a6d6c5e1fb5e6c2932cb97cb4601f5e1b41ffe9f7ecaf asr_side_by_side_comparison.py
6c11efc3d40e559bfbeadcbf4f51eb353b744cc4f765bd8abc472a701e3f33cb asrx_processor_contract_v1.py
93501463af84d6541405057da3783d40492aec5e536b4210dcaffe460cdb5503 asrx_processor_custom.py
6adfbee842d134b9d180e2d1104694ed5cdc1fa4febcd0c502801b8f87b3ce66 asrx_processor_simplified.py
60fc3465f9c461583f8d0b888e85b3a6e04e1f252a1e1c21d036b52e1ce4b43c asrx_processor_v2_noalign.py
82d65b71bd86874e484870c40214d3fbd9343c39d5d635896fb4d257d13a410f asrx_processor_v2_transcribe.py
5a0c9905a2e10c847aa74f108e4054de4704bbafb2004589db15bf33833ea3c7 asrx_processor_v2.py
b16b00cf9e5de96abc512022af9bb81196405b10988f5a39dfd3a9b6471f1155 asrx_processor.py
f11b67ada6167540d2f95cb2af93d0e3a0de55bce659745baa37c4aa4805212e audio_taxonomy_processor_v2.py
ded810b81cda24e31e82de14ba9846770ee2b18d84d52b9d570de5877e9e2513 audio_taxonomy_processor.py
f7c53be5a031a8bff15c3165543586529932d81c4312521654d132b1f0ed6bc3 auto_identify_persons.py
5497a6f1f7ae267c796a398a9f020ea485aa45f980f2eca932b904ad61ce9b40 backfill_demographics.py
39a479ca4f8986f3255b0bcd0d9162a1f2ae339bb4dcf081f931ff9b304797a1 backfill_frame_data.py
77a98d9b7cb97eceae4c0fcf2c353933e0fb36ee7406b57d59b1e216b1a44601 build_docs.py
308c8e3f3d45ee273504f9f415eaf6c025f06aaf1cca33156a66431ed6e64f43 build_semantic_index_poc.py
4eb37768edd252d94f0d751f219c317e905bc093f414b2a6350efb8294131138 build_semantic_index.py
debbd058957d09c2397f3f4c028edaa0a658002921dcca95eae2a20070ba95fb caption_processor_contract_v1.py
7236cdb5deaeada266cc246ee11380248bb9f2255888c25a152b2f6ab1f981cc caption_processor.py
e73cbb688dade5c5b6fc4276f0c78b377903ff83f3830b63d8bcdacd8da8aecf check_all_stamps.py
7ecdbd4b1f94be8ebab9935ea210a868330e7030b6e19c73229c579c1189fd5c check_architecture_all.py
7179ed1a87241904af29542f9018398f8afd9b9dd89af7bb11909310ab7b49e0 check_architecture_docs.py
7e6bd7d14582e494baf8b28354bbded3f79b43f0bd271ab33874da55b9086311 check_code_document_consistency.py
5ffca7c55edafad755e84499981553fcb48ce6056ca7b04130acafb9e6a9b1c3 check_frame_112_36.py
f49c7b0cfa53b657f69b2ad97a6e18393741cc2151b32c9d7dde2e078b75953f check_frame_91_59.py
d2cb7475262ee711a4b06e53559f0927242be4a924a56e7fe212225f318f4193 chinese_vector_test.py
ecde3d3df773916f62de4e34f8d8693feaedf112a3ef9955e22417c8421722bd chunk_statistics.py
2588ecf27c13020d894e46ba70a76de89f09556b475f555dae59db36da0b90a0 clean_sentence_text.py
98ab1129032f42fddc020f9b3492d1fc133851d1af33ddeb57e2385d88425af4 clip_logo_integration.py
bf6f74c09b8f8c7f25c5fffb9c36f16a8afb483a7b65903cfc75e2ea641bdf49 compare_asr_content.py
1f2caadcded724aa04a929018a35ace53dd79d172f5ee2720308fbd4581b0c6c compare_asr_models.py
1ed8a9530f40e304b556ff76c7cac40468c86a0cd32ff2a8bc7bf2a69669121d compare_models_gun_test.py
6bf790fe75a7a2a5220052ca14c31e90a97eabc4558cd5e9059280913862a81e compare_search.py
875e7a598982c8ad7222a51b7b147e91cd5e1a930f41214b3942107cb932fc5c compare_segmentation.py
e432b6f2364d5a9aaf207a1de0dca3fb14ab8d118c53ee34306abfe6fd211ba8 comprehensive_search_test.py
43df85cf860ac28e083de35b511bb2a7b91ed48f596757f52f19487768987500 coreml_embed_server.py
9149ccc8de5adfec69c6f3f2ec502ae7d5e7844518a228ba587af2e08cb38805 crop_opencv_stamp.py
fc36ecbb1455d959456945266e193b601a29c4210b4938a3f0d4a9aaf44b5cee crop_real_stamps.py
34a694624ce94d916b06a847bc4d41e7665985b85e55a626a4bc3a4370c21acf crop_stamp_112_36.py
27099dc9c8ee52a6949ce18c505089afef1720fe70858b90d0801972c3b43fff crop_stamp_closeup.py
01b5a3b091ebcffc0c1e2637b7af8192ba597239fa80d152738e3b8cfdf8174d crop_stamp.py
71b2a362b5395c6e4d70e62766820db92d94eaf140d98eecb2880bcd98d55be9 crop_top_candidates.py
60f18c5fa03ffbc80c209337cd1c8b6acd0b8471e600119340aa8cdfeef14f5b cut_benchmark_runner.py
deba86a1645ca5b1acf413dd9edfad77b93ff213897d739a32de1ba629bfce52 cut_processor_contract_v1.py
01024f947f0326c124293a30e4f2cdb859f21cfb2d4c07f9c1030e2934f7bc44 cut_processor.py
ff092ad2373b57321f87d1dd123fff8a99c8207057591e8526e56cb1424d47c6 dashboard.py
f184bf3e546db0253ffb71895e8d42aeb06588c71c4914c2fe656f42ef463c9a debug_face_registration.py
a9acce1ebd6ea821a8dc5009b8fc40586a98d31c23e93c97fd844bdadbda4ed2 deep_analysis_112_36.py
7767ee7455a956d14d286ad558c4c312c2ad3ccee1c73adc1bc8f761c96ad72a demo_dashboard.py
425290c12161c5cfcb0c505a737ba3951656b39e425e792919d4812e15b9b8e3 demo_face_learning.py
d7e3e27e6a65b1fa62530ee954c227dbb4f97593c5a5dcc48b39e5ebae4656e5 dense_scan_traces.py
df79b7fc7a03a8e754de5123a23bb33b1d5c23d832adc1886fb846ca517dd24d detect_language.py
f6f8047e24ebbec81ef27dd38f4242e63385f8ebe5be471cae156b8aa5fc4477 detect_objects_keyframes.py
e61d2ef5043bda3674a0050d83ba3bc6a70c47f54e456124a736b4328f0c0638 detect_stamp_shapes.py
f23a382113e9c7de2ec3b24e95160daef48f9336ae6d4ec9ee7a18f4bf529f6d download_places365_classes.py
a747e5e17960b972549714786bb9e28ea578e10e6c80788e298a0149c970bcc5 embed_faces.py
f1a2b3820e1a763eba6d8d905a5bb87f5a9b4a2f005e709e313bb7505ba7ddaa embeddinggemma_server.py
43c540c02c1be992e7d44ab4fc76a759815db3ed5f25bcbb594328b50ed7c73b export_file_package.py
19d23e4604d5532928412afe4d5d39ff49194ab4a046825286ae1be154326a1f export_file.py
5f10bab1dcb0b5fad233a74069f9e2f89043e7c848c9c38ae7e2806e6940c75d export_identities.py
2a1d0a1b853fd2c28f9a404871d33912f93521358576833be0999271bae02bcb export_person_thumbnails.py
a81bf1d6af78c052e638f5d5677b4edb512d0de5441025d86fd970d3e7993922 export_sqlite.py
2fe8c0131dde21382cae1483825d489fd467c2491a0cb91d5c1881df2e402e9f extract_face_embedding.py
8b5cc0ff437fb4dd0df28b7b20a78469cdca3621e2eeb4b6d46ad2391acb0596 extract_female_faces.py
bdecbaf0496bf536dce2ef4897f7090749820d15dcca03492d4d736ab0f8c6c5 face_benchmark_runner.py
22319a38bd684fb235fec681ddc60f45821e4bb2181f2b31fdf945f7ad9a1b85 face_clustering_processor.py
5adce4e444743331fa592e13d71e52f26554eadb9744d350a7654a449a8fb8a3 face_count_comparison.py
3574454c74eaf11021f9052f77d93044cca4ae0285d0f2630b4016c2ec0df783 face_cross_validate.py
4f09b3b66b14a5eefb14fcf915a1ad1e9147010f6ae7671731566679b1cae461 face_embedding_extractor.py
d05c65221cbe787e4e29a4de1966edb9e89fed47e9e89c9d065e1d5cb46cf178 face_landmark_qc.py
28776dfcc6ac40e9481c25467438745fed60fecdfd4fc19f9f4c7396397591a7 face_mediapipe_test.py
f4d1b4334a49357b74b80e390ad5a3d16263e51cbe5cab661af92bd2e9721f02 face_processor_contract_v1.py
802015c73dfce0866f2a0bc94c645aa35ba30a6de78244af23090bb1f1828c6e face_processor_mps.py
96ffdbde3f4d87e9942f9e1f4c93cbd999dc404b43e00d4cdcbb22de3c0f16b7 face_processor_optimized.py
4c3915a7465f524e706940c9813614ec4920cd6f8647602ef32e88fdbbaf8fc0 face_processor_v1.py
d6ddad29a5e53b43b887554072d7965f0535e47fb62dad1a8b87e44fa1be6015 face_processor.py
8edab61189ad1a8fa60c203077e814e82d46c5bae67054fa2ab1958e199c05f9 face_recognition_processor.py
9ea19f357b3fcec6c8b3875c538e53cb46e407ab188cd544963e0123e535fa03 face_registration.py
72648816de611fd9b84d2b98c177b8b4f24374024b69184e8151c06cf44d633b face_statistics_report.py
499f197a06f50839ebd5350af380fa56506ce08f073ba40c0e863b8e02b34133 fast_face_clustering_processor.py
0191781635b98d0675969fb87733af19525d7b5c148723346c5378c08a00fe33 fast_stamp_search.py
00e7e8ed06f6a0f2c46c84a47d7e7f5d366acee941d546a52c4b1b7885c71e08 filter_stamp_colors.py
5341fd648cffafc77568070313b06417636943d50ff3b4380a61381260acaafa final_face_validation.py
213793ab719f4ef42ec9b22f351dd86d4739211c17be486a46b76ba7e64fd8f1 find_blue_stamp_opencv.py
e1490317c0f56b895f73cfbb6f57c8e3ea5c65304bfdd7663f103f6b564e148c find_kids_pose.py
08d4cba0650f6a22fc134d07fd15fe8784c8472c3ba687b587e31e0b980e2b1c find_kids_refined.py
aecec0784ce5d0e98176c15798f05d4f67ab6a686f9ffafba71fbd82157027f8 find_magnifying_glass.py
620db08dd84f00af0c6d744dac54c68360548dd5b2cc26b12ddcefd936239b2e find_pink_stamp.py
1f4555b3578f4dc6bc08aa37e34eda1d91ea25d8134439771678d1a57bfdaeb9 find_realistic_stamp_opencv.py
277aa3b48eec2e739de3bb95ef501ffbd24104aa2a1bdef28c844ef44fd75013 find_small_stamp_opencv.py
fc73bbc9605938db495bd33ea74955e454e9384130531a16d42f25dbd9b515d8 find_stamp_in_hands.py
c6ed0f12e78c12df977ddca5d699f58edb174b47199f584e7a24dbdc3b7d02b1 find_stamp_in_magnifier_scene.py
ecf12e346619c27a985452e9f84ee262c2da25de9df0ff6e0b293279ccba559b find_stamp_opencv.py
4ff93cbcc781a5cff023f78006f1aebbe2d954405ae7d00a473fef6b41b2ebee fix_asr_text.py
4090cb892115843a909aa41426c0f39c5a53d8d88a5db69499ec8bafcb780d77 florence2_scan_stamps.py
e90e4447db3328b64a2062ca13ed41f6a045220d8fb640542dff5b790d3c4d3b gdino_comparison_test.py
7071a9999057c347e2275381f1f0c58e19aa8581d70a572d3170ed14a295a48d gdino_frame_api.py
891410310b415ff68a0f7ee0aa39e84eef7f2c75887487bdb88b8f4718d40e94 generate_asr1.py
24efe7db016387b40bd9caae449f0445a3d47eb878c00399803bb6e78e6dd5fc generate_benchmark_summary.py
dc956a78a3ed26686f45dd6d6d9cb42c023751fcd9b8789585450b6df63670a1 generate_chunk_summaries.py
8a0922d75fdc7c5994ebfb31881d765db4b105cbcddfcaa4b4c49d11950b8df4 generate_chunk_visual_stats.py
4860bfd00cc6c1c842c2f8e17e725eebca191d81067af3cb5a28661b45d74bd3 generate_parent_chunks_gemma4.py
e9fca223a8329ff6bdcb8552fecedb2d8b4607c6516c373c3023f29edfd42e06 generate_sentence_summaries.py
cbae7c3e85457274e8c284005196c39dc97f9d9200ed6b0e4ea266e48a381d3a generate_synonyms_llamacpp.py
57512cd7a5ec2f52813717fd3d81dec1aaa69dc9c91a9edbca847e7012b1c86f generate_synonyms_ollama.py
dc495cb8127858fa03a5f8b8bb4a772c5934ada1abecf97459bf71de80417672 gun_detector_scan.py
1a7cfb72723b3b94e3f4fe368477ba693ac3d20ac7af7351962bc548c700b451 head_shoulder_bench.py
b2fe8e4d8d7d1057ba928fc5e190f4a06cb60e83e2a02c5d7c423791596c11b8 head_shoulder_quick.py
ba5e67a97cb465e6a1a942c2f7342406031759ffcea2b897ae963bee4bc551c4 hybrid_stamp_search.py
f5847b6c8ed4c7c51290df9032d5a192317b5f03b5ff418ead1181a6e1b655f2 identity_agent.py
12237fa6cc5f0d2dcdd05f26fd50c0a7bfd541d1c922a1640d131fa0c4d6f4fc identity_bind.py
046aa90eb4a4b830910912362a9865d1e6170f5bc176fae42be630f967f9d3ff import_file_package.py
7cc260d4411ab13559803686f8b645afa07738d652d9459830aecac268597fa7 import_file.py
071e3a5141d04cb9e6bd31489a835c778608785896b18ea7fa65e8db9f1547e5 insert_chunks.py
d3d53f44daa7f1526488677b141e90fbf4aa5625369b96a3ca275b802414802f integrate_face_asrx.py
4cb6a93ef8006cb69e8bdb1bc72899ee9bab1bf7eceaafe9896923bb7023bbd5 integrate_rule3_markers.py
75aa3e4bffc9f9cb8b9254db19095c93c3efb43d465fb5dcca8c7b9b730f5c59 integrated_body_action_decoder.py
f4dd2e21fb6b668bdf0c51cc56e214188b46937b96a2b4a10d13783e171d0472 language_router.py
bef426641645fcf7dcc68c87e3325a6edf3f70925febaf1df84f7c6ff87681e5 lip_analyzer.py
7f98b0cc8379b3759cc7e805dd56f736cc518093e83f43b2e5ecf559a19b95f0 lip_processor_cv.py
a1473eeba17fce25e4678234fe4e8793a132514e0566b03b36a0bec04eb93acb lip_processor_media.py
0df61396756ee22d35356776c189b354458661916c8baf85bcef97c9f8b62ec8 lip_processor_mp.py
3202aeca29e651ef1a54f47681c6b3b2d0680555fe3c6d318a932bb12b49e58c lip_processor_simple.py
fed15bafb5e09715cc03962f465b2ff618bf05ebeafdf932643690c9635c9840 lip_processor.py
b9532949bd145c0411876bdf3a8cbf1540b4233f7585465ce6389928e1bfd908 llm_metadata_enhancer.py
1773054e8d563b493865880d0d8bda105e3eb6fb536a25817517237b3bb76afe magnifying_glass_analyze.py
7d4d048c452bf273f4a6d96da13eb7bab6aa60ca9dd51de5ca0fb0a01e587b13 magnifying_glass_extract.py
8528bbf89d2770fa5a23f461274038898be251fb6e48c5d3adece5aab3bf976d magnifying_glass_owl.py
cb645f5e29ee5a36b2f97812039abfdaed7328386bcd25ad7b742af6a6b16399 map_speakers_v2.py
a90bd3fb729a05010c29a213134c60cc0bdd17769e27a7d3f1250919b7bf1613 match_face_identity.py
2d864dc831c2fd0142b19b8ad2cda169c2a05facd9662d31861d29bb710c4979 match_face_with_pose_filtering.py
889d4853707896885ed96ab945d4266acb213f4b122e2ba7c4563eb0e3e9e865 match_identities_to_tmdb.py
b34ec373bcf65139e08e41967f58a2fc8ebb67a59c361074d3590cd16541415a match_speakers_to_chunks.py
fe6260a94d01d8b43d0d3b59eb820cfd7b4711c907343a1261c69f9010ae990d mediapipe_holistic_processor.py
bb36844b4d13bba8edc1b7f0703f02081b62bea795535b8cd8dcbfdb4281f402 migrate_asr_to_children.py
819312cbfce6e68a0d8d731e02d283946f79de6044f207991ddf9a28ac853d79 migrate_face_results.py
c3d062aab67b5177ac7bf2c3ad2f0e578e12c9893e377f68339a17cc2783316c migrate_identity_files.py
c418f6e50054fa7eae1d0d879e28997b98f57437acec48b53ecb09f332728867 migrate_to_4188.py
6f60aa899e06f05e575cb5b461ea517481119cc32644566245d74c96eccde722 multi_stage_stamp_search.py
b24e2289c00f803c8339f59c34d44ed6c53a3c19dafc13e72c4b260d6bb312a6 music_segmentation_processor.py
da2546f84d0dbd711c8800ae4e32e59d9c38de9e62e1b423c4518fa1fda1dbea natural_language_top10.py
78c3d1a9302dbfacdf9b3655dab07348957fd9dbb4af94aae83eefecd5343a33 natural_language_vector_detailed.py
e924f04d68c9a8211ad373da811aa6671d2c5654281c1634dbf8b1e5e5b51533 natural_language_vector_test.py
df6ac92367b1afb50c0af958e362d87555fe569f608a8d213e0a593e2a43cde8 object_search_agent.py
fd39b779a0337f521940f3f7b159931f1f207f200eefd610183781fdcf3dfafd object_search.py
42d2952fc78b57302b0d12bc3d45790a2c2c46d4ffa3c713a82686134bd63f13 ocr_benchmark_runner.py
7b3ccb5c4ddd4c62c5ad04d0e3aafaecc2c1441012b6a98613cdcf055e2e50e8 ocr_processor_contract_v1.py
271023eec42d6be4a1ce6ae2ce3f29e825210a57e6bb37554a6f7fdf54616f9a ocr_processor_mps.py
2e73c41285e52ef013594fcd4d20df9f5781bfc26bcf62e54dd2c04ec44200c3 ocr_processor.py
62196108cb3337b5f9a873d70d2981ac8f49152369afbcc8a12b3a13de579e80 opencv_stamp_search.py
b2e8d552c272fd173c77693e9453a85fe16dfc12f7c2cd304d299c6188c14077 paligemma_vs_gdino.py
1534d5b7617dbae77f7a37a2c33a89b90f965247a6828f00b73ea6b720f6f4fc parent_chunk_5w1h.py
5208c738d4b615282813d351daf09872ce516121bb604caa64968ef5e52c53d3 pipeline_checklist.py
8f80c3a2be5c330e2d1853d9250a171c75db84598dbf3304280c42237ed4fb1f pipeline_status.py
94db44c0f49115a677d117d4901a1b7991c1517905300eaa495dd62b8ac1c79c pose_processor_contract_v1.py
167dee5e42c6bd46674bcffcfd92f368fc0b48a1f42c459c806853b281bc6482 pose_processor_mps.py
a6ef3a785ef5c6dc47fa38dbed80d76bc7d4bf48cbaf0f7edb3d26df98d7262c pose_processor.py
45e6798dc5900f2f7c8776a2d260c122aae5068a075256b8a5c02e8d0be6c131 probe_file.py
01c7b3c30c1531224f9605f0ee633285fe8489ab2d0a3c9c6a41f2b2b60d6626 quick_stamp_search.py
e3143673a2bff6139e05c82446fd8770c4b7e59a854a42c3b29662f5ac75efe2 rebuild_parents.py
4aa98981632d4f8a11039c510e86aa296ae1cd4b399fc871ed664ac11e445bd9 rebuild_story_content.py
090137a5872edfed1b89c97b537d13ad8aafda9a705ebb4c54f30352503e5e3a redis_publisher.py
750f778946b56bc57c47d9d2295332bb0f8cec2c1aa03c6b882d39ef4432673d refine_search.py
0f8a6a6866a5797e964d3b17e2b7ef146fe7a798f09fcea982fcda6f629b4d06 regenerate_parent_5w1h.py
3ee192b623f290136b36bd63abd018aad6e6639a9543970c3415734628b33bd6 register_sample_faces.py
334782f0f66d0ad3818a51adf6343186a2de65467378ab68a81ade806e496af9 release_manager.py
9a44cdd155953778b52ac0cfb118504c56eb6b1141984365ffbb717e28f3e65b release_pack.py
3906b48f3a7764d19605def2bf8ef84a54a6afe64c9291a7cc0881a91472a826 render_face_heatmap.py
44e432c31a35211a37dd26695772b7e250487ac42ba4f16a56f843277c2fabbf render_offline_report.py
3fac1e6a4125042185a2ce82771f695c562b3137c7aa58a912bada00ad8ecf78 rescan_single_frame_traces.py
9c3212cb455c2a6230be918448560fee00c153a8956ffd04fcb62974d5e1abff resume_framework.py
7c95ec08daf4f980bd53233503b7a4fa01afc08660e8fe8cd031ea3613ead8f7 save_events_to_db.py
24795e1531fe05e33d515104e4fb2f9567b46d802ef1b5a38f11268cf105be76 scan_charade_stamps.py
cad2da5073577f851c5cb2abdbd7cab05b39caa0d1179ccc89c378a7df2736c8 scan_full_video_stamps.py
03ae71470331fe5b7f8e394f7f789eee08cad4ed5ec9196b46ab2c9dbefa7fec scan_handheld_objects.py
d3935ba498786cf260d9d5370ca60d3af7bc4fd438f6be33ce23cfd0b7bab593 scan_keyframes_opencv.py
12c9b35212f587f5adb37584bf3c3844804d2bc642ebfc5d82b86b44f46d2472 scan_keyframes.py
f386130ac203308c904ba7efea09ce0ca0d640d36762b113bf0cfedc24d7f885 scene_classifier.py
482edae04e5467a68c77729760db53d3653e8d7654fa49e5ec9a36f1f8f22616 search_blue_stamp.py
e3786422932138272d1096ad4c800594e62c9640952a286a9158372a1e5443e3 search_envelope.py
2df1e259c2e52d10d79b20856cb94ffff5a9bfdbe47cee587b1148b2f1c16101 search_objects_in_hands.py
9fd49be8ab16f94fd82efc5ae035c029372a7ddeb7fd779b557f1917cdc14592 search_vase.py
7a6d8e7c435368f6218db972c04a7be16d7d6680d8d4374f82c05b7162716b9d select_face_reference_vectors_v2.py
2bcf7c1b3c407b51a134a5ee4982713f0ea387cfd6df01ed75554c94603971a6 select_face_reference_vectors_v3.py
d52098fcf1f9f7ba14f31a9a90bc5b3bc933e1a5e5697e3d09eff389c153cb18 select_face_reference_vectors.py
a02cb37639275d86ae0b4504d21f50963b45aaf94630c59472ba30d07722e50c simple_api_test.py
02516ab1616c1756c4f8041f48ff12811cc5d672c53b34850b84ce682fefdff1 simple_face_stats.py
b024d9bfe244d0d058daae0acd314b9344d6f0912e4f3b02dbc618f9fe3e4949 simple_test.py
af8703506769f3cdb89ff7849b071c2421307717850596dd86d2fe0b053e7809 smart_stamp_v2.py
5e5f86d47ea2b75bcaa8662689f73af1963645149c0da688dc43482616aa4e76 sound_event_detector.py
bab7697e4b4b05e93babc116e0c5b13cbaf1f4d419a65acd5dc1de5bdfc510dc speaker_assign.py
381ff240ce806ead7d6463ee40c5b830035eb6252180b4b0901b3c8313fa4bbd speaker_bind_lip.py
5eede29fa0966974c1943792d7fcca2dd9179d4f23570cf1a3964dc97bc9ac1e specific_stamp_search.py
d5363d832272bdb3c1d6f6d93eee7b7894893b9164a3f5ad5fa08a4a0eaeeb47 split_asr_segments.py
8e1269f173f2c72de78857c2d83d3111b62ec89bd79f4fb00c3f57390986ae4f step3_asr_fine.py
7592df8be5dc58376b33960bfa7fc0003c51114b70ebc01f1589f39ee9568d3b store_traced_faces.py
7ac32c1e2146a19e6654ab3e4bbbfd42e1a6540fb8717d40d55c61e9f5d1bf71 story_embed.py
74cc24b328a075f48b1f44a465611157f44eadc8f5dabf6d95cd5cc5f80dd9dc story_pipeline_full.py
97628f0f1270825dabafdf0a69f10ef12c4ffe2be4ac12941315f06bfb084e7c story_processor_contract_v1.py
1b1f42fc4bbff26551f26f4ac1e8a995dfe3ff98b940a29c9e130410965d0fa0 story_processor.py
cdbc7ef88551e2b3a3771eac5be5e0360989e71fa009ac28c97e548507e08a5e sync_face_speaker_to_chunks.py
8b08e9a33f5917aad10e070d6aa48805f5e7c23f905ba8fff3b8697b2109d962 sync_to_mongodb.py
869b6c56fe16cbf8973826782a17503f02b5cd757ec025b944da693d38bdb4cb sync_users_from_sftpgo.py
f64cc6dcb72f54d3e97aa981b40591aef4804ca769e1f14628d901b98bc6aeac terminology_manager.py
455546b9bb3a2c2c877c7720229b254e75b28eea33b3715d1731c02ca85294ae test_api_correct_usage.py
b03dc1bbb091672e7da2b131850b17badac896b4fbba92fe9bce76c232c99be4 test_api_with_key_id.py
7d295c77d5bcd4c72c5673370af48cc89bbccf9292c3b82aad3a230d242547a9 test_args.py
f474ec88e6634decbf178da497443fa709096b174bb4a4320a07256f516b1044 test_asr_large_model.py
aa952524dd86f346740ffe555075b74adf2e60bb822bb04a943a51b1fd262445 test_birth_uuid.py
db87badad7948527325a528400d67a4eeef76abf8d13f5c4254c812e944e4e0c test_end_to_end.py
e191c98a82f7e089f7dccfc4c536244da2bf14339f982a3afef05d33332c3755 test_face_api_final.py
1b97c9aae2e1744aa7aefb192eaef86c64e6134efc8f08ffa9a274bff16a58d3 test_face_api_with_correct_key.py
f7e4078f31b1ca8494c18878219cf2f90c301f19fc851b9e7084657b71a5e150 test_face_api.py
9eafc49f8fa42b4cd58109e9b725b3aec3b06943ec426919b1788838ccf1ed92 test_face_db_fix.py
38bce82b167e0c97b257cc6b955fdc2e9ded581ce2d39eb0fd2c60249275394b test_face_direct.py
24e82bf0af82407e6c04361e9a671770cbfb0b05d92df589bd0d5a0118bb5a98 test_face_learning.py
8dcdb144c4253fbb466f220359b42c2a9579193865e320a56e682e384c2ae176 test_face_recognition_integration.py
b921e3256fdea176d4391116d1ead472c4f3ca8aac6999140367818818c35ec3 test_face_registration_api.py
9af6c6ff0c766b3de92185c3602f2b8b62b815bf88dcb0e3251c2676e61e0a48 test_face_tracker.py
4f70eadb6a8b80eb8febe32b17b77e58d1a4823cc5d598e5ea45555342d2d4cb test_florence2_direct.py
0588be0acea540950d737943073f71e769b6301374eaa4ff7fdb96a80145c4e0 test_florence2_pipeline.py
694c15193616157ddae4bdb0a45feada2a8f8490f01d290a28aa77a4b24eabb2 test_florence2_stamps.py
2c281f698616a83e9eeccd610555d9f9ab657b2deac65ae9e3dbfba0b450d9b0 test_identity_db.py
7a73e8314ea7e91ca9dad3867a83b9c1101fdab09bdc0fdac0f798d0a7a204f3 test_llm_capabilities.py
68300f87b96a474f06a3071a833e6b3ae48d1db5fb8a7e5a3ec1834fd878d808 test_multilingual.py
c17cdd0f4ffb7a151a634add08d13cc576ba7a848bb20f54fb97d0c1d9d81cc0 test_object_search.py
d07bd363a2878259fbf4ffcba40e367f7f1bf4171b5a5dfdda97f7a53b450d0e test_ollama_feasibility.py
8421003b1f66cbd21c6fe5d3aff0a526897753e959b23905ca8f502f644f66a5 test_owl_vit_debug.py
6f9e8b7947229ea4aa0a62b59bda5fcec05bd74f6c00dc4a7b06d932bd1b730f test_owl_vit_stamps.py
da91a7c97466ce7f03cde13aa9bf6e691b3e482d2cac74519a2e1a61a2abb05a test_parent_chunk_generation.py
19d9f2492d3b04b7dafa008f106767d3107dd36b0c8e4601765dca30131027cd test_places365_scene.py
de44553023067362e8b2223f03e1bff55fcbd2f11ddf3d01060dc02c4675a744 test_probe_file.py
c0e987ba06a61cc0426ffbca8af1eb51a97bd79acab59b70453cfbb18eaee093 test_processor_performance.py
7b4b55e23dff35ba107b3da5b0560d03b1b41dfdea1d3a59eac777b4be4d4033 test_pyannote_audio.py
5cb8b42033ffba41f25e7ef74ef04cf352c0c277a9971e9eaef53fd673902712 test_pyannote_multilingual.py
8580e689ae148754e03d958419e108241040a012584ba49e8a90db114a9f8c13 test_scene_api.py
1194d450070b1f42e045d98e532f41205bb3e52fc48ba26e7c9b72a188fe1b2c test_segment_count.py
147bfffeac9561cfa407207b04a825862ac623ba97deecf5ed7c6257432dc62c test_speechbrain.py
22e4b865bc769329c1146c2f914395044a9bc84cd2a13acf68fb374a57fe1e3e test_v2_detailed.py
a616570a2a080b5b19f4bf783877147e714a014103b274143dd37984a946ca08 test_v2_model.py
7b83611f6b3028500c91c62197f774c0769e299136eca8dc4b612a7b5743e3d6 test_v2_with_text.py
1dd983c78074a61ceec26d7e3623d40772ca55fd6ee63ba368afe756c66ae091 test_with_real_image.py
1b738cc0d69d33e967cbb775def0a7f58dc02f1911404af56a5825bd60a5b75b text_semantic_analysis.py
a4221417ae00add76881c6c715ee4257c263e2dfd0a846a8887738682dfe8cda thumbnail_extractor.py
0d188a738a0df79ead10065d9f17c366fe159c862bd4bafa2860d0e6ba2640c3 tkg_builder.py
a084d3b5840e920d552515febffa22b34943b9efa8b73adab9cd193372e71592 tmdb_agent.py
8b97f0fdfc0899460bf23d420dba0a51a34737c74ebad0519856909d198662bf tmdb_cast_fetcher.py
4858909a0beaf8397becf4103be17fcc350841217afcdc1d917c48c512a9041b tmdb_embed_extractor.py
54d8321dfe0f8caa669e4a9d1b48dc772a5b25817eab95b552944140c91f457d tmdb_identity_integration.py
2a84aa2dcfb83ac385d2c394f884926f306c81798e4277a26dbd1f3c5506be46 trace_face_aggregator.py
61d3b4b362722ce24326a204f1b72cc7b1dcc20cf3264a4f526d4ea343a8d33d transcribe.py
ede9a184fd51ef4c87eb3e2541f09b91739a49986cb588591a7c6fbb33433020 unified_synonym_processor.py
a408f294c3a71eb6a0eea80b9b586f73dedcefe286c62233f713a7428a9979be update_all_demographics.py
e6520bb10ae6835ceade487ceb5e3fa549ca6f06de35b2c785d649921ef443f4 update_fine_speakers.py
a2191daff2ad228725b6a66f0e472ec659a6b4fa8f2cbbd74d1bf9c35cca63eb update_person_demographics.py
1a7dddd1db467990ee1c685d61b971babfa30c3ae3a754b5df8f3b4c320f3ed1 update_qdrant_uuid.py
60060753cfd2a6d1241e55bf40a0c74f1df15739656d0349e22e8543036b2424 update_speaker_assignments.py
fdc61009c351263e0018801b32ad90ffd8919af611a2a0580546be7fd62c99c4 update_terminology.py
4840c11964a59eabad26b97fe01033ccaf7903e2d24edd5e1035f6dd5fc995ea vectorize_4188.py
078979114c5f248d2bfd43aa8df55235fa03ab812f26998b984cd485a3d2cda8 vectorize_chunk_summaries.py
ff98864f1b11795cc3bb64f30ccb6f8609771ddc7a5df2c003ba7c2233d16fc2 vectorize_chunks.py
5880c128400e6e36c8eb7dffd009dbbc99dd13f8575b0037bdc854e25ddc41fb video_comparison_statistics.py
0a1501ffdc027236cdf88706b3d61229e2998ab268fd57fb60e399ccb734b6a1 vision_agent.py
eac8f90fbbb655614abcefc4b887e346bf94db5f015d33d37bc9514fb030489d visual_chunk_processor.py
c165dfc5fc981dc731b25ef414184ee58e56b73b148d41a32fdce985c701efd5 visualize_stamp.py
6c65a82fdd1d585e20bee4fcb2d1bdec2e6220bda71d6ef9cd00d6a3cf74c4d7 voice_embedding_extractor.py
2b3a7b357db4ddd07ca30bf200c6600724e33441d8def0a4d9a39673e2cfb1c0 weather_sound_detector.py
206b61ebf3c91d7ce3f1488247b52aca6e955042d8aa979c59723e3ff10dd36a yolo_benchmark_runner.py
e8cb0963c90fbd1c2aa91141f80340edd3c9560d69780dd825d107c6ed14fa64 yolo_count_comparison.py
dad775ecdca0144bd14b7abaa7ec8fb213e8b9428e39906abce541e93db496b6 yolo_processor_contract_v1.py
74ff880e664ec514223a4f220b682fbc87089f8c0851c93ac68c97269b8a59b6 yolo_processor_mps.py
8af0a6db683b6626e07820b302135ac5960d38e3d4b3d187c640b23ce8a14f72 yolo_processor.py
e13cf22b9aeae96c7e28b4512dd2137743a25eb59027da446966c1aaaaf4ce71 zero_shot_combined_test.py
f4aaf017ff588999f06cd9ba1787517e06c6d6e6228a15a54d8aa4f54fde5eb3 zero_shot_gun_test.py
0a285b8ec33d7999e9d4ae8d43ce768c9f06ee1929e13a6809e98bdabe6357ce zero_shot_objects_test.py

136
scripts/embed_faces_only.py Normal file
View File

@@ -0,0 +1,136 @@
#!/opt/homebrew/bin/python3.11
"""Embed faces from existing detections JSON using CoreML FaceNet."""
import json, os, sys, time
import cv2
import numpy as np
from pathlib import Path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import coremltools as ct
FACENET_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "models", "facenet512.mlpackage")
def classify_pose(roll: float, yaw: float) -> str:
abs_yaw, abs_roll = abs(yaw), abs(roll)
if abs_yaw < 15 and abs_roll < 15:
return "frontal"
elif abs_yaw > 30:
return "profile_right" if yaw > 0 else "profile_left"
return "three_quarter"
def extract_embedding(coreml_model, face_img):
resized = cv2.resize(face_img, (160, 160))
normalized = (resized.astype(np.float32) / 127.5) - 1.0
normalized = np.transpose(normalized, (2, 0, 1))
input_array = np.expand_dims(normalized, axis=0)
result = coreml_model.predict({"input": input_array})
emb_key = [k for k in result.keys() if k.startswith("var_")][0]
return result[emb_key].flatten().tolist()
def main():
import argparse
parser = argparse.ArgumentParser(description="Embed faces only")
parser.add_argument("detections_json")
parser.add_argument("output_json")
parser.add_argument("--video", required=True)
args = parser.parse_args()
print(f"[EMBED] Loading detections: {args.detections_json}")
with open(args.detections_json) as f:
detection_data = json.load(f)
print(f"[EMBED] Loading CoreML FaceNet: {FACENET_PATH}")
coreml_model = ct.models.MLModel(FACENET_PATH)
print(f"[EMBED] Opening video: {args.video}")
video = cv2.VideoCapture(args.video)
fps = video.get(cv2.CAP_PROP_FPS)
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
face_data = {
"metadata": {
"video_path": os.path.abspath(args.video),
"fps": fps, "width": width, "height": height,
"sample_interval": detection_data.get("sample_interval", 3),
"detection_method": "apple_vision",
"embedding_method": "coreml_facenet",
"total_frames": total_frames,
},
"frames": {}
}
frames = detection_data.get("frames", [])
t0 = time.time()
embed_count, total_face_count = 0, 0
batch_size = max(1, len(frames) // 20)
for idx, frame_info in enumerate(frames):
frame_num = frame_info["frame"]
faces = []
for face in frame_info.get("faces", []):
total_face_count += 1
bb = face.get("bbox", face)
x, y, w, h = bb["x"], bb["y"], bb["width"], bb["height"]
if w <= 10 or h <= 10:
continue
video.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = video.read()
if not ret:
continue
x1, y1 = max(0, x), max(0, y)
x2, y2 = min(width, x + w), min(height, y + h)
if x2 <= x1 or y2 <= y1:
continue
face_img = frame[y1:y2, x1:x2]
if face_img.size == 0:
continue
emb = extract_embedding(coreml_model, face_img)
if emb is not None:
embed_count += 1
pose_info = face.get("pose", {})
pose_angle = classify_pose(
pose_info.get("roll", 0),
pose_info.get("yaw", 0)
)
faces.append({
"x": x, "y": y, "width": w, "height": h,
"confidence": face.get("confidence", 0.5),
"embedding": emb,
"pose_angle": {
"angle": pose_angle,
"roll": pose_info.get("roll", 0),
"yaw": pose_info.get("yaw", 0),
"pitch": pose_info.get("pitch", 0),
},
"landmarks": face.get("landmarks", []),
})
face_data["frames"][str(frame_num)] = faces
if (idx + 1) % batch_size == 0:
pct = (idx + 1) / len(frames) * 100
elapsed = time.time() - t0
eta = (elapsed / (idx + 1)) * (len(frames) - idx - 1) if idx > 0 else 0
print(f"[EMBED] {pct:.0f}% | {idx+1}/{len(frames)} frames | "
f"{embed_count} embeddings | {elapsed:.0f}s elapsed | "
f"{eta:.0f}s ETA", flush=True)
video.release()
face_data["metadata"]["status"] = "completed"
print(f"[EMBED] Writing output: {args.output_json}")
with open(args.output_json, "w") as f:
json.dump(face_data, f, indent=2)
elapsed = time.time() - t0
print(f"[EMBED] Done: {len(frames)} frames, {embed_count}/{total_face_count} embeddings, {elapsed:.0f}s")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,121 @@
#!/opt/homebrew/bin/python3.11
"""
Extract embeddings from existing face.json using CoreML FaceNet.
Usage: python3 scripts/extract_embeddings_from_face_json.py <face_json_path> <video_path> <output_path>
"""
import sys
import os
import json
import cv2
import numpy as np
import coremltools as ct
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
FACENET_PATH = os.path.join(SCRIPT_DIR, "..", "models", "facenet512.mlpackage")
def extract_embeddings(face_json_path: str, video_path: str, output_path: str):
# Load face.json
with open(face_json_path, 'r') as f:
face_data = json.load(f)
frames = face_data.get('frames', [])
if not frames:
print("No frames in face.json")
return
# Load CoreML FaceNet
facenet = os.path.normpath(FACENET_PATH)
if not os.path.exists(facenet):
print(f"FaceNet model not found: {facenet}")
return
coreml_model = ct.models.MLModel(facenet)
print(f"[EMB] CoreML FaceNet loaded: {facenet}")
# Open video
video = cv2.VideoCapture(video_path)
if not video.isOpened():
print(f"Cannot open video: {video_path}")
return
fps = video.get(cv2.CAP_PROP_FPS)
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"[EMB] Video: {fps} fps, {total_frames} frames")
# Extract embeddings for each face
embed_count = 0
processed_frames = 0
for frame_entry in frames:
frame_num = frame_entry.get('frame', 0)
faces = frame_entry.get('faces', [])
# Seek to frame
video.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, img = video.read()
if not ret:
continue
processed_frames += 1
for face in faces:
x = face.get('x', 0)
y = face.get('y', 0)
w = face.get('width', 0)
h = face.get('height', 0)
if w == 0 or h == 0:
continue
# Crop face
crop = img[y:y+h, x:x+w]
if crop.size == 0:
continue
# Resize to 160x160 (FaceNet input size)
crop_resized = cv2.resize(crop, (160, 160))
# Convert to RGB
crop_rgb = cv2.cvtColor(crop_resized, cv2.COLOR_BGR2RGB)
# CoreML embedding
try:
input_dict = {'image': crop_rgb}
output = coreml_model.predict(input_dict)
emb = output.get('output', output.get('embeddings', None))
if emb is not None:
if isinstance(emb, np.ndarray):
emb = emb.flatten().tolist()
elif isinstance(emb, dict):
emb = list(emb.values())[0]
if isinstance(emb, np.ndarray):
emb = emb.flatten().tolist()
face['embedding'] = emb
embed_count += 1
except Exception as e:
print(f"[EMB] Frame {frame_num} embedding failed: {e}")
if processed_frames % 1000 == 0:
print(f"[EMB] Processed {processed_frames} frames, {embed_count} embeddings")
video.release()
# Save updated face.json
face_data['metadata']['total_embeddings'] = embed_count
with open(output_path, 'w') as f:
json.dump(face_data, f)
print(f"[EMB] Done: {processed_frames} frames, {embed_count} embeddings")
if __name__ == "__main__":
if len(sys.argv) < 4:
print("Usage: python3 extract_embeddings_from_face_json.py <face_json> <video> <output>")
sys.exit(1)
extract_embeddings(sys.argv[1], sys.argv[2], sys.argv[3])

View File

@@ -0,0 +1,397 @@
#!/usr/bin/env python3
"""
extract_face_crops.py - 批量提取 face crops
Usage:
python3 scripts/extract_face_crops.py --uuid <file_uuid>
python3 scripts/extract_face_crops.py --uuid <file_uuid> --video <video_path>
儲存位置: {OUTPUT_DIR}/.faces/{file_uuid}/{trace_id}/{frame}.jpg
條件:
- trace_id != None and trace_id != 0
- landmarks.left_eye or landmarks.right_eye
品檢:
- file_size > 500 bytes
- mean_brightness > 5
- std_deviation > 10
Retry: 最多 3 次
"""
import argparse
import json
import subprocess
import os
import sys
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Set
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
# Constants
MAX_RETRIES = 3
MIN_FILE_SIZE = 500
MIN_BRIGHTNESS = 5
MIN_STD_DEV = 10
FFMPEG_TIMEOUT = 30
MAX_WORKERS = 8 # Parallel threads for ffmpeg
class FaceCropExtractor:
def __init__(self, output_dir: str):
self.output_dir = Path(output_dir)
self.faces_dir = self.output_dir / ".faces"
self.faces_dir.mkdir(parents=True, exist_ok=True)
self.stats = {"total_faces": 0, "qualified": 0, "successful": 0, "failed": 0, "skipped": 0, "low_confidence": 0, "too_small": 0}
self.stats_lock = threading.Lock()
def process_video(self, uuid: str, video_path: str) -> dict:
"""處理單一影片"""
face_json = self.output_dir / f"{uuid}.face.json"
traced_json = self.output_dir / f"{uuid}.face_traced.json"
if not face_json.exists():
print(f"[ERROR] face.json not found: {uuid}")
return {"error": "face.json not found"}
if not os.path.exists(video_path):
print(f"[ERROR] Video not found: {video_path}")
return {"error": "video not found"}
# Load face.json (landmarks)
print(f"[LOAD] Reading {face_json}")
with open(face_json) as f:
face_data = json.load(f)
# Load face_traced.json if exists (trace_id)
traced_data = {}
if traced_json.exists():
print(f"[LOAD] Reading {traced_json}")
with open(traced_json) as f:
traced_data = json.load(f)
# Build lookup: (frame, x, y) -> trace_id from traced_data
trace_lookup: Dict[Tuple[int, int, int], int] = {}
frames = traced_data.get("frames", {})
if isinstance(frames, dict):
for fnum, frm in frames.items():
faces = frm.get("faces", [])
if faces is None:
continue
for face in faces:
if face is None:
continue
trace_id = face.get("trace_id")
if trace_id and trace_id != 0:
x = face.get("x", 0)
y = face.get("y", 0)
key = (int(fnum), x, y)
trace_lookup[key] = trace_id
# Create output directory
uuid_dir = self.faces_dir / uuid
uuid_dir.mkdir(parents=True, exist_ok=True)
results = {"successful": [], "failed": []}
processed: Set[Tuple[int, int]] = set() # (trace_id, frame)
trace_counts: Dict[int, int] = {} # trace_id -> count
# Process faces from face.json
frames = face_data.get("frames", {})
if isinstance(frames, dict):
frame_items = frames.items()
elif isinstance(frames, list):
frame_items = [(frm.get("frame"), frm) for frm in frames]
else:
frame_items = []
# Collect extraction tasks
tasks = []
for fnum, frm in frame_items:
if fnum is None:
continue
faces = frm.get("faces", [])
if faces is None:
continue
for face in faces:
if face is None:
continue
self.stats["total_faces"] += 1
bb = face.get("bbox", face)
x = bb.get("x", 0) if isinstance(bb, dict) else 0
y = bb.get("y", 0) if isinstance(bb, dict) else 0
w = bb.get("width", 0) if isinstance(bb, dict) else 0
h = bb.get("height", 0) if isinstance(bb, dict) else 0
confidence = face.get("confidence", 0.0)
# Quality filtering: confidence + size
MIN_CONFIDENCE = 0.6
MIN_SIZE = 20
if confidence < MIN_CONFIDENCE:
self.stats["low_confidence"] += 1
continue
if w < MIN_SIZE or h < MIN_SIZE:
self.stats["too_small"] += 1
continue
# Check: has eyes
lm = face.get("landmarks")
has_eyes = False
if lm:
if isinstance(lm, dict):
has_eyes = lm.get("left_eye") or lm.get("right_eye")
elif isinstance(lm, list) and len(lm) >= 2:
has_eyes = True
if not has_eyes:
self.stats["skipped"] += 1
continue
self.stats["qualified"] += 1
# Check: in trace
key = (int(fnum), x, y)
trace_id = trace_lookup.get(key) or face.get("trace_id")
# Determine output directory
if trace_id and trace_id != 0:
output_dir = uuid_dir / str(trace_id)
crop_key = (trace_id, int(fnum))
else:
# No trace_id → unbound directory
output_dir = uuid_dir / "unbound"
crop_key = ("unbound", int(fnum), x, y) # unique key for unbound
if crop_key in processed:
continue
processed.add(crop_key)
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / f"{fnum}.jpg"
tasks.append({
"trace_id": trace_id or "unbound",
"frame": int(fnum),
"x": x, "y": y, "w": w, "h": h,
"output_path": output_path
})
# Parallel extraction
print(f"[EXTRACT] Processing {len(tasks)} faces with {MAX_WORKERS} threads...")
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
futures = {
executor.submit(
self.extract_face, video_path, t["frame"],
t["x"], t["y"], t["w"], t["h"], t["output_path"]
): t for t in tasks
}
for i, future in enumerate(as_completed(futures)):
t = futures[future]
result = future.result()
if result["success"]:
self.stats["successful"] += 1
results["successful"].append({
"trace_id": t["trace_id"],
"frame": t["frame"],
"path": str(t["output_path"])
})
trace_counts[t["trace_id"]] = trace_counts.get(t["trace_id"], 0) + 1
else:
self.stats["failed"] += 1
results["failed"].append({
"trace_id": t["trace_id"],
"frame": t["frame"],
"bbox": {"x": t["x"], "y": t["y"], "w": t["w"], "h": t["h"]},
"reason": result.get("reason", "unknown")
})
# Progress every 1000
if (i + 1) % 1000 == 0:
print(f" Progress: {i+1}/{len(tasks)} ({self.stats['successful']} OK, {self.stats['failed']} fail)")
# Write summary
self.write_summary(uuid, trace_counts, results)
return results
def extract_face(self, video_path: str, frame: int, x: int, y: int,
w: int, h: int, output_path: Path) -> dict:
"""提取 face crop含 retry使用 -ss 快速 seek"""
for attempt in range(MAX_RETRIES):
try:
ts = frame / 24.0 # FPS is always 24 for this video
cmd = [
"ffmpeg", "-y", "-ss", f"{ts:.3f}",
"-i", video_path,
"-vf", f"crop={w}:{h}:{x}:{y}",
"-frames:v", "1",
"-q:v", "2", # 高品質 JPEG
str(output_path)
]
proc = subprocess.run(
cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
timeout=FFMPEG_TIMEOUT
)
if proc.returncode != 0:
if attempt < MAX_RETRIES - 1:
continue
return {"success": False, "reason": "ffmpeg_failed"}
# Quality check
quality = self.check_quality(output_path)
if quality["ok"]:
return {"success": True, "path": str(output_path)}
# Quality failed, retry
if attempt < MAX_RETRIES - 1:
# Remove bad file
if output_path.exists():
output_path.unlink()
continue
return {"success": False, "reason": quality.get("reason", "quality_failed")}
except subprocess.TimeoutExpired:
if attempt < MAX_RETRIES - 1:
continue
return {"success": False, "reason": "timeout"}
except Exception as e:
return {"success": False, "reason": str(e)}
return {"success": False, "reason": "max_retries"}
def check_quality(self, path: Path) -> dict:
"""品檢"""
if not path.exists():
return {"ok": False, "reason": "file_not_exist"}
file_size = path.stat().st_size
if file_size < MIN_FILE_SIZE:
return {"ok": False, "reason": f"empty_file ({file_size}B)"}
try:
from PIL import Image
import numpy as np
img = Image.open(path)
arr = np.array(img.convert('RGB'))
mean_brightness = arr.mean()
if mean_brightness < MIN_BRIGHTNESS:
return {"ok": False, "reason": f"black_frame (mean={mean_brightness:.1f})"}
std_dev = arr.std()
if std_dev < MIN_STD_DEV:
return {"ok": False, "reason": f"low_contrast (std={std_dev:.1f})"}
return {"ok": True}
except ImportError:
# PIL not available, skip advanced quality check
return {"ok": True}
except Exception as e:
return {"ok": False, "reason": str(e)}
def write_summary(self, uuid: str, trace_counts: Dict[int, int], results: dict):
"""寫摘要報告"""
summary_path = self.faces_dir / uuid / "_summary.json"
summary = {
"file_uuid": uuid,
"timestamp": datetime.now().isoformat(),
"stats": self.stats,
"trace_counts": trace_counts,
"total_traces": len(trace_counts),
"failed_count": len(results["failed"]),
"failed_faces": results["failed"] if results["failed"] else None
}
with open(summary_path, "w") as f:
json.dump(summary, f, indent=2, ensure_ascii=False)
print(f"\n[SUMMARY] Written to {summary_path}")
def print_stats(self):
"""印統計"""
print(f"\n=== Statistics ===")
print(f"Total faces scanned: {self.stats['total_faces']}")
print(f"Filtered (low confidence < 0.6): {self.stats['low_confidence']}")
print(f"Filtered (too small < 20px): {self.stats['too_small']}")
print(f"Qualified (trace_id + eyes): {self.stats['qualified']}")
print(f"Successfully extracted: {self.stats['successful']}")
print(f"Failed: {self.stats['failed']}")
print(f"Skipped (no trace/eyes): {self.stats['skipped']}")
def main():
parser = argparse.ArgumentParser(
description="Extract face crops from videos",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
parser.add_argument("--uuid", type=str, required=True,
help="File UUID to process")
parser.add_argument("--video", type=str,
help="Video file path (optional, will check DB if not provided)")
parser.add_argument("--output-dir", type=str,
default="/Users/accusys/momentry/output_dev",
help="Output directory (default: output_dev)")
args = parser.parse_args()
# Get video path
video_path = args.video
if not video_path:
# Query from DB
video_path = query_video_path_from_db(args.uuid)
if not video_path:
print(f"[ERROR] Video path not found for UUID: {args.uuid}")
sys.exit(1)
print(f"=== Face Crop Extraction ===")
print(f"UUID: {args.uuid}")
print(f"Video: {video_path}")
print(f"Output: {args.output_dir}/.faces/{args.uuid}/")
print()
extractor = FaceCropExtractor(args.output_dir)
results = extractor.process_video(args.uuid, video_path)
extractor.print_stats()
def query_video_path_from_db(uuid: str) -> Optional[str]:
"""從 PostgreSQL 取得影片路徑"""
psql_path = "/opt/homebrew/Cellar/libpq/18.3/bin/psql"
if not os.path.exists(psql_path):
return None
cmd = [
psql_path, "-U", "accusys", "-d", "momentry", "-t", "-A",
"-c", f"SELECT file_path FROM public.videos WHERE file_uuid = '{uuid}' LIMIT 1"
]
try:
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=5)
path = proc.stdout.strip()
return path if path else None
except Exception:
return None
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,174 @@
#!/opt/homebrew/bin/python3.11
"""
Extract face embeddings for a video file using InsightFace + CoreML FaceNet.
Updates face_detections.embedding in PostgreSQL.
Usage: python3 scripts/extract_video_embeddings.py --file-uuid <uuid> --video-path <path>
"""
import argparse
import json
import os
import sys
import io
import warnings
import cv2
import numpy as np
import psycopg2
from psycopg2.extras import execute_values
warnings.filterwarnings("ignore")
DATABASE_URL = os.getenv("DATABASE_URL", "postgres://accusys@localhost:5432/momentry")
MODELS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "models")
FACENET_PATH = os.path.join(MODELS_DIR, "facenet512.mlpackage")
def get_schema():
"""Get schema from DATABASE_URL options"""
db_url = os.getenv("DATABASE_URL", "")
if "search_path=dev" in db_url or "DATABASE_SCHEMA=dev" in os.environ:
return "dev"
return "public"
def extract_video_embeddings(file_uuid: str, video_path: str, schema: str = "dev"):
"""Extract face embeddings from video frames"""
# Suppress InsightFace verbose output
old_stdout = sys.stdout
sys.stdout = io.StringIO()
try:
import insightface
from insightface.app import FaceAnalysis
import coremltools as ct
app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"])
app.prepare(ctx_id=0, det_thresh=0.5)
coreml_model = ct.models.MLModel(FACENET_PATH)
finally:
sys.stdout = old_stdout
# Open video
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"[EMBED] Video: {total_frames} frames, {fps} fps")
# Get face detections from DB (without embeddings)
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor()
cur.execute(f"""
SELECT id, frame_number, x, y, width, height
FROM {schema}.face_detections
WHERE file_uuid = %s AND embedding IS NULL
ORDER BY frame_number
""", (file_uuid,))
face_records = cur.fetchall()
print(f"[EMBED] Faces without embedding: {len(face_records)}")
if len(face_records) == 0:
print("[EMBED] All faces have embeddings")
cur.close()
conn.close()
return
# Build frame -> faces mapping
frame_faces = {}
for face_id, frame_num, x, y, w, h in face_records:
if frame_num not in frame_faces:
frame_faces[frame_num] = []
frame_faces[frame_num].append((face_id, x, y, w, h))
# Extract embeddings
batch_updates = []
processed_frames = 0
for frame_num in sorted(frame_faces.keys()):
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
ret, frame = cap.read()
if not ret:
continue
faces_data = frame_faces[frame_num]
# Detect faces in this frame
faces = app.get(frame)
for face_id, x, y, w, h in faces_data:
# Find matching detected face
best_face = None
best_iou = 0
for det_face in faces:
fx1, fy1, fx2, fy2 = det_face.bbox
fw, fh = fx2 - fx1, fy2 - fy1
# Calculate IoU
xi1, yi1 = max(x, fx1), max(y, fy1)
xi2, yi2 = min(x + w, fx2), min(y + h, fy2)
inter_w, inter_h = max(0, xi2 - xi1), max(0, yi2 - yi1)
inter = inter_w * inter_h
union = w * h + fw * fh - inter
iou = inter / union if union > 0 else 0
if iou > best_iou:
best_iou = iou
best_face = det_face
if best_face and best_iou > 0.3:
# Get embedding from InsightFace
embedding = best_face.embedding
if embedding is not None and len(embedding) > 0:
batch_updates.append((embedding.tolist(), face_id))
processed_frames += 1
if processed_frames % 100 == 0:
print(f"[EMBED] Progress: {processed_frames} frames, {len(batch_updates)} embeddings")
cap.release()
# Update embeddings in DB
if batch_updates:
print(f"[EMBED] Updating {len(batch_updates)} embeddings...")
for emb, face_id in batch_updates:
cur.execute(f"""
UPDATE {schema}.face_detections
SET embedding = %s
WHERE id = %s
""", (emb, face_id))
conn.commit()
# Verify
cur.execute(f"""
SELECT COUNT(embedding) FROM {schema}.face_detections
WHERE file_uuid = %s
""", (file_uuid,))
embed_count = cur.fetchone()[0]
print(f"[EMBED] Done: {embed_count} faces with embeddings")
cur.close()
conn.close()
def main():
parser = argparse.ArgumentParser(description="Extract face embeddings from video")
parser.add_argument("--file-uuid", required=True, help="Video file UUID")
parser.add_argument("--video-path", required=True, help="Video file path")
parser.add_argument("--schema", default=get_schema(), help="Database schema")
args = parser.parse_args()
extract_video_embeddings(args.file_uuid, args.video_path, args.schema)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,64 @@
#!/opt/homebrew/bin/python3.11
"""Insert face detections from traced JSON into DB."""
import json, os, sys
import psycopg2
import psycopg2.extras
DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
def insert_faces(file_uuid, traced_json_path, schema):
conn = psycopg2.connect(DB_URL)
cur = conn.cursor()
with open(traced_json_path) as f:
data = json.load(f)
frames = data.get("frames", {})
metadata = data.get("metadata", {})
fps = metadata.get("fps", 24.0)
total = 0
for frame_num_str, frame_data in sorted(frames.items(), key=lambda x: int(x[0])):
frame_num = int(frame_num_str)
ts = frame_num / fps
faces = frame_data.get("faces", [])
for face in faces:
x = int(face.get("x", 0))
y = int(face.get("y", 0))
w = int(face.get("width", 0))
h = int(face.get("height", 0))
confidence = face.get("confidence", 0.0)
trace_id = face.get("trace_id")
embedding = face.get("embedding")
try:
cur.execute(
f"""
INSERT INTO {schema}.face_detections
(file_uuid, frame_number, timestamp_secs, x, y, width, height, confidence, trace_id, embedding)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT DO NOTHING
""",
(file_uuid, frame_num, ts, x, y, w, h, confidence, trace_id, embedding),
)
if cur.rowcount > 0:
total += 1
except Exception as e:
print(f"[INSERT] Error at frame {frame_num}: {e}")
conn.rollback()
conn.commit()
cur.close()
conn.close()
print(f"[INSERT] Inserted {total} face detections into {schema}.face_detections")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Insert face detections")
parser.add_argument("--file-uuid", required=True)
parser.add_argument("--face-json", required=True)
parser.add_argument("--schema", default="public")
args = parser.parse_args()
insert_faces(args.file_uuid, args.face_json, args.schema)

View File

@@ -0,0 +1,201 @@
#!/opt/homebrew/bin/python3.11
"""
Match face_detections against TMDb identities via face embedding similarity.
Port of match_faces_against_tmdb from src/core/tmdb/face_agent.rs
Usage: python3 scripts/match_faces_to_tmdb.py <file_uuid> [--schema dev]
"""
import sys
import psycopg2
import psycopg2.extras
import numpy as np
from collections import defaultdict
DATABASE_URL = "postgres://accusys@localhost:5432/momentry"
THRESHOLD = 0.50
QC_MIN_FACES = 4 # Minimum faces per trace for QC
def cosine_similarity(a, b):
a = np.array(a, dtype=np.float64)
b = np.array(b, dtype=np.float64)
na = np.linalg.norm(a)
nb = np.linalg.norm(b)
if na == 0 or nb == 0:
return 0.0
return np.dot(a, b) / (na * nb)
def match_faces_to_tmdb(file_uuid: str, schema: str = "dev"):
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
# Step 1: Load TMDb identities with face embeddings
cur.execute(f"""
SELECT id, name, tmdb_id, face_embedding::real[] as embedding
FROM {schema}.identities
WHERE source = 'tmdb' AND face_embedding IS NOT NULL
""")
tmdb_identities = []
for row in cur.fetchall():
emb = row["embedding"]
if emb and len(emb) > 0:
tmdb_identities.append({
"id": row["id"],
"name": row["name"],
"tmdb_id": row["tmdb_id"],
"embedding": emb,
})
print(f"[TMDB-MATCH] Loaded {len(tmdb_identities)} TMDb identities")
if not tmdb_identities:
print("[TMDB-MATCH] No TMDb identities with embeddings")
cur.close()
conn.close()
return 0
# Step 2: Load face_detections with trace_id and embedding
cur.execute(f"""
SELECT id, trace_id, frame_number, embedding::real[] as embedding, confidence
FROM {schema}.face_detections
WHERE file_uuid = %s AND trace_id IS NOT NULL AND embedding IS NOT NULL
ORDER BY trace_id, frame_number
""", (file_uuid,))
fd_rows = cur.fetchall()
if not fd_rows:
print(f"[TMDB-MATCH] No face detections for {file_uuid}")
cur.close()
conn.close()
return 0
# Group by trace_id
trace_faces = defaultdict(list)
for row in fd_rows:
trace_id = row["trace_id"]
emb = row["embedding"]
if emb:
trace_faces[trace_id].append({
"id": row["id"],
"embedding": emb,
"frame": row["frame_number"],
"confidence": row["confidence"],
})
# Dedup near-identical embeddings within trace (sim > 0.99)
for tid, faces in trace_faces.items():
faces.sort(key=lambda x: x["embedding"][0])
unique = []
for f in faces:
if not unique or cosine_similarity(f["embedding"], unique[-1]["embedding"]) <= 0.99:
unique.append(f)
trace_faces[tid] = unique
total_traces = len(trace_faces)
total_faces = len(fd_rows)
print(f"[TMDB-MATCH] {total_traces} traces with {total_faces} faces")
# Step 3: Single-pass matching (one round only for performance)
matched = {} # trace_id → (identity_id, name)
# Build reference pool from TMDb seeds only
reference_pool = []
for tmdb in tmdb_identities:
reference_pool.append({
"embedding": tmdb["embedding"],
"identity_id": tmdb["id"],
"name": tmdb["name"],
})
print(f"[TMDB-MATCH] Matching {total_traces} traces against {len(reference_pool)} TMDb identities (threshold={THRESHOLD})")
# Match each trace against TMDb seeds
for tid, faces in trace_faces.items():
trace_scores = defaultdict(list)
for f in faces:
for ref in reference_pool:
sim = cosine_similarity(f["embedding"], ref["embedding"])
if sim >= THRESHOLD:
trace_scores[ref["identity_id"]].append((sim, ref["name"]))
if not trace_scores:
continue
# Select identity with highest aggregate score
best_identity = None
best_score = 0
best_name = None
for identity_id, scores in trace_scores.items():
avg_sim = np.mean([s[0] for s in scores])
if avg_sim > best_score:
best_score = avg_sim
best_identity = identity_id
best_name = scores[0][1]
if best_identity:
matched[tid] = (best_identity, best_name, best_score)
# Step 4: Quality Control - minimum faces per trace
qc_removed = 0
for tid, faces in trace_faces.items():
if tid in matched and len(faces) < QC_MIN_FACES:
del matched[tid]
qc_removed += 1
# Step 5: Temporal collision check
frame_identity_count = defaultdict(lambda: defaultdict(int))
for tid, faces in trace_faces.items():
if tid in matched:
identity_id = matched[tid][0]
for f in faces:
frame_identity_count[f["frame"]][identity_id] += 1
for frame, identity_counts in frame_identity_count.items():
for identity_id, count in identity_counts.items():
if count > 1:
conflicting = []
for tid, faces in trace_faces.items():
if tid in matched and matched[tid][0] == identity_id:
for f in faces:
if f["frame"] == frame:
conflicting.append((tid, f["confidence"]))
conflicting.sort(key=lambda x: x[1], reverse=True)
for tid, _ in conflicting[1:]:
if tid in matched:
del matched[tid]
qc_removed += 1
if qc_removed > 0:
print(f"[TMDB-MATCH] QC removed {qc_removed} traces")
# Step 6: Update face_detections.identity_id
bindings_created = 0
for tid, (identity_id, name, score) in matched.items():
for f in trace_faces[tid]:
cur.execute(f"""
UPDATE {schema}.face_detections
SET identity_id = %s
WHERE id = %s AND identity_id IS NULL
""", (identity_id, f["id"]))
bindings_created += cur.rowcount
conn.commit()
cur.close()
conn.close()
print(f"[TMDB-MATCH] {bindings_created} bindings created, {len(matched)} traces matched")
return bindings_created
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("file_uuid", help="Video file UUID")
parser.add_argument("--schema", default="dev", help="Database schema")
args = parser.parse_args()
match_faces_to_tmdb(args.file_uuid, args.schema)

View File

@@ -84,18 +84,22 @@ def process_ocr(
def _fallback(video_path, output_path, uuid, sample_interval):
"""Fallback to original PaddleOCR implementation"""
"""Fallback to MPS OCR implementation"""
import importlib
spec = importlib.util.spec_from_file_location(
"paddle_ocr",
os.path.join(os.path.dirname(__file__), "ocr_paddle.py")
"ocr_mps",
os.path.join(os.path.dirname(__file__), "ocr_processor_mps.py")
)
if spec is None:
print("[OCR] No fallback available, returning empty result", file=sys.stderr)
return {"frame_count": 0, "fps": 0, "frames": []}
paddle = importlib.util.module_from_spec(spec)
spec.loader.exec_module(paddle)
return paddle.process_ocr(video_path, output_path, uuid, sample_interval=sample_interval)
ocr_mps = importlib.util.module_from_spec(spec)
spec.loader.exec_module(ocr_mps)
return ocr_mps.process_video_ocr(
video_path=video_path,
output_path=output_path,
sample_interval=sample_interval
)
if __name__ == "__main__":

View File

@@ -0,0 +1,77 @@
#!/usr/bin/env python3
"""Sync all vectors from one Qdrant collection to another on the same instance."""
import json
import time
import urllib.request
import urllib.error
QDRANT_URL = "http://localhost:6333"
API_KEY = "Test3200Test3200Test3200"
SOURCE = "momentry_dev_rule1_v2"
TARGET = "momentry_rule1"
BATCH_SIZE = 500
SLEEP = 0.05
def qdrant(method, path, body=None):
url = f"{QDRANT_URL}{path}"
data = json.dumps(body).encode() if body else None
req = urllib.request.Request(url, data=data, method=method)
req.add_header("Content-Type", "application/json")
req.add_header("Api-Key", API_KEY)
try:
with urllib.request.urlopen(req) as resp:
return json.loads(resp.read())
except urllib.error.HTTPError as e:
print(f" HTTP {e.code}: {e.read().decode()}")
raise
def scroll_batch(offset=None):
body = {"limit": BATCH_SIZE, "with_payload": True, "with_vector": True}
if offset is not None:
body["offset"] = offset
result = qdrant("POST", f"/collections/{SOURCE}/points/scroll", body)
points = result.get("result", {}).get("points", [])
next_offset = result.get("result", {}).get("next_page_offset")
return points, next_offset
def upsert_batch(points):
body = {"points": points}
result = qdrant("PUT", f"/collections/{TARGET}/points", body)
return result.get("status") == "ok" or result.get("result", {}).get("status") == "ok"
def main():
offset = None
total = 0
batch_num = 0
t0 = time.time()
while True:
points, offset = scroll_batch(offset)
if not points:
break
ok = upsert_batch(points)
if not ok:
print(f" FAILED batch {batch_num} ({len(points)} pts)")
break
total += len(points)
batch_num += 1
if batch_num % 10 == 0:
elapsed = time.time() - t0
print(f" Synced {total} points ({elapsed:.1f}s)")
if offset is None:
break
time.sleep(SLEEP)
elapsed = time.time() - t0
print(f"Done: {total} points synced in {elapsed:.1f}s")
if __name__ == "__main__":
main()

View File

@@ -4,6 +4,24 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
ENV_FILE="${PROJECT_DIR}/.env.development"
# Load env vars (silently)
source "$ENV_FILE" 2>/dev/null || true
# Path defaults (can be overridden by env vars above)
LOG_DIR="${MOMENTRY_LOG_DIR:-/Users/accusys/momentry/logs}"
PG_BIN_DIR="${MOMENTRY_PG_BIN_DIR:-/Users/accusys/pgsql/18.3/bin}"
PG_DATA_DIR="${MOMENTRY_PG_DATA_DIR:-/Users/accusys/pgsql/data}"
QDRANT_BIN="${MOMENTRY_QDRANT_BIN:-/Users/accusys/.cargo/bin/qdrant}"
QDRANT_STORAGE_DIR="${MOMENTRY_QDRANT_STORAGE_DIR:-/Users/accusys/momentry/qdrant_storage}"
LLAMACPP_BIN="${MOMENTRY_LLAMACPP_BIN:-/Users/accusys/llama/bin/llama-server}"
A4B_MODEL="${MOMENTRY_LLM_A4B_MODEL_PATH:-/Users/accusys/models/google_gemma-4-26B-A4B-it-Q5_K_M.gguf}"
A4B_MMPROJ="${MOMENTRY_LLM_A4B_MMPROJ_PATH:-/Users/accusys/models/gemma-4-26B-A4B-it.mmproj-f16.gguf}"
E4B_MODEL="${MOMENTRY_LLM_E4B_MODEL_PATH:-/Users/accusys/models/gemma-4-E4B-it-Q4_K_M.gguf}"
E4B_MMPROJ="${MOMENTRY_LLM_E4B_MMPROJ_PATH:-/Users/accusys/models/mmproj-gemma-4-E4B-it-BF16.gguf}"
OLLAMA_BIN="${MOMENTRY_OLLAMA_BIN:-/Users/accusys/bin/ollama}"
PLAYGROUND_BIN="${MOMENTRY_PLAYGROUND_BIN:-target/debug/momentry_playground}"
API_KEY="${MOMENTRY_API_KEY:-muser_68600856036340bcafc01930eb4bd839_1774418104_97221b69}"
# Colors
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
@@ -17,27 +35,23 @@ check() {
if [ $? -eq 0 ]; then echo -e " ${GREEN}${NC} $1"; else echo -e " ${RED}${NC} $1"; FAILURES+=("$1"); fi
}
echo -e "${CYAN}====================================${NC}"
echo -e "${CYAN}========================================${NC}"
echo -e "${CYAN} Momentry Core - Startup Sequence${NC}"
echo -e "${CYAN}====================================${NC}"
echo -e "${CYAN}========================================${NC}"
echo ""
LOG_DIR="/Users/accusys/momentry/logs"
# ── 1. PostgreSQL ──
echo -e "${YELLOW}[1/8] PostgreSQL${NC}"
PG_DATA="/Users/accusys/pgsql/data"
PG_BIN="/Users/accusys/pgsql/18.3/bin"
if $PG_BIN/pg_isready -q 2>/dev/null; then
echo -e "${YELLOW}[1/10] PostgreSQL${NC}"
if "$PG_BIN_DIR/pg_isready" -q 2>/dev/null; then
echo -e " ${GREEN}${NC} already running"
else
$PG_BIN/pg_ctl -D "$PG_DATA" -l "$LOG_DIR/pg.log" start 2>/dev/null
"$PG_BIN_DIR/pg_ctl" -D "$PG_DATA_DIR" -l "$LOG_DIR/pg.log" start 2>/dev/null
sleep 2
$PG_BIN/pg_isready -q 2>/dev/null; check "started"
"$PG_BIN_DIR/pg_isready" -q 2>/dev/null; check "started"
fi
# ── 2. Redis ──
echo -e "${YELLOW}[2/8] Redis${NC}"
echo -e "${YELLOW}[2/10] Redis${NC}"
if redis-cli ping 2>/dev/null | grep -q PONG; then
echo -e " ${GREEN}${NC} already running"
else
@@ -46,14 +60,22 @@ else
redis-cli ping 2>/dev/null | grep -q PONG; check "started"
fi
# ── 3. Qdrant ──
echo -e "${YELLOW}[3/8] Qdrant${NC}"
QDRANT_BIN="/Users/accusys/momentry_resources/bin/qdrant"
QDRANT_STORAGE="/Users/accusys/momentry/qdrant_storage"
# ── 3. MongoDB ──
echo -e "${YELLOW}[3/10] MongoDB${NC}"
if pgrep -q mongod 2>/dev/null; then
echo -e " ${GREEN}${NC} already running"
else
brew services start mongodb-community 2>/dev/null || mongod --dbpath /opt/homebrew/var/mongodb --logpath "$LOG_DIR/mongodb.log" --fork 2>/dev/null
sleep 2
pgrep -q mongod 2>/dev/null; check "started"
fi
# ── 4. Qdrant ──
echo -e "${YELLOW}[4/10] Qdrant${NC}"
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 http://localhost:6333/healthz 2>/dev/null | grep -q 200; then
echo -e " ${GREEN}${NC} already running"
else
mkdir -p "$QDRANT_STORAGE"
mkdir -p "$QDRANT_STORAGE_DIR"
"$QDRANT_BIN" > "$LOG_DIR/qdrant.log" 2>&1 &
for i in $(seq 1 15); do
sleep 2
@@ -64,9 +86,8 @@ else
curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 http://localhost:6333/healthz 2>/dev/null | grep -q 200; check "started"
fi
# ── 4. Qdrant Collection ──
echo -e "${YELLOW}[4/8] Qdrant Collection${NC}"
source "$ENV_FILE" 2>/dev/null || true
# ── 5. Qdrant Collection ──
echo -e "${YELLOW}[5/10] Qdrant Collection${NC}"
COLLECTION="${QDRANT_COLLECTION:-momentry_dev_rule1_v2}"
EXISTS=$(curl -s "http://localhost:6333/collections/$COLLECTION" 2>/dev/null | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('result',{}).get('status','not_found'))" 2>/dev/null)
if [ "$EXISTS" = "not_found" ]; then
@@ -78,15 +99,19 @@ fi
curl -s "http://localhost:6333/collections/$COLLECTION" 2>/dev/null | python3 -c "import sys,json; d=json.load(sys.stdin); s=d.get('result',{}).get('status','not_found'); assert s in ('green','ok'), f'unexpected status: {s}'" 2>/dev/null
check "collection '$COLLECTION' ready"
# ── 5. LLM (Gemma4 / llama.cpp) ──
echo -e "${YELLOW}[5/8] LLM Server (Gemma4)${NC}"
# ── 6a. LLM Chat (A4B, port 8082) ──
echo -e "${YELLOW}[6a/10] LLM Chat - A4B (port 8082)${NC}"
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:8082/health 2>/dev/null | grep -q 200; then
echo -e " ${GREEN}${NC} already running"
else
LLM_BIN="/Users/accusys/momentry_resources/llama/bin/llama-server"
LLM_MODEL="/Users/accusys/momentry/models/llm/google_gemma-4-26B-A4B-it-Q5_K_M.gguf"
"$LLM_BIN" -m "$LLM_MODEL" --host 0.0.0.0 --port 8082 -ngl 99 -c 16384 --temp 0.1 --mlock --reasoning off > "$LOG_DIR/llama_server.log" 2>&1 &
echo -e " ${YELLOW}⏳ loading model (~30s)...${NC}"
LLAMA_ARGS_A4B=(
-m "$A4B_MODEL"
--mmproj "$A4B_MMPROJ"
--host 0.0.0.0 --port 8082
-ngl 99 -c 16384 --temp 0.1 --mlock --reasoning off
)
"$LLAMACPP_BIN" "${LLAMA_ARGS_A4B[@]}" > "$LOG_DIR/llama_a4b.log" 2>&1 &
echo -e " ${YELLOW}⏳ loading A4B model (~30s)...${NC}"
for i in $(seq 1 30); do
sleep 2
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 2 http://localhost:8082/health 2>/dev/null | grep -q 200; then
@@ -96,8 +121,30 @@ else
curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 http://localhost:8082/health 2>/dev/null | grep -q 200; check "started"
fi
# ── 6. Embedding Server ──
echo -e "${YELLOW}[6/8] EmbeddingGemma${NC}"
# ── 6b. LLM Vision (E4B, port 8083) ──
echo -e "${YELLOW}[6b/10] LLM Vision - E4B (port 8083)${NC}"
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:8083/health 2>/dev/null | grep -q 200; then
echo -e " ${GREEN}${NC} already running"
else
LLAMA_ARGS_E4B=(
-m "$E4B_MODEL"
--mmproj "$E4B_MMPROJ"
--host 0.0.0.0 --port 8083
-ngl 99 -c 16384 --temp 0.1 --mlock
)
"$LLAMACPP_BIN" "${LLAMA_ARGS_E4B[@]}" > "$LOG_DIR/llama_e4b.log" 2>&1 &
echo -e " ${YELLOW}⏳ loading E4B model (~30s)...${NC}"
for i in $(seq 1 30); do
sleep 2
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 2 http://localhost:8083/health 2>/dev/null | grep -q 200; then
break
fi
done
curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 http://localhost:8083/health 2>/dev/null | grep -q 200; check "started"
fi
# ── 7. Embedding Server ──
echo -e "${YELLOW}[7/10] EmbeddingGemma${NC}"
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:11436/health 2>/dev/null | grep -q 200; then
echo -e " ${GREEN}${NC} already running"
else
@@ -112,23 +159,22 @@ else
curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:11436/health 2>/dev/null | grep -q 200; check "started"
fi
# ── 7. Playground Server ──
echo -e "${YELLOW}[7/8] Playground API Server${NC}"
if curl -s -o /dev/null -w "%{http_code}" -H "X-API-Key: muser_68600856036340bcafc01930eb4bd839_1774418104_97221b69" --connect-timeout 5 http://127.0.0.1:3003/api/v1/agents/5w1h/status 2>/dev/null | grep -q 200; then
# ── 8. Playground Server ──
echo -e "${YELLOW}[8/10] Playground API Server${NC}"
if curl -s -o /dev/null -w "%{http_code}" -H "X-API-Key: $API_KEY" --connect-timeout 5 http://127.0.0.1:3003/api/v1/agents/5w1h/status 2>/dev/null | grep -q 200; then
echo -e " ${GREEN}${NC} already running"
else
cd "$PROJECT_DIR"
target/debug/momentry_playground server > "$LOG_DIR/playground.log" 2>&1 &
$PLAYGROUND_BIN server > "$LOG_DIR/playground.log" 2>&1 &
sleep 4
curl -s -o /dev/null -w "%{http_code}" -H "X-API-Key: muser_68600856036340bcafc01930eb4bd839_1774418104_97221b69" --connect-timeout 5 http://127.0.0.1:3003/api/v1/agents/5w1h/status 2>/dev/null | grep -q 200; check "started"
curl -s -o /dev/null -w "%{http_code}" -H "X-API-Key: $API_KEY" --connect-timeout 5 http://127.0.0.1:3003/api/v1/agents/5w1h/status 2>/dev/null | grep -q 200; check "started"
fi
# ── 8. Ollama (Gemma4 E4B) ──
echo -e "${YELLOW}[8/8] Ollama (Gemma4 E4B)${NC}"
# ── 9. Ollama ──
echo -e "${YELLOW}[9/10] Ollama${NC}"
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:11434/api/tags 2>/dev/null | grep -q 200; then
echo -e " ${GREEN}${NC} already running"
else
OLLAMA_BIN="/Users/accusys/momentry_resources/bin/ollama"
if [ ! -f "$OLLAMA_BIN" ]; then
echo -e " ${YELLOW}⚠ ollama binary not found, skipping${NC}"
else
@@ -138,6 +184,16 @@ else
fi
fi
# ── 10. SFTPGo ──
echo -e "${YELLOW}[10/10] SFTPGo${NC}"
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:8080/api/v1/version 2>/dev/null | grep -q 200; then
echo -e " ${GREEN}${NC} already running"
else
/Users/accusys/bin/sftpgo serve -c /Users/accusys/momentry/etc/sftpgo > "$LOG_DIR/sftpgo.log" 2>&1 &
sleep 3
curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:8080/api/v1/version 2>/dev/null | grep -q 200; check "started"
fi
echo ""
if [ ${#FAILURES[@]} -eq 0 ]; then
echo -e "${GREEN}====================================${NC}"
@@ -151,10 +207,13 @@ else
fi
echo ""
echo " Playground: http://127.0.0.1:3003"
echo " LLM: http://127.0.0.1:8082"
echo " LLM Chat: http://127.0.0.1:8082"
echo " LLM Vision: http://127.0.0.1:8083"
echo " Embedding: http://127.0.0.1:11436"
echo " Ollama: http://localhost:11434"
echo " Qdrant: http://localhost:6333"
echo " PostgreSQL: localhost:5432"
echo " Redis: localhost:6379"
echo " MongoDB: localhost:27017"
echo " SFTPGo: http://localhost:8080 (SFTP: port 2022)"
echo ""

View File

@@ -319,12 +319,13 @@ def store_traced_faces(file_uuid: str, traced_json_path: str, schema: str = SCHE
cur.execute(
f"""
UPDATE {schema}.face_detections
SET trace_id = %s
SET trace_id = %s, embedding = %s
WHERE file_uuid = %s AND frame_number = %s
AND x = %s AND y = %s AND width = %s AND height = %s
""",
(
trace_id,
embed_vec,
file_uuid, frame_num, x, y, w, h,
),
)

View File

@@ -126,12 +126,24 @@ struct SwiftFace: ParsableCommand {
let imgH = CGFloat(cgImage.height)
// Process landmark observations FIRST (each has bbox + landmarks, self-consistent)
// Quality filtering
let MIN_CONFIDENCE = 0.6
let MIN_SIZE = 20
for lmObs in landmarkObservations {
// Confidence filter
let lmConf = Double(lmObs.confidence)
if lmConf < MIN_CONFIDENCE { continue }
let bb = lmObs.boundingBox
let faceX = Int(bb.origin.x * imgW)
let faceY = Int((1.0 - bb.origin.y - bb.size.height) * imgH)
let faceW = Int(bb.size.width * imgW)
let faceH = Int(bb.size.height * imgH)
// Size filter
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
let faceX = Int(bb.origin.x * imgW)
let faceY = Int((1.0 - bb.origin.y - bb.size.height) * imgH)
var faceData: [String: Any] = [
"bbox": ["x": max(0, faceX), "y": max(0, faceY),
@@ -203,11 +215,21 @@ struct SwiftFace: ParsableCommand {
}
}
if matched { continue }
// Quality filtering for unmatched face rects
let MIN_CONFIDENCE = 0.6
let MIN_SIZE = 20
let faceConf = Double(faceObs.faceCaptureQuality ?? faceObs.confidence)
if faceConf < MIN_CONFIDENCE { continue }
let faceW = Int(fBB.size.width * imgW)
let faceH = Int(fBB.size.height * imgH)
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
// Unmatched face rect: output without landmarks
let faceX = Int(fBB.origin.x * imgW)
let faceY = Int((1.0 - fBB.origin.y - fBB.size.height) * imgH)
let faceW = Int(fBB.size.width * imgW)
let faceH = Int(fBB.size.height * imgH)
var faceData: [String: Any] = [
"bbox": ["x": max(0, faceX), "y": max(0, faceY),

View File

@@ -0,0 +1,107 @@
#!/opt/homebrew/bin/python3.11
"""Update face_detections embeddings from face_traced.json"""
import json
import psycopg2
import sys
import os
DATABASE_URL = os.getenv("DATABASE_URL", "postgres://accusys@localhost:5432/momentry")
def update_embeddings(file_uuid: str, traced_json_path: str, schema: str = "dev"):
"""Update embeddings in face_detections from face_traced.json"""
with open(traced_json_path, 'r') as f:
data = json.load(f)
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor()
frames = data.get('frames', {})
updated = 0
no_match = 0
# Process frames in batches of 1000
batch = []
for frame_key, frame_data in frames.items():
frame_num = int(frame_key)
faces = frame_data.get('faces', [])
for face in faces:
embedding = face.get('embedding')
if not embedding or len(embedding) == 0:
continue
trace_id = face.get('trace_id')
x = face.get('x', 0)
y = face.get('y', 0)
w = face.get('width', 0)
h = face.get('height', 0)
# Convert embedding to PostgreSQL array format
emb_str = '[' + ','.join(str(v) for v in embedding) + ']'
batch.append((trace_id, emb_str, file_uuid, frame_num, x, y, w, h))
# Execute batch every 1000 frames
if len(batch) >= 1000:
for item in batch:
try:
cur.execute(
f"""
UPDATE {schema}.face_detections
SET embedding = %s::real[], trace_id = %s
WHERE file_uuid = %s AND frame_number = %s
AND x = %s AND y = %s AND width = %s AND height = %s
AND embedding IS NULL
""",
(item[1], item[0], item[2], item[3], item[4], item[5], item[6], item[7])
)
updated += cur.rowcount
except Exception as e:
pass
conn.commit()
batch = []
print(f"[UPDATE] Processed {updated} so far...", file=sys.stderr)
# Final batch
for item in batch:
try:
cur.execute(
f"""
UPDATE {schema}.face_detections
SET embedding = %s::real[], trace_id = %s
WHERE file_uuid = %s AND frame_number = %s
AND x = %s AND y = %s AND width = %s AND height = %s
AND embedding IS NULL
""",
(item[1], item[0], item[2], item[3], item[4], item[5], item[6], item[7])
)
updated += cur.rowcount
except Exception as e:
pass
conn.commit()
cur.close()
conn.close()
print(f"[UPDATE] Total updated: {updated}")
return updated
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--file-uuid", required=True)
parser.add_argument("--traced-json", help="Path to face_traced.json")
parser.add_argument("--schema", default="dev")
args = parser.parse_args()
traced_json = args.traced_json or f"/Users/accusys/momentry/output_dev/{args.file_uuid}.face_traced.json"
if not os.path.exists(traced_json):
print(f"File not found: {traced_json}", file=sys.stderr)
sys.exit(1)
update_embeddings(args.file_uuid, traced_json, args.schema)

View File

@@ -0,0 +1,170 @@
#!/opt/homebrew/bin/python3.11
"""
Verify Charade pipeline completion.
Usage: python3 scripts/verify_charade_pipeline.py <file_uuid>
"""
import sys
import psycopg2
import subprocess
import json
DATABASE_URL = "postgres://accusys@localhost:5432/momentry"
OUTPUT_DIR = "/Users/accusys/momentry/output_dev"
def check_file_outputs(file_uuid):
"""Check all expected output files exist"""
expected_files = [
"cut.json",
"yolo.json",
"face.json",
"face_traced.json",
"pose.json",
"asrx.json",
"visual_chunk.json",
"scene.json",
"scene_meta.json",
"story_llm.json",
"story_story.json",
"tmdb.json",
]
results = []
for ext in expected_files:
path = f"{OUTPUT_DIR}/{file_uuid}.{ext}"
try:
size = subprocess.check_output(["stat", "-f%z", path]).decode().strip()
results.append({"file": ext, "exists": True, "size": int(size)})
except:
results.append({"file": ext, "exists": False, "size": 0})
return results
def check_db_records(file_uuid, schema="dev"):
"""Check database records"""
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor()
checks = [
("videos", f"SELECT status FROM {schema}.videos WHERE file_uuid = '{file_uuid}'"),
("monitor_jobs", f"SELECT status, completed_processors FROM {schema}.monitor_jobs WHERE uuid = '{file_uuid}'"),
("pre_chunks", f"SELECT COUNT(*) FROM {schema}.pre_chunks WHERE file_uuid = '{file_uuid}'"),
("face_detections_total", f"SELECT COUNT(*) FROM {schema}.face_detections WHERE file_uuid = '{file_uuid}'"),
("face_detections_embedding", f"SELECT COUNT(embedding) FROM {schema}.face_detections WHERE file_uuid = '{file_uuid}'"),
("face_detections_trace", f"SELECT COUNT(trace_id) FROM {schema}.face_detections WHERE file_uuid = '{file_uuid}'"),
("face_detections_identity", f"SELECT COUNT(identity_id) FROM {schema}.face_detections WHERE file_uuid = '{file_uuid}'"),
("chunks_total", f"SELECT COUNT(*) FROM {schema}.chunk WHERE file_uuid = '{file_uuid}'"),
("chunks_embedding", f"SELECT COUNT(embedding) FROM {schema}.chunk WHERE file_uuid = '{file_uuid}'"),
]
results = []
for name, query in checks:
try:
cur.execute(query)
result = cur.fetchone()
results.append({"check": name, "value": result[0] if result else None})
except Exception as e:
results.append({"check": name, "error": str(e)})
cur.close()
conn.close()
return results
def check_identity_bindings(file_uuid, schema="dev"):
"""Check identity bindings"""
conn = psycopg2.connect(DATABASE_URL)
cur = conn.cursor()
checks = [
("audrey_faces", f"""
SELECT COUNT(*) FROM {schema}.face_detections fd
JOIN {schema}.identities i ON fd.identity_id = i.id
WHERE fd.file_uuid = '{file_uuid}' AND i.name = 'Audrey Hepburn'
"""),
("cary_faces", f"""
SELECT COUNT(*) FROM {schema}.face_detections fd
JOIN {schema}.identities i ON fd.identity_id = i.id
WHERE fd.file_uuid = '{file_uuid}' AND i.name = 'Cary Grant'
"""),
("top_identities", f"""
SELECT i.name, COUNT(*) as count
FROM {schema}.face_detections fd
JOIN {schema}.identities i ON fd.identity_id = i.id
WHERE fd.file_uuid = '{file_uuid}' AND fd.identity_id IS NOT NULL
GROUP BY i.name
ORDER BY count DESC
LIMIT 10
"""),
]
results = []
for name, query in checks:
try:
cur.execute(query)
if name == "top_identities":
rows = cur.fetchall()
results.append({"check": name, "value": rows})
else:
result = cur.fetchone()
results.append({"check": name, "value": result[0] if result else 0})
except Exception as e:
results.append({"check": name, "error": str(e)})
cur.close()
conn.close()
return results
def print_report(file_uuid, file_outputs, db_records, identity_bindings):
"""Print verification report"""
print(f"\n{'='*60}")
print(f"Charade Pipeline Verification Report")
print(f"File UUID: {file_uuid}")
print(f"{'='*60}\n")
print("## File Outputs")
for f in file_outputs:
status = "" if f["exists"] and f["size"] > 0 else ""
size_kb = f["size"] / 1024
print(f" {status} {f['file']}: {size_kb:.1f} KB")
print("\n## Database Records")
for r in db_records:
value = r.get("value", r.get("error", "N/A"))
if isinstance(value, (list, tuple)):
value = ", ".join(str(v) for v in value)
print(f" {r['check']}: {value}")
print("\n## Identity Bindings")
for r in identity_bindings:
value = r.get("value", r.get("error", "N/A"))
if isinstance(value, list):
print(f" {r['check']}:")
for row in value:
print(f" - {row[0]}: {row[1]} faces")
else:
print(f" {r['check']}: {value}")
print(f"\n{'='*60}\n")
def main():
if len(sys.argv) < 2:
file_uuid = "c3c635e3641da80dde10cc555ffcdda5"
else:
file_uuid = sys.argv[1]
print("Verifying pipeline...")
file_outputs = check_file_outputs(file_uuid)
db_records = check_db_records(file_uuid)
identity_bindings = check_identity_bindings(file_uuid)
print_report(file_uuid, file_outputs, db_records, identity_bindings)
if __name__ == "__main__":
main()

View File

@@ -1,445 +0,0 @@
#!/usr/bin/env python3
"""
視覺分片處理器 (Phase 2.2)
從 YOLO 結果生成視覺分片,支持多種分片策略:
1. 固定幀數分片
2. 基於物件相似度分片
3. 基於場景變化分片
"""
import json
import sys
import os
import argparse
from pathlib import Path
from typing import Dict, List, Any, Optional
import numpy as np
from datetime import datetime
# 添加父目錄到路徑以導入其他模組
sys.path.insert(0, str(Path(__file__).parent.parent))
from scripts.yolo_processor_contract_v1 import YOLOProcessor
class VisualChunkProcessor:
"""視覺分片處理器"""
def __init__(self, video_path: str, yolo_result_path: Optional[str] = None):
self.video_path = video_path
self.yolo_result_path = yolo_result_path
self.yolo_result = None
def load_yolo_result(self):
"""加載 YOLO 結果"""
if self.yolo_result_path and os.path.exists(self.yolo_result_path):
with open(self.yolo_result_path, "r", encoding="utf-8") as f:
self.yolo_result = json.load(f)
else:
# 如果沒有提供 YOLO 結果路徑,則運行 YOLO 檢測
print(f"[VisualChunk] Running YOLO detection for: {self.video_path}")
yolo_processor = YOLOProcessor(self.video_path)
yolo_result = yolo_processor.process()
self.yolo_result = yolo_processor.to_json_dict()
def create_fixed_frame_chunks(
self, frames_per_chunk: int = 30
) -> List[Dict[str, Any]]:
"""創建固定幀數分片
Args:
frames_per_chunk: 每個分片的幀數
Returns:
視覺分片列表
"""
if not self.yolo_result:
self.load_yolo_result()
frames = self.yolo_result.get("frames", {})
if not frames:
return []
# 將幀字典轉換為排序後的列表
frame_list = []
for frame_key, frame_data in frames.items():
frame_list.append(
{
"frame_number": int(frame_key),
"timestamp": frame_data.get("time_seconds", 0),
"objects": frame_data.get("detections", []),
}
)
# 按幀號排序
frame_list.sort(key=lambda x: x["frame_number"])
chunks = []
total_frames = len(frame_list)
for start_idx in range(0, total_frames, frames_per_chunk):
end_idx = min(start_idx + frames_per_chunk, total_frames)
chunk_frames = frame_list[start_idx:end_idx]
if not chunk_frames:
continue
# 計算分片統計
chunk_stats = self._calculate_chunk_stats(chunk_frames)
chunk = {
"start_frame": chunk_frames[0]["frame_number"],
"end_frame": chunk_frames[-1]["frame_number"] + 1, # exclusive
"frame_count": len(chunk_frames),
"keyframe_objects": self._extract_keyframe_objects(chunk_frames),
"dominant_objects": chunk_stats["dominant_objects"],
"metadata": {
"object_count": chunk_stats["total_objects"],
"unique_classes": chunk_stats["unique_classes"],
"max_confidence": chunk_stats["max_confidence"],
"avg_confidence": chunk_stats["avg_confidence"],
"spatial_density": chunk_stats["spatial_density"],
},
}
chunks.append(chunk)
return chunks
def create_similarity_based_chunks(
self, similarity_threshold: float = 0.5, min_frames_per_chunk: int = 10
) -> List[Dict[str, Any]]:
"""基於物件相似度創建分片
Args:
similarity_threshold: 相似度閾值 (0-1)
min_frames_per_chunk: 最小幀數
Returns:
視覺分片列表
"""
if not self.yolo_result:
self.load_yolo_result()
frames = self.yolo_result.get("frames", {})
if not frames:
return []
# 將幀字典轉換為排序後的列表
frame_list = []
for frame_key, frame_data in frames.items():
frame_list.append(
{
"frame_number": int(frame_key),
"timestamp": frame_data.get("time_seconds", 0),
"objects": frame_data.get("detections", []),
}
)
# 按幀號排序
frame_list.sort(key=lambda x: x["frame_number"])
chunks = []
current_chunk_frames = []
current_start_frame = 0
for i, frame in enumerate(frame_list):
if not current_chunk_frames:
current_chunk_frames.append(frame)
current_start_frame = frame["frame_number"]
continue
# 計算相似度
last_frame = current_chunk_frames[-1]
similarity = self._calculate_frame_similarity(last_frame, frame)
if similarity >= similarity_threshold:
# 相似度高,加入當前分片
current_chunk_frames.append(frame)
else:
# 相似度低,創建新分片
if len(current_chunk_frames) >= min_frames_per_chunk:
chunk = self._create_chunk_from_frames(
current_chunk_frames,
current_start_frame,
frame_list[i - 1]["frame_number"] + 1,
)
chunks.append(chunk)
# 開始新的分片
current_chunk_frames = [frame]
current_start_frame = frame["frame_number"]
# 處理最後一個分片
if len(current_chunk_frames) >= min_frames_per_chunk:
chunk = self._create_chunk_from_frames(
current_chunk_frames,
current_start_frame,
current_chunk_frames[-1]["frame_number"] + 1,
)
chunks.append(chunk)
return chunks
def _calculate_frame_similarity(self, frame1: Dict, frame2: Dict) -> float:
"""計算兩個幀之間的相似度(基於物件類別)"""
objects1 = frame1.get("objects", [])
objects2 = frame2.get("objects", [])
if not objects1 and not objects2:
return 1.0
if not objects1 or not objects2:
return 0.0
# 提取物件類別
classes1 = set(
obj.get("class_name", "") for obj in objects1 if obj.get("class_name")
)
classes2 = set(
obj.get("class_name", "") for obj in objects2 if obj.get("class_name")
)
# 計算 Jaccard 相似度
intersection = classes1.intersection(classes2)
union = classes1.union(classes2)
if not union:
return 0.0
return len(intersection) / len(union)
def _calculate_chunk_stats(self, frames: List[Dict]) -> Dict[str, Any]:
"""計算分片統計信息"""
all_objects = []
for frame in frames:
all_objects.extend(frame.get("objects", []))
# 總物件數
total_objects = len(all_objects)
# 唯一類別
unique_classes = list(
set(
obj.get("class_name", "")
for obj in all_objects
if obj.get("class_name")
)
)
# 信心值統計
confidences = [obj.get("confidence", 0) for obj in all_objects]
max_confidence = max(confidences) if confidences else 0
avg_confidence = np.mean(confidences) if confidences else 0
# 空間密度(每幀平均物件數)
spatial_density = total_objects / len(frames) if frames else 0
# 主要物件(出現在大多數幀中的物件)
object_counts = {}
for frame in frames:
frame_classes = set(
obj.get("class_name", "")
for obj in frame.get("objects", [])
if obj.get("class_name")
)
for class_name in frame_classes:
object_counts[class_name] = object_counts.get(class_name, 0) + 1
dominant_objects = [
class_name
for class_name, count in object_counts.items()
if count / len(frames) > 0.5
]
dominant_objects.sort()
return {
"total_objects": total_objects,
"unique_classes": unique_classes,
"max_confidence": float(max_confidence),
"avg_confidence": float(avg_confidence),
"spatial_density": float(spatial_density),
"dominant_objects": dominant_objects,
}
def _extract_keyframe_objects(self, frames: List[Dict]) -> List[Dict[str, Any]]:
"""提取關鍵幀物件"""
keyframe_objects = []
# 簡化每5幀取一個關鍵幀
for i in range(0, len(frames), 5):
if i < len(frames):
frame = frames[i]
objects = []
for obj in frame.get("objects", []):
objects.append(
{
"class_name": obj.get("class_name", ""),
"class_id": obj.get("class_id", 0),
"confidence": float(obj.get("confidence", 0)),
"bbox": {
"x": obj.get("x1", 0),
"y": obj.get("y1", 0),
"width": obj.get("width", 0),
"height": obj.get("height", 0),
}
if "x1" in obj
else None,
"occurrence": 1,
}
)
keyframe_objects.append(
{
"timestamp": float(frame.get("timestamp", 0)),
"frame_number": frame.get("frame_number", 0),
"objects": objects,
}
)
return keyframe_objects
def _create_chunk_from_frames(
self, frames: List[Dict], start_frame: int, end_frame: int
) -> Dict[str, Any]:
"""從幀列表創建分片"""
chunk_stats = self._calculate_chunk_stats(frames)
return {
"start_frame": start_frame,
"end_frame": end_frame, # exclusive
"frame_count": len(frames),
"keyframe_objects": self._extract_keyframe_objects(frames),
"dominant_objects": chunk_stats["dominant_objects"],
"object_relationships": [], # 可選:後期添加關係檢測
"scene_description": None, # 可選:後期添加 LLM 生成的場景描述
"metadata": {
"object_count": chunk_stats["total_objects"],
"unique_classes": chunk_stats["unique_classes"],
"max_confidence": chunk_stats["max_confidence"],
"avg_confidence": chunk_stats["avg_confidence"],
"spatial_density": chunk_stats["spatial_density"],
},
}
def process(self, strategy: str = "fixed", **kwargs) -> Dict[str, Any]:
"""處理視覺分片生成
Args:
strategy: 分片策略 ("fixed""similarity")
**kwargs: 策略參數
Returns:
處理結果
"""
if not self.yolo_result:
self.load_yolo_result()
start_time = datetime.now()
if strategy == "fixed":
frames_per_chunk = kwargs.get("frames_per_chunk", 30)
chunks = self.create_fixed_frame_chunks(frames_per_chunk)
elif strategy == "similarity":
similarity_threshold = kwargs.get("similarity_threshold", 0.5)
min_frames = kwargs.get("min_frames_per_chunk", 10)
chunks = self.create_similarity_based_chunks(
similarity_threshold, min_frames
)
else:
raise ValueError(f"Unknown strategy: {strategy}")
# 計算總統計
total_frames = sum(chunk["frame_count"] for chunk in chunks)
total_objects = sum(chunk["metadata"]["object_count"] for chunk in chunks)
# 收集所有唯一類別
all_unique_classes = set()
for chunk in chunks:
all_unique_classes.update(chunk["metadata"]["unique_classes"])
processing_time = (datetime.now() - start_time).total_seconds()
result = {
"metadata": {
"video_path": self.video_path,
"processing_time": processing_time,
"strategy": strategy,
"parameters": kwargs,
"processed_at": datetime.now().isoformat(),
},
"chunk_count": len(chunks),
"total_frames": total_frames,
"total_objects": total_objects,
"unique_classes": len(all_unique_classes),
"chunks": chunks,
}
return result
def main():
parser = argparse.ArgumentParser(description="視覺分片處理器")
parser.add_argument("video_path", help="視頻文件路徑")
parser.add_argument("output_path", help="輸出文件路徑")
parser.add_argument("--yolo-result", help="YOLO 結果文件路徑(可選)")
parser.add_argument("--uuid", help="檔案 UUID由 executor 傳入)")
parser.add_argument(
"--strategy", choices=["fixed", "similarity"], default="fixed", help="分片策略"
)
parser.add_argument(
"--frames-per-chunk", type=int, default=30, help="固定幀數策略:每個分片的幀數"
)
parser.add_argument(
"--similarity-threshold", type=float, default=0.5, help="相似度策略:相似度閾值"
)
parser.add_argument(
"--min-frames-per-chunk", type=int, default=10, help="相似度策略:最小幀數"
)
args = parser.parse_args()
print(f"[VisualChunk] Starting processing: {args.video_path}")
print(f"[VisualChunk] Strategy: {args.strategy}")
try:
processor = VisualChunkProcessor(args.video_path, args.yolo_result)
if args.strategy == "fixed":
result = processor.process(
strategy="fixed", frames_per_chunk=args.frames_per_chunk
)
else:
result = processor.process(
strategy="similarity",
similarity_threshold=args.similarity_threshold,
min_frames_per_chunk=args.min_frames_per_chunk,
)
# 保存結果
with open(args.output_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print("[VisualChunk] Processing completed")
print(f"[VisualChunk] Generated {result['chunk_count']} visual chunks")
print(f"[VisualChunk] Total frames: {result['total_frames']}")
print(f"[VisualChunk] Total objects: {result['total_objects']}")
print(f"[VisualChunk] Unique classes: {result['unique_classes']}")
print(f"[VisualChunk] Result saved to: {args.output_path}")
except Exception as e:
print(f"[VisualChunk] Error: {e}", file=sys.stderr)
result = {
"chunk_count": 0,
"total_frames": 0,
"total_objects": 0,
"unique_classes": 0,
"chunks": [],
}
with open(args.output_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"[VisualChunk] Fallback: empty result saved to {args.output_path}")
if __name__ == "__main__":
main()

21
scripts/wrapper_embedding.sh Executable file
View File

@@ -0,0 +1,21 @@
#!/bin/bash
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
# Source environment (silently)
source "$PROJECT_DIR/.env" 2>/dev/null || true
source "$PROJECT_DIR/.env.development" 2>/dev/null || true
# Ensure PATH is set
export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:$PATH"
# Find python
PYTHON="${MOMENTRY_PYTHON_PATH:-$PROJECT_DIR/venv/bin/python}"
if [ ! -f "$PYTHON" ]; then
PYTHON="/opt/homebrew/bin/python3.11"
fi
EMBED_SCRIPT="$PROJECT_DIR/scripts/embeddinggemma_server.py"
exec "$PYTHON" "$EMBED_SCRIPT" --port 11436

14
scripts/wrapper_playground.sh Executable file
View File

@@ -0,0 +1,14 @@
#!/bin/bash
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
# Source environment (silently)
source "$PROJECT_DIR/.env" 2>/dev/null || true
source "$PROJECT_DIR/.env.development" 2>/dev/null || true
# Ensure PATH is set
export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:$PATH"
exec "$PROJECT_DIR/target/debug/momentry_playground" server --host 0.0.0.0

13
scripts/wrapper_production.sh Executable file
View File

@@ -0,0 +1,13 @@
#!/bin/bash
set -e
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
# Source environment (silently)
source "$PROJECT_DIR/.env" 2>/dev/null || true
# Ensure PATH is set
export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:$PATH"
exec "$PROJECT_DIR/target/release/momentry" server --host 0.0.0.0