feat: ASRX hybrid pipeline, identity history, worker fixes, checkpoint system
This commit is contained in:
Binary file not shown.
@@ -1,124 +1,320 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
ASRX Processor - Speaker Diarization
|
||||
Uses whisperx for speaker diarization (local model)
|
||||
ASRX Processor - Hybrid Pipeline Wrapper
|
||||
|
||||
Pipeline:
|
||||
1. ffprobe → select best audio track → ffmpeg → 16kHz mono WAV
|
||||
2. SelfASRXFixed.process() (7-step hybrid speaker diarization)
|
||||
3. Convert to Rust-expected format
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(
|
||||
0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "asrx_self")
|
||||
)
|
||||
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
|
||||
def process_asrx(video_path: str, output_path: str, uuid: str = ""):
|
||||
"""Process video for speaker diarization using whisperx"""
|
||||
def probe_audio_tracks(video_path: str) -> list:
|
||||
"""ffprobe 列出所有音軌"""
|
||||
cmd = [
|
||||
"ffprobe", "-v", "quiet", "-print_format", "json",
|
||||
"-show_streams", "-select_streams", "a", video_path,
|
||||
]
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
data = json.loads(result.stdout)
|
||||
tracks = []
|
||||
for stream in data.get("streams", []):
|
||||
tracks.append({
|
||||
"index": stream.get("index"),
|
||||
"codec": stream.get("codec_name"),
|
||||
"language": stream.get("tags", {}).get("language", "und"),
|
||||
"channels": stream.get("channels", 0),
|
||||
"sample_rate": stream.get("sample_rate", "0"),
|
||||
})
|
||||
return tracks
|
||||
except Exception as e:
|
||||
print(f"[ASRX] ffprobe failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def select_best_track(tracks: list) -> int:
|
||||
"""選最佳音軌: English > 最多channels > 0"""
|
||||
if not tracks:
|
||||
return 0
|
||||
for i, t in enumerate(tracks):
|
||||
if t["language"] in ("eng", "en"):
|
||||
return i
|
||||
best = 0
|
||||
for i, t in enumerate(tracks):
|
||||
if t["channels"] > tracks[best]["channels"]:
|
||||
best = i
|
||||
return best
|
||||
|
||||
|
||||
def extract_audio_to_wav(video_path: str, track_index: int, output_wav: str) -> bool:
|
||||
"""ffmpeg 提取音軌為 16kHz mono WAV"""
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-v", "quiet",
|
||||
"-i", video_path,
|
||||
"-map", f"0:{track_index}",
|
||||
"-ar", "16000",
|
||||
"-ac", "1",
|
||||
"-sample_fmt", "s16",
|
||||
output_wav,
|
||||
]
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=True, timeout=300)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"[ASRX] ffmpeg extraction failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def _cleanup(tmp_dir):
|
||||
if tmp_dir and os.path.exists(tmp_dir):
|
||||
import shutil
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
|
||||
|
||||
def _atomic_write(path: str, data: dict):
|
||||
tmp = path + ".tmp"
|
||||
with open(tmp, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
os.rename(tmp, path)
|
||||
|
||||
|
||||
def _shared_audio_setup(video_path):
|
||||
"""提取音頻,回傳 (tmp_dir, wav_path)"""
|
||||
tracks = probe_audio_tracks(video_path)
|
||||
track_idx = select_best_track(tracks) if tracks else 0
|
||||
actual_track_index = tracks[track_idx]["index"] if tracks else track_idx
|
||||
|
||||
tmp_dir = tempfile.mkdtemp(prefix="asrx_")
|
||||
wav_path = os.path.join(tmp_dir, "audio.wav")
|
||||
|
||||
if extract_audio_to_wav(video_path, actual_track_index, wav_path):
|
||||
return tmp_dir, wav_path
|
||||
print("[ASRX] Audio extraction failed, falling back to original file",
|
||||
file=sys.stderr)
|
||||
return tmp_dir, video_path
|
||||
|
||||
|
||||
def _convert_result(result, output_path):
|
||||
"""Stage 3: 將 SelfASRXFixed result 轉為 Rust-expected format"""
|
||||
fps = 30.0
|
||||
base_name = os.path.basename(output_path)
|
||||
uuid_part = base_name.split(".")[0]
|
||||
probe_path = os.path.join(os.path.dirname(output_path),
|
||||
f"{uuid_part}.probe.json")
|
||||
if os.path.exists(probe_path):
|
||||
try:
|
||||
with open(probe_path) as pf:
|
||||
probe_data = json.load(pf)
|
||||
if "fps" in probe_data:
|
||||
fps = float(probe_data["fps"])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
output_result = {
|
||||
"language": result.get("language"),
|
||||
"segments": [],
|
||||
"n_speakers": result.get("n_speakers", 0),
|
||||
"speaker_stats": result.get("speaker_stats", {}),
|
||||
}
|
||||
|
||||
for seg in result.get("segments", []):
|
||||
start_sec = seg["start"]
|
||||
end_sec = seg["end"]
|
||||
output_result["segments"].append({
|
||||
"start_time": start_sec,
|
||||
"end_time": end_sec,
|
||||
"start_frame": int(start_sec * fps),
|
||||
"end_frame": int(end_sec * fps),
|
||||
"text": seg.get("text", ""),
|
||||
"speaker_id": seg.get("speaker_id", seg.get("speaker", "")),
|
||||
"language": seg.get("language", ""),
|
||||
"lang_prob": seg.get("lang_prob", 0.0),
|
||||
"quality": seg.get("quality", 0.0),
|
||||
})
|
||||
|
||||
if "references" in result:
|
||||
output_result["references"] = result["references"]
|
||||
|
||||
return output_result
|
||||
|
||||
|
||||
def process_asrx(video_path: str, output_path: str, uuid: str = "",
|
||||
file_uuid: str = "", resume: bool = False):
|
||||
"""主處理函數"""
|
||||
publisher = RedisPublisher(uuid) if uuid else None
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_START")
|
||||
|
||||
try:
|
||||
import whisperx
|
||||
import torch
|
||||
except ImportError:
|
||||
if publisher:
|
||||
publisher.error("asrx", "whisperx not installed")
|
||||
result = {"language": None, "segments": []}
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
return result
|
||||
checkpoint_path = output_path + ".stage1.json"
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_LOADING_MODEL")
|
||||
# ── Phase 2: Resume from checkpoint (Steps 4-7 only) ──
|
||||
if resume and os.path.exists(checkpoint_path):
|
||||
print(f"[ASRX] Found checkpoint, resuming from Step 4...")
|
||||
tmp_dir, audio_input = _shared_audio_setup(video_path)
|
||||
try:
|
||||
from asrx_self.main_fixed import SelfASRXFixed
|
||||
asrx = SelfASRXFixed()
|
||||
|
||||
result = asrx.resume_from_checkpoint(
|
||||
checkpoint_path, audio_input, output_path=output_path,
|
||||
)
|
||||
|
||||
if "error" in result:
|
||||
if publisher:
|
||||
publisher.error("asrx", result["error"])
|
||||
output_result = {"language": None, "segments": []}
|
||||
_atomic_write(output_path, output_result)
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
_cleanup(tmp_dir)
|
||||
return output_result
|
||||
|
||||
output_result = _convert_result(result, output_path)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx",
|
||||
f"ASRX_COMPLETE:{len(output_result['segments'])}")
|
||||
|
||||
_atomic_write(output_path, output_result)
|
||||
|
||||
if publisher:
|
||||
publisher.complete(
|
||||
"asrx", f"{len(output_result['segments'])} segments")
|
||||
|
||||
print(f"[ASRX] Saved {len(output_result['segments'])} segments "
|
||||
f"to {output_path}", file=sys.stderr)
|
||||
|
||||
# 刪除 checkpoint(完成後清理)
|
||||
try:
|
||||
os.remove(checkpoint_path)
|
||||
print(f"[ASRX] Removed checkpoint: {checkpoint_path}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
_cleanup(tmp_dir)
|
||||
return output_result
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.error("asrx", str(e))
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
output_result = {"language": None, "segments": []}
|
||||
_atomic_write(output_path, output_result)
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
_cleanup(tmp_dir)
|
||||
return output_result
|
||||
|
||||
# ── Phase 1: Full 7-step pipeline ──
|
||||
tmp_dir = None
|
||||
|
||||
try:
|
||||
# Fix for PyTorch 2.6+ compatibility
|
||||
# Allow omegaconf types in torch.load
|
||||
import omegaconf
|
||||
# Stage 1: Audio Track Preprocessing
|
||||
tmp_dir, audio_input = _shared_audio_setup(video_path)
|
||||
|
||||
torch.serialization.add_safe_globals(
|
||||
[omegaconf.listconfig.ListConfig, omegaconf.dictconfig.DictConfig]
|
||||
)
|
||||
# Stage 2: SelfASRXFixed 7-step pipeline
|
||||
from asrx_self.main_fixed import SelfASRXFixed
|
||||
|
||||
# Load model - using faster-whisper for better performance
|
||||
# You can also use: "large-v3", "medium", "small", "base", "tiny"
|
||||
model = whisperx.load_model("base", device="cpu", compute_type="int8")
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_LOADING_MODEL")
|
||||
|
||||
asrx = SelfASRXFixed()
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_TRANSCRIBING")
|
||||
|
||||
# Transcribe audio
|
||||
result = model.transcribe(video_path, language="en")
|
||||
|
||||
# Align timestamps
|
||||
model_a, metadata = whisperx.load_align_model(language_code=result["language"])
|
||||
result = whisperx.align(
|
||||
result["segments"], model_a, metadata, video_path, device="cpu"
|
||||
result = asrx.process(
|
||||
audio_input,
|
||||
output_path=None,
|
||||
file_uuid=file_uuid or None,
|
||||
max_speakers=10,
|
||||
quality_threshold=0.85,
|
||||
checkpoint_path=checkpoint_path,
|
||||
)
|
||||
|
||||
# Diarization (speaker segmentation)
|
||||
try:
|
||||
from whisperx.diarize import DiarizationPipeline
|
||||
|
||||
# DiarizationPipeline parameters: model_name, token, device, cache_dir
|
||||
diarize_model = DiarizationPipeline(
|
||||
model_name="pyannote/speaker-diarization",
|
||||
token=None, # HuggingFace token (None for public models)
|
||||
device="cpu",
|
||||
)
|
||||
diarize_segments = diarize_model(video_path)
|
||||
|
||||
# Assign speaker labels
|
||||
result = whisperx.assign_word_speakers(diarize_segments, result)
|
||||
except Exception as e:
|
||||
if "error" in result:
|
||||
if publisher:
|
||||
publisher.info("asrx", f"Diarization skipped: {e}")
|
||||
publisher.error("asrx", result["error"])
|
||||
output_result = {"language": None, "segments": []}
|
||||
_atomic_write(output_path, output_result)
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
_cleanup(tmp_dir)
|
||||
return output_result
|
||||
|
||||
# Build output
|
||||
segments = []
|
||||
for seg in result.get("segments", []):
|
||||
text = seg.get("text", "").strip()
|
||||
if text:
|
||||
segments.append(
|
||||
{
|
||||
"start": seg.get("start", 0.0),
|
||||
"end": seg.get("end", 0.0),
|
||||
"text": text,
|
||||
"speaker_id": seg.get("speaker", None),
|
||||
}
|
||||
)
|
||||
|
||||
output_result = {"language": result.get("language"), "segments": segments}
|
||||
# Stage 3: Convert to Rust-expected format
|
||||
output_result = _convert_result(result, output_path)
|
||||
|
||||
if publisher:
|
||||
publisher.complete("asrx", f"{len(segments)} segments")
|
||||
publisher.info("asrx", f"ASRX_COMPLETE:{len(output_result['segments'])}")
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output_result, f, indent=2)
|
||||
_atomic_write(output_path, output_result)
|
||||
|
||||
if publisher:
|
||||
publisher.complete("asrx",
|
||||
f"{len(output_result['segments'])} segments")
|
||||
|
||||
print(f"[ASRX] Saved {len(output_result['segments'])} segments "
|
||||
f"to {output_path}", file=sys.stderr)
|
||||
|
||||
_cleanup(tmp_dir)
|
||||
return output_result
|
||||
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.error("asrx", f"Error: {e}")
|
||||
result = {"language": None, "segments": []}
|
||||
publisher.error("asrx", str(e))
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
output_result = {"language": None, "segments": []}
|
||||
_atomic_write(output_path, output_result)
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
return result
|
||||
# 如果 checkpoint 已存在(Step 3 完成後 crash),保留 WAV 給 resume
|
||||
if not os.path.exists(checkpoint_path):
|
||||
_cleanup(tmp_dir)
|
||||
else:
|
||||
print(f"[ASRX] Checkpoint saved, keeping temp dir for resume: {tmp_dir}")
|
||||
return output_result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="ASRX Speaker Diarization")
|
||||
parser.add_argument("video_path", help="Path to video file")
|
||||
parser.add_argument("output_path", help="Output JSON path")
|
||||
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
|
||||
parser = argparse.ArgumentParser(description="ASRX Processor (Hybrid Pipeline)")
|
||||
parser.add_argument("video_path", help="Path to video/audio file")
|
||||
parser.add_argument("output_path", help="Path to output JSON file")
|
||||
parser.add_argument("--uuid", help="UUID for Redis publishing", default="")
|
||||
parser.add_argument("--file-uuid", help="File UUID for Qdrant storage", default="")
|
||||
parser.add_argument("--resume", action="store_true",
|
||||
help="Resume from checkpoint (skip Steps 1-3)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
process_asrx(args.video_path, args.output_path, args.uuid)
|
||||
if not args.resume and not Path(args.video_path).exists():
|
||||
print(f"Error: Video file not found: {args.video_path}")
|
||||
sys.exit(1)
|
||||
|
||||
result = process_asrx(args.video_path, args.output_path, args.uuid,
|
||||
args.file_uuid, resume=args.resume)
|
||||
|
||||
print("\n[Summary]")
|
||||
print(f" Total segments: {len(result.get('segments', []))}")
|
||||
if "speaker_stats" in result:
|
||||
print(f" Detected speakers: {len(result['speaker_stats'])}")
|
||||
for speaker, stats in result["speaker_stats"].items():
|
||||
print(f" {speaker}: {stats['count']} segments")
|
||||
|
||||
@@ -1,584 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
ASRX Processor - AI-Driven Processor Contract Version 1.0
|
||||
|
||||
Compliant with AI-Driven Processor Contract v1.0
|
||||
Effective Date: 2025-03-27
|
||||
|
||||
Features:
|
||||
1. Standardized command-line interface
|
||||
2. Redis progress reporting
|
||||
3. Signal handling (SIGTERM, SIGINT)
|
||||
4. Health check mode
|
||||
5. Resource monitoring
|
||||
6. Contract-compliant JSON output
|
||||
7. Unified configuration
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
import argparse
|
||||
import signal
|
||||
import time
|
||||
import subprocess
|
||||
import traceback
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
|
||||
# Redis Publisher for progress reporting
|
||||
try:
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
REDIS_AVAILABLE = True
|
||||
except ImportError:
|
||||
REDIS_AVAILABLE = False
|
||||
print(
|
||||
"WARNING: RedisPublisher not available, progress reporting disabled",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
# Contract version
|
||||
CONTRACT_VERSION = "1.0"
|
||||
PROCESSOR_NAME = (
|
||||
"/Users/accusys/momentry_core_0.1/scripts/asrx_processor_contract_v1.py"
|
||||
)
|
||||
PROCESSOR_VERSION = "1.0.0"
|
||||
MODEL_NAME = "pyannote"
|
||||
MODEL_VERSION = "3.1"
|
||||
|
||||
# Unified configuration defaults
|
||||
DEFAULT_TIMEOUT = 7200 # 2 hours for speaker diarization
|
||||
DEFAULT_MODEL_SIZE = "base"
|
||||
DEFAULT_DEVICE = "cpu"
|
||||
DEFAULT_LANGUAGE = "auto"
|
||||
DEFAULT_BATCH_SIZE = 16
|
||||
DEFAULT_DIARIZATION = True
|
||||
DEFAULT_MIN_SPEAKERS = 1
|
||||
DEFAULT_MAX_SPEAKERS = 10
|
||||
|
||||
|
||||
# Signal handling with timeout support
|
||||
class SignalHandler:
|
||||
"""Handle system signals for graceful shutdown"""
|
||||
|
||||
def __init__(self):
|
||||
self.should_exit = False
|
||||
self.exit_code = 0
|
||||
signal.signal(signal.SIGTERM, self.handle_signal)
|
||||
signal.signal(signal.SIGINT, self.handle_signal)
|
||||
|
||||
def handle_signal(self, signum, frame):
|
||||
"""Handle termination signals"""
|
||||
print(f"\n收到信号 {signum},正在优雅关闭...")
|
||||
self.should_exit = True
|
||||
self.exit_code = 128 + signum
|
||||
|
||||
def should_stop(self):
|
||||
"""Check if should stop processing"""
|
||||
return self.should_exit
|
||||
|
||||
|
||||
# Timeout manager
|
||||
class TimeoutManager:
|
||||
"""Manage processing timeouts"""
|
||||
|
||||
def __init__(self, timeout_seconds: int):
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.start_time = time.time()
|
||||
self.timer = None
|
||||
|
||||
def check_timeout(self) -> bool:
|
||||
"""Check if timeout has been reached"""
|
||||
elapsed = time.time() - self.start_time
|
||||
return elapsed > self.timeout_seconds
|
||||
|
||||
def get_remaining_time(self) -> float:
|
||||
"""Get remaining time in seconds"""
|
||||
elapsed = time.time() - self.start_time
|
||||
return max(0, self.timeout_seconds - elapsed)
|
||||
|
||||
def format_remaining_time(self) -> str:
|
||||
"""Format remaining time as HH:MM:SS"""
|
||||
remaining = self.get_remaining_time()
|
||||
hours = int(remaining // 3600)
|
||||
minutes = int((remaining % 3600) // 60)
|
||||
seconds = int(remaining % 60)
|
||||
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
|
||||
|
||||
|
||||
# Health check functions
|
||||
def check_environment() -> Dict[str, Any]:
|
||||
"""Check environment and dependencies"""
|
||||
checks = []
|
||||
|
||||
# Check 1: whisperx for speaker diarization
|
||||
try:
|
||||
import whisperx
|
||||
|
||||
checks.append(
|
||||
{
|
||||
"name": "whisperx",
|
||||
"status": "available",
|
||||
"version": getattr(whisperx, "__version__", "unknown"),
|
||||
}
|
||||
)
|
||||
except ImportError:
|
||||
checks.append({"name": "whisperx", "status": "missing", "version": None})
|
||||
|
||||
# Check 2: FFmpeg/FFprobe
|
||||
try:
|
||||
ffprobe_result = subprocess.run(
|
||||
["ffprobe", "-version"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
if ffprobe_result.returncode == 0:
|
||||
version_line = ffprobe_result.stdout.split("\n")[0]
|
||||
checks.append(
|
||||
{"name": "ffprobe", "status": "available", "version": version_line}
|
||||
)
|
||||
else:
|
||||
checks.append({"name": "ffprobe", "status": "error", "version": None})
|
||||
except (subprocess.TimeoutExpired, FileNotFoundError):
|
||||
checks.append({"name": "ffprobe", "status": "missing", "version": None})
|
||||
|
||||
# Check 3: Redis (optional)
|
||||
checks.append(
|
||||
{
|
||||
"name": "redis",
|
||||
"status": "available" if REDIS_AVAILABLE else "optional",
|
||||
"version": None,
|
||||
}
|
||||
)
|
||||
|
||||
# Check 4: Python version
|
||||
checks.append(
|
||||
{
|
||||
"name": "python",
|
||||
"status": "available",
|
||||
"version": f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}",
|
||||
}
|
||||
)
|
||||
|
||||
# Check 5: CUDA/GPU availability (optional)
|
||||
try:
|
||||
import torch
|
||||
|
||||
cuda_available = torch.cuda.is_available()
|
||||
checks.append(
|
||||
{
|
||||
"name": "cuda",
|
||||
"status": "available" if cuda_available else "optional",
|
||||
"version": torch.version.cuda if cuda_available else None,
|
||||
}
|
||||
)
|
||||
except ImportError:
|
||||
checks.append({"name": "cuda", "status": "optional", "version": None})
|
||||
|
||||
return {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"processor_name": PROCESSOR_NAME,
|
||||
"processor_version": PROCESSOR_VERSION,
|
||||
"contract_version": CONTRACT_VERSION,
|
||||
"model_name": MODEL_NAME,
|
||||
"model_version": MODEL_VERSION,
|
||||
"checks": checks,
|
||||
}
|
||||
|
||||
|
||||
def check_video_file(video_path: str) -> Dict[str, Any]:
|
||||
"""Check video file properties"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
"ffprobe",
|
||||
"-v",
|
||||
"error",
|
||||
"-select_streams",
|
||||
"v:0",
|
||||
"-show_entries",
|
||||
"stream=codec_name,width,height,duration,r_frame_rate",
|
||||
"-show_entries",
|
||||
"format=duration,size",
|
||||
"-of",
|
||||
"json",
|
||||
video_path,
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
return {
|
||||
"valid": False,
|
||||
"error": result.stderr[:200] if result.stderr else "Unknown error",
|
||||
}
|
||||
|
||||
info = json.loads(result.stdout)
|
||||
|
||||
video_info = {}
|
||||
if "streams" in info and len(info["streams"]) > 0:
|
||||
stream = info["streams"][0]
|
||||
video_info = {
|
||||
"codec": stream.get("codec_name", "unknown"),
|
||||
"width": int(stream.get("width", 0)),
|
||||
"height": int(stream.get("height", 0)),
|
||||
"duration": float(stream.get("duration", 0)),
|
||||
"frame_rate": stream.get("r_frame_rate", "0/0"),
|
||||
}
|
||||
|
||||
format_info = {}
|
||||
if "format" in info:
|
||||
format_info = {
|
||||
"format_duration": float(info["format"].get("duration", 0)),
|
||||
"file_size": int(info["format"].get("size", 0)),
|
||||
}
|
||||
|
||||
return {
|
||||
"valid": True,
|
||||
"video_info": video_info,
|
||||
"format_info": format_info,
|
||||
"exists": os.path.exists(video_path),
|
||||
"file_size": os.path.getsize(video_path)
|
||||
if os.path.exists(video_path)
|
||||
else 0,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {"valid": False, "error": str(e)}
|
||||
|
||||
|
||||
# Main processing function
|
||||
def process_asrx(
|
||||
video_path: str,
|
||||
output_path: str,
|
||||
uuid: str = "",
|
||||
model_size: str = DEFAULT_MODEL_SIZE,
|
||||
device: str = DEFAULT_DEVICE,
|
||||
language: str = DEFAULT_LANGUAGE,
|
||||
batch_size: int = DEFAULT_BATCH_SIZE,
|
||||
diarization: bool = DEFAULT_DIARIZATION,
|
||||
min_speakers: int = DEFAULT_MIN_SPEAKERS,
|
||||
max_speakers: int = DEFAULT_MAX_SPEAKERS,
|
||||
timeout: int = DEFAULT_TIMEOUT,
|
||||
) -> Dict[str, Any]:
|
||||
"""Process video for speaker diarization using whisperx"""
|
||||
|
||||
# Initialize
|
||||
signal_handler = SignalHandler()
|
||||
timeout_manager = TimeoutManager(timeout)
|
||||
publisher = RedisPublisher(uuid) if REDIS_AVAILABLE and uuid else None
|
||||
|
||||
def publish(stage: str, message: str, data: Dict = None):
|
||||
if publisher:
|
||||
publisher.info(PROCESSOR_NAME, stage, message, data)
|
||||
|
||||
publish("ASRX_START", f"开始处理: {os.path.basename(video_path)}")
|
||||
|
||||
result = {
|
||||
"processor_name": PROCESSOR_NAME,
|
||||
"processor_version": PROCESSOR_VERSION,
|
||||
"contract_version": CONTRACT_VERSION,
|
||||
"model_name": MODEL_NAME,
|
||||
"model_version": MODEL_VERSION,
|
||||
"video_path": video_path,
|
||||
"output_path": output_path,
|
||||
"uuid": uuid,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"parameters": {
|
||||
"model_size": model_size,
|
||||
"device": device,
|
||||
"language": language,
|
||||
"batch_size": batch_size,
|
||||
"diarization": diarization,
|
||||
"min_speakers": min_speakers,
|
||||
"max_speakers": max_speakers,
|
||||
"timeout": timeout,
|
||||
},
|
||||
"success": False,
|
||||
"error": None,
|
||||
"segments": [],
|
||||
"speakers": [],
|
||||
"processing_time": 0,
|
||||
"resource_usage": {},
|
||||
}
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Check timeout
|
||||
if timeout_manager.check_timeout():
|
||||
raise TimeoutError(f"超时 ({timeout} 秒)")
|
||||
|
||||
# Check if should exit
|
||||
if signal_handler.should_stop():
|
||||
raise KeyboardInterrupt("收到停止信号")
|
||||
|
||||
# Check video file
|
||||
publish("ASRX_CHECK_VIDEO", "检查视频文件")
|
||||
video_check = check_video_file(video_path)
|
||||
if not video_check.get("valid", False):
|
||||
raise ValueError(f"无效的视频文件: {video_check.get('error', '未知错误')}")
|
||||
|
||||
result["video_info"] = video_check.get("video_info", {})
|
||||
result["format_info"] = video_check.get("format_info", {})
|
||||
|
||||
# Import whisperx
|
||||
publish("ASRX_LOAD_MODEL", f"加载模型: {model_size}")
|
||||
try:
|
||||
import whisperx
|
||||
except ImportError as e:
|
||||
raise ImportError(f"whisperx 未安装: {e}")
|
||||
|
||||
# Load model
|
||||
publish("ASRX_LOADING", f"加载 whisperx 模型 ({model_size}, {device})")
|
||||
model = whisperx.load_model(
|
||||
model_size,
|
||||
device=device,
|
||||
compute_type="int8" if device == "cpu" else "float16",
|
||||
)
|
||||
|
||||
# Transcribe
|
||||
publish("ASRX_TRANSCRIBING", "转录音频")
|
||||
transcript = model.transcribe(
|
||||
video_path,
|
||||
language=language if language != "auto" else None,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
# Align timestamps
|
||||
publish("ASRX_ALIGNING", "对齐时间戳")
|
||||
model_a, metadata = whisperx.load_align_model(
|
||||
language_code=transcript["language"]
|
||||
)
|
||||
transcript = whisperx.align(
|
||||
transcript["segments"],
|
||||
model_a,
|
||||
metadata,
|
||||
video_path,
|
||||
device,
|
||||
return_char_alignments=False,
|
||||
)
|
||||
|
||||
# Speaker diarization
|
||||
if diarization:
|
||||
publish("ASRX_DIARIZATION", "说话人分离")
|
||||
diarize_model = whisperx.DiarizationPipeline(
|
||||
use_auth_token=None, device=device
|
||||
)
|
||||
|
||||
# Add min/max speakers
|
||||
diarize_segments = diarize_model(
|
||||
video_path,
|
||||
min_speakers=min_speakers,
|
||||
max_speakers=max_speakers,
|
||||
)
|
||||
|
||||
transcript = whisperx.assign_word_speakers(diarize_segments, transcript)
|
||||
|
||||
# Extract speaker information
|
||||
speakers = {}
|
||||
for segment in transcript["segments"]:
|
||||
if "speaker" in segment:
|
||||
speaker_id = segment["speaker"]
|
||||
if speaker_id not in speakers:
|
||||
speakers[speaker_id] = {
|
||||
"id": speaker_id,
|
||||
"segment_count": 0,
|
||||
"total_words": 0,
|
||||
"total_duration": 0.0,
|
||||
}
|
||||
|
||||
speakers[speaker_id]["segment_count"] += 1
|
||||
speakers[speaker_id]["total_words"] += len(
|
||||
segment.get("text", "").split()
|
||||
)
|
||||
speakers[speaker_id]["total_duration"] += segment.get(
|
||||
"end", 0
|
||||
) - segment.get("start", 0)
|
||||
|
||||
result["speakers"] = list(speakers.values())
|
||||
|
||||
# Format segments
|
||||
segments = []
|
||||
for segment in transcript.get("segments", []):
|
||||
segments.append(
|
||||
{
|
||||
"start": segment.get("start", 0.0),
|
||||
"end": segment.get("end", 0.0),
|
||||
"text": segment.get("text", ""),
|
||||
"speaker": segment.get("speaker", None),
|
||||
"words": segment.get("words", []),
|
||||
"confidence": segment.get("confidence", 0.0),
|
||||
}
|
||||
)
|
||||
|
||||
result["segments"] = segments
|
||||
result["language"] = transcript.get("language", "unknown")
|
||||
result["success"] = True
|
||||
|
||||
publish("ASRX_COMPLETE", f"完成: {len(segments)} 个片段")
|
||||
|
||||
except TimeoutError as e:
|
||||
result["error"] = f"处理超时: {e}"
|
||||
publish("ASRX_TIMEOUT", f"超时: {e}")
|
||||
except KeyboardInterrupt:
|
||||
result["error"] = "处理被用户中断"
|
||||
publish("ASRX_INTERRUPTED", "处理被中断")
|
||||
except ImportError as e:
|
||||
result["error"] = f"依赖缺失: {e}"
|
||||
publish("ASRX_MISSING_DEPS", f"缺少依赖: {e}")
|
||||
except Exception as e:
|
||||
result["error"] = f"处理错误: {str(e)}"
|
||||
publish("ASRX_ERROR", f"错误: {str(e)}")
|
||||
traceback.print_exc()
|
||||
|
||||
# Calculate processing time
|
||||
processing_time = time.time() - start_time
|
||||
result["processing_time"] = processing_time
|
||||
|
||||
# Add resource usage
|
||||
try:
|
||||
import psutil
|
||||
|
||||
process = psutil.Process()
|
||||
memory_info = process.memory_info()
|
||||
result["resource_usage"] = {
|
||||
"cpu_percent": process.cpu_percent(),
|
||||
"memory_mb": memory_info.rss / (1024 * 1024),
|
||||
"user_time": process.cpu_times().user,
|
||||
"system_time": process.cpu_times().system,
|
||||
}
|
||||
except ImportError:
|
||||
result["resource_usage"] = {"error": "psutil not available"}
|
||||
|
||||
# Save result
|
||||
try:
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
publish("ASRX_SAVED", f"结果保存到: {output_path}")
|
||||
except Exception as e:
|
||||
result["error"] = f"保存结果失败: {str(e)}"
|
||||
publish("ASRX_SAVE_ERROR", f"保存失败: {str(e)}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description=f"{PROCESSOR_NAME.upper()} Processor v{PROCESSOR_VERSION} - Speaker Diarization"
|
||||
)
|
||||
parser.add_argument("video_path", help="Path to input video file")
|
||||
parser.add_argument("output_path", help="Path to output JSON file")
|
||||
parser.add_argument("--uuid", help="UUID for progress tracking", default="")
|
||||
parser.add_argument(
|
||||
"--model-size",
|
||||
help=f"Model size (default: {DEFAULT_MODEL_SIZE})",
|
||||
default=DEFAULT_MODEL_SIZE,
|
||||
choices=["tiny", "base", "small", "medium", "large-v3"],
|
||||
)
|
||||
parser.add_argument(
|
||||
"--device",
|
||||
help=f"Device to use (default: {DEFAULT_DEVICE})",
|
||||
default=DEFAULT_DEVICE,
|
||||
choices=["cpu", "cuda"],
|
||||
)
|
||||
parser.add_argument(
|
||||
"--language",
|
||||
help=f"Language code or 'auto' (default: {DEFAULT_LANGUAGE})",
|
||||
default=DEFAULT_LANGUAGE,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch-size",
|
||||
help=f"Batch size for processing (default: {DEFAULT_BATCH_SIZE})",
|
||||
type=int,
|
||||
default=DEFAULT_BATCH_SIZE,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-diarization",
|
||||
help="Disable speaker diarization",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-speakers",
|
||||
help=f"Minimum number of speakers (default: {DEFAULT_MIN_SPEAKERS})",
|
||||
type=int,
|
||||
default=DEFAULT_MIN_SPEAKERS,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-speakers",
|
||||
help=f"Maximum number of speakers (default: {DEFAULT_MAX_SPEAKERS})",
|
||||
type=int,
|
||||
default=DEFAULT_MAX_SPEAKERS,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
help=f"Timeout in seconds (default: {DEFAULT_TIMEOUT})",
|
||||
type=int,
|
||||
default=DEFAULT_TIMEOUT,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--health-check",
|
||||
help="Run health check and exit",
|
||||
action="store_true",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--check-video",
|
||||
help="Check video file and exit",
|
||||
action="store_true",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Health check mode
|
||||
if args.health_check:
|
||||
health = check_environment()
|
||||
print(json.dumps(health, indent=2, ensure_ascii=False))
|
||||
return (
|
||||
0
|
||||
if all(c["status"] in ["available", "optional"] for c in health["checks"])
|
||||
else 1
|
||||
)
|
||||
|
||||
# Video check mode
|
||||
if args.check_video:
|
||||
video_check = check_video_file(args.video_path)
|
||||
print(json.dumps(video_check, indent=2, ensure_ascii=False))
|
||||
return 0 if video_check.get("valid", False) else 1
|
||||
|
||||
# Normal processing mode
|
||||
result = process_asrx(
|
||||
video_path=args.video_path,
|
||||
output_path=args.output_path,
|
||||
uuid=args.uuid,
|
||||
model_size=args.model_size,
|
||||
device=args.device,
|
||||
language=args.language,
|
||||
batch_size=args.batch_size,
|
||||
diarization=not args.no_diarization,
|
||||
min_speakers=args.min_speakers,
|
||||
max_speakers=args.max_speakers,
|
||||
timeout=args.timeout,
|
||||
)
|
||||
|
||||
# Print result summary
|
||||
if result.get("success", False):
|
||||
print(f"✅ {PROCESSOR_NAME.upper()} 处理成功")
|
||||
print(f" 片段数: {len(result.get('segments', []))}")
|
||||
print(f" 说话人数: {len(result.get('speakers', []))}")
|
||||
print(f" 处理时间: {result.get('processing_time', 0):.1f} 秒")
|
||||
print(f" 输出文件: {args.output_path}")
|
||||
return 0
|
||||
else:
|
||||
print(f"❌ {PROCESSOR_NAME.upper()} 处理失败")
|
||||
print(f" 错误: {result.get('error', '未知错误')}")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -1,328 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
ASRX Processor - Custom Implementation Wrapper
|
||||
Uses SpeechBrain ECAPA-TDNN (no HuggingFace token required)
|
||||
|
||||
Pipeline:
|
||||
1. Preprocess: ffprobe audio tracks → select best track → extract WAV
|
||||
2. Process: VAD (Silero) → Speaker embedding (ECAPA-TDNN) → Spectral clustering
|
||||
3. Output: segments with speaker_id
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.insert(
|
||||
0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "asrx_self")
|
||||
)
|
||||
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
|
||||
def probe_audio_tracks(video_path: str) -> list:
|
||||
"""Use ffprobe to list all audio tracks in the video file."""
|
||||
cmd = [
|
||||
"ffprobe", "-v", "quiet", "-print_format", "json",
|
||||
"-show_streams", "-select_streams", "a", video_path,
|
||||
]
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
data = json.loads(result.stdout)
|
||||
tracks = []
|
||||
for stream in data.get("streams", []):
|
||||
track = {
|
||||
"index": stream.get("index"),
|
||||
"codec": stream.get("codec_name"),
|
||||
"language": stream.get("tags", {}).get("language", "und"),
|
||||
"channels": stream.get("channels", 0),
|
||||
"sample_rate": stream.get("sample_rate", "0"),
|
||||
}
|
||||
tracks.append(track)
|
||||
return tracks
|
||||
except Exception as e:
|
||||
print(f"[ASRX] ffprobe failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def select_best_track(tracks: list) -> int:
|
||||
"""Select the best audio track: English > first available > fallback to 0."""
|
||||
if not tracks:
|
||||
return 0
|
||||
|
||||
# Priority 1: English track
|
||||
for i, t in enumerate(tracks):
|
||||
if t["language"] == "eng" or t["language"] == "en":
|
||||
print(f"[ASRX] Selected English track (index {t['index']})")
|
||||
return i
|
||||
|
||||
# Priority 2: First track with the most channels
|
||||
best = 0
|
||||
for i, t in enumerate(tracks):
|
||||
if t["channels"] > tracks[best]["channels"]:
|
||||
best = i
|
||||
|
||||
print(f"[ASRX] Selected track {best} (lang={tracks[best]['language']}, ch={tracks[best]['channels']})")
|
||||
return best
|
||||
|
||||
|
||||
def extract_audio_to_wav(video_path: str, track_index: int, output_wav: str) -> bool:
|
||||
"""Extract selected audio track to 16kHz mono WAV using ffmpeg."""
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-v", "quiet",
|
||||
"-i", video_path,
|
||||
"-map", f"0:{track_index}",
|
||||
"-ar", "16000",
|
||||
"-ac", "1",
|
||||
"-sample_fmt", "s16",
|
||||
output_wav,
|
||||
]
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=True, timeout=300)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"[ASRX] ffmpeg extraction failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def _cleanup(tmp_dir):
|
||||
"""Clean up temporary directory."""
|
||||
if tmp_dir and os.path.exists(tmp_dir):
|
||||
import shutil
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
|
||||
|
||||
def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
|
||||
"""Process video for speaker diarization using custom implementation"""
|
||||
|
||||
publisher = RedisPublisher(uuid) if uuid else None
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_START")
|
||||
|
||||
tmp_dir = None
|
||||
|
||||
try:
|
||||
# Ensure working directory is the scripts dir for model loading
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
os.chdir(script_dir)
|
||||
|
||||
# Debug: check ffmpeg availability
|
||||
import shutil
|
||||
ffmpeg_path = shutil.which("ffmpeg")
|
||||
print(f"[ASRX] ffmpeg: {ffmpeg_path}", file=sys.stderr)
|
||||
print(f"[ASRX] CWD: {os.getcwd()}", file=sys.stderr)
|
||||
|
||||
# ---- Stage 1: Audio Track Preprocessing ----
|
||||
print("\n[ASRX] ===== Stage 1: Audio Track Analysis =====", file=sys.stderr)
|
||||
print(f"[ASRX] Input: {video_path}", file=sys.stderr)
|
||||
|
||||
tracks = probe_audio_tracks(video_path)
|
||||
if tracks:
|
||||
print(f"[ASRX] Found {len(tracks)} audio track(s):", file=sys.stderr)
|
||||
for t in tracks:
|
||||
print(f" Track {t['index']}: {t['codec']} {t['channels']}ch {t['sample_rate']}Hz lang={t['language']}", file=sys.stderr)
|
||||
else:
|
||||
print("[ASRX] No audio tracks found via ffprobe, using raw file", file=sys.stderr)
|
||||
|
||||
# Select best track
|
||||
track_idx = select_best_track(tracks) if tracks else 0
|
||||
actual_track_index = tracks[track_idx]["index"] if tracks else track_idx
|
||||
|
||||
# Extract audio to WAV
|
||||
tmp_dir = tempfile.mkdtemp(prefix="asrx_")
|
||||
wav_path = os.path.join(tmp_dir, "audio.wav")
|
||||
|
||||
if extract_audio_to_wav(video_path, actual_track_index, wav_path):
|
||||
wav_size = os.path.getsize(wav_path)
|
||||
print(f"[ASRX] Audio extracted: {wav_path} ({wav_size / 1024 / 1024:.1f}MB)", file=sys.stderr)
|
||||
audio_input = wav_path
|
||||
else:
|
||||
print("[ASRX] Audio extraction failed, falling back to original file", file=sys.stderr)
|
||||
audio_input = video_path
|
||||
|
||||
# ---- Stage 2: Load ASR segments for time alignment ----
|
||||
# Try multiple paths to find ASR JSON
|
||||
asr_segments = []
|
||||
asr_fallback_reason = ""
|
||||
asr_candidates = [
|
||||
output_path.replace(".asrx.json", ".asr.json") if output_path else "",
|
||||
os.path.join(os.path.dirname(output_path) if output_path else ".", os.path.basename(video_path).rsplit(".", 1)[0] + ".asr.json"),
|
||||
os.path.join(os.path.dirname(output_path) if output_path else ".", "dd61fda85fee441fdd00ab5528213ff7.asr.json"),
|
||||
]
|
||||
asr_path = ""
|
||||
for candidate in asr_candidates:
|
||||
if candidate and os.path.exists(candidate):
|
||||
asr_path = candidate
|
||||
break
|
||||
if asr_path:
|
||||
try:
|
||||
with open(asr_path) as f:
|
||||
asr_data = json.load(f)
|
||||
asr_segments = asr_data.get("segments", [])
|
||||
print(f"[ASRX] Loaded {len(asr_segments)} ASR segments from {asr_path}", file=sys.stderr)
|
||||
asr_fallback_reason = f"loaded_{len(asr_segments)}_segments"
|
||||
except Exception as e:
|
||||
asr_fallback_reason = f"load_error_{e}"
|
||||
print(f"[ASRX] Failed to load ASR segments: {e}", file=sys.stderr)
|
||||
else:
|
||||
asr_fallback_reason = f"asr_json_not_found_tried_{len(asr_candidates)}_paths"
|
||||
print(f"[ASRX] ASR output not found, tried {len(asr_candidates)} paths. First candidate: {asr_candidates[0]}", file=sys.stderr)
|
||||
|
||||
# ---- Stage 3: ASRX Processing ----
|
||||
from asrx_self.main_fixed import SelfASRXFixed
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_LOADING_MODEL")
|
||||
|
||||
asrx = SelfASRXFixed()
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_TRANSCRIBING")
|
||||
|
||||
if asr_segments:
|
||||
# Use ASR segment boundaries for speaker embedding extraction
|
||||
print(f"[ASRX] Using {len(asr_segments)} ASR segments for diarization", file=sys.stderr)
|
||||
result = asrx.process_with_segments(
|
||||
audio_input,
|
||||
asr_segments,
|
||||
output_path=None,
|
||||
)
|
||||
else:
|
||||
# Fallback: VAD-based diarization
|
||||
result = asrx.process(
|
||||
audio_input,
|
||||
output_path=None,
|
||||
min_speech_duration_ms=500,
|
||||
max_speakers=10,
|
||||
)
|
||||
|
||||
if "error" in result:
|
||||
if publisher:
|
||||
publisher.error("asrx", result["error"])
|
||||
|
||||
# Return empty result
|
||||
output_result = {"language": None, "segments": []}
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output_result, f, indent=2)
|
||||
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
|
||||
_cleanup(tmp_dir)
|
||||
return output_result
|
||||
|
||||
# Convert to Rust-expected format (start_frame/end_frame/speaker)
|
||||
# Read fps from probe json ({file_uuid}.probe.json)
|
||||
_debug = {"asr_fallback": asr_fallback_reason, "asr_path": asr_path}
|
||||
fps = 30.0
|
||||
output_dir = os.path.dirname(output_path) if output_path else "."
|
||||
base_name = os.path.basename(output_path) if output_path else ""
|
||||
# Extract uuid from {uuid}.{type}.json format
|
||||
uuid_part = base_name.split(".")[0] if base_name else ""
|
||||
probe_candidates = [
|
||||
os.path.join(output_dir, f"{uuid_part}.probe.json"),
|
||||
]
|
||||
for p in probe_candidates:
|
||||
if os.path.exists(p):
|
||||
try:
|
||||
with open(p) as pf:
|
||||
probe_data = json.load(pf)
|
||||
if "fps" in probe_data:
|
||||
fps = float(probe_data["fps"])
|
||||
print(f"[ASRX] FPS from probe: {fps}", file=sys.stderr)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
output_result = {
|
||||
"language": None,
|
||||
"segments": [],
|
||||
}
|
||||
|
||||
# Convert segments
|
||||
for seg in result["segments"]:
|
||||
start_sec = seg["start"]
|
||||
end_sec = seg["end"]
|
||||
output_result["segments"].append(
|
||||
{
|
||||
"start_time": start_sec,
|
||||
"end_time": end_sec,
|
||||
"start_frame": int(start_sec * fps),
|
||||
"end_frame": int(end_sec * fps),
|
||||
"text": "",
|
||||
"speaker_id": seg["speaker"],
|
||||
}
|
||||
)
|
||||
|
||||
# Add speaker_stats as optional metadata
|
||||
if "speaker_stats" in result:
|
||||
output_result["speaker_stats"] = result["speaker_stats"]
|
||||
|
||||
# 傳遞 embeddings(每個 segment 對應的 192-D speaker embedding)
|
||||
if "embeddings" in result:
|
||||
output_result["embeddings"] = result["embeddings"]
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", f"ASRX_COMPLETE:{len(output_result['segments'])}")
|
||||
|
||||
# Save output
|
||||
output_result["_debug"] = _debug
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output_result, f, indent=2)
|
||||
|
||||
if publisher:
|
||||
publisher.complete("asrx", f"{len(output_result['segments'])} segments")
|
||||
|
||||
print(f"[ASRX-Custom] Saved {len(output_result['segments'])} segments to {output_path}", file=sys.stderr)
|
||||
|
||||
_cleanup(tmp_dir)
|
||||
return output_result
|
||||
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.error("asrx", str(e))
|
||||
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
|
||||
# Return empty result on error
|
||||
output_result = {"language": None, "segments": []}
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output_result, f, indent=2)
|
||||
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
|
||||
_cleanup(tmp_dir)
|
||||
return output_result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="ASRX Processor (Custom Implementation)"
|
||||
)
|
||||
parser.add_argument("video_path", help="Path to video/audio file")
|
||||
parser.add_argument("output_path", help="Path to output JSON file")
|
||||
parser.add_argument("--uuid", help="UUID for Redis publishing", default="")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not Path(args.video_path).exists():
|
||||
print(f"Error: Video file not found: {args.video_path}")
|
||||
sys.exit(1)
|
||||
|
||||
result = process_asrx_custom(args.video_path, args.output_path, args.uuid)
|
||||
|
||||
print("\n[Summary]")
|
||||
print(f" Total segments: {len(result['segments'])}")
|
||||
if "speaker_stats" in result:
|
||||
print(f" Detected speakers: {len(result['speaker_stats'])}")
|
||||
for speaker, stats in result["speaker_stats"].items():
|
||||
print(f" {speaker}: {stats['count']} segments")
|
||||
@@ -1,177 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
ASRX 處理器 - 簡化版
|
||||
先做轉錄,說話人分離可選
|
||||
修復 PyTorch 2.6 兼容性問題
|
||||
"""
|
||||
|
||||
# Fix for PyTorch 2.6+ compatibility - MUST be set before importing torch
|
||||
import os
|
||||
os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import signal
|
||||
import subprocess
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
print(f"ASRX: Received signal {signum}, exiting...")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def has_audio_stream(video_path):
|
||||
"""Check if video file has audio stream using ffprobe."""
|
||||
try:
|
||||
cmd = [
|
||||
"ffprobe",
|
||||
"-v",
|
||||
"error",
|
||||
"-select_streams",
|
||||
"a",
|
||||
"-show_entries",
|
||||
"stream=codec_type",
|
||||
"-of",
|
||||
"csv=p=0",
|
||||
video_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
return bool(result.stdout.strip())
|
||||
except subprocess.CalledProcessError:
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
print("WARNING: ffprobe not found, assuming audio exists")
|
||||
return True
|
||||
|
||||
|
||||
def process_asrx(video_path: str, output_path: str, uuid: str = "", skip_diarization: bool = True):
|
||||
"""
|
||||
Process video for speaker diarization using whisperx
|
||||
|
||||
Args:
|
||||
video_path: Path to video file
|
||||
output_path: Path to output JSON
|
||||
uuid: UUID for Redis progress
|
||||
skip_diarization: Skip speaker diarization (only transcription)
|
||||
"""
|
||||
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
publisher = RedisPublisher(uuid) if uuid else None
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_START")
|
||||
|
||||
try:
|
||||
import whisperx
|
||||
import torch
|
||||
except ImportError as e:
|
||||
if publisher:
|
||||
publisher.error("asrx", f"Missing dependency: {e}")
|
||||
result = {"language": None, "segments": []}
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
sys.exit(1)
|
||||
|
||||
# Check for audio stream
|
||||
if not has_audio_stream(video_path):
|
||||
if publisher:
|
||||
publisher.info("asrx", "No audio stream detected, skipping transcription")
|
||||
output = {"language": "", "language_probability": 0.0, "segments": []}
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments (no audio)")
|
||||
sys.stderr.write("ASRX: No audio stream, skipping transcription\n")
|
||||
sys.stderr.flush()
|
||||
sys.exit(0)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_LOADING_MODEL")
|
||||
|
||||
try:
|
||||
# Load model
|
||||
if publisher:
|
||||
publisher.info("asrx", "Loading whisperx base model (this may take a while)...")
|
||||
|
||||
model = whisperx.load_model("base", device="cpu", compute_type="int8")
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_TRANSCRIBING")
|
||||
|
||||
# Transcribe with language detection
|
||||
result = model.transcribe(video_path)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", f"ASRX_LANGUAGE:{result.get('language', 'unknown')}")
|
||||
|
||||
# Build output (without diarization for now)
|
||||
segments = []
|
||||
for seg in result.get("segments", []):
|
||||
text = seg.get("text", "").strip()
|
||||
if text:
|
||||
segments.append(
|
||||
{
|
||||
"start": seg.get("start", 0.0),
|
||||
"end": seg.get("end", 0.0),
|
||||
"text": text,
|
||||
"speaker_id": None, # Will be added when diarization is enabled
|
||||
}
|
||||
)
|
||||
|
||||
output_result = {
|
||||
"language": result.get("language"),
|
||||
"language_probability": result.get("language_probability", 0),
|
||||
"segments": segments,
|
||||
"diarization_enabled": not skip_diarization
|
||||
}
|
||||
|
||||
if publisher:
|
||||
publisher.complete("asrx", f"{len(segments)} segments")
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output_result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
sys.stderr.write(
|
||||
f"ASRX: Transcription complete, {len(segments)} segments written to {output_path}\n"
|
||||
)
|
||||
sys.stderr.flush()
|
||||
sys.exit(0)
|
||||
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.error("asrx", f"Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
result = {"language": None, "segments": [], "error": str(e)}
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments (error)")
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="ASRX Speaker Diarization (Simplified)")
|
||||
parser.add_argument("video_path", help="Path to video file")
|
||||
parser.add_argument("output_path", help="Output JSON path")
|
||||
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
|
||||
parser.add_argument(
|
||||
"--skip-diarization",
|
||||
action="store_true",
|
||||
help="Skip speaker diarization (only transcription)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
process_asrx(
|
||||
args.video_path,
|
||||
args.output_path,
|
||||
args.uuid,
|
||||
args.skip_diarization
|
||||
)
|
||||
@@ -1,212 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
ASRX 處理器 v2 - 說話人分離
|
||||
使用 whisperx 進行轉錄和說話人分離
|
||||
需要 PyTorch 2.5.0 + torchvision 0.20.0 + torchaudio 2.5.0
|
||||
"""
|
||||
|
||||
# Fix for PyTorch 2.5 compatibility
|
||||
import os
|
||||
os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import signal
|
||||
import subprocess
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
print(f"ASRX: Received signal {signum}, exiting...")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def has_audio_stream(video_path):
|
||||
"""Check if video file has audio stream using ffprobe."""
|
||||
try:
|
||||
cmd = [
|
||||
"ffprobe",
|
||||
"-v",
|
||||
"error",
|
||||
"-select_streams",
|
||||
"a",
|
||||
"-show_entries",
|
||||
"stream=codec_type",
|
||||
"-of",
|
||||
"csv=p=0",
|
||||
video_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
return bool(result.stdout.strip())
|
||||
except subprocess.CalledProcessError:
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
print("WARNING: ffprobe not found, assuming audio exists")
|
||||
return True
|
||||
|
||||
|
||||
def process_asrx(video_path: str, output_path: str, uuid: str = "", skip_diarization: bool = False):
|
||||
"""
|
||||
Process video for speaker diarization using whisperx
|
||||
|
||||
Args:
|
||||
video_path: Path to video file
|
||||
output_path: Path to output JSON
|
||||
uuid: UUID for Redis progress
|
||||
skip_diarization: Skip speaker diarization (only transcription)
|
||||
"""
|
||||
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
publisher = RedisPublisher(uuid) if uuid else None
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_START")
|
||||
|
||||
# Check for audio stream
|
||||
if not has_audio_stream(video_path):
|
||||
if publisher:
|
||||
publisher.info("asrx", "No audio stream detected, skipping transcription")
|
||||
output = {"language": "", "language_probability": 0.0, "segments": []}
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments (no audio)")
|
||||
sys.stderr.write("ASRX: No audio stream, skipping transcription\n")
|
||||
sys.stderr.flush()
|
||||
sys.exit(0)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_LOADING_MODEL")
|
||||
|
||||
try:
|
||||
import whisperx
|
||||
import torch
|
||||
except ImportError as e:
|
||||
if publisher:
|
||||
publisher.error("asrx", f"Missing dependency: {e}")
|
||||
result = {"language": None, "segments": [], "error": str(e)}
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
# Load model
|
||||
if publisher:
|
||||
publisher.info("asrx", "Loading whisperx base model (this may take a while)...")
|
||||
|
||||
model = whisperx.load_model("base", device="cpu", compute_type="int8")
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_TRANSCRIBING")
|
||||
|
||||
# Transcribe with language detection
|
||||
result = model.transcribe(video_path)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", f"ASRX_LANGUAGE:{result.get('language', 'unknown')}")
|
||||
|
||||
# Align timestamps
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_ALIGNING_TIMESTAMPS")
|
||||
|
||||
model_a, metadata = whisperx.load_align_model(
|
||||
language_code=result["language"],
|
||||
device="cpu"
|
||||
)
|
||||
result = whisperx.align(
|
||||
result["segments"],
|
||||
model_a,
|
||||
metadata,
|
||||
video_path,
|
||||
device="cpu"
|
||||
)
|
||||
|
||||
# Diarization (speaker segmentation)
|
||||
if not skip_diarization:
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_DIARIZATION")
|
||||
|
||||
try:
|
||||
diarize_model = whisperx.DiarizationPipeline(use_auth_token=None)
|
||||
diarize_segments = diarize_model(video_path)
|
||||
|
||||
# Assign speaker labels
|
||||
result = whisperx.assign_word_speakers(diarize_segments, result)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "Diarization completed")
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.info("asrx", f"Diarization skipped: {e}")
|
||||
sys.stderr.write(f"ASRX: Diarization failed: {e}\n")
|
||||
|
||||
# Build output
|
||||
segments = []
|
||||
for seg in result.get("segments", []):
|
||||
text = seg.get("text", "").strip()
|
||||
if text:
|
||||
segments.append(
|
||||
{
|
||||
"start": seg.get("start", 0.0),
|
||||
"end": seg.get("end", 0.0),
|
||||
"text": text,
|
||||
"speaker_id": seg.get("speaker", None),
|
||||
}
|
||||
)
|
||||
|
||||
output_result = {
|
||||
"language": result.get("language"),
|
||||
"language_probability": result.get("language_probability", 0),
|
||||
"segments": segments,
|
||||
"diarization_enabled": not skip_diarization
|
||||
}
|
||||
|
||||
if publisher:
|
||||
publisher.complete("asrx", f"{len(segments)} segments")
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output_result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
sys.stderr.write(
|
||||
f"ASRX: Transcription complete, {len(segments)} segments written to {output_path}\n"
|
||||
)
|
||||
sys.stderr.flush()
|
||||
sys.exit(0)
|
||||
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.error("asrx", f"Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
result = {"language": None, "segments": [], "error": str(e)}
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments (error)")
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="ASRX Speaker Diarization v2")
|
||||
parser.add_argument("video_path", help="Path to video file")
|
||||
parser.add_argument("output_path", help="Output JSON path")
|
||||
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
|
||||
parser.add_argument(
|
||||
"--skip-diarization",
|
||||
action="store_true",
|
||||
help="Skip speaker diarization (only transcription)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
process_asrx(
|
||||
args.video_path,
|
||||
args.output_path,
|
||||
args.uuid,
|
||||
args.skip_diarization
|
||||
)
|
||||
@@ -1,184 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
ASRX 處理器 v2 - 快速版(跳過對齊)
|
||||
使用 whisperx 進行轉錄和說話人分離
|
||||
跳過時間戳對齊以避開 PyTorch 版本問題
|
||||
"""
|
||||
|
||||
import os
|
||||
os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import signal
|
||||
import subprocess
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
print(f"ASRX: Received signal {signum}, exiting...")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def has_audio_stream(video_path):
|
||||
"""Check if video file has audio stream using ffprobe."""
|
||||
try:
|
||||
cmd = [
|
||||
"ffprobe",
|
||||
"-v",
|
||||
"error",
|
||||
"-select_streams",
|
||||
"a",
|
||||
"-show_entries",
|
||||
"stream=codec_type",
|
||||
"-of",
|
||||
"csv=p=0",
|
||||
video_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
return bool(result.stdout.strip())
|
||||
except subprocess.CalledProcessError:
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
print("WARNING: ffprobe not found, assuming audio exists")
|
||||
return True
|
||||
|
||||
|
||||
def process_asrx(video_path: str, output_path: str, uuid: str = ""):
|
||||
"""
|
||||
Process video for speaker diarization using whisperx (no alignment)
|
||||
|
||||
Args:
|
||||
video_path: Path to video file
|
||||
output_path: Path to output JSON
|
||||
uuid: UUID for Redis progress
|
||||
"""
|
||||
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
publisher = RedisPublisher(uuid) if uuid else None
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_START")
|
||||
|
||||
# Check for audio stream
|
||||
if not has_audio_stream(video_path):
|
||||
if publisher:
|
||||
publisher.info("asrx", "No audio stream detected")
|
||||
output = {"language": "", "language_probability": 0.0, "segments": []}
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments (no audio)")
|
||||
sys.exit(0)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_LOADING_MODEL")
|
||||
|
||||
try:
|
||||
import whisperx
|
||||
import torch
|
||||
except ImportError as e:
|
||||
if publisher:
|
||||
publisher.error("asrx", f"Missing dependency: {e}")
|
||||
result = {"language": None, "segments": [], "error": str(e)}
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
# Load model
|
||||
if publisher:
|
||||
publisher.info("asrx", "Loading whisperx base model...")
|
||||
|
||||
model = whisperx.load_model("base", device="cpu", compute_type="int8")
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_TRANSCRIBING")
|
||||
|
||||
# Transcribe with language detection
|
||||
result = model.transcribe(video_path)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", f"ASRX_LANGUAGE:{result.get('language', 'unknown')}")
|
||||
|
||||
# Skip alignment (requires PyTorch 2.6+)
|
||||
# Go directly to diarization
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_DIARIZATION")
|
||||
|
||||
try:
|
||||
diarize_model = whisperx.DiarizationPipeline(use_auth_token=None)
|
||||
diarize_segments = diarize_model(video_path)
|
||||
|
||||
# Assign speaker labels
|
||||
result = whisperx.assign_word_speakers(diarize_segments, result)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "Diarization completed")
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.info("asrx", f"Diarization info: {e}")
|
||||
sys.stderr.write(f"ASRX: Diarization note: {e}\n")
|
||||
|
||||
# Build output
|
||||
segments = []
|
||||
for seg in result.get("segments", []):
|
||||
text = seg.get("text", "").strip()
|
||||
if text:
|
||||
segments.append(
|
||||
{
|
||||
"start": seg.get("start", 0.0),
|
||||
"end": seg.get("end", 0.0),
|
||||
"text": text,
|
||||
"speaker_id": seg.get("speaker", None),
|
||||
}
|
||||
)
|
||||
|
||||
output_result = {
|
||||
"language": result.get("language"),
|
||||
"language_probability": result.get("language_probability", 0),
|
||||
"segments": segments,
|
||||
"diarization_enabled": True,
|
||||
"alignment_enabled": False,
|
||||
"note": "Alignment skipped due to PyTorch version compatibility"
|
||||
}
|
||||
|
||||
if publisher:
|
||||
publisher.complete("asrx", f"{len(segments)} segments")
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output_result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
sys.stderr.write(
|
||||
f"ASRX: Transcription complete, {len(segments)} segments written to {output_path}\n"
|
||||
)
|
||||
sys.stderr.flush()
|
||||
sys.exit(0)
|
||||
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.error("asrx", f"Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
result = {"language": None, "segments": [], "error": str(e)}
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments (error)")
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="ASRX Speaker Diarization v2 (No Alignment)")
|
||||
parser.add_argument("video_path", help="Path to video file")
|
||||
parser.add_argument("output_path", help="Output JSON path")
|
||||
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
|
||||
args = parser.parse_args()
|
||||
|
||||
process_asrx(args.video_path, args.output_path, args.uuid)
|
||||
@@ -1,165 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
ASRX 處理器 v2 - 轉錄版
|
||||
使用 whisperx 進行轉錄(不含說話人分離)
|
||||
說話人分離需要額外安裝 pyannote.audio 並配置 HuggingFace token
|
||||
"""
|
||||
|
||||
import os
|
||||
os.environ["TORCH_FORCE_WEIGHTS_ONLY_LOAD"] = "0"
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import signal
|
||||
import subprocess
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
print(f"ASRX: Received signal {signum}, exiting...")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def has_audio_stream(video_path):
|
||||
"""Check if video file has audio stream using ffprobe."""
|
||||
try:
|
||||
cmd = [
|
||||
"ffprobe",
|
||||
"-v",
|
||||
"error",
|
||||
"-select_streams",
|
||||
"a",
|
||||
"-show_entries",
|
||||
"stream=codec_type",
|
||||
"-of",
|
||||
"csv=p=0",
|
||||
video_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
return bool(result.stdout.strip())
|
||||
except subprocess.CalledProcessError:
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
print("WARNING: ffprobe not found, assuming audio exists")
|
||||
return True
|
||||
|
||||
|
||||
def process_asrx(video_path: str, output_path: str, uuid: str = ""):
|
||||
"""
|
||||
Process video for transcription using whisperx
|
||||
|
||||
Args:
|
||||
video_path: Path to video file
|
||||
output_path: Path to output JSON
|
||||
uuid: UUID for Redis progress
|
||||
"""
|
||||
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
publisher = RedisPublisher(uuid) if uuid else None
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_START")
|
||||
|
||||
# Check for audio stream
|
||||
if not has_audio_stream(video_path):
|
||||
if publisher:
|
||||
publisher.info("asrx", "No audio stream detected")
|
||||
output = {"language": "", "language_probability": 0.0, "segments": []}
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments (no audio)")
|
||||
sys.exit(0)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_LOADING_MODEL")
|
||||
|
||||
try:
|
||||
import whisperx
|
||||
import torch
|
||||
except ImportError as e:
|
||||
if publisher:
|
||||
publisher.error("asrx", f"Missing dependency: {e}")
|
||||
result = {"language": None, "segments": [], "error": str(e)}
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
# Load model
|
||||
if publisher:
|
||||
publisher.info("asrx", "Loading whisperx base model...")
|
||||
|
||||
model = whisperx.load_model("base", device="cpu", compute_type="int8")
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_TRANSCRIBING")
|
||||
|
||||
# Transcribe with language detection
|
||||
result = model.transcribe(video_path)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", f"ASRX_LANGUAGE:{result.get('language', 'unknown')}")
|
||||
|
||||
# Build output (without alignment and diarization due to PyTorch version)
|
||||
segments = []
|
||||
for seg in result.get("segments", []):
|
||||
text = seg.get("text", "").strip()
|
||||
if text:
|
||||
segments.append(
|
||||
{
|
||||
"start": seg.get("start", 0.0),
|
||||
"end": seg.get("end", 0.0),
|
||||
"text": text,
|
||||
"speaker_id": None, # Requires pyannote.audio + HuggingFace token
|
||||
}
|
||||
)
|
||||
|
||||
output_result = {
|
||||
"language": result.get("language"),
|
||||
"language_probability": result.get("language_probability", 0),
|
||||
"segments": segments,
|
||||
"diarization_enabled": False,
|
||||
"alignment_enabled": False,
|
||||
"note": "PyTorch 2.5.0 compatibility - alignment and diarization require additional setup"
|
||||
}
|
||||
|
||||
if publisher:
|
||||
publisher.complete("asrx", f"{len(segments)} segments")
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output_result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
sys.stderr.write(
|
||||
f"ASRX: Transcription complete, {len(segments)} segments written to {output_path}\n"
|
||||
)
|
||||
sys.stderr.flush()
|
||||
sys.exit(0)
|
||||
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.error("asrx", f"Error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
result = {"language": None, "segments": [], "error": str(e)}
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments (error)")
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(result, f, indent=2)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="ASRX Transcription (PyTorch 2.5.0)")
|
||||
parser.add_argument("video_path", help="Path to video file")
|
||||
parser.add_argument("output_path", help="Output JSON path")
|
||||
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
|
||||
args = parser.parse_args()
|
||||
|
||||
process_asrx(args.video_path, args.output_path, args.uuid)
|
||||
@@ -1,178 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
整合 Face + ASRX 說話人分離(版本 3 - 修復 face_detected 檢查)
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
|
||||
def load_json(path: str):
|
||||
"""載入 JSON 文件"""
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def match_face_with_speaker_v3(face_data: Dict, asrx_data: Dict,
|
||||
time_threshold: float = 3.0) -> List[Dict]:
|
||||
"""
|
||||
匹配人臉與說話人(版本 3 - 修復版)
|
||||
|
||||
修復:Face 數據沒有 face_detected 欄位,改用 faces 列表是否為空判斷
|
||||
"""
|
||||
face_frames = face_data.get('frames', [])
|
||||
asrx_segments = asrx_data.get('segments', [])
|
||||
|
||||
# 將 Face 幀按時間排序
|
||||
face_frames_sorted = sorted(face_frames, key=lambda x: x.get('timestamp', 0))
|
||||
|
||||
print(f" Face frames: {len(face_frames_sorted)}")
|
||||
print(f" ASRX segments: {len(asrx_segments)}")
|
||||
|
||||
# 匹配
|
||||
integrated = []
|
||||
|
||||
for i, seg in enumerate(asrx_segments):
|
||||
start = seg['start']
|
||||
end = seg['end']
|
||||
speaker = seg['speaker']
|
||||
mid_time = (start + end) / 2
|
||||
|
||||
# 找到時間範圍內的人臉
|
||||
faces_in_range = []
|
||||
for frame in face_frames_sorted:
|
||||
ts = frame.get('timestamp', 0)
|
||||
|
||||
# 檢查是否在時間範圍內
|
||||
if start - time_threshold <= ts <= end + time_threshold:
|
||||
# 檢查是否有人臉(faces 列表不為空)
|
||||
faces = frame.get('faces', [])
|
||||
if faces and len(faces) > 0:
|
||||
faces_in_range.append({
|
||||
'timestamp': ts,
|
||||
'faces': faces,
|
||||
'distance_from_mid': abs(ts - mid_time)
|
||||
})
|
||||
|
||||
# 選擇最接近片段中間的人臉
|
||||
if faces_in_range:
|
||||
faces_in_range.sort(key=lambda x: x['distance_from_mid'])
|
||||
best_face = faces_in_range[0]
|
||||
else:
|
||||
best_face = None
|
||||
|
||||
# 建立整合結果
|
||||
integrated.append({
|
||||
'start': start,
|
||||
'end': end,
|
||||
'duration': seg.get('duration', end - start),
|
||||
'speaker': speaker,
|
||||
'has_face': best_face is not None,
|
||||
'face_timestamp': best_face['timestamp'] if best_face else None,
|
||||
'face_location': best_face['faces'][0] if best_face and best_face['faces'] else None,
|
||||
'face_count_in_range': len(faces_in_range)
|
||||
})
|
||||
|
||||
# 進度顯示
|
||||
if (i + 1) % 200 == 0:
|
||||
print(f" Processed {i+1}/{len(asrx_segments)} segments...")
|
||||
|
||||
return integrated
|
||||
|
||||
|
||||
def analyze_speaker_face(integrated: List[Dict]):
|
||||
"""分析說話人與人臉的對應"""
|
||||
speaker_stats = {}
|
||||
|
||||
for item in integrated:
|
||||
speaker = item['speaker']
|
||||
if speaker not in speaker_stats:
|
||||
speaker_stats[speaker] = {
|
||||
'total_segments': 0,
|
||||
'with_face': 0,
|
||||
'without_face': 0,
|
||||
'total_duration': 0
|
||||
}
|
||||
|
||||
speaker_stats[speaker]['total_segments'] += 1
|
||||
speaker_stats[speaker]['total_duration'] += item['duration']
|
||||
|
||||
if item['has_face']:
|
||||
speaker_stats[speaker]['with_face'] += 1
|
||||
else:
|
||||
speaker_stats[speaker]['without_face'] += 1
|
||||
|
||||
return speaker_stats
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='整合 Face + ASRX 說話人')
|
||||
parser.add_argument('face_json', help='Face 檢測結果 JSON')
|
||||
parser.add_argument('asrx_json', help='ASRX 說話人分離 JSON')
|
||||
parser.add_argument('-o', '--output', help='輸出整合結果 JSON')
|
||||
parser.add_argument('--threshold', type=float, default=3.0,
|
||||
help='時間閾值(秒)')
|
||||
parser.add_argument('--stats', action='store_true', help='只显示統計')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 載入數據
|
||||
print(f"[Load] Face: {args.face_json}")
|
||||
face_data = load_json(args.face_json)
|
||||
|
||||
print(f"[Load] ASRX: {args.asrx_json}")
|
||||
asrx_data = load_json(args.asrx_json)
|
||||
|
||||
# 匹配
|
||||
print(f"\n[Match] Matching faces with speakers (threshold={args.threshold}s)...")
|
||||
integrated = match_face_with_speaker_v3(face_data, asrx_data, args.threshold)
|
||||
|
||||
# 分析
|
||||
print("\n[Analyze] Analyzing speaker-face correspondence...")
|
||||
speaker_stats = analyze_speaker_face(integrated)
|
||||
|
||||
# 顯示統計
|
||||
print(f"\n{'='*70}")
|
||||
print("說話人 - 人臉對應統計")
|
||||
print(f"{'='*70}")
|
||||
|
||||
total_segments = len(integrated)
|
||||
total_with_face = sum(1 for item in integrated if item['has_face'])
|
||||
|
||||
for speaker, stats in sorted(speaker_stats.items()):
|
||||
with_face_pct = stats['with_face'] / stats['total_segments'] * 100 if stats['total_segments'] > 0 else 0
|
||||
print(f"\n🔊 {speaker}:")
|
||||
print(f" 總片段:{stats['total_segments']}")
|
||||
print(f" 有人臉:{stats['with_face']} ({with_face_pct:.1f}%)")
|
||||
print(f" 無人臉:{stats['without_face']}")
|
||||
print(f" 總時長:{stats['total_duration']:.1f}s ({stats['total_duration']/60:.1f}分鐘)")
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"總計:{total_segments} 片段,{total_with_face} 片段有人臉 ({total_with_face/total_segments*100:.1f}%)")
|
||||
print(f"{'='*70}")
|
||||
|
||||
# 保存結果
|
||||
if args.output:
|
||||
output_path = Path(args.output)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
result = {
|
||||
'face_source': str(args.face_json),
|
||||
'asrx_source': str(args.asrx_json),
|
||||
'time_threshold': args.threshold,
|
||||
'integrated_segments': integrated,
|
||||
'speaker_stats': speaker_stats
|
||||
}
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n[Save] Results saved to: {output_path}")
|
||||
|
||||
return integrated, speaker_stats
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,268 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Self-implemented ASRX - 自實作說話人分離系統
|
||||
基於聲紋嵌入 + 譜聚類
|
||||
|
||||
技術架構:
|
||||
1. VAD (Silero VAD) - 語音活動檢測
|
||||
2. Speaker Encoder (ECAPA-TDNN) - 聲紋特徵提取
|
||||
3. Spectral Clustering - 譜聚類
|
||||
4. Post-processing - 後處理
|
||||
|
||||
流程:
|
||||
音頻 → VAD → 語音片段 → 聲紋嵌入 → 相似度矩陣 → 譜聚類 → 說話人 ID
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
# 導入自定義模組
|
||||
from vad import load_vad_model, extract_speech_segments
|
||||
from speaker_encoder import (
|
||||
load_speaker_encoder,
|
||||
extract_speaker_embeddings_batch,
|
||||
compute_similarity_matrix,
|
||||
normalize_embeddings,
|
||||
)
|
||||
from speaker_cluster import spectral_clustering_speaker, smooth_speaker_labels
|
||||
|
||||
|
||||
class SelfASRX:
|
||||
"""
|
||||
自實作說話人分離系統
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""初始化模型"""
|
||||
print("[SelfASRX] Initializing models...")
|
||||
|
||||
# 載入 VAD 模型
|
||||
print("[SelfASRX] Loading VAD model (Silero)...")
|
||||
self.vad_model, self.vad_utils = load_vad_model()
|
||||
|
||||
# 載入聲紋模型
|
||||
print("[SelfASRX] Loading speaker encoder (ECAPA-TDNN)...")
|
||||
self.speaker_encoder = load_speaker_encoder()
|
||||
|
||||
print("[SelfASRX] Models loaded successfully")
|
||||
|
||||
def process(
|
||||
self,
|
||||
audio_path,
|
||||
output_path=None,
|
||||
min_speech_duration_ms=500,
|
||||
n_speakers=None,
|
||||
smooth_window=5,
|
||||
):
|
||||
"""
|
||||
處理音頻文件進行說話人分離
|
||||
|
||||
Args:
|
||||
audio_path: 音頻文件路徑
|
||||
output_path: 輸出 JSON 路徑(可選)
|
||||
min_speech_duration_ms: 最小語音持續時間
|
||||
n_speakers: 說話人數量(None=自動估計)
|
||||
smooth_window: 平滑窗口大小
|
||||
|
||||
Returns:
|
||||
result: 說話人分離結果
|
||||
"""
|
||||
start_time = time.time()
|
||||
print(f"\n[SelfASRX] Processing: {audio_path}")
|
||||
print("=" * 60)
|
||||
|
||||
# 步驟 1: VAD - 語音活動檢測
|
||||
print("\n[Step 1] Voice Activity Detection...")
|
||||
step1_start = time.time()
|
||||
|
||||
speech_segments, wav, sample_rate = extract_speech_segments(
|
||||
audio_path,
|
||||
self.vad_model,
|
||||
self.vad_utils,
|
||||
min_speech_duration_ms=min_speech_duration_ms,
|
||||
)
|
||||
|
||||
step1_time = time.time() - step1_start
|
||||
print(f" Speech segments: {len(speech_segments)}")
|
||||
print(f" Total duration: {len(wav) / sample_rate:.2f}s")
|
||||
print(f" VAD time: {step1_time:.2f}s")
|
||||
|
||||
if len(speech_segments) == 0:
|
||||
print("[SelfASRX] No speech detected!")
|
||||
return {"error": "No speech detected", "segments": []}
|
||||
|
||||
# 步驟 2: 聲紋特徵提取
|
||||
print("\n[Step 2] Speaker embedding extraction...")
|
||||
step2_start = time.time()
|
||||
|
||||
# 提取語音片段音頻
|
||||
audio_segments = []
|
||||
for start_sec, end_sec in speech_segments:
|
||||
start_sample = int(start_sec * sample_rate)
|
||||
end_sample = int(end_sec * sample_rate)
|
||||
audio_segments.append(wav[start_sample:end_sample])
|
||||
|
||||
# 批量提取嵌入
|
||||
embeddings = extract_speaker_embeddings_batch(
|
||||
self.speaker_encoder, audio_segments, sample_rate
|
||||
)
|
||||
|
||||
# 正規化
|
||||
embeddings = normalize_embeddings(embeddings)
|
||||
|
||||
step2_time = time.time() - step2_start
|
||||
print(f" Embedding shape: {embeddings.shape}")
|
||||
print(f" Embedding time: {step2_time:.2f}s")
|
||||
|
||||
# 步驟 3: 計算相似度矩陣
|
||||
print("\n[Step 3] Computing similarity matrix...")
|
||||
step3_start = time.time()
|
||||
|
||||
similarity_matrix = compute_similarity_matrix(embeddings, method="cosine")
|
||||
|
||||
step3_time = time.time() - step3_start
|
||||
print(f" Similarity matrix shape: {similarity_matrix.shape}")
|
||||
print(f" Similarity time: {step3_time:.2f}s")
|
||||
|
||||
# 步驟 4: 譜聚類
|
||||
print("\n[Step 4] Spectral clustering...")
|
||||
step4_start = time.time()
|
||||
|
||||
speaker_labels, estimated_n_speakers = spectral_clustering_speaker(
|
||||
similarity_matrix, n_speakers=n_speakers, auto_estimate=(n_speakers is None)
|
||||
)
|
||||
|
||||
# 平滑標籤
|
||||
if smooth_window > 1:
|
||||
speaker_labels = smooth_speaker_labels(
|
||||
speaker_labels, window_size=smooth_window
|
||||
)
|
||||
|
||||
step4_time = time.time() - step4_start
|
||||
print(f" Estimated speakers: {estimated_n_speakers}")
|
||||
print(f" Clustering time: {step4_time:.2f}s")
|
||||
|
||||
# 步驟 5: 建立輸出結果
|
||||
print("\n[Step 5] Building output...")
|
||||
|
||||
result = {
|
||||
"audio_path": str(audio_path),
|
||||
"total_duration": len(wav) / sample_rate,
|
||||
"n_speech_segments": len(speech_segments),
|
||||
"n_speakers": int(estimated_n_speakers),
|
||||
"segments": [],
|
||||
}
|
||||
|
||||
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
|
||||
result["segments"].append(
|
||||
{
|
||||
"index": i,
|
||||
"start": round(start, 3),
|
||||
"end": round(end, 3),
|
||||
"duration": round(end - start, 3),
|
||||
"speaker": f"SPEAKER_{int(label)}",
|
||||
}
|
||||
)
|
||||
|
||||
# 統計每個說話人的總時長
|
||||
speaker_stats = {}
|
||||
for seg in result["segments"]:
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in speaker_stats:
|
||||
speaker_stats[speaker] = {"count": 0, "duration": 0}
|
||||
speaker_stats[speaker]["count"] += 1
|
||||
speaker_stats[speaker]["duration"] += seg["duration"]
|
||||
|
||||
result["speaker_stats"] = speaker_stats
|
||||
|
||||
total_time = time.time() - start_time
|
||||
result["processing_time"] = round(total_time, 2)
|
||||
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
|
||||
|
||||
print("\n[SelfASRX] Processing completed!")
|
||||
print(f" Total time: {total_time:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
print(f" Detected speakers: {estimated_n_speakers}")
|
||||
|
||||
# 保存結果
|
||||
if output_path:
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f" Results saved to: {output_path}")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
"""主函數"""
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Self-implemented ASRX - Speaker Diarization"
|
||||
)
|
||||
parser.add_argument("audio_path", help="Path to audio file")
|
||||
parser.add_argument("-o", "--output", help="Output JSON path")
|
||||
parser.add_argument(
|
||||
"--min-speech-duration",
|
||||
type=int,
|
||||
default=500,
|
||||
help="Minimum speech duration in ms (default: 500)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--n-speakers",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Number of speakers (default: auto-estimate)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--smooth-window",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Smoothing window size (default: 5)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# 檢查文件是否存在
|
||||
if not Path(args.audio_path).exists():
|
||||
print(f"Error: Audio file not found: {args.audio_path}")
|
||||
sys.exit(1)
|
||||
|
||||
# 創建 ASRX 實例並處理
|
||||
asrx = SelfASRX()
|
||||
result = asrx.process(
|
||||
args.audio_path,
|
||||
args.output,
|
||||
min_speech_duration_ms=args.min_speech_duration,
|
||||
n_speakers=args.n_speakers,
|
||||
smooth_window=args.smooth_window,
|
||||
)
|
||||
|
||||
# 顯示結果摘要
|
||||
if "error" not in result:
|
||||
print("\n[Summary]")
|
||||
print(f" Audio duration: {result['total_duration']:.2f}s")
|
||||
print(f" Speech segments: {result['n_speech_segments']}")
|
||||
print(f" Detected speakers: {result['n_speakers']}")
|
||||
print(f" Processing time: {result['processing_time']:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
|
||||
print("\n[Speaker Statistics]")
|
||||
for speaker, stats in result["speaker_stats"].items():
|
||||
pct = stats["duration"] / result["total_duration"] * 100
|
||||
print(
|
||||
f" {speaker}: {stats['count']} segments, "
|
||||
+ f"{stats['duration']:.2f}s ({pct:.1f}%)"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,308 +1,728 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Self-implemented ASRX - Fixed Version
|
||||
使用魯棒的聚類算法
|
||||
SelfASRXFixed - 7 步 Hybrid Speaker Diarization Pipeline
|
||||
|
||||
Pipeline:
|
||||
1. whisper.transcribe(full_audio) → rough segments + text + language
|
||||
2. VAD scan each rough segment → refined segments
|
||||
3. whisper per refined segment → {text, language, lang_prob}
|
||||
4. ECAPA-TDNN per refined segment → 192-dim embeddings
|
||||
5. AgglomerativeClustering → speaker_labels
|
||||
6. Store all embeddings in Qdrant (payload: file_uuid, speaker_id, text, ...)
|
||||
7. High-quality embeddings → gender classify + store reference in Qdrant
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from urllib.request import Request, urlopen
|
||||
from urllib.error import URLError
|
||||
|
||||
# 導入自定義模組
|
||||
from vad import load_vad_model, extract_speech_segments
|
||||
from speaker_encoder import (
|
||||
load_speaker_encoder,
|
||||
extract_speaker_embeddings_batch,
|
||||
normalize_embeddings
|
||||
)
|
||||
from speaker_cluster_fixed import robust_speaker_clustering
|
||||
|
||||
def _load_audio(path):
|
||||
"""載入音頻文件,回傳 (wav_numpy, sample_rate)"""
|
||||
import soundfile as sf
|
||||
wav, sr = sf.read(path)
|
||||
if len(wav.shape) > 1:
|
||||
wav = np.mean(wav, axis=1)
|
||||
return wav, sr
|
||||
|
||||
|
||||
def _load_whisper_model(size="small"):
|
||||
from whisper_local import load_model
|
||||
return load_model(size)
|
||||
|
||||
|
||||
def _load_vad():
|
||||
from vad import load_vad_model
|
||||
return load_vad_model()
|
||||
|
||||
|
||||
def _load_speaker_encoder():
|
||||
from speaker_encoder import load_speaker_encoder
|
||||
return load_speaker_encoder()
|
||||
|
||||
|
||||
def _load_gender_classifier():
|
||||
try:
|
||||
from speechbrain.inference.classifiers import EncoderClassifier
|
||||
classifier = EncoderClassifier.from_hparams(
|
||||
source="speechbrain/gender-recognition-ecapa",
|
||||
run_opts={"device": "cpu"},
|
||||
)
|
||||
print("[Gender] Classifier loaded: speechbrain/gender-recognition-ecapa")
|
||||
return classifier
|
||||
except Exception as e:
|
||||
print(f"[Gender] Classifier not available: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _ensure_speaker_collection(qdrant_url, api_key, collection):
|
||||
"""確認 Qdrant speaker collection 存在,不存在則建立 (dim=192, cosine)"""
|
||||
try:
|
||||
url = f"{qdrant_url}/collections/{collection}"
|
||||
req = Request(url, method="GET",
|
||||
headers={"api-key": api_key} if api_key else {})
|
||||
try:
|
||||
urlopen(req)
|
||||
return True
|
||||
except URLError as e:
|
||||
if getattr(e, "code", None) == 404:
|
||||
body = json.dumps({
|
||||
"vectors": {
|
||||
"size": 192,
|
||||
"distance": "Cosine"
|
||||
}
|
||||
}).encode()
|
||||
req = Request(url, data=body, method="PUT",
|
||||
headers={"Content-Type": "application/json",
|
||||
**({"api-key": api_key} if api_key else {})})
|
||||
urlopen(req)
|
||||
print(f"[Qdrant] Created collection: {collection} (dim=192)")
|
||||
return True
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"[Qdrant] Cannot access Qdrant: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def _qdrant_upsert(qdrant_url, api_key, collection, points):
|
||||
"""批量寫入 Qdrant points"""
|
||||
try:
|
||||
url = f"{qdrant_url}/collections/{collection}/points?wait=true"
|
||||
body = json.dumps({"points": points}).encode()
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if api_key:
|
||||
headers["api-key"] = api_key
|
||||
req = Request(url, data=body, headers=headers, method="PUT")
|
||||
urlopen(req)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"[Qdrant] Upsert failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def _hash_point_id(file_uuid, label):
|
||||
"""產生一致的 point ID"""
|
||||
s = f"{file_uuid}_{label}"
|
||||
return hash(s) & 0x7FFFFFFFFFFFFFFF
|
||||
|
||||
|
||||
def _save_checkpoint(path: str, data: dict):
|
||||
"""原子寫入 checkpoint(先 .tmp 再 rename)"""
|
||||
tmp = path + ".tmp"
|
||||
Path(tmp).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(tmp, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
os.replace(tmp, path)
|
||||
|
||||
|
||||
def compute_embedding_quality(embeddings, labels):
|
||||
"""每個 embedding 到所屬 cluster centroid 的餘弦相似度"""
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
unique_labels = set(labels)
|
||||
centroids = {}
|
||||
for label in unique_labels:
|
||||
mask = labels == label
|
||||
centroid = np.mean(embeddings[mask], axis=0)
|
||||
norm = np.linalg.norm(centroid)
|
||||
if norm > 0:
|
||||
centroid = centroid / norm
|
||||
centroids[label] = centroid
|
||||
qualities = []
|
||||
for emb, label in zip(embeddings, labels):
|
||||
sim = cosine_similarity([emb], [centroids[label]])[0][0]
|
||||
qualities.append(sim)
|
||||
return np.array(qualities)
|
||||
|
||||
|
||||
class SelfASRXFixed:
|
||||
"""自實作說話人分離系統(修復版)"""
|
||||
|
||||
"""7 步 Hybrid Speaker Diarization Pipeline"""
|
||||
|
||||
def __init__(self):
|
||||
print("[SelfASRX-Fixed] Initializing models...")
|
||||
|
||||
# 載入 VAD 模型
|
||||
print("[SelfASRX-Fixed] Loading VAD model (Silero)...")
|
||||
self.vad_model, self.vad_utils = load_vad_model()
|
||||
|
||||
# 載入聲紋模型
|
||||
print("[SelfASRX-Fixed] Loading speaker encoder (ECAPA-TDNN)...")
|
||||
self.speaker_encoder = load_speaker_encoder()
|
||||
|
||||
print("[SelfASRX-Fixed] Models loaded successfully")
|
||||
|
||||
def process(self, audio_path, output_path=None,
|
||||
min_speech_duration_ms=500,
|
||||
n_speakers=None,
|
||||
max_speakers=10):
|
||||
"""處理音頻文件"""
|
||||
start_time = time.time()
|
||||
print(f"\n[SelfASRX-Fixed] Processing: {audio_path}")
|
||||
print("=" * 60)
|
||||
|
||||
# 步驟 1: VAD
|
||||
print("\n[Step 1] Voice Activity Detection...")
|
||||
step1_start = time.time()
|
||||
|
||||
speech_segments, wav, sample_rate = extract_speech_segments(
|
||||
audio_path, self.vad_model, self.vad_utils,
|
||||
min_speech_duration_ms=min_speech_duration_ms
|
||||
)
|
||||
|
||||
step1_time = time.time() - step1_start
|
||||
print(f" Speech segments: {len(speech_segments)}")
|
||||
print(f" Total duration: {len(wav)/sample_rate:.2f}s")
|
||||
print(f" VAD time: {step1_time:.2f}s")
|
||||
|
||||
if len(speech_segments) == 0:
|
||||
print("[SelfASRX-Fixed] No speech detected!")
|
||||
return {"error": "No speech detected", "segments": []}
|
||||
|
||||
# 步驟 2: 聲紋特徵提取
|
||||
print("\n[Step 2] Speaker embedding extraction...")
|
||||
step2_start = time.time()
|
||||
|
||||
# 提取語音片段音頻
|
||||
audio_segments = []
|
||||
for start_sec, end_sec in speech_segments:
|
||||
start_sample = int(start_sec * sample_rate)
|
||||
end_sample = int(end_sec * sample_rate)
|
||||
audio_segments.append(wav[start_sample:end_sample])
|
||||
|
||||
# 批量提取嵌入
|
||||
embeddings = extract_speaker_embeddings_batch(
|
||||
self.speaker_encoder, audio_segments, sample_rate
|
||||
)
|
||||
|
||||
# 正規化
|
||||
embeddings = normalize_embeddings(embeddings)
|
||||
|
||||
step2_time = time.time() - step2_start
|
||||
print(f" Embedding shape: {embeddings.shape}")
|
||||
print(f" Embedding time: {step2_time:.2f}s")
|
||||
|
||||
# 步驟 3: 魯棒聚類
|
||||
print("\n[Step 3] Robust speaker clustering...")
|
||||
step3_start = time.time()
|
||||
|
||||
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
|
||||
embeddings,
|
||||
n_speakers=n_speakers,
|
||||
max_speakers=max_speakers
|
||||
)
|
||||
|
||||
step3_time = time.time() - step3_start
|
||||
print(f" Clustering time: {step3_time:.2f}s")
|
||||
|
||||
# 步驟 4: 建立輸出
|
||||
print("\n[Step 4] Building output...")
|
||||
|
||||
result = {
|
||||
"audio_path": str(audio_path),
|
||||
"total_duration": len(wav) / sample_rate,
|
||||
"n_speech_segments": len(speech_segments),
|
||||
"n_speakers": int(estimated_n_speakers),
|
||||
"segments": []
|
||||
}
|
||||
|
||||
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
|
||||
result["segments"].append({
|
||||
"index": i,
|
||||
"start": round(start, 3),
|
||||
"end": round(end, 3),
|
||||
"duration": round(end - start, 3),
|
||||
"speaker": f"SPEAKER_{int(label)}"
|
||||
})
|
||||
|
||||
# 統計每個說話人的總時長
|
||||
speaker_stats = {}
|
||||
for seg in result["segments"]:
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in speaker_stats:
|
||||
speaker_stats[speaker] = {"count": 0, "duration": 0}
|
||||
speaker_stats[speaker]["count"] += 1
|
||||
speaker_stats[speaker]["duration"] += seg["duration"]
|
||||
|
||||
result["speaker_stats"] = speaker_stats
|
||||
|
||||
total_time = time.time() - start_time
|
||||
result["processing_time"] = round(total_time, 2)
|
||||
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
|
||||
|
||||
print("\n[SelfASRX-Fixed] Processing completed!")
|
||||
print(f" Total time: {total_time:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
print(f" Detected speakers: {estimated_n_speakers}")
|
||||
|
||||
# 保存結果
|
||||
if output_path:
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f" Results saved to: {output_path}")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
return result
|
||||
print("[SelfASRX] Initializing models...")
|
||||
|
||||
print("[SelfASRX] Loading whisper model...")
|
||||
self.whisper = _load_whisper_model("small")
|
||||
|
||||
print("[SelfASRX] Loading VAD model (Silero)...")
|
||||
self.vad_model, self.vad_utils = _load_vad()
|
||||
|
||||
print("[SelfASRX] Loading speaker encoder (ECAPA-TDNN)...")
|
||||
self.speaker_encoder = _load_speaker_encoder()
|
||||
|
||||
print("[SelfASRX] Loading gender classifier...")
|
||||
self.gender_classifier = _load_gender_classifier()
|
||||
|
||||
# Qdrant 設定
|
||||
self.qdrant_url = os.environ.get("QDRANT_URL", "http://localhost:6333")
|
||||
self.qdrant_api_key = os.environ.get("QDRANT_API_KEY", "")
|
||||
schema = os.environ.get("DATABASE_SCHEMA", "public")
|
||||
self.qdrant_collection = os.environ.get(
|
||||
"QDRANT_SPEAKER_COLLECTION",
|
||||
f"momentry_{schema}_speaker"
|
||||
)
|
||||
self._qdrant_ok = False
|
||||
|
||||
print("[SelfASRX] Models loaded successfully")
|
||||
|
||||
def process(self, audio_path, output_path=None, file_uuid=None,
|
||||
max_speakers=10, quality_threshold=0.85,
|
||||
checkpoint_path=None):
|
||||
"""7 步 speaker diarization pipeline
|
||||
|
||||
def process_with_segments(self, audio_path, asr_segments, output_path=None):
|
||||
"""
|
||||
使用 ASR segment 邊界進行 speaker diarization,取代 VAD 步驟。
|
||||
|
||||
Args:
|
||||
audio_path: 音頻文件路徑(WAV)
|
||||
asr_segments: ASR segment 列表,每個包含 start/end(秒)
|
||||
output_path: 輸出 JSON 路徑(可選)
|
||||
audio_path: 音頻文件路徑 (WAV 16kHz mono)
|
||||
output_path: 輸出 JSON 路徑 (可選)
|
||||
file_uuid: 檔案 UUID (用於 Qdrant 儲存)
|
||||
max_speakers: 最大說話人數
|
||||
quality_threshold: 高品質聲紋門檻 (0-1)
|
||||
checkpoint_path: Step 3 完成後儲存 checkpoint 路徑
|
||||
|
||||
Returns:
|
||||
dict: segments, speaker_stats, n_speakers, total_duration, references
|
||||
"""
|
||||
start_time = time.time()
|
||||
print(f"\n[SelfASRX-Fixed] Processing with {len(asr_segments)} ASR segments: {audio_path}")
|
||||
print(f"\n[SelfASRX] Processing: {audio_path}")
|
||||
print("=" * 60)
|
||||
|
||||
# 載入完整音頻
|
||||
import soundfile as sf
|
||||
wav, sample_rate = sf.read(audio_path)
|
||||
if len(wav.shape) > 1:
|
||||
wav = np.mean(wav, axis=1) # 轉 mono
|
||||
print(f" Audio loaded: {len(wav)/sample_rate:.2f}s, {sample_rate}Hz")
|
||||
# 載入音頻
|
||||
wav, sample_rate = _load_audio(audio_path)
|
||||
total_duration = len(wav) / sample_rate
|
||||
print(f" Audio: {total_duration:.2f}s, {sample_rate}Hz")
|
||||
|
||||
# 使用 ASR segments 取代 VAD (audio处理用time)
|
||||
speech_segments = [(s["start_time"], s["end_time"]) for s in asr_segments]
|
||||
print(f" Speech segments from ASR: {len(speech_segments)}")
|
||||
# ── Step 1: whisper 粗略定位 (faster-whisper) ──
|
||||
print("\n[Step 1] Initial whisper transcription...")
|
||||
t1 = time.time()
|
||||
seg_gen, info = self.whisper.transcribe(audio_path)
|
||||
rough_segments = []
|
||||
for seg in seg_gen:
|
||||
rough_segments.append({"start": seg.start, "end": seg.end, "text": seg.text})
|
||||
language = info.language if info else None
|
||||
print(f" Rough segments: {len(rough_segments)}")
|
||||
print(f" Language: {language}")
|
||||
print(f" Step 1 time: {time.time() - t1:.2f}s")
|
||||
|
||||
if len(speech_segments) == 0:
|
||||
print("[SelfASRX-Fixed] No ASR segments provided!")
|
||||
return {"error": "No ASR segments", "segments": []}
|
||||
if not rough_segments:
|
||||
print("[SelfASRX] No speech detected by whisper!")
|
||||
return {"error": "No speech detected", "segments": []}
|
||||
|
||||
# 提取語音片段
|
||||
audio_segments = []
|
||||
for start_sec, end_sec in speech_segments:
|
||||
start_sample = int(start_sec * sample_rate)
|
||||
end_sample = int(end_sec * sample_rate)
|
||||
if start_sample >= len(wav):
|
||||
# ── Step 2: VAD scan 每個 rough segment 細切 ──
|
||||
print("\n[Step 2] VAD scan for refined segmentation...")
|
||||
t2 = time.time()
|
||||
refined_segments = []
|
||||
for seg in rough_segments:
|
||||
s = seg["start"]
|
||||
e = seg["end"]
|
||||
sub = self._vad_scan_segment(wav, sample_rate, s, e)
|
||||
if sub:
|
||||
refined_segments.extend(sub)
|
||||
else:
|
||||
refined_segments.append((s, e))
|
||||
print(f" Refined segments: {len(refined_segments)}")
|
||||
print(f" Step 2 time: {time.time() - t2:.2f}s")
|
||||
|
||||
if not refined_segments:
|
||||
return {"error": "No segments after VAD scan", "segments": []}
|
||||
|
||||
# ── Step 3: whisper per refined segment ──
|
||||
print("\n[Step 3] Per-segment transcription...")
|
||||
t3 = time.time()
|
||||
CHECKPOINT_INTERVAL = 50
|
||||
|
||||
segment_texts = []
|
||||
resume_from = 0
|
||||
|
||||
# 載入既有 partial checkpoint(中斷續接)
|
||||
if checkpoint_path and os.path.exists(checkpoint_path):
|
||||
try:
|
||||
with open(checkpoint_path, "r") as f:
|
||||
cp = json.load(f)
|
||||
if cp.get("checkpoint_version") == 2 and not cp.get("step3_completed"):
|
||||
saved = cp.get("segment_texts", [])
|
||||
if saved:
|
||||
resume_from = len(saved)
|
||||
segment_texts = saved
|
||||
print(f"[Step 3] Resuming from #{resume_from}/{len(refined_segments)}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for i, (start_sec, end_sec) in enumerate(refined_segments):
|
||||
if i < resume_from:
|
||||
continue
|
||||
audio_segments.append(wav[start_sample:min(end_sample, len(wav))])
|
||||
seg_text = self._transcribe_segment(wav, sample_rate, start_sec, end_sec)
|
||||
segment_texts.append(seg_text)
|
||||
|
||||
print(f" Audio segments extracted: {len(audio_segments)}")
|
||||
if checkpoint_path and (i + 1) % CHECKPOINT_INTERVAL == 0:
|
||||
_save_checkpoint(checkpoint_path, {
|
||||
"checkpoint_version": 2,
|
||||
"step3_completed": False,
|
||||
"step3_progress": i + 1,
|
||||
"language": language,
|
||||
"total_duration": total_duration,
|
||||
"refined_segments": [[s, e] for s, e in refined_segments],
|
||||
"segment_texts": [{
|
||||
"text": st["text"],
|
||||
"language": st["language"],
|
||||
"lang_prob": st["lang_prob"],
|
||||
} for st in segment_texts],
|
||||
"file_uuid": file_uuid,
|
||||
"max_speakers": max_speakers,
|
||||
"quality_threshold": quality_threshold,
|
||||
})
|
||||
print(f"[Checkpoint] Step 3: {i+1}/{len(refined_segments)}")
|
||||
|
||||
# 批量提取聲紋嵌入
|
||||
print("\n[Step 2] Speaker embedding extraction...")
|
||||
step2_start = time.time()
|
||||
print(f" Step 3 time: {time.time() - t3:.2f}s")
|
||||
|
||||
# ── Save final checkpoint after Step 3 ──
|
||||
if checkpoint_path:
|
||||
_save_checkpoint(checkpoint_path, {
|
||||
"checkpoint_version": 2,
|
||||
"step3_completed": True,
|
||||
"language": language,
|
||||
"total_duration": total_duration,
|
||||
"refined_segments": [[s, e] for s, e in refined_segments],
|
||||
"segment_texts": [{
|
||||
"text": st["text"],
|
||||
"language": st["language"],
|
||||
"lang_prob": st["lang_prob"],
|
||||
} for st in segment_texts],
|
||||
"file_uuid": file_uuid,
|
||||
"max_speakers": max_speakers,
|
||||
"quality_threshold": quality_threshold,
|
||||
})
|
||||
print(f"[Checkpoint] Step 3 complete, saved to {checkpoint_path}")
|
||||
|
||||
# ── Step 4: ECAPA-TDNN per refined segment ──
|
||||
print("\n[Step 4] Speaker embedding extraction...")
|
||||
t4 = time.time()
|
||||
audio_segments = []
|
||||
for start_sec, end_sec in refined_segments:
|
||||
s = int(start_sec * sample_rate)
|
||||
e = int(end_sec * sample_rate)
|
||||
audio_segments.append(wav[s:min(e, len(wav))])
|
||||
|
||||
from speaker_encoder import extract_speaker_embeddings_batch, normalize_embeddings
|
||||
embeddings = extract_speaker_embeddings_batch(
|
||||
self.speaker_encoder, audio_segments, sample_rate
|
||||
)
|
||||
embeddings = normalize_embeddings(embeddings)
|
||||
step2_time = time.time() - step2_start
|
||||
print(f" Embedding shape: {embeddings.shape}")
|
||||
print(f" Embedding time: {step2_time:.2f}s")
|
||||
print(f" Embeddings: {embeddings.shape}")
|
||||
print(f" Step 4 time: {time.time() - t4:.2f}s")
|
||||
|
||||
# 聚類
|
||||
print("\n[Step 3] Robust speaker clustering...")
|
||||
step3_start = time.time()
|
||||
# ── Step 5: AgglomerativeClustering ──
|
||||
print("\n[Step 5] Speaker clustering...")
|
||||
t5 = time.time()
|
||||
from speaker_cluster_fixed import robust_speaker_clustering
|
||||
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
|
||||
embeddings, n_speakers=None, max_speakers=10
|
||||
embeddings, n_speakers=None, max_speakers=max_speakers
|
||||
)
|
||||
step3_time = time.time() - step3_start
|
||||
print(f" Clustering time: {step3_time:.2f}s")
|
||||
print(f" Speakers: {estimated_n_speakers}")
|
||||
print(f" Step 5 time: {time.time() - t5:.2f}s")
|
||||
|
||||
# 建立輸出
|
||||
result = {
|
||||
"audio_path": str(audio_path),
|
||||
"total_duration": len(wav) / sample_rate,
|
||||
"n_speech_segments": len(speech_segments),
|
||||
"n_speakers": int(estimated_n_speakers),
|
||||
"segments": []
|
||||
}
|
||||
# 品質計算
|
||||
qualities = compute_embedding_quality(embeddings, speaker_labels)
|
||||
|
||||
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
|
||||
result["segments"].append({
|
||||
"index": i,
|
||||
"start": round(start, 3),
|
||||
"end": round(end, 3),
|
||||
"duration": round(end - start, 3),
|
||||
"speaker": f"SPEAKER_{int(label)}"
|
||||
})
|
||||
|
||||
# 加入 embeddings(每個 segment 對應的 192-D speaker embedding)
|
||||
result["embeddings"] = []
|
||||
for emb in embeddings:
|
||||
result["embeddings"].append(emb.tolist())
|
||||
# 建立輸出 segments
|
||||
segments = []
|
||||
for i, ((start_sec, end_sec), label) in enumerate(
|
||||
zip(refined_segments, speaker_labels)):
|
||||
seg = {
|
||||
"start": round(start_sec, 3),
|
||||
"end": round(end_sec, 3),
|
||||
"start_frame": int(start_sec * 30),
|
||||
"end_frame": int(end_sec * 30),
|
||||
"text": segment_texts[i]["text"],
|
||||
"language": segment_texts[i]["language"],
|
||||
"lang_prob": segment_texts[i]["lang_prob"],
|
||||
"speaker": f"SPEAKER_{int(label)}",
|
||||
"speaker_id": f"SPEAKER_{int(label)}",
|
||||
"quality": float(qualities[i]),
|
||||
}
|
||||
segments.append(seg)
|
||||
|
||||
# 統計
|
||||
speaker_stats = {}
|
||||
for seg in result["segments"]:
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in speaker_stats:
|
||||
speaker_stats[speaker] = {"count": 0, "duration": 0}
|
||||
speaker_stats[speaker]["count"] += 1
|
||||
speaker_stats[speaker]["duration"] += seg["duration"]
|
||||
result["speaker_stats"] = speaker_stats
|
||||
for seg in segments:
|
||||
spk = seg["speaker_id"]
|
||||
dur = seg["end"] - seg["start"]
|
||||
if spk not in speaker_stats:
|
||||
speaker_stats[spk] = {"count": 0, "duration": 0}
|
||||
speaker_stats[spk]["count"] += 1
|
||||
speaker_stats[spk]["duration"] += dur
|
||||
|
||||
result = {
|
||||
"language": language or "",
|
||||
"segments": segments,
|
||||
"n_speakers": int(estimated_n_speakers),
|
||||
"speaker_stats": speaker_stats,
|
||||
"total_duration": total_duration,
|
||||
"n_segments": len(segments),
|
||||
}
|
||||
|
||||
# ── Step 6: Store embeddings in Qdrant ──
|
||||
if file_uuid:
|
||||
print("\n[Step 6] Storing embeddings in Qdrant...")
|
||||
t6 = time.time()
|
||||
self._store_speaker_embeddings(segments, embeddings, speaker_labels,
|
||||
file_uuid)
|
||||
print(f" Step 6 time: {time.time() - t6:.2f}s")
|
||||
|
||||
# ── Step 7: High-quality classification ──
|
||||
if file_uuid:
|
||||
print("\n[Step 7] Classifying high-quality embeddings...")
|
||||
t7 = time.time()
|
||||
references = self._classify_high_quality_speakers(
|
||||
segments, embeddings, speaker_labels, file_uuid,
|
||||
wav, sample_rate, quality_threshold
|
||||
)
|
||||
if references:
|
||||
result["references"] = references
|
||||
print(f" Step 7 time: {time.time() - t7:.2f}s")
|
||||
|
||||
total_time = time.time() - start_time
|
||||
result["processing_time"] = round(total_time, 2)
|
||||
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
|
||||
|
||||
print("\n[SelfASRX-Fixed] Processing completed!")
|
||||
print(f" Total time: {total_time:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
print(f" Detected speakers: {estimated_n_speakers}")
|
||||
if total_duration > 0:
|
||||
result["realtime_factor"] = round(total_duration / total_time, 2)
|
||||
|
||||
# 保存輸出
|
||||
if output_path:
|
||||
import json
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
print(f" Results saved to: {output_path}")
|
||||
print(f"\n[SelfASRX] Saved to: {output_path}")
|
||||
|
||||
print(f"\n[SelfASRX] Done! {len(segments)} segments, "
|
||||
f"{estimated_n_speakers} speakers, "
|
||||
f"{total_time:.2f}s")
|
||||
|
||||
print("=" * 60)
|
||||
return result
|
||||
|
||||
def resume_from_checkpoint(self, checkpoint_path, audio_path,
|
||||
output_path=None):
|
||||
"""從 checkpoint 載入 Steps 1-3 結果,執行 Steps 4-7"""
|
||||
print(f"\n[SelfASRX] Resuming from checkpoint: {checkpoint_path}")
|
||||
print("=" * 60)
|
||||
|
||||
with open(checkpoint_path, "r", encoding="utf-8") as f:
|
||||
cp = json.load(f)
|
||||
|
||||
if not cp.get("step3_completed"):
|
||||
error_msg = f"Checkpoint step3 not completed (progress: {cp.get('step3_progress', '?')})"
|
||||
print(f"[SelfASRX] {error_msg}")
|
||||
return {"error": error_msg, "segments": []}
|
||||
|
||||
wav, sample_rate = _load_audio(audio_path)
|
||||
refined_segments = [tuple(s) for s in cp["refined_segments"]]
|
||||
segment_texts = cp["segment_texts"]
|
||||
language = cp.get("language", "")
|
||||
total_duration = cp.get("total_duration", 0)
|
||||
file_uuid = cp.get("file_uuid")
|
||||
max_speakers = cp.get("max_speakers", 10)
|
||||
quality_threshold = cp.get("quality_threshold", 0.85)
|
||||
|
||||
print(f" Loaded checkpoint: {len(refined_segments)} segments, "
|
||||
f"language={language}, duration={total_duration:.2f}s")
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# ── Step 4: ECAPA-TDNN per refined segment ──
|
||||
print("\n[Step 4] Speaker embedding extraction...")
|
||||
t4 = time.time()
|
||||
audio_segments = []
|
||||
for start_sec, end_sec in refined_segments:
|
||||
s = int(start_sec * sample_rate)
|
||||
e = int(end_sec * sample_rate)
|
||||
audio_segments.append(wav[s:min(e, len(wav))])
|
||||
|
||||
from speaker_encoder import extract_speaker_embeddings_batch, normalize_embeddings
|
||||
embeddings = extract_speaker_embeddings_batch(
|
||||
self.speaker_encoder, audio_segments, sample_rate
|
||||
)
|
||||
embeddings = normalize_embeddings(embeddings)
|
||||
print(f" Embeddings: {embeddings.shape}")
|
||||
print(f" Step 4 time: {time.time() - t4:.2f}s")
|
||||
|
||||
# ── Step 5: AgglomerativeClustering ──
|
||||
print("\n[Step 5] Speaker clustering...")
|
||||
t5 = time.time()
|
||||
from speaker_cluster_fixed import robust_speaker_clustering
|
||||
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
|
||||
embeddings, n_speakers=None, max_speakers=max_speakers
|
||||
)
|
||||
print(f" Speakers: {estimated_n_speakers}")
|
||||
print(f" Step 5 time: {time.time() - t5:.2f}s")
|
||||
|
||||
# 品質計算
|
||||
qualities = compute_embedding_quality(embeddings, speaker_labels)
|
||||
|
||||
# 建立輸出 segments
|
||||
segments = []
|
||||
for i, ((start_sec, end_sec), label) in enumerate(
|
||||
zip(refined_segments, speaker_labels)):
|
||||
seg = {
|
||||
"start": round(start_sec, 3),
|
||||
"end": round(end_sec, 3),
|
||||
"start_frame": int(start_sec * 30),
|
||||
"end_frame": int(end_sec * 30),
|
||||
"text": segment_texts[i]["text"],
|
||||
"language": segment_texts[i]["language"],
|
||||
"lang_prob": segment_texts[i]["lang_prob"],
|
||||
"speaker": f"SPEAKER_{int(label)}",
|
||||
"speaker_id": f"SPEAKER_{int(label)}",
|
||||
"quality": float(qualities[i]),
|
||||
}
|
||||
segments.append(seg)
|
||||
|
||||
# 統計
|
||||
speaker_stats = {}
|
||||
for seg in segments:
|
||||
spk = seg["speaker_id"]
|
||||
dur = seg["end"] - seg["start"]
|
||||
if spk not in speaker_stats:
|
||||
speaker_stats[spk] = {"count": 0, "duration": 0}
|
||||
speaker_stats[spk]["count"] += 1
|
||||
speaker_stats[spk]["duration"] += dur
|
||||
|
||||
result = {
|
||||
"language": language or "",
|
||||
"segments": segments,
|
||||
"n_speakers": int(estimated_n_speakers),
|
||||
"speaker_stats": speaker_stats,
|
||||
"total_duration": total_duration,
|
||||
"n_segments": len(segments),
|
||||
}
|
||||
|
||||
# ── Step 6: Store embeddings in Qdrant ──
|
||||
if file_uuid:
|
||||
print("\n[Step 6] Storing embeddings in Qdrant...")
|
||||
t6 = time.time()
|
||||
self._store_speaker_embeddings(segments, embeddings, speaker_labels,
|
||||
file_uuid)
|
||||
print(f" Step 6 time: {time.time() - t6:.2f}s")
|
||||
|
||||
# ── Step 7: High-quality classification ──
|
||||
if file_uuid:
|
||||
print("\n[Step 7] Classifying high-quality embeddings...")
|
||||
t7 = time.time()
|
||||
references = self._classify_high_quality_speakers(
|
||||
segments, embeddings, speaker_labels, file_uuid,
|
||||
wav, sample_rate, quality_threshold
|
||||
)
|
||||
if references:
|
||||
result["references"] = references
|
||||
print(f" Step 7 time: {time.time() - t7:.2f}s")
|
||||
|
||||
total_time = time.time() - start_time
|
||||
result["processing_time"] = round(total_time, 2)
|
||||
if total_duration > 0:
|
||||
result["realtime_factor"] = round(total_duration / total_time, 2)
|
||||
|
||||
# 保存輸出
|
||||
if output_path:
|
||||
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
print(f"\n[SelfASRX] Saved to: {output_path}")
|
||||
|
||||
print(f"\n[SelfASRX] Done! {len(segments)} segments, "
|
||||
f"{estimated_n_speakers} speakers, "
|
||||
f"{total_time:.2f}s")
|
||||
|
||||
return result
|
||||
|
||||
# ── Internal helpers ──
|
||||
|
||||
def _vad_scan_segment(self, wav, sample_rate, start_sec, end_sec):
|
||||
"""VAD 細切單一段落"""
|
||||
from vad import scan_within_segment
|
||||
return scan_within_segment(
|
||||
wav, sample_rate, start_sec, end_sec,
|
||||
self.vad_model, self.vad_utils
|
||||
)
|
||||
|
||||
def _transcribe_segment(self, wav, sample_rate, start_sec, end_sec):
|
||||
"""轉錄單一段落"""
|
||||
from whisper_local import transcribe_segment
|
||||
return transcribe_segment(wav, sample_rate, start_sec, end_sec, self.whisper)
|
||||
|
||||
def _store_speaker_embeddings(self, segments, embeddings, labels, file_uuid):
|
||||
"""Step 6: 所有 embedding 存入 Qdrant"""
|
||||
if not self._ensure_qdrant():
|
||||
return
|
||||
|
||||
points = []
|
||||
for i, (seg, emb, label) in enumerate(
|
||||
zip(segments, embeddings, labels)):
|
||||
point_id = _hash_point_id(file_uuid, f"{i}")
|
||||
points.append({
|
||||
"id": point_id,
|
||||
"vector": emb.tolist(),
|
||||
"payload": {
|
||||
"type": "speaker_embedding",
|
||||
"file_uuid": file_uuid,
|
||||
"speaker_id": seg["speaker_id"],
|
||||
"text": seg["text"],
|
||||
"language": seg["language"],
|
||||
"start_time": seg["start"],
|
||||
"end_time": seg["end"],
|
||||
}
|
||||
})
|
||||
|
||||
ok = _qdrant_upsert(self.qdrant_url, self.qdrant_api_key,
|
||||
self.qdrant_collection, points)
|
||||
if ok:
|
||||
print(f" Stored {len(points)} speaker embeddings to Qdrant")
|
||||
return ok
|
||||
|
||||
def _classify_high_quality_speakers(self, segments, embeddings, labels,
|
||||
file_uuid, wav, sample_rate,
|
||||
threshold=0.85):
|
||||
"""Step 7: 高品質聲紋分級 + 性別分類 → Qdrant reference"""
|
||||
qualities = compute_embedding_quality(embeddings, labels)
|
||||
high_mask = qualities >= threshold
|
||||
|
||||
if not np.any(high_mask):
|
||||
print(" No high-quality embeddings found")
|
||||
return []
|
||||
|
||||
unique_labels = set(labels)
|
||||
references = []
|
||||
for label in unique_labels:
|
||||
mask = (labels == label) & high_mask
|
||||
if not np.any(mask):
|
||||
continue
|
||||
high_indices = [i for i in range(len(segments)) if mask[i]]
|
||||
high_segs = [segments[i] for i in high_indices]
|
||||
|
||||
# 取品質最高的 segment index
|
||||
best_idx = high_indices[int(np.argmax(qualities[mask]))]
|
||||
best_seg = segments[best_idx]
|
||||
|
||||
centroid = np.mean(embeddings[mask], axis=0)
|
||||
norm = np.linalg.norm(centroid)
|
||||
if norm > 0:
|
||||
centroid = centroid / norm
|
||||
|
||||
avg_quality = float(np.mean(qualities[mask]))
|
||||
speaker_id = f"SPEAKER_{int(label)}"
|
||||
text_samples = [s["text"] for s in high_segs[:5] if s["text"]]
|
||||
total_dur = sum(s["end"] - s["start"] for s in high_segs)
|
||||
|
||||
ref_id = _hash_point_id(file_uuid, f"ref_{label}")
|
||||
ref_payload = {
|
||||
"type": "speaker_reference",
|
||||
"file_uuid": file_uuid,
|
||||
"speaker_id": speaker_id,
|
||||
"n_segments": int(np.sum(mask)),
|
||||
"avg_quality": avg_quality,
|
||||
"total_duration": round(total_dur, 2),
|
||||
"language": best_seg.get("language", ""),
|
||||
"text_samples": text_samples,
|
||||
}
|
||||
|
||||
# 性別分類:用最佳 segment 的音頻
|
||||
if self.gender_classifier is not None:
|
||||
try:
|
||||
import torch
|
||||
s = int(best_seg["start"] * sample_rate)
|
||||
e = int(best_seg["end"] * sample_rate)
|
||||
seg_wav = wav[s:min(e, len(wav))]
|
||||
seg_tensor = torch.from_numpy(seg_wav).float().unsqueeze(0)
|
||||
# SpeechBrain gender classifier 接受音頻
|
||||
out = self.gender_classifier.classify_batch(seg_tensor)
|
||||
probs = torch.softmax(out[0], dim=-1).squeeze().cpu().detach().numpy()
|
||||
if len(probs) >= 2:
|
||||
idx = int(np.argmax(probs))
|
||||
ref_payload["gender"] = "male" if idx == 0 else "female"
|
||||
ref_payload["gender_conf"] = float(probs[idx])
|
||||
else:
|
||||
ref_payload["gender"] = "unknown"
|
||||
ref_payload["gender_conf"] = 0.0
|
||||
except Exception as e:
|
||||
print(f"[Gender] Classify error: {e}")
|
||||
ref_payload["gender"] = "unknown"
|
||||
ref_payload["gender_conf"] = 0.0
|
||||
else:
|
||||
ref_payload["gender"] = "unknown"
|
||||
ref_payload["gender_conf"] = 0.0
|
||||
|
||||
_qdrant_upsert(self.qdrant_url, self.qdrant_api_key,
|
||||
self.qdrant_collection, [{
|
||||
"id": ref_id,
|
||||
"vector": centroid.tolist(),
|
||||
"payload": ref_payload,
|
||||
}])
|
||||
|
||||
references.append({
|
||||
"speaker_id": speaker_id,
|
||||
"n_segments": int(np.sum(mask)),
|
||||
"avg_quality": avg_quality,
|
||||
"gender": ref_payload["gender"],
|
||||
})
|
||||
|
||||
print(f" Ref: {speaker_id}, gender={ref_payload['gender']}"
|
||||
f" ({ref_payload['gender_conf']:.2f}), q={avg_quality:.3f}")
|
||||
|
||||
return references
|
||||
|
||||
def _ensure_qdrant(self):
|
||||
"""確保 Qdrant collection 可用"""
|
||||
if not self._qdrant_ok:
|
||||
ok = _ensure_speaker_collection(
|
||||
self.qdrant_url, self.qdrant_api_key, self.qdrant_collection
|
||||
)
|
||||
self._qdrant_ok = ok
|
||||
return self._qdrant_ok
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Self-implemented ASRX (Fixed)")
|
||||
parser.add_argument("audio_path", help="Path to audio file")
|
||||
parser = argparse.ArgumentParser(description="SelfASRX - Hybrid Speaker Diarization")
|
||||
parser.add_argument("audio_path", help="Path to audio file (WAV)")
|
||||
parser.add_argument("-o", "--output", help="Output JSON path")
|
||||
parser.add_argument("--min-speech-duration", type=int, default=500)
|
||||
parser.add_argument("--n-speakers", type=int, default=None)
|
||||
parser.add_argument("--file-uuid", help="File UUID for Qdrant storage")
|
||||
parser.add_argument("--max-speakers", type=int, default=10)
|
||||
|
||||
parser.add_argument("--quality-threshold", type=float, default=0.85)
|
||||
parser.add_argument("--resume", help="Checkpoint path to resume from")
|
||||
parser.add_argument("--checkpoint", help="Save checkpoint path after Step 3")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not Path(args.audio_path).exists():
|
||||
print(f"Error: Audio file not found: {args.audio_path}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
asrx = SelfASRXFixed()
|
||||
result = asrx.process(
|
||||
args.audio_path,
|
||||
args.output,
|
||||
min_speech_duration_ms=args.min_speech_duration,
|
||||
n_speakers=args.n_speakers,
|
||||
max_speakers=args.max_speakers
|
||||
)
|
||||
|
||||
|
||||
if args.resume:
|
||||
if not Path(args.resume).exists():
|
||||
print(f"Error: Checkpoint not found: {args.resume}")
|
||||
sys.exit(1)
|
||||
result = asrx.resume_from_checkpoint(
|
||||
args.resume, args.audio_path,
|
||||
output_path=args.output,
|
||||
)
|
||||
else:
|
||||
if not Path(args.audio_path).exists():
|
||||
print(f"Error: Audio file not found: {args.audio_path}")
|
||||
sys.exit(1)
|
||||
|
||||
result = asrx.process(
|
||||
args.audio_path,
|
||||
output_path=args.output,
|
||||
file_uuid=args.file_uuid,
|
||||
max_speakers=args.max_speakers,
|
||||
quality_threshold=args.quality_threshold,
|
||||
checkpoint_path=args.checkpoint,
|
||||
)
|
||||
|
||||
if "error" not in result:
|
||||
print("\n[Summary]")
|
||||
print(f" Audio duration: {result['total_duration']:.2f}s")
|
||||
print(f" Speech segments: {result['n_speech_segments']}")
|
||||
print(f" Detected speakers: {result['n_speakers']}")
|
||||
print(f" Processing time: {result['processing_time']:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
|
||||
print("\n[Speaker Statistics]")
|
||||
for speaker, stats in result['speaker_stats'].items():
|
||||
pct = stats['duration'] / result['total_duration'] * 100
|
||||
print(f" {speaker}: {stats['count']} segments, " +
|
||||
f"{stats['duration']:.2f}s ({pct:.1f}%)")
|
||||
print(f" Duration: {result['total_duration']:.2f}s")
|
||||
print(f" Segments: {result['n_segments']}")
|
||||
print(f" Speakers: {result['n_speakers']}")
|
||||
if "references" in result:
|
||||
for ref in result["references"]:
|
||||
print(f" {ref['speaker_id']}: gender={ref['gender']}, "
|
||||
f"quality={ref['avg_quality']:.3f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -1,280 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Speaker Audio Player - 說話人語音播放器
|
||||
從 ASRX 結果中提取並播放每個說話人的語音片段
|
||||
"""
|
||||
|
||||
import json
|
||||
import argparse
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
def load_asrx_result(result_path: str) -> Dict:
|
||||
"""載入 ASRX 結果"""
|
||||
with open(result_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def extract_audio_segment(
|
||||
audio_path: str, start_sec: float, end_sec: float, output_path: str
|
||||
) -> bool:
|
||||
"""
|
||||
使用 ffmpeg 提取音頻片段
|
||||
|
||||
Args:
|
||||
audio_path: 原始音頻路徑
|
||||
start_sec: 開始時間(秒)
|
||||
end_sec: 結束時間(秒)
|
||||
output_path: 輸出路徑
|
||||
|
||||
Returns:
|
||||
bool: 是否成功
|
||||
"""
|
||||
duration = end_sec - start_sec
|
||||
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-i",
|
||||
audio_path,
|
||||
"-ss",
|
||||
str(start_sec),
|
||||
"-t",
|
||||
str(duration),
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-ac",
|
||||
"1",
|
||||
output_path,
|
||||
]
|
||||
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
return result.returncode == 0
|
||||
except Exception as e:
|
||||
print(f"Error extracting audio: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def play_audio(audio_path: str) -> bool:
|
||||
"""
|
||||
播放音頻文件
|
||||
|
||||
使用 macOS 的 afplay 或 Linux 的 aplay
|
||||
"""
|
||||
try:
|
||||
# 嘗試使用 afplay (macOS)
|
||||
if os.path.exists("/usr/bin/afplay"):
|
||||
subprocess.run(["afplay", audio_path], check=True)
|
||||
# 嘗試使用 aplay (Linux)
|
||||
elif os.path.exists("/usr/bin/aplay"):
|
||||
subprocess.run(["aplay", audio_path], check=True)
|
||||
else:
|
||||
print(
|
||||
"No audio player found. Please install afplay (macOS) or aplay (Linux)"
|
||||
)
|
||||
return False
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error playing audio: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def group_segments_by_speaker(segments: List[Dict]) -> Dict[str, List[Dict]]:
|
||||
"""將語音片段按說話人分組"""
|
||||
speaker_segments = {}
|
||||
|
||||
for seg in segments:
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in speaker_segments:
|
||||
speaker_segments[speaker] = []
|
||||
speaker_segments[speaker].append(seg)
|
||||
|
||||
# 按開始時間排序
|
||||
for speaker in speaker_segments:
|
||||
speaker_segments[speaker].sort(key=lambda x: x["start"])
|
||||
|
||||
return speaker_segments
|
||||
|
||||
|
||||
def play_speaker_segments(
|
||||
audio_path: str,
|
||||
result_path: str,
|
||||
speaker_id: str = None,
|
||||
limit: int = None,
|
||||
temp_dir: str = None,
|
||||
):
|
||||
"""
|
||||
播放指定說話人的語音片段
|
||||
|
||||
Args:
|
||||
audio_path: 原始音頻路徑
|
||||
result_path: ASRX 結果 JSON 路徑
|
||||
speaker_id: 說話人 ID(None=播放所有)
|
||||
limit: 最多播放幾個片段(None=全部)
|
||||
temp_dir: 臨時目錄
|
||||
"""
|
||||
# 載入結果
|
||||
print(f"[Load] Loading ASRX result: {result_path}")
|
||||
result = load_asrx_result(result_path)
|
||||
|
||||
segments = result.get("segments", [])
|
||||
total_duration = result.get("total_duration", 0)
|
||||
|
||||
print(f"[Info] Total segments: {len(segments)}")
|
||||
print(f"[Info] Total duration: {total_duration / 60:.1f} minutes")
|
||||
|
||||
# 分組
|
||||
speaker_segments = group_segments_by_speaker(segments)
|
||||
|
||||
# 選擇說話人
|
||||
if speaker_id:
|
||||
speakers_to_play = [speaker_id]
|
||||
else:
|
||||
speakers_to_play = sorted(speaker_segments.keys())
|
||||
|
||||
# 創建臨時目錄
|
||||
if temp_dir is None:
|
||||
temp_dir = tempfile.mkdtemp(prefix="speaker_audio_")
|
||||
|
||||
print(f"\n[Info] Temp directory: {temp_dir}")
|
||||
print(f"[Info] Speakers to play: {speakers_to_play}")
|
||||
print("=" * 60)
|
||||
|
||||
# 播放每個說話人的片段
|
||||
for speaker in speakers_to_play:
|
||||
if speaker not in speaker_segments:
|
||||
print(f"\n[Warning] Speaker {speaker} not found!")
|
||||
continue
|
||||
|
||||
segs = speaker_segments[speaker]
|
||||
if limit:
|
||||
segs = segs[:limit]
|
||||
|
||||
print(f"\n▶️ {speaker} ({len(segs)} segments)")
|
||||
print("-" * 60)
|
||||
|
||||
for i, seg in enumerate(segs, 1):
|
||||
start = seg["start"]
|
||||
end = seg["end"]
|
||||
duration = seg["duration"]
|
||||
|
||||
# 提取音頻
|
||||
temp_audio = os.path.join(temp_dir, f"{speaker}_{i:03d}.wav")
|
||||
|
||||
print(
|
||||
f" [{i:3d}] {start:7.2f}s - {end:7.2f}s ({duration:5.2f}s) ... ",
|
||||
end="",
|
||||
flush=True,
|
||||
)
|
||||
|
||||
if extract_audio_segment(audio_path, start, end, temp_audio):
|
||||
print("✅", end="", flush=True)
|
||||
|
||||
# 播放
|
||||
if play_audio(temp_audio):
|
||||
print(" ▶️ Played")
|
||||
else:
|
||||
print(" ❌ Play failed")
|
||||
else:
|
||||
print(" ❌ Extract failed")
|
||||
|
||||
print()
|
||||
|
||||
|
||||
def show_speaker_stats(result_path: str):
|
||||
"""顯示說話人統計資訊"""
|
||||
result = load_asrx_result(result_path)
|
||||
|
||||
segments = result.get("segments", [])
|
||||
speaker_segments = group_segments_by_speaker(segments)
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("說話人統計")
|
||||
print("=" * 60)
|
||||
|
||||
# 按時長排序
|
||||
speaker_stats = []
|
||||
for speaker, segs in speaker_segments.items():
|
||||
total_duration = sum(seg["duration"] for seg in segs)
|
||||
speaker_stats.append((speaker, len(segs), total_duration))
|
||||
|
||||
speaker_stats.sort(key=lambda x: x[2], reverse=True)
|
||||
|
||||
total_duration = result.get("total_duration", 0)
|
||||
|
||||
for speaker, count, duration in speaker_stats:
|
||||
pct = duration / total_duration * 100 if total_duration > 0 else 0
|
||||
print(f"{speaker:12} {count:4} segments {duration:8.1f}s ({pct:5.1f}%)")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Speaker Audio Player - 播放說話人語音片段",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# 顯示說話人統計
|
||||
python3 speaker_audio_player.py --stats result.json
|
||||
|
||||
# 播放所有說話人的前 3 個片段
|
||||
python3 speaker_audio_player.py audio.wav result.json --limit 3
|
||||
|
||||
# 播放特定說話人的所有片段
|
||||
python3 speaker_audio_player.py audio.wav result.json --speaker SPEAKER_0
|
||||
|
||||
# 播放 SPEAKER_1 的前 5 個片段
|
||||
python3 speaker_audio_player.py audio.wav result.json --speaker SPEAKER_1 --limit 5
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument("audio_path", nargs="?", help="原始音頻文件路徑")
|
||||
parser.add_argument("result_path", help="ASRX 結果 JSON 路徑")
|
||||
parser.add_argument("--stats", action="store_true", help="只显示說話人統計")
|
||||
parser.add_argument("--speaker", type=str, help="指定說話人 ID(如 SPEAKER_0)")
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
default=None,
|
||||
help="每個說話人最多播放幾個片段(None=全部)",
|
||||
)
|
||||
parser.add_argument("--temp-dir", type=str, default=None, help="臨時目錄路徑")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.stats:
|
||||
show_speaker_stats(args.result_path)
|
||||
return
|
||||
|
||||
if not args.audio_path:
|
||||
print("Error: audio_path is required unless --stats is specified")
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
if not Path(args.audio_path).exists():
|
||||
print(f"Error: Audio file not found: {args.audio_path}")
|
||||
return
|
||||
|
||||
if not Path(args.result_path).exists():
|
||||
print(f"Error: Result file not found: {args.result_path}")
|
||||
return
|
||||
|
||||
play_speaker_segments(
|
||||
args.audio_path,
|
||||
args.result_path,
|
||||
speaker_id=args.speaker,
|
||||
limit=args.limit,
|
||||
temp_dir=args.temp_dir,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
65
scripts/asrx_self/speaker_classifier.py
Normal file
65
scripts/asrx_self/speaker_classifier.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""
|
||||
Speaker Classifier - 聲紋品質評估與性別分類
|
||||
|
||||
提供品質計算與性別分類功能,作為 main_fixed.py 的輔助模組。
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def compute_embedding_quality(embeddings, labels):
|
||||
"""每個 embedding 到所屬 cluster centroid 的餘弦相似度
|
||||
|
||||
Args:
|
||||
embeddings: [n_segments, 192] 聲紋向量矩陣
|
||||
labels: [n_segments] 聚類標籤
|
||||
|
||||
Returns:
|
||||
qualities: [n_segments] 品質分數 (0-1)
|
||||
"""
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
unique_labels = set(labels)
|
||||
centroids = {}
|
||||
for label in unique_labels:
|
||||
mask = labels == label
|
||||
centroid = np.mean(embeddings[mask], axis=0)
|
||||
norm = np.linalg.norm(centroid)
|
||||
if norm > 0:
|
||||
centroid = centroid / norm
|
||||
centroids[label] = centroid
|
||||
|
||||
qualities = []
|
||||
for emb, label in zip(embeddings, labels):
|
||||
sim = cosine_similarity([emb], [centroids[label]])[0][0]
|
||||
qualities.append(sim)
|
||||
|
||||
return np.array(qualities)
|
||||
|
||||
|
||||
def classify_gender(audio_wav, sample_rate, classifier):
|
||||
"""從音頻段分類性別
|
||||
|
||||
Args:
|
||||
audio_wav: 音頻波形 (numpy array)
|
||||
sample_rate: 採樣率
|
||||
classifier: SpeechBrain EncoderClassifier (gender-recognition-ecapa)
|
||||
|
||||
Returns:
|
||||
dict: {"gender": "male"|"female"|"unknown", "confidence": float}
|
||||
"""
|
||||
default = {"gender": "unknown", "confidence": 0.0}
|
||||
if classifier is None or len(audio_wav) == 0:
|
||||
return default
|
||||
try:
|
||||
import torch
|
||||
seg_tensor = torch.from_numpy(audio_wav).float().unsqueeze(0)
|
||||
out = classifier.classify_batch(seg_tensor)
|
||||
probs = torch.softmax(out[0], dim=-1).squeeze().cpu().detach().numpy()
|
||||
if len(probs) >= 2:
|
||||
idx = int(np.argmax(probs))
|
||||
label = "male" if idx == 0 else "female"
|
||||
return {"gender": label, "confidence": float(probs[idx])}
|
||||
except Exception as e:
|
||||
pass
|
||||
return default
|
||||
@@ -1,310 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Speaker Clustering - 說話人聚類
|
||||
使用譜聚類算法將聲紋嵌入分組
|
||||
|
||||
技術來源:
|
||||
- 譜聚類:Shi & Malik (2000), IEEE TPAMI
|
||||
- 論文:https://ieeexplore.ieee.org/document/868688
|
||||
- 應用於說話人分離:Wooters & Huijbregts (2008), ICASSP
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from sklearn.cluster import SpectralClustering, AgglomerativeClustering
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
|
||||
def estimate_n_speakers_eigengap(similarity_matrix, max_speakers=10):
|
||||
"""
|
||||
使用特徵值間隙方法估計說話人數量
|
||||
|
||||
技術來源:
|
||||
- 特徵值間隙理論:Lu et al. (2010)
|
||||
- 原理:相似度矩陣的特徵值分佈中,最大間隙對應最佳聚類數
|
||||
|
||||
Args:
|
||||
similarity_matrix: 相似度矩陣 [n, n]
|
||||
max_speakers: 最大說話人數
|
||||
|
||||
Returns:
|
||||
n_speakers: 估計的說話人數量
|
||||
"""
|
||||
# 計算特徵值
|
||||
eigenvalues = np.linalg.eigvalsh(similarity_matrix)
|
||||
|
||||
# 降序排列
|
||||
eigenvalues = np.sort(eigenvalues)[::-1]
|
||||
|
||||
# 只考慮前 max_speakers 個特徵值
|
||||
eigenvalues = eigenvalues[:max_speakers]
|
||||
|
||||
# 計算間隙
|
||||
gaps = np.diff(eigenvalues)
|
||||
|
||||
# 找到最大間隙的位置
|
||||
if len(gaps) > 0:
|
||||
n_speakers = np.argmax(np.abs(gaps)) + 1
|
||||
else:
|
||||
n_speakers = 1
|
||||
|
||||
# 限制範圍
|
||||
n_speakers = max(2, min(n_speakers, max_speakers))
|
||||
|
||||
return n_speakers
|
||||
|
||||
|
||||
def estimate_n_speakers_silhouette(embeddings, max_speakers=10):
|
||||
"""
|
||||
使用輪廓係數估計說話人數量
|
||||
|
||||
Args:
|
||||
embeddings: 嵌入矩陣 [n, d]
|
||||
max_speakers: 最大說話人數
|
||||
|
||||
Returns:
|
||||
n_speakers: 估計的說話人數量
|
||||
"""
|
||||
from sklearn.metrics import silhouette_score
|
||||
|
||||
best_score = -1
|
||||
best_n = 2
|
||||
|
||||
for n in range(2, min(max_speakers + 1, len(embeddings))):
|
||||
clustering = AgglomerativeClustering(n_clusters=n)
|
||||
labels = clustering.fit_predict(embeddings)
|
||||
|
||||
if len(np.unique(labels)) > 1:
|
||||
score = silhouette_score(embeddings, labels)
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_n = n
|
||||
|
||||
return best_n
|
||||
|
||||
|
||||
def spectral_clustering_speaker(
|
||||
similarity_matrix, n_speakers=None, auto_estimate=True, max_speakers=10
|
||||
):
|
||||
"""
|
||||
使用譜聚類進行說話人分離
|
||||
|
||||
Args:
|
||||
similarity_matrix: 相似度矩陣 [n, n]
|
||||
n_speakers: 說話人數量(可選,如果為 None 則自動估計)
|
||||
auto_estimate: 是否自動估計說話人數量
|
||||
max_speakers: 最大說話人數
|
||||
|
||||
Returns:
|
||||
speaker_labels: 說話人標籤 [n,]
|
||||
n_speakers: 使用的說話人數量
|
||||
"""
|
||||
n_segments = len(similarity_matrix)
|
||||
|
||||
# 清洗相似度矩陣
|
||||
similarity_matrix = np.nan_to_num(
|
||||
similarity_matrix, nan=0.5, posinf=1.0, neginf=-1.0
|
||||
)
|
||||
|
||||
# 確保對角線為 1
|
||||
np.fill_diagonal(similarity_matrix, 1.0)
|
||||
|
||||
# 確保值在 [-1, 1] 範圍
|
||||
similarity_matrix = np.clip(similarity_matrix, -1.0, 1.0)
|
||||
|
||||
# 自動估計說話人數量
|
||||
if n_speakers is None and auto_estimate:
|
||||
n_speakers = estimate_n_speakers_eigengap(
|
||||
similarity_matrix, max_speakers=max_speakers
|
||||
)
|
||||
print(f"[Clustering] Estimated n_speakers: {n_speakers}")
|
||||
|
||||
if n_speakers is None:
|
||||
n_speakers = 2 # 預設值
|
||||
|
||||
# 確保 n_speakers 不超過樣本數
|
||||
n_speakers = min(n_speakers, n_segments)
|
||||
|
||||
print(f"[Clustering] Running spectral clustering with {n_speakers} clusters...")
|
||||
|
||||
# 譜聚類
|
||||
try:
|
||||
clustering = SpectralClustering(
|
||||
n_clusters=int(n_speakers),
|
||||
affinity="precomputed",
|
||||
assign_labels="kmeans",
|
||||
random_state=42,
|
||||
n_init=10,
|
||||
)
|
||||
|
||||
speaker_labels = clustering.fit_predict(similarity_matrix)
|
||||
|
||||
print("[Clustering] Spectral clustering completed")
|
||||
print(f"[Clustering] n_speakers: {n_speakers}")
|
||||
print(f"[Clustering] n_segments: {n_segments}")
|
||||
|
||||
return speaker_labels, n_speakers
|
||||
|
||||
except Exception as e:
|
||||
print(f"[Clustering] Spectral clustering failed: {e}")
|
||||
print("[Clustering] Using fallback: 2 speakers")
|
||||
# 簡單分配:前一半是 SPEAKER_0,後一半是 SPEAKER_1
|
||||
speaker_labels = np.array(
|
||||
[0] * (n_segments // 2) + [1] * (n_segments - n_segments // 2)
|
||||
)
|
||||
return speaker_labels, 2
|
||||
|
||||
|
||||
def agglomerative_clustering_speaker(
|
||||
embeddings, n_speakers=None, threshold=0.5, max_speakers=10
|
||||
):
|
||||
"""
|
||||
使用層次聚類進行說話人分離
|
||||
|
||||
Args:
|
||||
embeddings: 嵌入矩陣 [n, d]
|
||||
n_speakers: 說話人數量(可選)
|
||||
threshold: 距離閾值(用於自動決定聚類數)
|
||||
max_speakers: 最大說話人數
|
||||
|
||||
Returns:
|
||||
speaker_labels: 說話人標籤 [n,]
|
||||
n_speakers: 使用的說話人數量
|
||||
"""
|
||||
n_segments = len(embeddings)
|
||||
|
||||
if n_speakers is None:
|
||||
# 使用距離閾值自動決定
|
||||
from sklearn.metrics.pairwise import cosine_distances
|
||||
|
||||
distances = cosine_distances(embeddings)
|
||||
|
||||
# 計算平均最近鄰距離
|
||||
avg_distances = []
|
||||
for i in range(min(100, n_segments)):
|
||||
dists = distances[i]
|
||||
dists = np.sort(dists)
|
||||
if len(dists) > 1:
|
||||
avg_distances.append(dists[1]) # 最近鄰(排除自己)
|
||||
|
||||
if avg_distances:
|
||||
avg_dist = np.mean(avg_distances)
|
||||
# 根據平均距離估計聚類數
|
||||
n_speakers = max(2, int(avg_dist / threshold))
|
||||
n_speakers = min(n_speakers, max_speakers)
|
||||
else:
|
||||
n_speakers = 2
|
||||
|
||||
n_speakers = min(n_speakers, n_segments)
|
||||
|
||||
# 層次聚類
|
||||
clustering = AgglomerativeClustering(
|
||||
n_clusters=n_speakers, metric="cosine", linkage="average"
|
||||
)
|
||||
|
||||
speaker_labels = clustering.fit_predict(embeddings)
|
||||
|
||||
print("[Clustering] Agglomerative clustering completed")
|
||||
print(f"[Clustering] n_speakers: {n_speakers}")
|
||||
|
||||
return speaker_labels, n_speakers
|
||||
|
||||
|
||||
def smooth_speaker_labels(speaker_labels, window_size=5):
|
||||
"""
|
||||
平滑說話人標籤(去除噪聲)
|
||||
|
||||
Args:
|
||||
speaker_labels: 原始說話人標籤
|
||||
window_size: 平滑窗口大小
|
||||
|
||||
Returns:
|
||||
smoothed_labels: 平滑後的標籤
|
||||
"""
|
||||
from scipy import stats
|
||||
|
||||
smoothed = np.copy(speaker_labels)
|
||||
half_window = window_size // 2
|
||||
|
||||
for i in range(len(speaker_labels)):
|
||||
start = max(0, i - half_window)
|
||||
end = min(len(speaker_labels), i + half_window + 1)
|
||||
|
||||
window_labels = speaker_labels[start:end]
|
||||
mode_result = stats.mode(window_labels, keepdims=True)
|
||||
smoothed[i] = mode_result.mode[0]
|
||||
|
||||
return smoothed
|
||||
|
||||
|
||||
def compute_diarization_purity(speaker_labels, ground_truth_labels=None):
|
||||
"""
|
||||
計算說話人分離純度(如果有 ground truth)
|
||||
|
||||
Args:
|
||||
speaker_labels: 預測的說話人標籤
|
||||
ground_truth_labels: 真實的說話人標籤(可選)
|
||||
|
||||
Returns:
|
||||
purity: 純度分數(0-1)
|
||||
"""
|
||||
if ground_truth_labels is None:
|
||||
# 沒有 ground truth,使用聚類純度近似
|
||||
|
||||
# 使用餘弦相似度作為距離
|
||||
purity = 0.5 # 預設值
|
||||
else:
|
||||
# 計算純度
|
||||
from sklearn.metrics import adjusted_rand_score
|
||||
|
||||
purity = adjusted_rand_score(ground_truth_labels, speaker_labels)
|
||||
|
||||
return purity
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 測試聚類算法
|
||||
print("[Test] Testing speaker clustering algorithms")
|
||||
|
||||
# 生成模擬數據
|
||||
np.random.seed(42)
|
||||
n_speakers = 3
|
||||
n_segments_per_speaker = 20
|
||||
|
||||
# 生成 3 個說話人的嵌入
|
||||
embeddings = []
|
||||
for i in range(n_speakers):
|
||||
# 每個說話人有不同的中心
|
||||
center = np.random.randn(192) * 2 + i * 3
|
||||
# 添加噪聲
|
||||
for _ in range(n_segments_per_speaker):
|
||||
emb = center + np.random.randn(192) * 0.5
|
||||
embeddings.append(emb)
|
||||
|
||||
embeddings = np.array(embeddings)
|
||||
print(f"[Test] Generated {len(embeddings)} embeddings for {n_speakers} speakers")
|
||||
|
||||
# 計算相似度矩陣
|
||||
similarity = cosine_similarity(embeddings)
|
||||
print(f"[Test] Similarity matrix shape: {similarity.shape}")
|
||||
|
||||
# 估計說話人數量
|
||||
estimated_n = estimate_n_speakers_eigengap(similarity, max_speakers=10)
|
||||
print(f"[Test] Estimated n_speakers (eigengap): {estimated_n}")
|
||||
|
||||
estimated_n_silhouette = estimate_n_speakers_silhouette(embeddings, max_speakers=10)
|
||||
print(f"[Test] Estimated n_speakers (silhouette): {estimated_n_silhouette}")
|
||||
|
||||
# 譜聚類
|
||||
labels, n_clusters = spectral_clustering_speaker(
|
||||
similarity, n_speakers=None, auto_estimate=True
|
||||
)
|
||||
|
||||
print("\n[Test] Clustering results:")
|
||||
print(f" True n_speakers: {n_speakers}")
|
||||
print(f" Estimated n_speakers: {n_clusters}")
|
||||
print(f" Unique labels: {np.unique(labels)}")
|
||||
|
||||
# 計算每個聚類的大小
|
||||
for label in np.unique(labels):
|
||||
count = np.sum(labels == label)
|
||||
print(f" Cluster {label}: {count} segments")
|
||||
@@ -1,431 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Speaker Player GUI - 說話人語音播放器(圖形界面)
|
||||
使用 tkinter 顯示播放進度和 Speaker ID
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import tkinter as tk
|
||||
from tkinter import ttk, filedialog, messagebox
|
||||
|
||||
HAS_TKINTER = True
|
||||
except ImportError:
|
||||
HAS_TKINTER = False
|
||||
|
||||
|
||||
class SpeakerPlayerGUI:
|
||||
"""說話人語音播放器 GUI"""
|
||||
|
||||
def __init__(self, root):
|
||||
self.root = root
|
||||
self.root.title("🎬 Speaker Audio Player - Face Integration")
|
||||
self.root.geometry("1100x800")
|
||||
|
||||
# 數據
|
||||
self.audio_path = None
|
||||
self.result_path = None
|
||||
self.face_path = None
|
||||
self.result_data = None
|
||||
self.face_data = None
|
||||
self.integrated_data = None
|
||||
self.speaker_segments = {}
|
||||
self.speakers = []
|
||||
self.current_speaker_idx = 0
|
||||
self.is_playing = False
|
||||
self.stop_flag = False
|
||||
|
||||
# 創建界面
|
||||
self.create_widgets()
|
||||
|
||||
def create_widgets(self):
|
||||
"""創建界面組件"""
|
||||
# 頂部:文件選擇
|
||||
top_frame = ttk.Frame(self.root, padding="10")
|
||||
top_frame.pack(fill=tk.X)
|
||||
|
||||
ttk.Label(top_frame, text="📁 Audio:").pack(side=tk.LEFT)
|
||||
self.audio_label = ttk.Label(top_frame, text="未選擇", width=50)
|
||||
self.audio_label.pack(side=tk.LEFT, padx=5)
|
||||
ttk.Button(top_frame, text="選擇音頻", command=self.select_audio).pack(
|
||||
side=tk.LEFT, padx=5
|
||||
)
|
||||
|
||||
ttk.Label(top_frame, text=" 📊 Result:").pack(side=tk.LEFT, padx=(20, 0))
|
||||
self.result_label = ttk.Label(top_frame, text="未選擇", width=50)
|
||||
self.result_label.pack(side=tk.LEFT, padx=5)
|
||||
ttk.Button(top_frame, text="選擇結果", command=self.select_result).pack(
|
||||
side=tk.LEFT, padx=5
|
||||
)
|
||||
|
||||
# 中間:說話人列表和片段列表
|
||||
mid_frame = ttk.Frame(self.root, padding="10")
|
||||
mid_frame.pack(fill=tk.BOTH, expand=True)
|
||||
|
||||
# 左側:說話人列表
|
||||
left_frame = ttk.LabelFrame(mid_frame, text="📢 說話人列表", padding="10")
|
||||
left_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=False)
|
||||
|
||||
self.speaker_listbox = tk.Listbox(
|
||||
left_frame, width=35, height=20, font=("Arial", 11)
|
||||
)
|
||||
self.speaker_listbox.pack(fill=tk.BOTH, expand=True)
|
||||
self.speaker_listbox.bind("<<ListboxSelect>>", self.on_speaker_select)
|
||||
|
||||
# 右側:片段列表
|
||||
right_frame = ttk.LabelFrame(mid_frame, text="🎵 語音片段", padding="10")
|
||||
right_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=10)
|
||||
|
||||
# 片段列表(带滚动条)
|
||||
list_frame = ttk.Frame(right_frame)
|
||||
list_frame.pack(fill=tk.BOTH, expand=True)
|
||||
|
||||
scrollbar = ttk.Scrollbar(list_frame)
|
||||
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
|
||||
|
||||
self.segment_listbox = tk.Listbox(
|
||||
list_frame,
|
||||
width=50,
|
||||
height=20,
|
||||
font=("Courier", 10),
|
||||
yscrollcommand=scrollbar.set,
|
||||
)
|
||||
self.segment_listbox.pack(fill=tk.BOTH, expand=True)
|
||||
scrollbar.config(command=self.segment_listbox.yview)
|
||||
|
||||
self.segment_listbox.bind("<Double-Button-1>", self.on_segment_double_click)
|
||||
|
||||
# 底部:播放控制和進度
|
||||
bottom_frame = ttk.Frame(self.root, padding="10")
|
||||
bottom_frame.pack(fill=tk.X)
|
||||
|
||||
# 播放控制
|
||||
control_frame = ttk.Frame(bottom_frame)
|
||||
control_frame.pack(fill=tk.X)
|
||||
|
||||
self.play_button = ttk.Button(
|
||||
control_frame, text="▶️ 播放所選", command=self.play_selected, width=15
|
||||
)
|
||||
self.play_button.pack(side=tk.LEFT, padx=5)
|
||||
|
||||
self.stop_button = ttk.Button(
|
||||
control_frame, text="⏹️ 停止", command=self.stop_playing, width=10
|
||||
)
|
||||
self.stop_button.pack(side=tk.LEFT, padx=5)
|
||||
self.stop_button.config(state=tk.DISABLED)
|
||||
|
||||
self.play_all_button = ttk.Button(
|
||||
control_frame, text="▶️▶️ 播放全部", command=self.play_all, width=15
|
||||
)
|
||||
self.play_all_button.pack(side=tk.LEFT, padx=5)
|
||||
|
||||
# 進度條
|
||||
progress_frame = ttk.Frame(bottom_frame)
|
||||
progress_frame.pack(fill=tk.X, pady=(10, 0))
|
||||
|
||||
ttk.Label(progress_frame, text="⏱️ 進度:").pack(side=tk.LEFT)
|
||||
self.progress_bar = ttk.Progressbar(progress_frame, mode="determinate")
|
||||
self.progress_bar.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=10)
|
||||
|
||||
self.progress_label = ttk.Label(progress_frame, text="0:00 / 0:00", width=20)
|
||||
self.progress_label.pack(side=tk.LEFT)
|
||||
|
||||
# 狀態欄
|
||||
self.status_label = ttk.Label(
|
||||
bottom_frame, text="就緒", relief=tk.SUNKEN, anchor=tk.W
|
||||
)
|
||||
self.status_label.pack(fill=tk.X, pady=(10, 0))
|
||||
|
||||
def select_audio(self):
|
||||
"""選擇音頻文件"""
|
||||
filename = filedialog.askopenfilename(
|
||||
title="選擇音頻文件",
|
||||
filetypes=[("WAV files", "*.wav"), ("All files", "*.*")],
|
||||
)
|
||||
if filename:
|
||||
self.audio_path = filename
|
||||
self.audio_label.config(text=Path(filename).name)
|
||||
self.check_ready()
|
||||
|
||||
def select_result(self):
|
||||
"""選擇結果文件"""
|
||||
filename = filedialog.askopenfilename(
|
||||
title="選擇 ASRX 結果文件",
|
||||
filetypes=[("JSON files", "*.json"), ("All files", "*.*")],
|
||||
)
|
||||
if filename:
|
||||
self.result_path = filename
|
||||
self.result_label.config(text=Path(filename).name)
|
||||
self.load_result()
|
||||
self.check_ready()
|
||||
|
||||
def load_result(self):
|
||||
"""載入 ASRX 結果"""
|
||||
try:
|
||||
with open(self.result_path, "r", encoding="utf-8") as f:
|
||||
self.result_data = json.load(f)
|
||||
|
||||
# 分組
|
||||
self.speaker_segments = {}
|
||||
for seg in self.result_data.get("segments", []):
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in self.speaker_segments:
|
||||
self.speaker_segments[speaker] = []
|
||||
self.speaker_segments[speaker].append(seg)
|
||||
|
||||
# 排序
|
||||
for speaker in self.speaker_segments:
|
||||
self.speaker_segments[speaker].sort(key=lambda x: x["start"])
|
||||
|
||||
# 說話人列表(按時長排序)
|
||||
self.speakers = sorted(
|
||||
self.speaker_segments.keys(),
|
||||
key=lambda s: sum(seg["duration"] for seg in self.speaker_segments[s]),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
# 更新列表框
|
||||
self.speaker_listbox.delete(0, tk.END)
|
||||
for speaker in self.speakers:
|
||||
segs = self.speaker_segments[speaker]
|
||||
total_dur = sum(seg["duration"] for seg in segs)
|
||||
total_dur_min = total_dur / 60
|
||||
self.speaker_listbox.insert(
|
||||
tk.END,
|
||||
f"🔊 {speaker:12} | {len(segs):4d}段 | {total_dur_min:5.1f}分鐘",
|
||||
)
|
||||
|
||||
self.status_label.config(
|
||||
text=f"載入成功:{len(self.speakers)} 個說話人,{len(self.result_data.get('segments', []))} 個片段"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
messagebox.showerror("錯誤", f"載入結果文件失敗:{e}")
|
||||
self.result_path = None
|
||||
self.result_label.config(text="載入失敗")
|
||||
|
||||
def check_ready(self):
|
||||
"""檢查是否就緒"""
|
||||
if self.audio_path and self.result_path:
|
||||
self.status_label.config(text="✅ 就緒 - 請選擇說話人並播放")
|
||||
self.play_button.config(state=tk.NORMAL)
|
||||
self.play_all_button.config(state=tk.NORMAL)
|
||||
else:
|
||||
self.status_label.config(text="⚠️ 請選擇音頻和結果文件")
|
||||
self.play_button.config(state=tk.DISABLED)
|
||||
self.play_all_button.config(state=tk.DISABLED)
|
||||
|
||||
def on_speaker_select(self, event):
|
||||
"""說話人選擇事件"""
|
||||
selection = self.speaker_listbox.curselection()
|
||||
if not selection:
|
||||
return
|
||||
|
||||
self.current_speaker_idx = selection[0]
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
|
||||
# 更新片段列表
|
||||
self.segment_listbox.delete(0, tk.END)
|
||||
for i, seg in enumerate(self.speaker_segments[speaker], 1):
|
||||
start = seg["start"]
|
||||
end = seg["end"]
|
||||
duration = seg["duration"]
|
||||
self.segment_listbox.insert(
|
||||
tk.END,
|
||||
f"[{i:4d}] {speaker:12} | {start:7.2f}s - {end:7.2f}s ({duration:5.2f}s)",
|
||||
)
|
||||
|
||||
self.status_label.config(
|
||||
text=f"選擇:{speaker} - {len(self.speaker_segments[speaker])} 個片段"
|
||||
)
|
||||
|
||||
def on_segment_double_click(self, event):
|
||||
"""片段雙擊事件"""
|
||||
self.play_selected()
|
||||
|
||||
def extract_and_play(self, start_sec: float, end_sec: float) -> bool:
|
||||
"""提取並播放音頻"""
|
||||
duration = end_sec - start_sec
|
||||
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||
temp_path = temp_file.name
|
||||
temp_file.close()
|
||||
|
||||
try:
|
||||
# 提取
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-loglevel",
|
||||
"quiet",
|
||||
"-i",
|
||||
self.audio_path,
|
||||
"-ss",
|
||||
str(start_sec),
|
||||
"-t",
|
||||
str(duration),
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-ac",
|
||||
"1",
|
||||
temp_path,
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
return False
|
||||
|
||||
# 播放
|
||||
if os.path.exists("/usr/bin/afplay"):
|
||||
subprocess.run(["afplay", temp_path], capture_output=True)
|
||||
elif os.path.exists("/usr/bin/aplay"):
|
||||
subprocess.run(["aplay", temp_path], capture_output=True)
|
||||
else:
|
||||
return False
|
||||
|
||||
return True
|
||||
finally:
|
||||
if os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
|
||||
def play_segment(self, speaker: str, seg: dict, seg_idx: int, total: int):
|
||||
"""播放單個片段"""
|
||||
if self.stop_flag:
|
||||
return False
|
||||
|
||||
start = seg["start"]
|
||||
end = seg["end"]
|
||||
duration = seg["duration"]
|
||||
|
||||
# 更新 UI
|
||||
self.root.after(
|
||||
0,
|
||||
lambda: self.status_label.config(
|
||||
text=f"▶️ {speaker} [{seg_idx}/{total}] {start:.2f}s - {end:.2f}s"
|
||||
),
|
||||
)
|
||||
|
||||
# 更新進度
|
||||
progress = (seg_idx / total) * 100
|
||||
self.root.after(0, lambda: self.progress_bar.config(value=progress))
|
||||
self.root.after(
|
||||
0, lambda: self.progress_label.config(text=f"{seg_idx}:{total}")
|
||||
)
|
||||
|
||||
# 播放
|
||||
if self.extract_and_play(start, end):
|
||||
return True
|
||||
else:
|
||||
self.root.after(
|
||||
0,
|
||||
lambda: messagebox.showwarning(
|
||||
"警告", f"播放失敗:{speaker} [{seg_idx}]"
|
||||
),
|
||||
)
|
||||
return True
|
||||
|
||||
def play_selected(self):
|
||||
"""播放所選片段"""
|
||||
selection = self.segment_listbox.curselection()
|
||||
if not selection:
|
||||
# 如果沒選擇,播放第一個
|
||||
if self.speakers:
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
segs = self.speaker_segments[speaker]
|
||||
if segs:
|
||||
self.play_all()
|
||||
return
|
||||
|
||||
# 播放所選
|
||||
seg_idx = selection[0]
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
seg = self.speaker_segments[speaker][seg_idx]
|
||||
|
||||
self.is_playing = True
|
||||
self.stop_flag = False
|
||||
self.play_button.config(state=tk.DISABLED)
|
||||
self.stop_button.config(state=tk.NORMAL)
|
||||
|
||||
# 在後台線程播放
|
||||
def play_thread():
|
||||
success = self.play_segment(speaker, seg, seg_idx + 1, 1)
|
||||
self.root.after(0, lambda: self.on_play_done())
|
||||
|
||||
thread = threading.Thread(target=play_thread, daemon=True)
|
||||
thread.start()
|
||||
|
||||
def play_all(self):
|
||||
"""播放所選說話人的所有片段"""
|
||||
if not self.speakers:
|
||||
return
|
||||
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
segs = self.speaker_segments[speaker]
|
||||
|
||||
if not segs:
|
||||
return
|
||||
|
||||
self.is_playing = True
|
||||
self.stop_flag = False
|
||||
self.play_button.config(state=tk.DISABLED)
|
||||
self.play_all_button.config(state=tk.DISABLED)
|
||||
self.stop_button.config(state=tk.NORMAL)
|
||||
|
||||
# 在後台線程播放
|
||||
def play_thread():
|
||||
for i, seg in enumerate(segs, 1):
|
||||
if self.stop_flag:
|
||||
break
|
||||
self.play_segment(speaker, seg, i, len(segs))
|
||||
time.sleep(0.3) # 片段間隔
|
||||
|
||||
self.root.after(0, lambda: self.on_play_done())
|
||||
|
||||
thread = threading.Thread(target=play_thread, daemon=True)
|
||||
thread.start()
|
||||
|
||||
def stop_playing(self):
|
||||
"""停止播放"""
|
||||
self.stop_flag = True
|
||||
self.is_playing = False
|
||||
self.on_play_done()
|
||||
|
||||
def on_play_done(self):
|
||||
"""播放完成"""
|
||||
self.is_playing = False
|
||||
self.stop_flag = False
|
||||
self.play_button.config(state=tk.NORMAL)
|
||||
self.play_all_button.config(state=tk.NORMAL)
|
||||
self.stop_button.config(state=tk.DISABLED)
|
||||
self.progress_bar.config(value=0)
|
||||
self.progress_label.config(text="0:00 / 0:00")
|
||||
|
||||
if self.stop_flag:
|
||||
self.status_label.config(text="⏹️ 已停止")
|
||||
else:
|
||||
self.status_label.config(text="✅ 播放完成")
|
||||
|
||||
|
||||
def main():
|
||||
"""主函數"""
|
||||
if not HAS_TKINTER:
|
||||
print("❌ tkinter 未安裝")
|
||||
print("請使用以下命令安裝:")
|
||||
print(" brew install python-tk@3.9")
|
||||
return
|
||||
|
||||
root = tk.Tk()
|
||||
app = SpeakerPlayerGUI(root)
|
||||
root.mainloop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,522 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Speaker Player GUI - 說話人語音播放器(Face 整合版)
|
||||
使用 tkinter 顯示播放進度、Speaker ID 和人臉信息
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
try:
|
||||
import tkinter as tk
|
||||
from tkinter import ttk, filedialog, messagebox
|
||||
|
||||
HAS_TKINTER = True
|
||||
except ImportError:
|
||||
HAS_TKINTER = False
|
||||
|
||||
|
||||
class SpeakerPlayerGUI:
|
||||
"""說話人語音播放器 GUI(Face 整合版)"""
|
||||
|
||||
def __init__(self, root):
|
||||
self.root = root
|
||||
self.root.title("🎬 Speaker Player - Face Integration")
|
||||
self.root.geometry("1200x800")
|
||||
|
||||
# 數據
|
||||
self.audio_path = None
|
||||
self.result_path = None
|
||||
self.face_path = None
|
||||
self.result_data = None
|
||||
self.face_data = None
|
||||
self.integrated_data = None
|
||||
self.speaker_segments = {}
|
||||
self.speakers = []
|
||||
self.current_speaker_idx = 0
|
||||
self.is_playing = False
|
||||
self.stop_flag = False
|
||||
|
||||
# 創建界面
|
||||
self.create_widgets()
|
||||
|
||||
def create_widgets(self):
|
||||
"""創建界面組件"""
|
||||
# 頂部:文件選擇
|
||||
top_frame = ttk.Frame(self.root, padding="10")
|
||||
top_frame.pack(fill=tk.X)
|
||||
|
||||
# 第一行:音頻和 ASRX 結果
|
||||
row1_frame = ttk.Frame(top_frame)
|
||||
row1_frame.pack(fill=tk.X)
|
||||
|
||||
ttk.Label(row1_frame, text="📁 Audio:").pack(side=tk.LEFT)
|
||||
self.audio_label = ttk.Label(row1_frame, text="未選擇", width=50)
|
||||
self.audio_label.pack(side=tk.LEFT, padx=5)
|
||||
ttk.Button(row1_frame, text="選擇音頻", command=self.select_audio).pack(
|
||||
side=tk.LEFT, padx=5
|
||||
)
|
||||
|
||||
ttk.Label(row1_frame, text=" 📊 ASRX:").pack(side=tk.LEFT, padx=(20, 0))
|
||||
self.result_label = ttk.Label(row1_frame, text="未選擇", width=50)
|
||||
self.result_label.pack(side=tk.LEFT, padx=5)
|
||||
ttk.Button(row1_frame, text="選擇結果", command=self.select_result).pack(
|
||||
side=tk.LEFT, padx=5
|
||||
)
|
||||
|
||||
# 第二行:Face 結果
|
||||
row2_frame = ttk.Frame(top_frame)
|
||||
row2_frame.pack(fill=tk.X, pady=(5, 0))
|
||||
|
||||
ttk.Label(row2_frame, text="👤 Face:").pack(side=tk.LEFT)
|
||||
self.face_label = ttk.Label(row2_frame, text="未選擇 (可選)", width=50)
|
||||
self.face_label.pack(side=tk.LEFT, padx=5)
|
||||
ttk.Button(row2_frame, text="選擇 Face", command=self.select_face).pack(
|
||||
side=tk.LEFT, padx=5
|
||||
)
|
||||
self.integrate_button = ttk.Button(
|
||||
row2_frame,
|
||||
text="🔗 整合 Face",
|
||||
command=self.integrate_face,
|
||||
state=tk.DISABLED,
|
||||
)
|
||||
self.integrate_button.pack(side=tk.LEFT, padx=5)
|
||||
|
||||
# 中間:說話人列表和片段列表
|
||||
mid_frame = ttk.Frame(self.root, padding="10")
|
||||
mid_frame.pack(fill=tk.BOTH, expand=True)
|
||||
|
||||
# 左側:說話人列表(帶 Face 統計)
|
||||
left_frame = ttk.LabelFrame(mid_frame, text="📢 說話人列表", padding="10")
|
||||
left_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=False)
|
||||
|
||||
self.speaker_listbox = tk.Listbox(
|
||||
left_frame, width=45, height=20, font=("Arial", 11)
|
||||
)
|
||||
self.speaker_listbox.pack(fill=tk.BOTH, expand=True)
|
||||
self.speaker_listbox.bind("<<ListboxSelect>>", self.on_speaker_select)
|
||||
|
||||
# 右側:片段列表(帶 Face 信息)
|
||||
right_frame = ttk.LabelFrame(
|
||||
mid_frame, text="🎵 語音片段 + 👥 人臉", padding="10"
|
||||
)
|
||||
right_frame.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=10)
|
||||
|
||||
# 片段列表(带滚动条)
|
||||
list_frame = ttk.Frame(right_frame)
|
||||
list_frame.pack(fill=tk.BOTH, expand=True)
|
||||
|
||||
scrollbar = ttk.Scrollbar(list_frame)
|
||||
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
|
||||
|
||||
self.segment_listbox = tk.Listbox(
|
||||
list_frame,
|
||||
width=65,
|
||||
height=20,
|
||||
font=("Courier", 9),
|
||||
yscrollcommand=scrollbar.set,
|
||||
)
|
||||
self.segment_listbox.pack(fill=tk.BOTH, expand=True)
|
||||
scrollbar.config(command=self.segment_listbox.yview)
|
||||
|
||||
self.segment_listbox.bind("<Double-Button-1>", self.on_segment_double_click)
|
||||
|
||||
# 底部:播放控制和進度
|
||||
bottom_frame = ttk.Frame(self.root, padding="10")
|
||||
bottom_frame.pack(fill=tk.X)
|
||||
|
||||
# 播放控制
|
||||
control_frame = ttk.Frame(bottom_frame)
|
||||
control_frame.pack(fill=tk.X)
|
||||
|
||||
self.play_button = ttk.Button(
|
||||
control_frame, text="▶️ 播放所選", command=self.play_selected, width=15
|
||||
)
|
||||
self.play_button.pack(side=tk.LEFT, padx=5)
|
||||
self.play_button.config(state=tk.DISABLED)
|
||||
|
||||
self.stop_button = ttk.Button(
|
||||
control_frame, text="⏹️ 停止", command=self.stop_playing, width=10
|
||||
)
|
||||
self.stop_button.pack(side=tk.LEFT, padx=5)
|
||||
self.stop_button.config(state=tk.DISABLED)
|
||||
|
||||
self.play_all_button = ttk.Button(
|
||||
control_frame, text="▶️▶️ 播放全部", command=self.play_all, width=15
|
||||
)
|
||||
self.play_all_button.pack(side=tk.LEFT, padx=5)
|
||||
self.play_all_button.config(state=tk.DISABLED)
|
||||
|
||||
# 進度條
|
||||
progress_frame = ttk.Frame(bottom_frame)
|
||||
progress_frame.pack(fill=tk.X, pady=(10, 0))
|
||||
|
||||
ttk.Label(progress_frame, text="⏱️ 進度:").pack(side=tk.LEFT)
|
||||
self.progress_bar = ttk.Progressbar(progress_frame, mode="determinate")
|
||||
self.progress_bar.pack(side=tk.LEFT, fill=tk.X, expand=True, padx=10)
|
||||
|
||||
self.progress_label = ttk.Label(progress_frame, text="0:00 / 0:00", width=20)
|
||||
self.progress_label.pack(side=tk.LEFT)
|
||||
|
||||
# 狀態欄
|
||||
self.status_label = ttk.Label(
|
||||
bottom_frame, text="就緒", relief=tk.SUNKEN, anchor=tk.W
|
||||
)
|
||||
self.status_label.pack(fill=tk.X, pady=(10, 0))
|
||||
|
||||
def select_audio(self):
|
||||
"""選擇音頻文件"""
|
||||
filename = filedialog.askopenfilename(
|
||||
title="選擇音頻文件",
|
||||
filetypes=[("WAV files", "*.wav"), ("All files", "*.*")],
|
||||
)
|
||||
if filename:
|
||||
self.audio_path = filename
|
||||
self.audio_label.config(text=Path(filename).name)
|
||||
self.check_ready()
|
||||
|
||||
def select_result(self):
|
||||
"""選擇 ASRX 結果文件"""
|
||||
filename = filedialog.askopenfilename(
|
||||
title="選擇 ASRX 結果文件",
|
||||
filetypes=[("JSON files", "*.json"), ("All files", "*.*")],
|
||||
)
|
||||
if filename:
|
||||
self.result_path = filename
|
||||
self.result_label.config(text=Path(filename).name)
|
||||
self.load_result()
|
||||
self.check_ready()
|
||||
|
||||
def select_face(self):
|
||||
"""選擇 Face 結果文件"""
|
||||
filename = filedialog.askopenfilename(
|
||||
title="選擇 Face 檢測結果",
|
||||
filetypes=[("JSON files", "*.json"), ("All files", "*.*")],
|
||||
)
|
||||
if filename:
|
||||
self.face_path = filename
|
||||
self.face_label.config(text=Path(filename).name)
|
||||
self.integrate_button.config(state=tk.NORMAL)
|
||||
self.status_label.config(text="✅ Face 已選擇 - 請點擊整合")
|
||||
|
||||
def integrate_face(self):
|
||||
"""整合 Face 與 ASRX"""
|
||||
if not self.face_path or not self.result_path:
|
||||
messagebox.showwarning("警告", "請先選擇 Face 和 ASRX 文件")
|
||||
return
|
||||
|
||||
self.status_label.config(text="🔄 整合中...")
|
||||
self.root.update()
|
||||
|
||||
try:
|
||||
# 載入 Face 數據
|
||||
with open(self.face_path, "r", encoding="utf-8") as f:
|
||||
self.face_data = json.load(f)
|
||||
|
||||
# 重新載入 ASRX 數據並整合
|
||||
self.load_result(integrate_with_face=True)
|
||||
|
||||
self.status_label.config(text="✅ Face 整合完成")
|
||||
self.integrate_button.config(state=tk.DISABLED)
|
||||
|
||||
except Exception as e:
|
||||
messagebox.showerror("錯誤", f"整合失敗:{e}")
|
||||
self.status_label.config(text="❌ 整合失敗")
|
||||
|
||||
def load_result(self, integrate_with_face=False):
|
||||
"""載入 ASRX 結果"""
|
||||
try:
|
||||
with open(self.result_path, "r", encoding="utf-8") as f:
|
||||
self.result_data = json.load(f)
|
||||
|
||||
# 分組
|
||||
self.speaker_segments = {}
|
||||
for seg in self.result_data.get("segments", []):
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in self.speaker_segments:
|
||||
self.speaker_segments[speaker] = []
|
||||
self.speaker_segments[speaker].append(seg)
|
||||
|
||||
# 排序
|
||||
for speaker in self.speaker_segments:
|
||||
self.speaker_segments[speaker].sort(key=lambda x: x["start"])
|
||||
|
||||
# 說話人列表(按時長排序)
|
||||
self.speakers = sorted(
|
||||
self.speaker_segments.keys(),
|
||||
key=lambda s: sum(seg["duration"] for seg in self.speaker_segments[s]),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
# 更新列表框
|
||||
self.speaker_listbox.delete(0, tk.END)
|
||||
for speaker in self.speakers:
|
||||
segs = self.speaker_segments[speaker]
|
||||
total_dur = sum(seg["duration"] for seg in segs)
|
||||
total_dur_min = total_dur / 60
|
||||
|
||||
# 如果有 Face 數據,計算有人臉的片段數
|
||||
face_info = ""
|
||||
if integrate_with_face and self.integrated_data:
|
||||
speaker_integrated = [
|
||||
item
|
||||
for item in self.integrated_data
|
||||
if item["speaker"] == speaker
|
||||
]
|
||||
with_face = sum(
|
||||
1 for item in speaker_integrated if item.get("has_face", False)
|
||||
)
|
||||
face_info = f" | 👥 {with_face}/{len(segs)}"
|
||||
|
||||
self.speaker_listbox.insert(
|
||||
tk.END,
|
||||
f"🔊 {speaker:12} | {len(segs):4d}段 | {total_dur_min:5.1f}分鐘{face_info}",
|
||||
)
|
||||
|
||||
total_segments = len(self.result_data.get("segments", []))
|
||||
self.status_label.config(
|
||||
text=f"載入成功:{len(self.speakers)} 個說話人,{total_segments} 個片段"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
messagebox.showerror("錯誤", f"載入結果文件失敗:{e}")
|
||||
self.result_path = None
|
||||
self.result_label.config(text="載入失敗")
|
||||
|
||||
def check_ready(self):
|
||||
"""檢查是否就緒"""
|
||||
if self.audio_path and self.result_path:
|
||||
self.status_label.config(text="✅ 就緒 - 請選擇說話人並播放")
|
||||
self.play_button.config(state=tk.NORMAL)
|
||||
self.play_all_button.config(state=tk.NORMAL)
|
||||
else:
|
||||
self.status_label.config(text="⚠️ 請選擇音頻和結果文件")
|
||||
self.play_button.config(state=tk.DISABLED)
|
||||
self.play_all_button.config(state=tk.DISABLED)
|
||||
|
||||
def on_speaker_select(self, event):
|
||||
"""說話人選擇事件"""
|
||||
selection = self.speaker_listbox.curselection()
|
||||
if not selection:
|
||||
return
|
||||
|
||||
self.current_speaker_idx = selection[0]
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
|
||||
# 更新片段列表
|
||||
self.segment_listbox.delete(0, tk.END)
|
||||
for i, seg in enumerate(self.speaker_segments[speaker], 1):
|
||||
start = seg["start"]
|
||||
end = seg["end"]
|
||||
duration = seg["duration"]
|
||||
|
||||
# 如果有整合 Face 數據
|
||||
face_info = ""
|
||||
if self.integrated_data:
|
||||
matching = [
|
||||
item
|
||||
for item in self.integrated_data
|
||||
if abs(item["start"] - start) < 0.1 and item["speaker"] == speaker
|
||||
]
|
||||
if matching and matching[0].get("has_face", False):
|
||||
face_info = " 👥✅"
|
||||
elif matching:
|
||||
face_info = " 👥❌"
|
||||
|
||||
self.segment_listbox.insert(
|
||||
tk.END,
|
||||
f"[{i:4d}] {speaker:12} | {start:7.2f}s - {end:7.2f}s ({duration:5.2f}s){face_info}",
|
||||
)
|
||||
|
||||
self.status_label.config(
|
||||
text=f"選擇:{speaker} - {len(self.speaker_segments[speaker])} 個片段"
|
||||
)
|
||||
|
||||
def on_segment_double_click(self, event):
|
||||
"""片段雙擊事件"""
|
||||
self.play_selected()
|
||||
|
||||
def extract_and_play(self, start_sec: float, end_sec: float) -> bool:
|
||||
"""提取並播放音頻"""
|
||||
duration = end_sec - start_sec
|
||||
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||
temp_path = temp_file.name
|
||||
temp_file.close()
|
||||
|
||||
try:
|
||||
# 提取
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-loglevel",
|
||||
"quiet",
|
||||
"-i",
|
||||
self.audio_path,
|
||||
"-ss",
|
||||
str(start_sec),
|
||||
"-t",
|
||||
str(duration),
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-ac",
|
||||
"1",
|
||||
temp_path,
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
return False
|
||||
|
||||
# 播放
|
||||
if os.path.exists("/usr/bin/afplay"):
|
||||
subprocess.run(["afplay", temp_path], capture_output=True)
|
||||
elif os.path.exists("/usr/bin/aplay"):
|
||||
subprocess.run(["aplay", temp_path], capture_output=True)
|
||||
else:
|
||||
return False
|
||||
|
||||
return True
|
||||
finally:
|
||||
if os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
|
||||
def play_segment(self, speaker: str, seg: dict, seg_idx: int, total: int):
|
||||
"""播放單個片段"""
|
||||
if self.stop_flag:
|
||||
return False
|
||||
|
||||
start = seg["start"]
|
||||
end = seg["end"]
|
||||
duration = seg["duration"]
|
||||
|
||||
# 更新 UI
|
||||
self.root.after(
|
||||
0,
|
||||
lambda: self.status_label.config(
|
||||
text=f"▶️ {speaker} [{seg_idx}/{total}] {start:.2f}s - {end:.2f}s"
|
||||
),
|
||||
)
|
||||
|
||||
# 更新進度
|
||||
progress = (seg_idx / total) * 100
|
||||
self.root.after(0, lambda: self.progress_bar.config(value=progress))
|
||||
self.root.after(
|
||||
0, lambda: self.progress_label.config(text=f"{seg_idx}:{total}")
|
||||
)
|
||||
|
||||
# 播放
|
||||
if self.extract_and_play(start, end):
|
||||
return True
|
||||
else:
|
||||
self.root.after(
|
||||
0,
|
||||
lambda: messagebox.showwarning(
|
||||
"警告", f"播放失敗:{speaker} [{seg_idx}]"
|
||||
),
|
||||
)
|
||||
return True
|
||||
|
||||
def play_selected(self):
|
||||
"""播放所選片段"""
|
||||
selection = self.segment_listbox.curselection()
|
||||
if not selection:
|
||||
# 如果沒選擇,播放第一個
|
||||
if self.speakers:
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
segs = self.speaker_segments[speaker]
|
||||
if segs:
|
||||
self.play_all()
|
||||
return
|
||||
|
||||
# 播放所選
|
||||
seg_idx = selection[0]
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
seg = self.speaker_segments[speaker][seg_idx]
|
||||
|
||||
self.is_playing = True
|
||||
self.stop_flag = False
|
||||
self.play_button.config(state=tk.DISABLED)
|
||||
self.stop_button.config(state=tk.NORMAL)
|
||||
|
||||
# 在後台線程播放
|
||||
def play_thread():
|
||||
success = self.play_segment(speaker, seg, seg_idx + 1, 1)
|
||||
self.root.after(0, lambda: self.on_play_done())
|
||||
|
||||
thread = threading.Thread(target=play_thread, daemon=True)
|
||||
thread.start()
|
||||
|
||||
def play_all(self):
|
||||
"""播放所選說話人的所有片段"""
|
||||
if not self.speakers:
|
||||
return
|
||||
|
||||
speaker = self.speakers[self.current_speaker_idx]
|
||||
segs = self.speaker_segments[speaker]
|
||||
|
||||
if not segs:
|
||||
return
|
||||
|
||||
self.is_playing = True
|
||||
self.stop_flag = False
|
||||
self.play_button.config(state=tk.DISABLED)
|
||||
self.play_all_button.config(state=tk.DISABLED)
|
||||
self.stop_button.config(state=tk.NORMAL)
|
||||
|
||||
# 在後台線程播放
|
||||
def play_thread():
|
||||
for i, seg in enumerate(segs, 1):
|
||||
if self.stop_flag:
|
||||
break
|
||||
self.play_segment(speaker, seg, i, len(segs))
|
||||
time.sleep(0.3) # 片段間隔
|
||||
|
||||
self.root.after(0, lambda: self.on_play_done())
|
||||
|
||||
thread = threading.Thread(target=play_thread, daemon=True)
|
||||
thread.start()
|
||||
|
||||
def stop_playing(self):
|
||||
"""停止播放"""
|
||||
self.stop_flag = True
|
||||
self.is_playing = False
|
||||
self.on_play_done()
|
||||
|
||||
def on_play_done(self):
|
||||
"""播放完成"""
|
||||
self.is_playing = False
|
||||
self.stop_flag = False
|
||||
self.play_button.config(state=tk.NORMAL)
|
||||
self.play_all_button.config(state=tk.NORMAL)
|
||||
self.stop_button.config(state=tk.DISABLED)
|
||||
self.progress_bar.config(value=0)
|
||||
self.progress_label.config(text="0:00 / 0:00")
|
||||
|
||||
if self.stop_flag:
|
||||
self.status_label.config(text="⏹️ 已停止")
|
||||
else:
|
||||
self.status_label.config(text="✅ 播放完成")
|
||||
|
||||
|
||||
def main():
|
||||
"""主函數"""
|
||||
if not HAS_TKINTER:
|
||||
print("❌ tkinter 未安裝")
|
||||
print("請使用以下命令安裝:")
|
||||
print(" brew install python-tk@3.9")
|
||||
return
|
||||
|
||||
root = tk.Tk()
|
||||
app = SpeakerPlayerGUI(root)
|
||||
root.mainloop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,267 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Interactive Speaker Audio Player - 交互式說話人語音播放器
|
||||
可以選擇播放哪個說話人的哪些片段
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import tempfile
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
def load_asrx_result(result_path: str) -> Dict:
|
||||
"""載入 ASRX 結果"""
|
||||
with open(result_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def extract_and_play(audio_path: str, start_sec: float, end_sec: float) -> bool:
|
||||
"""提取並播放音頻片段"""
|
||||
duration = end_sec - start_sec
|
||||
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||
temp_path = temp_file.name
|
||||
temp_file.close()
|
||||
|
||||
try:
|
||||
# 提取
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-loglevel",
|
||||
"quiet",
|
||||
"-i",
|
||||
audio_path,
|
||||
"-ss",
|
||||
str(start_sec),
|
||||
"-t",
|
||||
str(duration),
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-ac",
|
||||
"1",
|
||||
temp_path,
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
if result.returncode != 0:
|
||||
return False
|
||||
|
||||
# 播放
|
||||
if os.path.exists("/usr/bin/afplay"):
|
||||
subprocess.run(["afplay", temp_path], capture_output=True)
|
||||
elif os.path.exists("/usr/bin/aplay"):
|
||||
subprocess.run(["aplay", temp_path], capture_output=True)
|
||||
else:
|
||||
print(" ⚠️ No audio player found")
|
||||
return False
|
||||
|
||||
return True
|
||||
finally:
|
||||
if os.path.exists(temp_path):
|
||||
os.unlink(temp_path)
|
||||
|
||||
|
||||
def show_menu(speaker_segments: Dict[str, List[Dict]], speaker_id: str):
|
||||
"""顯示選單"""
|
||||
segs = speaker_segments[speaker_id]
|
||||
total_duration = sum(seg["duration"] for seg in segs)
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
print(f"🔊 {speaker_id}")
|
||||
print(f"{'=' * 70}")
|
||||
print(f" Segments: {len(segs)}")
|
||||
print(
|
||||
f" Total duration: {total_duration / 60:.1f} minutes ({total_duration:.1f}s)"
|
||||
)
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
# 顯示前 20 個片段
|
||||
for i, seg in enumerate(segs[:20], 1):
|
||||
start = seg["start"]
|
||||
end = seg["end"]
|
||||
duration = seg["duration"]
|
||||
print(
|
||||
f" [{i:3d}] {speaker_id:12} | {start:7.2f}s - {end:7.2f}s ({duration:5.2f}s)"
|
||||
)
|
||||
|
||||
if len(segs) > 20:
|
||||
print(f" ... and {len(segs) - 20} more segments")
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
print("Commands:")
|
||||
print(f" [1-{min(20, len(segs))}] Play specific segment")
|
||||
print(" all Play all segments (may take a while)")
|
||||
print(" first N Play first N segments")
|
||||
print(" next Next speaker")
|
||||
print(" prev Previous speaker")
|
||||
print(" list List all speakers")
|
||||
print(" quit Exit")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
|
||||
def interactive_player(audio_path: str, result_path: str):
|
||||
"""交互式播放器"""
|
||||
# 載入結果
|
||||
result = load_asrx_result(result_path)
|
||||
segments = result.get("segments", [])
|
||||
total_duration = result.get("total_duration", 0)
|
||||
|
||||
# 分組
|
||||
speaker_segments = {}
|
||||
for seg in segments:
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in speaker_segments:
|
||||
speaker_segments[speaker] = []
|
||||
speaker_segments[speaker].append(seg)
|
||||
|
||||
# 排序
|
||||
for speaker in speaker_segments:
|
||||
speaker_segments[speaker].sort(key=lambda x: x["start"])
|
||||
|
||||
# 說話人列表
|
||||
speakers = sorted(
|
||||
speaker_segments.keys(),
|
||||
key=lambda s: sum(seg["duration"] for seg in speaker_segments[s]),
|
||||
reverse=True,
|
||||
)
|
||||
|
||||
current_speaker_idx = 0
|
||||
|
||||
print("\n🎬 Speaker Audio Player")
|
||||
print(f"📁 Audio: {audio_path}")
|
||||
print(f"📊 Speakers: {len(speakers)}")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
while True:
|
||||
current_speaker = speakers[current_speaker_idx]
|
||||
show_menu(speaker_segments, current_speaker)
|
||||
|
||||
try:
|
||||
cmd = input(f"\n▶️ {current_speaker} > ").strip().lower()
|
||||
except (EOFError, KeyboardInterrupt):
|
||||
print("\n\nExiting...")
|
||||
break
|
||||
|
||||
if not cmd:
|
||||
continue
|
||||
|
||||
# 播放特定片段
|
||||
if cmd.isdigit():
|
||||
idx = int(cmd) - 1
|
||||
if 0 <= idx < len(speaker_segments[current_speaker]):
|
||||
seg = speaker_segments[current_speaker][idx]
|
||||
print(f"\n 🔊 {current_speaker} - Segment {idx + 1}")
|
||||
print(
|
||||
f" ⏱️ {seg['start']:.2f}s - {seg['end']:.2f}s ({seg['duration']:.2f}s)"
|
||||
)
|
||||
print(" ▶️ Playing...", end="", flush=True)
|
||||
if extract_and_play(audio_path, seg["start"], seg["end"]):
|
||||
print(" ✅ Done")
|
||||
else:
|
||||
print(" ❌ Failed")
|
||||
else:
|
||||
print(
|
||||
f" Invalid segment number (1-{len(speaker_segments[current_speaker])})"
|
||||
)
|
||||
|
||||
# 播放所有
|
||||
elif cmd == "all":
|
||||
print(
|
||||
f"\n 🔊 {current_speaker} - Playing all {len(speaker_segments[current_speaker])} segments..."
|
||||
)
|
||||
print("=" * 70)
|
||||
for i, seg in enumerate(speaker_segments[current_speaker], 1):
|
||||
print(
|
||||
f" [{i:3d}/{len(speaker_segments[current_speaker])}] {current_speaker} | "
|
||||
+ f"{seg['start']:7.2f}s - {seg['end']:7.2f}s ({seg['duration']:5.2f}s)",
|
||||
end="",
|
||||
flush=True,
|
||||
)
|
||||
if extract_and_play(audio_path, seg["start"], seg["end"]):
|
||||
print(" ✅")
|
||||
else:
|
||||
print(" ❌")
|
||||
print("=" * 70)
|
||||
|
||||
# 播放前 N 個
|
||||
elif cmd.startswith("first "):
|
||||
try:
|
||||
n = int(cmd.split()[1])
|
||||
print(f"\n 🔊 {current_speaker} - Playing first {n} segments...")
|
||||
print("=" * 70)
|
||||
for i, seg in enumerate(speaker_segments[current_speaker][:n], 1):
|
||||
print(
|
||||
f" [{i:3d}/{n}] {current_speaker} | "
|
||||
+ f"{seg['start']:7.2f}s - {seg['end']:7.2f}s ({seg['duration']:5.2f}s)",
|
||||
end="",
|
||||
flush=True,
|
||||
)
|
||||
if extract_and_play(audio_path, seg["start"], seg["end"]):
|
||||
print(" ✅")
|
||||
else:
|
||||
print(" ❌")
|
||||
print("=" * 70)
|
||||
except (IndexError, ValueError):
|
||||
print(" Usage: first N")
|
||||
|
||||
# 下一個說話人
|
||||
elif cmd == "next":
|
||||
current_speaker_idx = (current_speaker_idx + 1) % len(speakers)
|
||||
|
||||
# 上一個說話人
|
||||
elif cmd == "prev":
|
||||
current_speaker_idx = (current_speaker_idx - 1) % len(speakers)
|
||||
|
||||
# 列出所有說話人
|
||||
elif cmd == "list":
|
||||
print(f"\n{'=' * 70}")
|
||||
print("📢 All speakers:")
|
||||
print(f"{'=' * 70}")
|
||||
for i, speaker in enumerate(speakers, 1):
|
||||
segs = speaker_segments[speaker]
|
||||
total_dur = sum(seg["duration"] for seg in segs)
|
||||
pct = total_dur / total_duration * 100 if total_duration > 0 else 0
|
||||
print(
|
||||
f" {i:2d}. 🔊 {speaker:12} | {len(segs):4d} segments, "
|
||||
+ f"{total_dur:7.1f}s ({pct:5.1f}%)"
|
||||
)
|
||||
print(f"{'=' * 70}")
|
||||
print(f" Current: 🔊 {speakers[current_speaker_idx]}")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
# 退出
|
||||
elif cmd == "quit" or cmd == "exit" or cmd == "q":
|
||||
print("\nExiting...")
|
||||
break
|
||||
|
||||
else:
|
||||
print(f" Unknown command: {cmd}")
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Interactive Speaker Audio Player")
|
||||
parser.add_argument("audio_path", help="原始音頻文件路徑")
|
||||
parser.add_argument("result_path", help="ASRX 結果 JSON 路徑")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not Path(args.audio_path).exists():
|
||||
print(f"Error: Audio file not found: {args.audio_path}")
|
||||
return
|
||||
|
||||
if not Path(args.result_path).exists():
|
||||
print(f"Error: Result file not found: {args.result_path}")
|
||||
return
|
||||
|
||||
interactive_player(args.audio_path, args.result_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,164 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
GUI Face Player 自動化測試腳本
|
||||
測試所有功能並生成測試報告
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def check_file_exists(path, description):
|
||||
"""檢查文件是否存在"""
|
||||
exists = Path(path).exists()
|
||||
status = "✅" if exists else "❌"
|
||||
size = Path(path).stat().st_size / 1024 / 1024 if exists else 0
|
||||
print(f"{status} {description}: {path} ({size:.1f} MB)")
|
||||
return exists
|
||||
|
||||
|
||||
def check_process_running(pattern):
|
||||
"""檢查進程是否運行"""
|
||||
result = subprocess.run(['pgrep', '-f', pattern], capture_output=True, text=True)
|
||||
running = result.returncode == 0
|
||||
status = "✅" if running else "❌"
|
||||
print(f"{status} 進程:{pattern} ({'運行中' if running else '未運行'})")
|
||||
return running
|
||||
|
||||
|
||||
def test_json_structure(path, required_keys, description):
|
||||
"""測試 JSON 文件結構"""
|
||||
try:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
missing_keys = [key for key in required_keys if key not in data]
|
||||
if missing_keys:
|
||||
print(f"❌ {description}: 缺少鍵 {missing_keys}")
|
||||
return False
|
||||
else:
|
||||
print(f"✅ {description}: 結構正確")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"❌ {description}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def test_integration_script():
|
||||
"""測試整合腳本"""
|
||||
print("\n" + "="*70)
|
||||
print("測試整合腳本")
|
||||
print("="*70)
|
||||
|
||||
cmd = [
|
||||
'python3',
|
||||
'integrate_face_asrx_speaker.py',
|
||||
'/tmp/face_long.json',
|
||||
'/tmp/asrx_charade_optimized.json',
|
||||
'--threshold', '3.0',
|
||||
'--stats'
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
|
||||
# 檢查輸出
|
||||
if '99.8%' in result.stdout:
|
||||
print("✅ 整合腳本:匹配率正確 (99.8%)")
|
||||
return True
|
||||
else:
|
||||
print("❌ 整合腳本:匹配率異常")
|
||||
print(result.stdout)
|
||||
return False
|
||||
|
||||
|
||||
def test_gui_startup():
|
||||
"""測試 GUI 啟動"""
|
||||
print("\n" + "="*70)
|
||||
print("測試 GUI 啟動")
|
||||
print("="*70)
|
||||
|
||||
# 檢查進程
|
||||
running = check_process_running('speaker_player_gui_face')
|
||||
|
||||
if running:
|
||||
print("✅ GUI 進程:正常運行")
|
||||
return True
|
||||
else:
|
||||
print("❌ GUI 進程:未運行")
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""主測試函數"""
|
||||
print("="*70)
|
||||
print("GUI Face Player 自動化測試")
|
||||
print("="*70)
|
||||
|
||||
# 測試文件
|
||||
print("\n" + "="*70)
|
||||
print("測試文件")
|
||||
print("="*70)
|
||||
|
||||
files_ok = True
|
||||
files_ok &= check_file_exists('/tmp/charade_audio.wav', '音頻文件')
|
||||
files_ok &= check_file_exists('/tmp/asrx_charade_optimized.json', 'ASRX 結果')
|
||||
files_ok &= check_file_exists('/tmp/face_long.json', 'Face 結果')
|
||||
files_ok &= check_file_exists('/tmp/charade_integrated.json', '整合結果')
|
||||
|
||||
# 測試 JSON 結構
|
||||
print("\n" + "="*70)
|
||||
print("測試 JSON 結構")
|
||||
print("="*70)
|
||||
|
||||
json_ok = True
|
||||
json_ok &= test_json_structure(
|
||||
'/tmp/asrx_charade_optimized.json',
|
||||
['segments', 'n_speakers'],
|
||||
'ASRX 結果'
|
||||
)
|
||||
json_ok &= test_json_structure(
|
||||
'/tmp/face_long.json',
|
||||
['frames', 'frame_count'],
|
||||
'Face 結果'
|
||||
)
|
||||
json_ok &= test_json_structure(
|
||||
'/tmp/charade_integrated.json',
|
||||
['integrated_segments', 'speaker_stats'],
|
||||
'整合結果'
|
||||
)
|
||||
|
||||
# 測試整合腳本
|
||||
integration_ok = test_integration_script()
|
||||
|
||||
# 測試 GUI
|
||||
gui_ok = test_gui_startup()
|
||||
|
||||
# 總結
|
||||
print("\n" + "="*70)
|
||||
print("測試總結")
|
||||
print("="*70)
|
||||
|
||||
all_ok = files_ok and json_ok and integration_ok and gui_ok
|
||||
|
||||
if all_ok:
|
||||
print("✅ 所有測試通過!")
|
||||
else:
|
||||
print("❌ 部分測試失敗")
|
||||
if not files_ok:
|
||||
print(" - 文件測試失敗")
|
||||
if not json_ok:
|
||||
print(" - JSON 結構測試失敗")
|
||||
if not integration_ok:
|
||||
print(" - 整合腳本測試失敗")
|
||||
if not gui_ok:
|
||||
print(" - GUI 啟動測試失敗")
|
||||
|
||||
print("\n" + "="*70)
|
||||
|
||||
return all_ok
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = main()
|
||||
exit(0 if success else 1)
|
||||
@@ -1,240 +0,0 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
長影片(Charade 1963,114 分鐘)完整測試腳本
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def print_header(title):
|
||||
"""打印標題"""
|
||||
print("\n" + "="*70)
|
||||
print(f" {title}")
|
||||
print("="*70)
|
||||
|
||||
|
||||
def test_data_files():
|
||||
"""測試數據文件"""
|
||||
print_header("1. 數據文件測試")
|
||||
|
||||
files = {
|
||||
'音頻文件': '/tmp/charade_audio.wav',
|
||||
'ASRX 結果': '/tmp/asrx_charade_optimized.json',
|
||||
'Face 結果': '/tmp/face_long.json',
|
||||
'整合結果': '/tmp/charade_integrated.json'
|
||||
}
|
||||
|
||||
all_ok = True
|
||||
for name, path in files.items():
|
||||
exists = Path(path).exists()
|
||||
size = Path(path).stat().st_size / 1024 / 1024 if exists else 0
|
||||
status = "✅" if exists else "❌"
|
||||
print(f"{status} {name}: {size:.1f} MB")
|
||||
all_ok = all_ok and exists
|
||||
|
||||
return all_ok
|
||||
|
||||
|
||||
def test_asrx_results():
|
||||
"""測試 ASRX 結果"""
|
||||
print_header("2. ASRX 結果測試")
|
||||
|
||||
with open('/tmp/asrx_charade_optimized.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
total_duration = data.get('total_duration', 0)
|
||||
n_speakers = data.get('n_speakers', 0)
|
||||
n_segments = data.get('n_speech_segments', 0)
|
||||
|
||||
print(f"📊 影片時長:{total_duration/60:.1f} 分鐘 ({total_duration:.1f}秒)")
|
||||
print(f" 說話人數量:{n_speakers}")
|
||||
print(f"📊 語音片段:{n_segments}")
|
||||
|
||||
# 說話人統計
|
||||
print("\n📢 說話人分佈:")
|
||||
speaker_stats = data.get('speaker_stats', {})
|
||||
for speaker, stats in sorted(speaker_stats.items(), key=lambda x: x[1]['duration'], reverse=True):
|
||||
duration = stats.get('duration', 0)
|
||||
count = stats.get('count', 0)
|
||||
pct = duration / total_duration * 100 if total_duration > 0 else 0
|
||||
print(f" {speaker}: {count} 片段,{duration/60:.1f}分鐘 ({pct:.1f}%)")
|
||||
|
||||
return n_speakers >= 2 and n_segments > 100
|
||||
|
||||
|
||||
def test_face_results():
|
||||
"""測試 Face 結果"""
|
||||
print_header("3. Face 結果測試")
|
||||
|
||||
with open('/tmp/face_long.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
total_frames = data.get('frame_count', 0)
|
||||
detected_frames = data.get('frames', [])
|
||||
fps = data.get('fps', 0)
|
||||
|
||||
print(f"📊 總數:{total_frames:,}")
|
||||
print(f"📊 檢測到人臉:{len(detected_frames):,}")
|
||||
print(f"📊 FPS: {fps:.2f}")
|
||||
print(f"📊 檢測率:{len(detected_frames)/total_frames*100:.2f}%")
|
||||
|
||||
return len(detected_frames) > 0
|
||||
|
||||
|
||||
def test_integration():
|
||||
"""測試整合結果"""
|
||||
print_header("4. Face + ASRX 整合測試")
|
||||
|
||||
with open('/tmp/charade_integrated.json', 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
segments = data.get('integrated_segments', [])
|
||||
total = len(segments)
|
||||
with_face = sum(1 for seg in segments if seg.get('has_face', False))
|
||||
match_rate = with_face / total * 100 if total > 0 else 0
|
||||
|
||||
print(f"📊 總片段:{total}")
|
||||
print(f"📊 有人臉:{with_face}")
|
||||
print(f"📊 匹配率:{match_rate:.2f}%")
|
||||
|
||||
# 說話人匹配統計
|
||||
print("\n📢 說話人匹配詳情:")
|
||||
speaker_stats = data.get('speaker_stats', {})
|
||||
for speaker, stats in sorted(speaker_stats.items()):
|
||||
total_seg = stats.get('total_segments', 0)
|
||||
with_face_seg = stats.get('with_face', 0)
|
||||
rate = with_face_seg / total_seg * 100 if total_seg > 0 else 0
|
||||
status = "✅" if rate >= 99 else "⚠️" if rate >= 50 else "❌"
|
||||
print(f" {status} {speaker}: {with_face_seg}/{total_seg} ({rate:.1f}%)")
|
||||
|
||||
return match_rate >= 95
|
||||
|
||||
|
||||
def test_gui_process():
|
||||
"""測試 GUI 進程"""
|
||||
print_header("5. GUI 進程測試")
|
||||
|
||||
result = subprocess.run(['pgrep', '-f', 'speaker_player_gui_face'],
|
||||
capture_output=True, text=True)
|
||||
running = result.returncode == 0
|
||||
|
||||
if running:
|
||||
pid = result.stdout.strip()
|
||||
print(f"✅ GUI 進程運行中 (PID: {pid})")
|
||||
|
||||
# 檢查進程資源使用
|
||||
ps_result = subprocess.run(['ps', 'aux'], capture_output=True, text=True)
|
||||
for line in ps_result.stdout.split('\n'):
|
||||
if 'speaker_player_gui_face' in line and 'grep' not in line:
|
||||
parts = line.split()
|
||||
if len(parts) >= 8:
|
||||
cpu = parts[2]
|
||||
mem = parts[3]
|
||||
print(f" CPU: {cpu}%, 記憶體:{mem}%")
|
||||
else:
|
||||
print("❌ GUI 進程未運行")
|
||||
|
||||
return running
|
||||
|
||||
|
||||
def test_playback():
|
||||
"""測試播放功能(模擬)"""
|
||||
print_header("6. 播放功能測試")
|
||||
|
||||
# 測試 ffmpeg 是否可用
|
||||
result = subprocess.run(['which', 'ffmpeg'], capture_output=True, text=True)
|
||||
ffmpeg_ok = result.returncode == 0
|
||||
print(f"{'✅' if ffmpeg_ok else '❌'} ffmpeg: {'可用' if ffmpeg_ok else '不可用'}")
|
||||
|
||||
# 測試 afplay 是否可用
|
||||
result = subprocess.run(['which', 'afplay'], capture_output=True, text=True)
|
||||
afplay_ok = result.returncode == 0
|
||||
print(f"{'✅' if afplay_ok else '❌'} afplay: {'可用' if afplay_ok else '不可用'}")
|
||||
|
||||
# 測試音頻提取(第一個片段)
|
||||
with open('/tmp/asrx_charade_optimized.json', 'r', encoding='utf-8') as f:
|
||||
asrx_data = json.load(f)
|
||||
|
||||
first_seg = asrx_data['segments'][0]
|
||||
start = first_seg['start']
|
||||
end = first_seg['end']
|
||||
duration = end - start
|
||||
|
||||
print("\n🎵 測試提取第一個片段:")
|
||||
print(f" 時間:{start:.2f}s - {end:.2f}s ({duration:.2f}s)")
|
||||
|
||||
# 實際提取測試
|
||||
temp_file = '/tmp/test_segment.wav'
|
||||
cmd = [
|
||||
'ffmpeg', '-y', '-loglevel', 'quiet',
|
||||
'-i', '/tmp/charade_audio.wav',
|
||||
'-ss', str(start),
|
||||
'-t', str(duration),
|
||||
temp_file
|
||||
]
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
extract_ok = result.returncode == 0 and Path(temp_file).exists()
|
||||
|
||||
print(f"{'✅' if extract_ok else '❌'} 音頻提取: {'成功' if extract_ok else '失敗'}")
|
||||
|
||||
if extract_ok:
|
||||
size = Path(temp_file).stat().st_size / 1024
|
||||
print(f" 文件大小:{size:.1f} KB")
|
||||
Path(temp_file).unlink() # 清理
|
||||
|
||||
return ffmpeg_ok and afplay_ok and extract_ok
|
||||
|
||||
|
||||
def generate_report():
|
||||
"""生成測試報告"""
|
||||
print_header("測試報告")
|
||||
|
||||
tests = [
|
||||
("數據文件", test_data_files()),
|
||||
("ASRX 結果", test_asrx_results()),
|
||||
("Face 結果", test_face_results()),
|
||||
("整合結果", test_integration()),
|
||||
("GUI 進程", test_gui_process()),
|
||||
("播放功能", test_playback())
|
||||
]
|
||||
|
||||
passed = sum(1 for _, result in tests if result)
|
||||
total = len(tests)
|
||||
|
||||
print("\n" + "="*70)
|
||||
print(f" 測試總結:{passed}/{total} 通過")
|
||||
print("="*70)
|
||||
|
||||
for name, result in tests:
|
||||
status = "✅" if result else "❌"
|
||||
print(f"{status} {name}")
|
||||
|
||||
if passed == total:
|
||||
print("\n🎉 所有測試通過!")
|
||||
else:
|
||||
print(f"\n⚠️ {total - passed} 個測試失敗")
|
||||
|
||||
# 保存報告
|
||||
report_path = '/tmp/long_movie_test_report.md'
|
||||
with open(report_path, 'w', encoding='utf-8') as f:
|
||||
f.write("# 長影片測試報告\n\n")
|
||||
f.write(f"**測試時間**: {datetime.now().isoformat()}\n")
|
||||
f.write("**測試影片**: Charade 1963 (114.7 分鐘)\n\n")
|
||||
f.write("## 結果\n\n")
|
||||
f.write(f"**通過**: {passed}/{total}\n\n")
|
||||
for name, result in tests:
|
||||
status = "✅" if result else "❌"
|
||||
f.write(f"- {status} {name}\n")
|
||||
|
||||
print(f"\n📄 報告已保存:{report_path}")
|
||||
|
||||
return passed == total
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
success = generate_report()
|
||||
exit(0 if success else 1)
|
||||
@@ -126,6 +126,52 @@ def extract_speech_audio(audio_path, model, utils, output_dir=None):
|
||||
return speech_audios, speech_segments
|
||||
|
||||
|
||||
def scan_within_segment(wav, sample_rate, start_sec, end_sec, model, utils,
|
||||
min_speech_duration_ms=500, min_silence_duration_ms=300):
|
||||
"""
|
||||
在一個時間範圍內執行 VAD 掃描,切出子片段。
|
||||
|
||||
用途: whisper 給出的粗略時間段內,利用句間停頓細切。
|
||||
|
||||
Args:
|
||||
wav: 完整音頻波形 (numpy array)
|
||||
sample_rate: 採樣率
|
||||
start_sec: 掃描起始時間 (秒)
|
||||
end_sec: 掃描結束時間 (秒)
|
||||
model: VAD 模型
|
||||
utils: VAD 工具函數
|
||||
min_speech_duration_ms: 最小語音持續時間
|
||||
min_silence_duration_ms: 最小靜音持續時間
|
||||
|
||||
Returns:
|
||||
sub_segments: [(start_sec, end_sec), ...] 子片段列表 (原始時間軸)
|
||||
"""
|
||||
get_speech_timestamps, _, _, _, _ = utils
|
||||
|
||||
# 提取該時間範圍內的音頻
|
||||
start_sample = int(start_sec * sample_rate)
|
||||
end_sample = int(end_sec * sample_rate)
|
||||
segment_wav = wav[start_sample:end_sample]
|
||||
|
||||
# 在子音頻上執行 VAD
|
||||
speech_ts = get_speech_timestamps(
|
||||
segment_wav,
|
||||
model,
|
||||
sampling_rate=sample_rate,
|
||||
min_speech_duration_ms=min_speech_duration_ms,
|
||||
min_silence_duration_ms=min_silence_duration_ms,
|
||||
return_seconds=True,
|
||||
)
|
||||
|
||||
# 轉換回原始時間軸
|
||||
sub_segments = [
|
||||
(ts["start"] + start_sec, ts["end"] + start_sec)
|
||||
for ts in speech_ts
|
||||
]
|
||||
|
||||
return sub_segments
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 測試 VAD
|
||||
import sys
|
||||
|
||||
35
scripts/asrx_self/whisper_local.py
Normal file
35
scripts/asrx_self/whisper_local.py
Normal file
@@ -0,0 +1,35 @@
|
||||
"""
|
||||
Whisper Local - uses faster-whisper for per-segment transcription
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def load_model(size="small"):
|
||||
from faster_whisper import WhisperModel
|
||||
return WhisperModel(size, device="cpu", compute_type="int8")
|
||||
|
||||
|
||||
def transcribe_segment(wav, sample_rate, start_sec, end_sec, model):
|
||||
start_sample = int(start_sec * sample_rate)
|
||||
end_sample = int(end_sec * sample_rate)
|
||||
if start_sample >= len(wav):
|
||||
return {"text": "", "language": "", "lang_prob": 0.0, "segments": []}
|
||||
segment_wav = wav[start_sample:min(end_sample, len(wav))]
|
||||
|
||||
segments_generator, info = model.transcribe(segment_wav, language=None)
|
||||
|
||||
text = ""
|
||||
lang_prob = info.language_probability if info else 0.0
|
||||
language = info.language if info else ""
|
||||
|
||||
segs = list(segments_generator)
|
||||
for seg in segs:
|
||||
text += seg.text + " "
|
||||
|
||||
return {
|
||||
"text": text.strip(),
|
||||
"language": language,
|
||||
"lang_prob": lang_prob,
|
||||
"segments": segs,
|
||||
}
|
||||
@@ -1,346 +1,293 @@
|
||||
2bfe6a1c1263f35916d4a28981814515fc40cb473f7bbc801f84842904c888f6 ./add_yolo_to_chunks.py
|
||||
f61f7126698018b346c8bafc45501708c17e3b45d9db54be5f0109afeee63176 ./age_benchmark.py
|
||||
8efb13239db2a25a728abbdebd92affe685b69402a277cceb0d76e62ed9451ac ./analyze_asr_lip.py
|
||||
432b3e3b30578e71ef973aca758bd1964102cbbb19530620df8ac02df00eefb8 ./analyze_video_faces.py
|
||||
732609ef1882e14dc7ed60488697f6ae7e2607ec90b240a86ea9e585f052b9be ./apply_asr_corrections.py
|
||||
790bd25424e93ca5a0743ea1a740a9a70f6ae6f8a9ca411012eb1e9b03907eb4 ./asr_benchmark_runner.py
|
||||
18744dc3bebdce0d89ea7076b5e43febd35ad3c84064bb52adde4d128d50bc9f ./asr_face_stats.py
|
||||
1577d055328a73561f9ccfaf0c54727532e3dddcd1bf0f33e3c38081415cced8 ./asr_model_benchmark.py
|
||||
fcbb81639f53e9e08bee436853c84d918c0eeac09d985b34634d5ddc00055b61 ./asr_processor_base.py
|
||||
25948a204e45ce844d43606b7e45c9532321d48df44887d261fc886748276b10 ./asr_processor_contract_v1.py
|
||||
e9209cf028a11bdc45514124826374e58458ee06b054cfedffe8013d751735ea ./asr_processor_contract_v2.py
|
||||
407dd0ec772027e0df27af0b66ea8130cb390595ccdeca4350e7bdc210acee6c ./asr_processor_debug.py
|
||||
dcee1b80071b47c974bcffe3d27ec2f2269f4b8de7e7409ceaec7e6f271d31aa ./asr_processor_legacy_v2.py
|
||||
10728a05a6ff2d56a70bb831abb51e05b03309e45bc5fa068c5a0702a4c73769 ./asr_processor_legacy.py
|
||||
9106bfe07de9cfc920f4f4d2f821dc024df612f4c2a8f5f75d35f012d26440f0 ./asr_processor_simplified.py
|
||||
7eabdcf7320302ee65c67e801f3ac7ca5801abc76165faa182348d30a8113e9f ./asr_processor_small_multilingual.py
|
||||
2714f7be88f286635ea8465daf8fa969e6b27d2b2d1f73ac5e98f5e496139cad ./asr_processor_small.py
|
||||
1089ff10b9b0a9f528cac79580aec25e33f8eeea485ac44b6aaf8c7c0cab5b42 ./asr_processor_v2.py
|
||||
e9e622d737990bea8ecc139fa310a7cb4b0ca0309d6783f8105e74f864dfb850 ./asr_processor.py
|
||||
5431b57d4369a841d51a6d6c5e1fb5e6c2932cb97cb4601f5e1b41ffe9f7ecaf ./asr_side_by_side_comparison.py
|
||||
6c11efc3d40e559bfbeadcbf4f51eb353b744cc4f765bd8abc472a701e3f33cb ./asrx_processor_contract_v1.py
|
||||
93501463af84d6541405057da3783d40492aec5e536b4210dcaffe460cdb5503 ./asrx_processor_custom.py
|
||||
6adfbee842d134b9d180e2d1104694ed5cdc1fa4febcd0c502801b8f87b3ce66 ./asrx_processor_simplified.py
|
||||
60fc3465f9c461583f8d0b888e85b3a6e04e1f252a1e1c21d036b52e1ce4b43c ./asrx_processor_v2_noalign.py
|
||||
82d65b71bd86874e484870c40214d3fbd9343c39d5d635896fb4d257d13a410f ./asrx_processor_v2_transcribe.py
|
||||
5a0c9905a2e10c847aa74f108e4054de4704bbafb2004589db15bf33833ea3c7 ./asrx_processor_v2.py
|
||||
b16b00cf9e5de96abc512022af9bb81196405b10988f5a39dfd3a9b6471f1155 ./asrx_processor.py
|
||||
d570fbe89bf84c50f180e8f3ec26c30092e07e3fa4883fb83a644670c13b8588 ./asrx_self/__init__.py
|
||||
3b7a788e5fe2fa1a7518bf2a639ccd09b304b264b952c88a3e6612aba30faef5 ./asrx_self/integrate_face_asrx_speaker.py
|
||||
1fe4b9ac1d04c2f2ef5361d8325cf9333e434b126be6a53a4c0d40a04f32a34d ./asrx_self/main_fixed.py
|
||||
e4a2894bd4207f6d034c86e1d232001e2e0f9e65856c89d84d8a038473a5e50b ./asrx_self/main.py
|
||||
46f61075b403729e4ff9bf0b05367b5319acf5d8c696a0517033699dcba36276 ./asrx_self/speaker_audio_player.py
|
||||
2a072521662906e5ca84ec54cb1963930a1c795f8d64906b66e889c0f442198b ./asrx_self/speaker_cluster_fixed.py
|
||||
db4ddc98d563bf4a8c34fcd1fe40edd34fab63fa8c293644a8a40ae87be521dd ./asrx_self/speaker_cluster.py
|
||||
a50d0ae549b733532f940332e4656a4dcf0623703240eb74832524eedf54f888 ./asrx_self/speaker_encoder.py
|
||||
42f325168e1f6edd514eb00321f18ce581f7b61d18c50798271c3da8410cb248 ./asrx_self/speaker_player_gui_face.py
|
||||
54a847a8862e2f7400c4d8425f4bebaeb230fd50932933734819fbb6729bb560 ./asrx_self/speaker_player_gui.py
|
||||
43508b714f2f1aa8bacdb9c4f52152f3fd14f6c2e2529460e5b24b29846c8c37 ./asrx_self/speaker_player_interactive.py
|
||||
e25e789552fef129bd6f536140ec4deead8e242091ab60ab679b544ff9d43307 ./asrx_self/test_gui_face_player.py
|
||||
788014df1faf7cfa09fbce16781f8bf9da1acef75e8891592b3b4d51b91e93f2 ./asrx_self/test_long_movie.py
|
||||
8bac63ea24cd06b9d398c2650ac396e10db64e33f0686a01bd460e17286e7574 ./asrx_self/vad.py
|
||||
f11b67ada6167540d2f95cb2af93d0e3a0de55bce659745baa37c4aa4805212e ./audio_taxonomy_processor_v2.py
|
||||
ded810b81cda24e31e82de14ba9846770ee2b18d84d52b9d570de5877e9e2513 ./audio_taxonomy_processor.py
|
||||
f7c53be5a031a8bff15c3165543586529932d81c4312521654d132b1f0ed6bc3 ./auto_identify_persons.py
|
||||
5497a6f1f7ae267c796a398a9f020ea485aa45f980f2eca932b904ad61ce9b40 ./backfill_demographics.py
|
||||
39a479ca4f8986f3255b0bcd0d9162a1f2ae339bb4dcf081f931ff9b304797a1 ./backfill_frame_data.py
|
||||
308c8e3f3d45ee273504f9f415eaf6c025f06aaf1cca33156a66431ed6e64f43 ./build_semantic_index_poc.py
|
||||
4eb37768edd252d94f0d751f219c317e905bc093f414b2a6350efb8294131138 ./build_semantic_index.py
|
||||
debbd058957d09c2397f3f4c028edaa0a658002921dcca95eae2a20070ba95fb ./caption_processor_contract_v1.py
|
||||
7236cdb5deaeada266cc246ee11380248bb9f2255888c25a152b2f6ab1f981cc ./caption_processor.py
|
||||
e73cbb688dade5c5b6fc4276f0c78b377903ff83f3830b63d8bcdacd8da8aecf ./check_all_stamps.py
|
||||
7ecdbd4b1f94be8ebab9935ea210a868330e7030b6e19c73229c579c1189fd5c ./check_architecture_all.py
|
||||
7179ed1a87241904af29542f9018398f8afd9b9dd89af7bb11909310ab7b49e0 ./check_architecture_docs.py
|
||||
7e6bd7d14582e494baf8b28354bbded3f79b43f0bd271ab33874da55b9086311 ./check_code_document_consistency.py
|
||||
5ffca7c55edafad755e84499981553fcb48ce6056ca7b04130acafb9e6a9b1c3 ./check_frame_112_36.py
|
||||
f49c7b0cfa53b657f69b2ad97a6e18393741cc2151b32c9d7dde2e078b75953f ./check_frame_91_59.py
|
||||
d2cb7475262ee711a4b06e53559f0927242be4a924a56e7fe212225f318f4193 ./chinese_vector_test.py
|
||||
ecde3d3df773916f62de4e34f8d8693feaedf112a3ef9955e22417c8421722bd ./chunk_statistics.py
|
||||
2588ecf27c13020d894e46ba70a76de89f09556b475f555dae59db36da0b90a0 ./clean_sentence_text.py
|
||||
98ab1129032f42fddc020f9b3492d1fc133851d1af33ddeb57e2385d88425af4 ./clip_logo_integration.py
|
||||
bf6f74c09b8f8c7f25c5fffb9c36f16a8afb483a7b65903cfc75e2ea641bdf49 ./compare_asr_content.py
|
||||
1f2caadcded724aa04a929018a35ace53dd79d172f5ee2720308fbd4581b0c6c ./compare_asr_models.py
|
||||
1ed8a9530f40e304b556ff76c7cac40468c86a0cd32ff2a8bc7bf2a69669121d ./compare_models_gun_test.py
|
||||
6bf790fe75a7a2a5220052ca14c31e90a97eabc4558cd5e9059280913862a81e ./compare_search.py
|
||||
875e7a598982c8ad7222a51b7b147e91cd5e1a930f41214b3942107cb932fc5c ./compare_segmentation.py
|
||||
e432b6f2364d5a9aaf207a1de0dca3fb14ab8d118c53ee34306abfe6fd211ba8 ./comprehensive_search_test.py
|
||||
43df85cf860ac28e083de35b511bb2a7b91ed48f596757f52f19487768987500 ./coreml_embed_server.py
|
||||
9149ccc8de5adfec69c6f3f2ec502ae7d5e7844518a228ba587af2e08cb38805 ./crop_opencv_stamp.py
|
||||
fc36ecbb1455d959456945266e193b601a29c4210b4938a3f0d4a9aaf44b5cee ./crop_real_stamps.py
|
||||
34a694624ce94d916b06a847bc4d41e7665985b85e55a626a4bc3a4370c21acf ./crop_stamp_112_36.py
|
||||
27099dc9c8ee52a6949ce18c505089afef1720fe70858b90d0801972c3b43fff ./crop_stamp_closeup.py
|
||||
01b5a3b091ebcffc0c1e2637b7af8192ba597239fa80d152738e3b8cfdf8174d ./crop_stamp.py
|
||||
71b2a362b5395c6e4d70e62766820db92d94eaf140d98eecb2880bcd98d55be9 ./crop_top_candidates.py
|
||||
60f18c5fa03ffbc80c209337cd1c8b6acd0b8471e600119340aa8cdfeef14f5b ./cut_benchmark_runner.py
|
||||
deba86a1645ca5b1acf413dd9edfad77b93ff213897d739a32de1ba629bfce52 ./cut_processor_contract_v1.py
|
||||
01024f947f0326c124293a30e4f2cdb859f21cfb2d4c07f9c1030e2934f7bc44 ./cut_processor.py
|
||||
ff092ad2373b57321f87d1dd123fff8a99c8207057591e8526e56cb1424d47c6 ./dashboard.py
|
||||
f184bf3e546db0253ffb71895e8d42aeb06588c71c4914c2fe656f42ef463c9a ./debug_face_registration.py
|
||||
a9acce1ebd6ea821a8dc5009b8fc40586a98d31c23e93c97fd844bdadbda4ed2 ./deep_analysis_112_36.py
|
||||
7767ee7455a956d14d286ad558c4c312c2ad3ccee1c73adc1bc8f761c96ad72a ./demo_dashboard.py
|
||||
425290c12161c5cfcb0c505a737ba3951656b39e425e792919d4812e15b9b8e3 ./demo_face_learning.py
|
||||
d7e3e27e6a65b1fa62530ee954c227dbb4f97593c5a5dcc48b39e5ebae4656e5 ./dense_scan_traces.py
|
||||
df79b7fc7a03a8e754de5123a23bb33b1d5c23d832adc1886fb846ca517dd24d ./detect_language.py
|
||||
f6f8047e24ebbec81ef27dd38f4242e63385f8ebe5be471cae156b8aa5fc4477 ./detect_objects_keyframes.py
|
||||
e61d2ef5043bda3674a0050d83ba3bc6a70c47f54e456124a736b4328f0c0638 ./detect_stamp_shapes.py
|
||||
f23a382113e9c7de2ec3b24e95160daef48f9336ae6d4ec9ee7a18f4bf529f6d ./download_places365_classes.py
|
||||
a747e5e17960b972549714786bb9e28ea578e10e6c80788e298a0149c970bcc5 ./embed_faces.py
|
||||
f1a2b3820e1a763eba6d8d905a5bb87f5a9b4a2f005e709e313bb7505ba7ddaa ./embeddinggemma_server.py
|
||||
43c540c02c1be992e7d44ab4fc76a759815db3ed5f25bcbb594328b50ed7c73b ./export_file_package.py
|
||||
19d23e4604d5532928412afe4d5d39ff49194ab4a046825286ae1be154326a1f ./export_file.py
|
||||
5f10bab1dcb0b5fad233a74069f9e2f89043e7c848c9c38ae7e2806e6940c75d ./export_identities.py
|
||||
2a1d0a1b853fd2c28f9a404871d33912f93521358576833be0999271bae02bcb ./export_person_thumbnails.py
|
||||
a81bf1d6af78c052e638f5d5677b4edb512d0de5441025d86fd970d3e7993922 ./export_sqlite.py
|
||||
8b5cc0ff437fb4dd0df28b7b20a78469cdca3621e2eeb4b6d46ad2391acb0596 ./extract_female_faces.py
|
||||
bdecbaf0496bf536dce2ef4897f7090749820d15dcca03492d4d736ab0f8c6c5 ./face_benchmark_runner.py
|
||||
22319a38bd684fb235fec681ddc60f45821e4bb2181f2b31fdf945f7ad9a1b85 ./face_clustering_processor.py
|
||||
5adce4e444743331fa592e13d71e52f26554eadb9744d350a7654a449a8fb8a3 ./face_count_comparison.py
|
||||
3574454c74eaf11021f9052f77d93044cca4ae0285d0f2630b4016c2ec0df783 ./face_cross_validate.py
|
||||
4f09b3b66b14a5eefb14fcf915a1ad1e9147010f6ae7671731566679b1cae461 ./face_embedding_extractor.py
|
||||
87f1b69affbac03fbd87331a99cd7c4faba6c72d359ffcfebb62d6ad8f70445b ./face_landmark_qc.py
|
||||
28776dfcc6ac40e9481c25467438745fed60fecdfd4fc19f9f4c7396397591a7 ./face_mediapipe_test.py
|
||||
f4d1b4334a49357b74b80e390ad5a3d16263e51cbe5cab661af92bd2e9721f02 ./face_processor_contract_v1.py
|
||||
802015c73dfce0866f2a0bc94c645aa35ba30a6de78244af23090bb1f1828c6e ./face_processor_mps.py
|
||||
96ffdbde3f4d87e9942f9e1f4c93cbd999dc404b43e00d4cdcbb22de3c0f16b7 ./face_processor_optimized.py
|
||||
17e7d0bd142bddfead94b1dd959c1f41c0dad7063ffc677dff1a99d62aab6cf8 ./face_processor_v1.py
|
||||
d6ddad29a5e53b43b887554072d7965f0535e47fb62dad1a8b87e44fa1be6015 ./face_processor.py
|
||||
8edab61189ad1a8fa60c203077e814e82d46c5bae67054fa2ab1958e199c05f9 ./face_recognition_processor.py
|
||||
9ea19f357b3fcec6c8b3875c538e53cb46e407ab188cd544963e0123e535fa03 ./face_registration.py
|
||||
72648816de611fd9b84d2b98c177b8b4f24374024b69184e8151c06cf44d633b ./face_statistics_report.py
|
||||
499f197a06f50839ebd5350af380fa56506ce08f073ba40c0e863b8e02b34133 ./fast_face_clustering_processor.py
|
||||
0191781635b98d0675969fb87733af19525d7b5c148723346c5378c08a00fe33 ./fast_stamp_search.py
|
||||
00e7e8ed06f6a0f2c46c84a47d7e7f5d366acee941d546a52c4b1b7885c71e08 ./filter_stamp_colors.py
|
||||
5341fd648cffafc77568070313b06417636943d50ff3b4380a61381260acaafa ./final_face_validation.py
|
||||
213793ab719f4ef42ec9b22f351dd86d4739211c17be486a46b76ba7e64fd8f1 ./find_blue_stamp_opencv.py
|
||||
e1490317c0f56b895f73cfbb6f57c8e3ea5c65304bfdd7663f103f6b564e148c ./find_kids_pose.py
|
||||
08d4cba0650f6a22fc134d07fd15fe8784c8472c3ba687b587e31e0b980e2b1c ./find_kids_refined.py
|
||||
aecec0784ce5d0e98176c15798f05d4f67ab6a686f9ffafba71fbd82157027f8 ./find_magnifying_glass.py
|
||||
620db08dd84f00af0c6d744dac54c68360548dd5b2cc26b12ddcefd936239b2e ./find_pink_stamp.py
|
||||
1f4555b3578f4dc6bc08aa37e34eda1d91ea25d8134439771678d1a57bfdaeb9 ./find_realistic_stamp_opencv.py
|
||||
277aa3b48eec2e739de3bb95ef501ffbd24104aa2a1bdef28c844ef44fd75013 ./find_small_stamp_opencv.py
|
||||
fc73bbc9605938db495bd33ea74955e454e9384130531a16d42f25dbd9b515d8 ./find_stamp_in_hands.py
|
||||
c6ed0f12e78c12df977ddca5d699f58edb174b47199f584e7a24dbdc3b7d02b1 ./find_stamp_in_magnifier_scene.py
|
||||
ecf12e346619c27a985452e9f84ee262c2da25de9df0ff6e0b293279ccba559b ./find_stamp_opencv.py
|
||||
4ff93cbcc781a5cff023f78006f1aebbe2d954405ae7d00a473fef6b41b2ebee ./fix_asr_text.py
|
||||
4090cb892115843a909aa41426c0f39c5a53d8d88a5db69499ec8bafcb780d77 ./florence2_scan_stamps.py
|
||||
e90e4447db3328b64a2062ca13ed41f6a045220d8fb640542dff5b790d3c4d3b ./gdino_comparison_test.py
|
||||
7071a9999057c347e2275381f1f0c58e19aa8581d70a572d3170ed14a295a48d ./gdino_frame_api.py
|
||||
891410310b415ff68a0f7ee0aa39e84eef7f2c75887487bdb88b8f4718d40e94 ./generate_asr1.py
|
||||
24efe7db016387b40bd9caae449f0445a3d47eb878c00399803bb6e78e6dd5fc ./generate_benchmark_summary.py
|
||||
dc956a78a3ed26686f45dd6d6d9cb42c023751fcd9b8789585450b6df63670a1 ./generate_chunk_summaries.py
|
||||
8a0922d75fdc7c5994ebfb31881d765db4b105cbcddfcaa4b4c49d11950b8df4 ./generate_chunk_visual_stats.py
|
||||
4860bfd00cc6c1c842c2f8e17e725eebca191d81067af3cb5a28661b45d74bd3 ./generate_parent_chunks_gemma4.py
|
||||
e9fca223a8329ff6bdcb8552fecedb2d8b4607c6516c373c3023f29edfd42e06 ./generate_sentence_summaries.py
|
||||
cbae7c3e85457274e8c284005196c39dc97f9d9200ed6b0e4ea266e48a381d3a ./generate_synonyms_llamacpp.py
|
||||
57512cd7a5ec2f52813717fd3d81dec1aaa69dc9c91a9edbca847e7012b1c86f ./generate_synonyms_ollama.py
|
||||
dc495cb8127858fa03a5f8b8bb4a772c5934ada1abecf97459bf71de80417672 ./gun_detector_scan.py
|
||||
1a7cfb72723b3b94e3f4fe368477ba693ac3d20ac7af7351962bc548c700b451 ./head_shoulder_bench.py
|
||||
b2fe8e4d8d7d1057ba928fc5e190f4a06cb60e83e2a02c5d7c423791596c11b8 ./head_shoulder_quick.py
|
||||
ba5e67a97cb465e6a1a942c2f7342406031759ffcea2b897ae963bee4bc551c4 ./hybrid_stamp_search.py
|
||||
f5847b6c8ed4c7c51290df9032d5a192317b5f03b5ff418ead1181a6e1b655f2 ./identity_agent.py
|
||||
61bea1980af5861a02d6e9b47ac5ad0bd04a4fd633af477d2179b7361ae58c01 ./identity_bind.py
|
||||
046aa90eb4a4b830910912362a9865d1e6170f5bc176fae42be630f967f9d3ff ./import_file_package.py
|
||||
7cc260d4411ab13559803686f8b645afa07738d652d9459830aecac268597fa7 ./import_file.py
|
||||
071e3a5141d04cb9e6bd31489a835c778608785896b18ea7fa65e8db9f1547e5 ./insert_chunks.py
|
||||
d3d53f44daa7f1526488677b141e90fbf4aa5625369b96a3ca275b802414802f ./integrate_face_asrx.py
|
||||
4cb6a93ef8006cb69e8bdb1bc72899ee9bab1bf7eceaafe9896923bb7023bbd5 ./integrate_rule3_markers.py
|
||||
75aa3e4bffc9f9cb8b9254db19095c93c3efb43d465fb5dcca8c7b9b730f5c59 ./integrated_body_action_decoder.py
|
||||
f4dd2e21fb6b668bdf0c51cc56e214188b46937b96a2b4a10d13783e171d0472 ./language_router.py
|
||||
bef426641645fcf7dcc68c87e3325a6edf3f70925febaf1df84f7c6ff87681e5 ./lip_analyzer.py
|
||||
7f98b0cc8379b3759cc7e805dd56f736cc518093e83f43b2e5ecf559a19b95f0 ./lip_processor_cv.py
|
||||
a1473eeba17fce25e4678234fe4e8793a132514e0566b03b36a0bec04eb93acb ./lip_processor_media.py
|
||||
0df61396756ee22d35356776c189b354458661916c8baf85bcef97c9f8b62ec8 ./lip_processor_mp.py
|
||||
3202aeca29e651ef1a54f47681c6b3b2d0680555fe3c6d318a932bb12b49e58c ./lip_processor_simple.py
|
||||
fed15bafb5e09715cc03962f465b2ff618bf05ebeafdf932643690c9635c9840 ./lip_processor.py
|
||||
1773054e8d563b493865880d0d8bda105e3eb6fb536a25817517237b3bb76afe ./magnifying_glass_analyze.py
|
||||
7d4d048c452bf273f4a6d96da13eb7bab6aa60ca9dd51de5ca0fb0a01e587b13 ./magnifying_glass_extract.py
|
||||
8528bbf89d2770fa5a23f461274038898be251fb6e48c5d3adece5aab3bf976d ./magnifying_glass_owl.py
|
||||
cb645f5e29ee5a36b2f97812039abfdaed7328386bcd25ad7b742af6a6b16399 ./map_speakers_v2.py
|
||||
a90bd3fb729a05010c29a213134c60cc0bdd17769e27a7d3f1250919b7bf1613 ./match_face_identity.py
|
||||
2d864dc831c2fd0142b19b8ad2cda169c2a05facd9662d31861d29bb710c4979 ./match_face_with_pose_filtering.py
|
||||
889d4853707896885ed96ab945d4266acb213f4b122e2ba7c4563eb0e3e9e865 ./match_identities_to_tmdb.py
|
||||
b34ec373bcf65139e08e41967f58a2fc8ebb67a59c361074d3590cd16541415a ./match_speakers_to_chunks.py
|
||||
fe6260a94d01d8b43d0d3b59eb820cfd7b4711c907343a1261c69f9010ae990d ./mediapipe_holistic_processor.py
|
||||
bb36844b4d13bba8edc1b7f0703f02081b62bea795535b8cd8dcbfdb4281f402 ./migrate_asr_to_children.py
|
||||
819312cbfce6e68a0d8d731e02d283946f79de6044f207991ddf9a28ac853d79 ./migrate_face_results.py
|
||||
c418f6e50054fa7eae1d0d879e28997b98f57437acec48b53ecb09f332728867 ./migrate_to_4188.py
|
||||
6f60aa899e06f05e575cb5b461ea517481119cc32644566245d74c96eccde722 ./multi_stage_stamp_search.py
|
||||
b24e2289c00f803c8339f59c34d44ed6c53a3c19dafc13e72c4b260d6bb312a6 ./music_segmentation_processor.py
|
||||
da2546f84d0dbd711c8800ae4e32e59d9c38de9e62e1b423c4518fa1fda1dbea ./natural_language_top10.py
|
||||
78c3d1a9302dbfacdf9b3655dab07348957fd9dbb4af94aae83eefecd5343a33 ./natural_language_vector_detailed.py
|
||||
e924f04d68c9a8211ad373da811aa6671d2c5654281c1634dbf8b1e5e5b51533 ./natural_language_vector_test.py
|
||||
df6ac92367b1afb50c0af958e362d87555fe569f608a8d213e0a593e2a43cde8 ./object_search_agent.py
|
||||
fd39b779a0337f521940f3f7b159931f1f207f200eefd610183781fdcf3dfafd ./object_search.py
|
||||
42d2952fc78b57302b0d12bc3d45790a2c2c46d4ffa3c713a82686134bd63f13 ./ocr_benchmark_runner.py
|
||||
7b3ccb5c4ddd4c62c5ad04d0e3aafaecc2c1441012b6a98613cdcf055e2e50e8 ./ocr_processor_contract_v1.py
|
||||
271023eec42d6be4a1ce6ae2ce3f29e825210a57e6bb37554a6f7fdf54616f9a ./ocr_processor_mps.py
|
||||
2e73c41285e52ef013594fcd4d20df9f5781bfc26bcf62e54dd2c04ec44200c3 ./ocr_processor.py
|
||||
62196108cb3337b5f9a873d70d2981ac8f49152369afbcc8a12b3a13de579e80 ./opencv_stamp_search.py
|
||||
b2e8d552c272fd173c77693e9453a85fe16dfc12f7c2cd304d299c6188c14077 ./paligemma_vs_gdino.py
|
||||
1534d5b7617dbae77f7a37a2c33a89b90f965247a6828f00b73ea6b720f6f4fc ./parent_chunk_5w1h.py
|
||||
5208c738d4b615282813d351daf09872ce516121bb604caa64968ef5e52c53d3 ./pipeline_checklist.py
|
||||
8f80c3a2be5c330e2d1853d9250a171c75db84598dbf3304280c42237ed4fb1f ./pipeline_status.py
|
||||
94db44c0f49115a677d117d4901a1b7991c1517905300eaa495dd62b8ac1c79c ./pose_processor_contract_v1.py
|
||||
167dee5e42c6bd46674bcffcfd92f368fc0b48a1f42c459c806853b281bc6482 ./pose_processor_mps.py
|
||||
a6ef3a785ef5c6dc47fa38dbed80d76bc7d4bf48cbaf0f7edb3d26df98d7262c ./pose_processor.py
|
||||
45e6798dc5900f2f7c8776a2d260c122aae5068a075256b8a5c02e8d0be6c131 ./probe_file.py
|
||||
139a68b5915680ec697d4bb5420adbd20b89637de2c16a15d68aca4fc22da02b ./qa/executor.py
|
||||
4a59b36c29e1ee6e2b169db3b0201d2f7088c6ccbfdf642a3b522aeb182bbeea ./qa/judges/facenet.py
|
||||
0dcea0258ae3309cdec93dc4dd534d1a42511c327d528a117c8e3085f5b30386 ./qa/judges/gdino.py
|
||||
7c9392436477662bc1b49d719f0c78f96e8e7e180fd281d4c59c36fd241a3e6a ./qa/judges/gemma4.py
|
||||
84c6f793538981bdafdc08bb9bd5f12401b442441fae54936f610a758d18e972 ./qa/judges/maskformer.py
|
||||
2f9b5dd3373fdec77a84f117ab620230e208f96d015c960275ab60a0656575b6 ./qa/judges/paligemma.py
|
||||
52dedc276f6f9806710f1ef510aabd88032afe4abad364f5963fd2bd5b6cf14d ./qa/judges/yolo.py
|
||||
c4e4424aad1847d822e9cf7dc98a1b2e903735a61e8ec056c6a9be75f79486bd ./qa/pipeline.py
|
||||
96f5ab509622118db307641082a19daff6b9a36bcc66451c35ed2abee4fe4249 ./qa/query_generator.py
|
||||
00b1716423a184856bbe44d4132fd6d84ca13f3ae018964caa6f3389c1ab98a5 ./qa/scorer.py
|
||||
01c7b3c30c1531224f9605f0ee633285fe8489ab2d0a3c9c6a41f2b2b60d6626 ./quick_stamp_search.py
|
||||
e3143673a2bff6139e05c82446fd8770c4b7e59a854a42c3b29662f5ac75efe2 ./rebuild_parents.py
|
||||
4aa98981632d4f8a11039c510e86aa296ae1cd4b399fc871ed664ac11e445bd9 ./rebuild_story_content.py
|
||||
205cfc47b603b5ab94d97dae8c25486b342b7c2858afe6d6dae27615ca0b2aeb ./redis_publisher.py
|
||||
750f778946b56bc57c47d9d2295332bb0f8cec2c1aa03c6b882d39ef4432673d ./refine_search.py
|
||||
0f8a6a6866a5797e964d3b17e2b7ef146fe7a798f09fcea982fcda6f629b4d06 ./regenerate_parent_5w1h.py
|
||||
3ee192b623f290136b36bd63abd018aad6e6639a9543970c3415734628b33bd6 ./register_sample_faces.py
|
||||
334782f0f66d0ad3818a51adf6343186a2de65467378ab68a81ade806e496af9 ./release_manager.py
|
||||
9a44cdd155953778b52ac0cfb118504c56eb6b1141984365ffbb717e28f3e65b ./release_pack.py
|
||||
3906b48f3a7764d19605def2bf8ef84a54a6afe64c9291a7cc0881a91472a826 ./render_face_heatmap.py
|
||||
44e432c31a35211a37dd26695772b7e250487ac42ba4f16a56f843277c2fabbf ./render_offline_report.py
|
||||
3fac1e6a4125042185a2ce82771f695c562b3137c7aa58a912bada00ad8ecf78 ./rescan_single_frame_traces.py
|
||||
9c3212cb455c2a6230be918448560fee00c153a8956ffd04fcb62974d5e1abff ./resume_framework.py
|
||||
7c95ec08daf4f980bd53233503b7a4fa01afc08660e8fe8cd031ea3613ead8f7 ./save_events_to_db.py
|
||||
24795e1531fe05e33d515104e4fb2f9567b46d802ef1b5a38f11268cf105be76 ./scan_charade_stamps.py
|
||||
cad2da5073577f851c5cb2abdbd7cab05b39caa0d1179ccc89c378a7df2736c8 ./scan_full_video_stamps.py
|
||||
03ae71470331fe5b7f8e394f7f789eee08cad4ed5ec9196b46ab2c9dbefa7fec ./scan_handheld_objects.py
|
||||
d3935ba498786cf260d9d5370ca60d3af7bc4fd438f6be33ce23cfd0b7bab593 ./scan_keyframes_opencv.py
|
||||
12c9b35212f587f5adb37584bf3c3844804d2bc642ebfc5d82b86b44f46d2472 ./scan_keyframes.py
|
||||
f386130ac203308c904ba7efea09ce0ca0d640d36762b113bf0cfedc24d7f885 ./scene_classifier.py
|
||||
482edae04e5467a68c77729760db53d3653e8d7654fa49e5ec9a36f1f8f22616 ./search_blue_stamp.py
|
||||
e3786422932138272d1096ad4c800594e62c9640952a286a9158372a1e5443e3 ./search_envelope.py
|
||||
2df1e259c2e52d10d79b20856cb94ffff5a9bfdbe47cee587b1148b2f1c16101 ./search_objects_in_hands.py
|
||||
9fd49be8ab16f94fd82efc5ae035c029372a7ddeb7fd779b557f1917cdc14592 ./search_vase.py
|
||||
7a6d8e7c435368f6218db972c04a7be16d7d6680d8d4374f82c05b7162716b9d ./select_face_reference_vectors_v2.py
|
||||
2bcf7c1b3c407b51a134a5ee4982713f0ea387cfd6df01ed75554c94603971a6 ./select_face_reference_vectors_v3.py
|
||||
d52098fcf1f9f7ba14f31a9a90bc5b3bc933e1a5e5697e3d09eff389c153cb18 ./select_face_reference_vectors.py
|
||||
a02cb37639275d86ae0b4504d21f50963b45aaf94630c59472ba30d07722e50c ./simple_api_test.py
|
||||
02516ab1616c1756c4f8041f48ff12811cc5d672c53b34850b84ce682fefdff1 ./simple_face_stats.py
|
||||
b024d9bfe244d0d058daae0acd314b9344d6f0912e4f3b02dbc618f9fe3e4949 ./simple_test.py
|
||||
af8703506769f3cdb89ff7849b071c2421307717850596dd86d2fe0b053e7809 ./smart_stamp_v2.py
|
||||
5e5f86d47ea2b75bcaa8662689f73af1963645149c0da688dc43482616aa4e76 ./sound_event_detector.py
|
||||
bab7697e4b4b05e93babc116e0c5b13cbaf1f4d419a65acd5dc1de5bdfc510dc ./speaker_assign.py
|
||||
381ff240ce806ead7d6463ee40c5b830035eb6252180b4b0901b3c8313fa4bbd ./speaker_bind_lip.py
|
||||
5eede29fa0966974c1943792d7fcca2dd9179d4f23570cf1a3964dc97bc9ac1e ./specific_stamp_search.py
|
||||
d5363d832272bdb3c1d6f6d93eee7b7894893b9164a3f5ad5fa08a4a0eaeeb47 ./split_asr_segments.py
|
||||
8e1269f173f2c72de78857c2d83d3111b62ec89bd79f4fb00c3f57390986ae4f ./step3_asr_fine.py
|
||||
7592df8be5dc58376b33960bfa7fc0003c51114b70ebc01f1589f39ee9568d3b ./store_traced_faces.py
|
||||
7ac32c1e2146a19e6654ab3e4bbbfd42e1a6540fb8717d40d55c61e9f5d1bf71 ./story_embed.py
|
||||
74cc24b328a075f48b1f44a465611157f44eadc8f5dabf6d95cd5cc5f80dd9dc ./story_pipeline_full.py
|
||||
97628f0f1270825dabafdf0a69f10ef12c4ffe2be4ac12941315f06bfb084e7c ./story_processor_contract_v1.py
|
||||
1b1f42fc4bbff26551f26f4ac1e8a995dfe3ff98b940a29c9e130410965d0fa0 ./story_processor.py
|
||||
cdbc7ef88551e2b3a3771eac5be5e0360989e71fa009ac28c97e548507e08a5e ./sync_face_speaker_to_chunks.py
|
||||
8b08e9a33f5917aad10e070d6aa48805f5e7c23f905ba8fff3b8697b2109d962 ./sync_to_mongodb.py
|
||||
f64cc6dcb72f54d3e97aa981b40591aef4804ca769e1f14628d901b98bc6aeac ./terminology_manager.py
|
||||
455546b9bb3a2c2c877c7720229b254e75b28eea33b3715d1731c02ca85294ae ./test_api_correct_usage.py
|
||||
b03dc1bbb091672e7da2b131850b17badac896b4fbba92fe9bce76c232c99be4 ./test_api_with_key_id.py
|
||||
7d295c77d5bcd4c72c5673370af48cc89bbccf9292c3b82aad3a230d242547a9 ./test_args.py
|
||||
f474ec88e6634decbf178da497443fa709096b174bb4a4320a07256f516b1044 ./test_asr_large_model.py
|
||||
aa952524dd86f346740ffe555075b74adf2e60bb822bb04a943a51b1fd262445 ./test_birth_uuid.py
|
||||
db87badad7948527325a528400d67a4eeef76abf8d13f5c4254c812e944e4e0c ./test_end_to_end.py
|
||||
e191c98a82f7e089f7dccfc4c536244da2bf14339f982a3afef05d33332c3755 ./test_face_api_final.py
|
||||
1b97c9aae2e1744aa7aefb192eaef86c64e6134efc8f08ffa9a274bff16a58d3 ./test_face_api_with_correct_key.py
|
||||
f7e4078f31b1ca8494c18878219cf2f90c301f19fc851b9e7084657b71a5e150 ./test_face_api.py
|
||||
9eafc49f8fa42b4cd58109e9b725b3aec3b06943ec426919b1788838ccf1ed92 ./test_face_db_fix.py
|
||||
38bce82b167e0c97b257cc6b955fdc2e9ded581ce2d39eb0fd2c60249275394b ./test_face_direct.py
|
||||
24e82bf0af82407e6c04361e9a671770cbfb0b05d92df589bd0d5a0118bb5a98 ./test_face_learning.py
|
||||
8dcdb144c4253fbb466f220359b42c2a9579193865e320a56e682e384c2ae176 ./test_face_recognition_integration.py
|
||||
b921e3256fdea176d4391116d1ead472c4f3ca8aac6999140367818818c35ec3 ./test_face_registration_api.py
|
||||
9af6c6ff0c766b3de92185c3602f2b8b62b815bf88dcb0e3251c2676e61e0a48 ./test_face_tracker.py
|
||||
4f70eadb6a8b80eb8febe32b17b77e58d1a4823cc5d598e5ea45555342d2d4cb ./test_florence2_direct.py
|
||||
0588be0acea540950d737943073f71e769b6301374eaa4ff7fdb96a80145c4e0 ./test_florence2_pipeline.py
|
||||
694c15193616157ddae4bdb0a45feada2a8f8490f01d290a28aa77a4b24eabb2 ./test_florence2_stamps.py
|
||||
2c281f698616a83e9eeccd610555d9f9ab657b2deac65ae9e3dbfba0b450d9b0 ./test_identity_db.py
|
||||
7a73e8314ea7e91ca9dad3867a83b9c1101fdab09bdc0fdac0f798d0a7a204f3 ./test_llm_capabilities.py
|
||||
68300f87b96a474f06a3071a833e6b3ae48d1db5fb8a7e5a3ec1834fd878d808 ./test_multilingual.py
|
||||
c17cdd0f4ffb7a151a634add08d13cc576ba7a848bb20f54fb97d0c1d9d81cc0 ./test_object_search.py
|
||||
d07bd363a2878259fbf4ffcba40e367f7f1bf4171b5a5dfdda97f7a53b450d0e ./test_ollama_feasibility.py
|
||||
8421003b1f66cbd21c6fe5d3aff0a526897753e959b23905ca8f502f644f66a5 ./test_owl_vit_debug.py
|
||||
6f9e8b7947229ea4aa0a62b59bda5fcec05bd74f6c00dc4a7b06d932bd1b730f ./test_owl_vit_stamps.py
|
||||
da91a7c97466ce7f03cde13aa9bf6e691b3e482d2cac74519a2e1a61a2abb05a ./test_parent_chunk_generation.py
|
||||
19d9f2492d3b04b7dafa008f106767d3107dd36b0c8e4601765dca30131027cd ./test_places365_scene.py
|
||||
de44553023067362e8b2223f03e1bff55fcbd2f11ddf3d01060dc02c4675a744 ./test_probe_file.py
|
||||
c0e987ba06a61cc0426ffbca8af1eb51a97bd79acab59b70453cfbb18eaee093 ./test_processor_performance.py
|
||||
7b4b55e23dff35ba107b3da5b0560d03b1b41dfdea1d3a59eac777b4be4d4033 ./test_pyannote_audio.py
|
||||
5cb8b42033ffba41f25e7ef74ef04cf352c0c277a9971e9eaef53fd673902712 ./test_pyannote_multilingual.py
|
||||
8580e689ae148754e03d958419e108241040a012584ba49e8a90db114a9f8c13 ./test_scene_api.py
|
||||
1194d450070b1f42e045d98e532f41205bb3e52fc48ba26e7c9b72a188fe1b2c ./test_segment_count.py
|
||||
147bfffeac9561cfa407207b04a825862ac623ba97deecf5ed7c6257432dc62c ./test_speechbrain.py
|
||||
22e4b865bc769329c1146c2f914395044a9bc84cd2a13acf68fb374a57fe1e3e ./test_v2_detailed.py
|
||||
a616570a2a080b5b19f4bf783877147e714a014103b274143dd37984a946ca08 ./test_v2_model.py
|
||||
7b83611f6b3028500c91c62197f774c0769e299136eca8dc4b612a7b5743e3d6 ./test_v2_with_text.py
|
||||
1dd983c78074a61ceec26d7e3623d40772ca55fd6ee63ba368afe756c66ae091 ./test_with_real_image.py
|
||||
1b738cc0d69d33e967cbb775def0a7f58dc02f1911404af56a5825bd60a5b75b ./text_semantic_analysis.py
|
||||
a4221417ae00add76881c6c715ee4257c263e2dfd0a846a8887738682dfe8cda ./thumbnail_extractor.py
|
||||
0d188a738a0df79ead10065d9f17c366fe159c862bd4bafa2860d0e6ba2640c3 ./tkg_builder.py
|
||||
8b97f0fdfc0899460bf23d420dba0a51a34737c74ebad0519856909d198662bf ./tmdb_cast_fetcher.py
|
||||
4858909a0beaf8397becf4103be17fcc350841217afcdc1d917c48c512a9041b ./tmdb_embed_extractor.py
|
||||
54d8321dfe0f8caa669e4a9d1b48dc772a5b25817eab95b552944140c91f457d ./tmdb_identity_integration.py
|
||||
2a84aa2dcfb83ac385d2c394f884926f306c81798e4277a26dbd1f3c5506be46 ./trace_face_aggregator.py
|
||||
61d3b4b362722ce24326a204f1b72cc7b1dcc20cf3264a4f526d4ea343a8d33d ./transcribe.py
|
||||
ede9a184fd51ef4c87eb3e2541f09b91739a49986cb588591a7c6fbb33433020 ./unified_synonym_processor.py
|
||||
a408f294c3a71eb6a0eea80b9b586f73dedcefe286c62233f713a7428a9979be ./update_all_demographics.py
|
||||
e6520bb10ae6835ceade487ceb5e3fa549ca6f06de35b2c785d649921ef443f4 ./update_fine_speakers.py
|
||||
a2191daff2ad228725b6a66f0e472ec659a6b4fa8f2cbbd74d1bf9c35cca63eb ./update_person_demographics.py
|
||||
60060753cfd2a6d1241e55bf40a0c74f1df15739656d0349e22e8543036b2424 ./update_speaker_assignments.py
|
||||
fdc61009c351263e0018801b32ad90ffd8919af611a2a0580546be7fd62c99c4 ./update_terminology.py
|
||||
0d337c821b36eb7761c0e439b63b8192ff54a542c539d1279efa6854f0b0cdc2 ./utils/body_action_decoder.py
|
||||
3b384a8d88f6147d1953b14bd6b55672f4f161885e29bc241a466d4cfbd50e99 ./utils/face_trace_visualizer.py
|
||||
52a7b79ade15f213841c70416565d3c5e46c145c9a72724ce545143c6e0bdea8 ./utils/face_tracker.py
|
||||
ecd902a4a6f1084d8396af0b4d88079105c84fa6170e3a394720a6452ff3aa3b ./utils/pose_action_decoder.py
|
||||
29dd3e0f802c0347cd9d9465123915b4604c990d7250048b7ae388af03cf5f36 ./utils/pose_analyzer.py
|
||||
bc6184153096e5cd8d89d02fa3279c6587f60a49c6b3366b4d82cee722bbf352 ./utils/pose_transition_analyzer.py
|
||||
d0ec8f4a67c1a1eb1356ad6e9b2f466575691bd336621cdbbfd31dd10159f2dc ./utils/test_mediapipe.py
|
||||
4840c11964a59eabad26b97fe01033ccaf7903e2d24edd5e1035f6dd5fc995ea ./vectorize_4188.py
|
||||
078979114c5f248d2bfd43aa8df55235fa03ab812f26998b984cd485a3d2cda8 ./vectorize_chunk_summaries.py
|
||||
ff98864f1b11795cc3bb64f30ccb6f8609771ddc7a5df2c003ba7c2233d16fc2 ./vectorize_chunks.py
|
||||
5880c128400e6e36c8eb7dffd009dbbc99dd13f8575b0037bdc854e25ddc41fb ./video_comparison_statistics.py
|
||||
0a1501ffdc027236cdf88706b3d61229e2998ab268fd57fb60e399ccb734b6a1 ./vision_agent.py
|
||||
eac8f90fbbb655614abcefc4b887e346bf94db5f015d33d37bc9514fb030489d ./visual_chunk_processor.py
|
||||
c165dfc5fc981dc731b25ef414184ee58e56b73b148d41a32fdce985c701efd5 ./visualize_stamp.py
|
||||
6c65a82fdd1d585e20bee4fcb2d1bdec2e6220bda71d6ef9cd00d6a3cf74c4d7 ./voice_embedding_extractor.py
|
||||
2b3a7b357db4ddd07ca30bf200c6600724e33441d8def0a4d9a39673e2cfb1c0 ./weather_sound_detector.py
|
||||
206b61ebf3c91d7ce3f1488247b52aca6e955042d8aa979c59723e3ff10dd36a ./yolo_benchmark_runner.py
|
||||
e8cb0963c90fbd1c2aa91141f80340edd3c9560d69780dd825d107c6ed14fa64 ./yolo_count_comparison.py
|
||||
dad775ecdca0144bd14b7abaa7ec8fb213e8b9428e39906abce541e93db496b6 ./yolo_processor_contract_v1.py
|
||||
74ff880e664ec514223a4f220b682fbc87089f8c0851c93ac68c97269b8a59b6 ./yolo_processor_mps.py
|
||||
8af0a6db683b6626e07820b302135ac5960d38e3d4b3d187c640b23ce8a14f72 ./yolo_processor.py
|
||||
e13cf22b9aeae96c7e28b4512dd2137743a25eb59027da446966c1aaaaf4ce71 ./zero_shot_combined_test.py
|
||||
f4aaf017ff588999f06cd9ba1787517e06c6d6e6228a15a54d8aa4f54fde5eb3 ./zero_shot_gun_test.py
|
||||
0a285b8ec33d7999e9d4ae8d43ce768c9f06ee1929e13a6809e98bdabe6357ce ./zero_shot_objects_test.py
|
||||
5711c6d18acba76511a3f088d4d0f095b47c978a6c6ae3e086e2b7cbee7b9e55 ./backup_all.sh
|
||||
c8860e3d55b99745265998abaae63efe28c83d7c1bfd91b30dfba54d146793d3 ./check_config.sh
|
||||
6321793085bfb33b751b2848dddc41f13d9ead9763f6e581f9dcfceea9090f8b ./demo_identity_full_cycle.sh
|
||||
77382d8671059ff99fd5ca3db42590de47ecf4e1555eea950bd3a7016b1547b0 ./deploy_package.sh
|
||||
09bda12152917b969259ff3ca0bcda63f615bdf4873dbb8bb7f7ce5eec742a9f ./final_validation.sh
|
||||
491e609bb43526b0c41d3dd060a3813bbeb3defc70fc88fe36f9fbbd2280e720 ./install_mongodb.sh
|
||||
09e21960f0d7fdd00ff1d30334b753a8216ad17fc3644c9dbb129b4446ecc12c ./package_delivery.sh
|
||||
0c2fe9288f9b51ad34aadf87093c1e1a423483ad7a972861ace811250e30204c ./package_file.sh
|
||||
c233bb7b854dfd68e75808640fdea379af6952095a93cc8884d7e8b7ecbb4539 ./package_release.sh
|
||||
02e85ba83e8d3da68bf9320ff25506714ce460736b8824309027a5ec375ea86b ./package_system.sh
|
||||
7557f1999bde53ef397b78208713e8df8817171dfbc053d6bed130b57a229517 ./release_preflight_check.sh
|
||||
091087dad7f38e8a0d98458b64fdeb0ac5770534f7dfebdbdf3b80d945ff39df ./security_check.sh
|
||||
25711049adabfd179d4e19c2a4c4bd675ed9da8e8913ed1bdaac7519f6cde7ac ./setup_fresh_mac.sh
|
||||
f6dae232edd5d2d111468be125609feb0dbd8db1895846f3d1c48f0e411e3a16 ./setup/01_postgresql.sh
|
||||
8a405e2372ddb5958f7bfac15d330a2f189ffe2583ae37bba4c953ac45412c80 ./setup/check_momentry.sh
|
||||
72dc22172a201a060a20f21b89af38c80ecb6399f594ecca81cafa8a918c764f ./setup/install_momentry.sh
|
||||
5eccd14e8e4b3c91159b17756892dd03a7d26cb7bbc1961d783188ed10411770 ./setup/upgrade_momentry.sh
|
||||
e48ab4673f71370dc7d4ce5c32d159bf9438e9e1dd7c9edd9c6053156fbdaa99 ./start_momentry.sh
|
||||
ffe7e91a24fbfa826eb816f66cdb315097fe841a7b67a476865aec1ad7a4dda0 ./swift_processors/.build/checkouts/swift-argument-parser/Scripts/environment.sh
|
||||
b2ee4f8a445a7e83f7b99ae5d4139fd525d9e3e58a360bfef054d441aa21d901 ./swift_processors/.build/checkouts/swift-argument-parser/Scripts/format.sh
|
||||
9461213a77531fb3a5742fda0c9024304abe47988bb33852da55e96ae01a382a ./test_api_validation.sh
|
||||
7cb98fb67007abe03bb57ef58a5e7499ae389693b33a14e015c9ef6061d6b0f5 ./test_face_recognition.sh
|
||||
46bf67f794dbcd2c191f1933f1c05a1eef0ad3f5bb2e1d64e11e5f23a44ffc10 ./test_identity_agent.sh
|
||||
7763d5bfbd83ede94e31eb8e44dd0d422fe2d1221b9e112d73fc637f29fdb7ea ./test_multilingual.sh
|
||||
8a730fedf9252b7ed352b8447773c9c256f064fd64ca20efa05f9021766b09e5 ./test_search_modes_v2.sh
|
||||
fbca5ba0783153c4e21c174b0cbf75b582514f6ef0f92750a82d3178bc170f48 ./test_search_modes.sh
|
||||
f8c1647cdb4db8adef1829e41fbecd97f6b3b2e62927f195cd8e68127876069d ./troubleshoot.sh
|
||||
992296b5218f3ef97ce53325be12f71848f3c3aeb3ee81d764bfe4bd61e1de05 ./verify_package.sh
|
||||
b6f95fa070cc0258bc5d005f10d13025ba8b08d3ee1598bcdad405ff1d3332ed ./tmdb_agent.py
|
||||
2bfe6a1c1263f35916d4a28981814515fc40cb473f7bbc801f84842904c888f6 add_yolo_to_chunks.py
|
||||
f61f7126698018b346c8bafc45501708c17e3b45d9db54be5f0109afeee63176 age_benchmark.py
|
||||
8efb13239db2a25a728abbdebd92affe685b69402a277cceb0d76e62ed9451ac analyze_asr_lip.py
|
||||
432b3e3b30578e71ef973aca758bd1964102cbbb19530620df8ac02df00eefb8 analyze_video_faces.py
|
||||
732609ef1882e14dc7ed60488697f6ae7e2607ec90b240a86ea9e585f052b9be apply_asr_corrections.py
|
||||
790bd25424e93ca5a0743ea1a740a9a70f6ae6f8a9ca411012eb1e9b03907eb4 asr_benchmark_runner.py
|
||||
18744dc3bebdce0d89ea7076b5e43febd35ad3c84064bb52adde4d128d50bc9f asr_face_stats.py
|
||||
1577d055328a73561f9ccfaf0c54727532e3dddcd1bf0f33e3c38081415cced8 asr_model_benchmark.py
|
||||
fcbb81639f53e9e08bee436853c84d918c0eeac09d985b34634d5ddc00055b61 asr_processor_base.py
|
||||
25948a204e45ce844d43606b7e45c9532321d48df44887d261fc886748276b10 asr_processor_contract_v1.py
|
||||
e9209cf028a11bdc45514124826374e58458ee06b054cfedffe8013d751735ea asr_processor_contract_v2.py
|
||||
407dd0ec772027e0df27af0b66ea8130cb390595ccdeca4350e7bdc210acee6c asr_processor_debug.py
|
||||
dcee1b80071b47c974bcffe3d27ec2f2269f4b8de7e7409ceaec7e6f271d31aa asr_processor_legacy_v2.py
|
||||
10728a05a6ff2d56a70bb831abb51e05b03309e45bc5fa068c5a0702a4c73769 asr_processor_legacy.py
|
||||
9106bfe07de9cfc920f4f4d2f821dc024df612f4c2a8f5f75d35f012d26440f0 asr_processor_simplified.py
|
||||
7eabdcf7320302ee65c67e801f3ac7ca5801abc76165faa182348d30a8113e9f asr_processor_small_multilingual.py
|
||||
2714f7be88f286635ea8465daf8fa969e6b27d2b2d1f73ac5e98f5e496139cad asr_processor_small.py
|
||||
1089ff10b9b0a9f528cac79580aec25e33f8eeea485ac44b6aaf8c7c0cab5b42 asr_processor_v2.py
|
||||
b9e826f23f080ae67f5961ad750ec2a6834cd18335955c3b3175b8cd06ebd6d3 asr_processor.py
|
||||
5431b57d4369a841d51a6d6c5e1fb5e6c2932cb97cb4601f5e1b41ffe9f7ecaf asr_side_by_side_comparison.py
|
||||
6c11efc3d40e559bfbeadcbf4f51eb353b744cc4f765bd8abc472a701e3f33cb asrx_processor_contract_v1.py
|
||||
93501463af84d6541405057da3783d40492aec5e536b4210dcaffe460cdb5503 asrx_processor_custom.py
|
||||
6adfbee842d134b9d180e2d1104694ed5cdc1fa4febcd0c502801b8f87b3ce66 asrx_processor_simplified.py
|
||||
60fc3465f9c461583f8d0b888e85b3a6e04e1f252a1e1c21d036b52e1ce4b43c asrx_processor_v2_noalign.py
|
||||
82d65b71bd86874e484870c40214d3fbd9343c39d5d635896fb4d257d13a410f asrx_processor_v2_transcribe.py
|
||||
5a0c9905a2e10c847aa74f108e4054de4704bbafb2004589db15bf33833ea3c7 asrx_processor_v2.py
|
||||
b16b00cf9e5de96abc512022af9bb81196405b10988f5a39dfd3a9b6471f1155 asrx_processor.py
|
||||
f11b67ada6167540d2f95cb2af93d0e3a0de55bce659745baa37c4aa4805212e audio_taxonomy_processor_v2.py
|
||||
ded810b81cda24e31e82de14ba9846770ee2b18d84d52b9d570de5877e9e2513 audio_taxonomy_processor.py
|
||||
f7c53be5a031a8bff15c3165543586529932d81c4312521654d132b1f0ed6bc3 auto_identify_persons.py
|
||||
5497a6f1f7ae267c796a398a9f020ea485aa45f980f2eca932b904ad61ce9b40 backfill_demographics.py
|
||||
39a479ca4f8986f3255b0bcd0d9162a1f2ae339bb4dcf081f931ff9b304797a1 backfill_frame_data.py
|
||||
77a98d9b7cb97eceae4c0fcf2c353933e0fb36ee7406b57d59b1e216b1a44601 build_docs.py
|
||||
308c8e3f3d45ee273504f9f415eaf6c025f06aaf1cca33156a66431ed6e64f43 build_semantic_index_poc.py
|
||||
4eb37768edd252d94f0d751f219c317e905bc093f414b2a6350efb8294131138 build_semantic_index.py
|
||||
debbd058957d09c2397f3f4c028edaa0a658002921dcca95eae2a20070ba95fb caption_processor_contract_v1.py
|
||||
7236cdb5deaeada266cc246ee11380248bb9f2255888c25a152b2f6ab1f981cc caption_processor.py
|
||||
e73cbb688dade5c5b6fc4276f0c78b377903ff83f3830b63d8bcdacd8da8aecf check_all_stamps.py
|
||||
7ecdbd4b1f94be8ebab9935ea210a868330e7030b6e19c73229c579c1189fd5c check_architecture_all.py
|
||||
7179ed1a87241904af29542f9018398f8afd9b9dd89af7bb11909310ab7b49e0 check_architecture_docs.py
|
||||
7e6bd7d14582e494baf8b28354bbded3f79b43f0bd271ab33874da55b9086311 check_code_document_consistency.py
|
||||
5ffca7c55edafad755e84499981553fcb48ce6056ca7b04130acafb9e6a9b1c3 check_frame_112_36.py
|
||||
f49c7b0cfa53b657f69b2ad97a6e18393741cc2151b32c9d7dde2e078b75953f check_frame_91_59.py
|
||||
d2cb7475262ee711a4b06e53559f0927242be4a924a56e7fe212225f318f4193 chinese_vector_test.py
|
||||
ecde3d3df773916f62de4e34f8d8693feaedf112a3ef9955e22417c8421722bd chunk_statistics.py
|
||||
2588ecf27c13020d894e46ba70a76de89f09556b475f555dae59db36da0b90a0 clean_sentence_text.py
|
||||
98ab1129032f42fddc020f9b3492d1fc133851d1af33ddeb57e2385d88425af4 clip_logo_integration.py
|
||||
bf6f74c09b8f8c7f25c5fffb9c36f16a8afb483a7b65903cfc75e2ea641bdf49 compare_asr_content.py
|
||||
1f2caadcded724aa04a929018a35ace53dd79d172f5ee2720308fbd4581b0c6c compare_asr_models.py
|
||||
1ed8a9530f40e304b556ff76c7cac40468c86a0cd32ff2a8bc7bf2a69669121d compare_models_gun_test.py
|
||||
6bf790fe75a7a2a5220052ca14c31e90a97eabc4558cd5e9059280913862a81e compare_search.py
|
||||
875e7a598982c8ad7222a51b7b147e91cd5e1a930f41214b3942107cb932fc5c compare_segmentation.py
|
||||
e432b6f2364d5a9aaf207a1de0dca3fb14ab8d118c53ee34306abfe6fd211ba8 comprehensive_search_test.py
|
||||
43df85cf860ac28e083de35b511bb2a7b91ed48f596757f52f19487768987500 coreml_embed_server.py
|
||||
9149ccc8de5adfec69c6f3f2ec502ae7d5e7844518a228ba587af2e08cb38805 crop_opencv_stamp.py
|
||||
fc36ecbb1455d959456945266e193b601a29c4210b4938a3f0d4a9aaf44b5cee crop_real_stamps.py
|
||||
34a694624ce94d916b06a847bc4d41e7665985b85e55a626a4bc3a4370c21acf crop_stamp_112_36.py
|
||||
27099dc9c8ee52a6949ce18c505089afef1720fe70858b90d0801972c3b43fff crop_stamp_closeup.py
|
||||
01b5a3b091ebcffc0c1e2637b7af8192ba597239fa80d152738e3b8cfdf8174d crop_stamp.py
|
||||
71b2a362b5395c6e4d70e62766820db92d94eaf140d98eecb2880bcd98d55be9 crop_top_candidates.py
|
||||
60f18c5fa03ffbc80c209337cd1c8b6acd0b8471e600119340aa8cdfeef14f5b cut_benchmark_runner.py
|
||||
deba86a1645ca5b1acf413dd9edfad77b93ff213897d739a32de1ba629bfce52 cut_processor_contract_v1.py
|
||||
01024f947f0326c124293a30e4f2cdb859f21cfb2d4c07f9c1030e2934f7bc44 cut_processor.py
|
||||
ff092ad2373b57321f87d1dd123fff8a99c8207057591e8526e56cb1424d47c6 dashboard.py
|
||||
f184bf3e546db0253ffb71895e8d42aeb06588c71c4914c2fe656f42ef463c9a debug_face_registration.py
|
||||
a9acce1ebd6ea821a8dc5009b8fc40586a98d31c23e93c97fd844bdadbda4ed2 deep_analysis_112_36.py
|
||||
7767ee7455a956d14d286ad558c4c312c2ad3ccee1c73adc1bc8f761c96ad72a demo_dashboard.py
|
||||
425290c12161c5cfcb0c505a737ba3951656b39e425e792919d4812e15b9b8e3 demo_face_learning.py
|
||||
d7e3e27e6a65b1fa62530ee954c227dbb4f97593c5a5dcc48b39e5ebae4656e5 dense_scan_traces.py
|
||||
df79b7fc7a03a8e754de5123a23bb33b1d5c23d832adc1886fb846ca517dd24d detect_language.py
|
||||
f6f8047e24ebbec81ef27dd38f4242e63385f8ebe5be471cae156b8aa5fc4477 detect_objects_keyframes.py
|
||||
e61d2ef5043bda3674a0050d83ba3bc6a70c47f54e456124a736b4328f0c0638 detect_stamp_shapes.py
|
||||
f23a382113e9c7de2ec3b24e95160daef48f9336ae6d4ec9ee7a18f4bf529f6d download_places365_classes.py
|
||||
a747e5e17960b972549714786bb9e28ea578e10e6c80788e298a0149c970bcc5 embed_faces.py
|
||||
f1a2b3820e1a763eba6d8d905a5bb87f5a9b4a2f005e709e313bb7505ba7ddaa embeddinggemma_server.py
|
||||
43c540c02c1be992e7d44ab4fc76a759815db3ed5f25bcbb594328b50ed7c73b export_file_package.py
|
||||
19d23e4604d5532928412afe4d5d39ff49194ab4a046825286ae1be154326a1f export_file.py
|
||||
5f10bab1dcb0b5fad233a74069f9e2f89043e7c848c9c38ae7e2806e6940c75d export_identities.py
|
||||
2a1d0a1b853fd2c28f9a404871d33912f93521358576833be0999271bae02bcb export_person_thumbnails.py
|
||||
a81bf1d6af78c052e638f5d5677b4edb512d0de5441025d86fd970d3e7993922 export_sqlite.py
|
||||
2fe8c0131dde21382cae1483825d489fd467c2491a0cb91d5c1881df2e402e9f extract_face_embedding.py
|
||||
8b5cc0ff437fb4dd0df28b7b20a78469cdca3621e2eeb4b6d46ad2391acb0596 extract_female_faces.py
|
||||
bdecbaf0496bf536dce2ef4897f7090749820d15dcca03492d4d736ab0f8c6c5 face_benchmark_runner.py
|
||||
22319a38bd684fb235fec681ddc60f45821e4bb2181f2b31fdf945f7ad9a1b85 face_clustering_processor.py
|
||||
5adce4e444743331fa592e13d71e52f26554eadb9744d350a7654a449a8fb8a3 face_count_comparison.py
|
||||
3574454c74eaf11021f9052f77d93044cca4ae0285d0f2630b4016c2ec0df783 face_cross_validate.py
|
||||
4f09b3b66b14a5eefb14fcf915a1ad1e9147010f6ae7671731566679b1cae461 face_embedding_extractor.py
|
||||
d05c65221cbe787e4e29a4de1966edb9e89fed47e9e89c9d065e1d5cb46cf178 face_landmark_qc.py
|
||||
28776dfcc6ac40e9481c25467438745fed60fecdfd4fc19f9f4c7396397591a7 face_mediapipe_test.py
|
||||
f4d1b4334a49357b74b80e390ad5a3d16263e51cbe5cab661af92bd2e9721f02 face_processor_contract_v1.py
|
||||
802015c73dfce0866f2a0bc94c645aa35ba30a6de78244af23090bb1f1828c6e face_processor_mps.py
|
||||
96ffdbde3f4d87e9942f9e1f4c93cbd999dc404b43e00d4cdcbb22de3c0f16b7 face_processor_optimized.py
|
||||
4c3915a7465f524e706940c9813614ec4920cd6f8647602ef32e88fdbbaf8fc0 face_processor_v1.py
|
||||
d6ddad29a5e53b43b887554072d7965f0535e47fb62dad1a8b87e44fa1be6015 face_processor.py
|
||||
8edab61189ad1a8fa60c203077e814e82d46c5bae67054fa2ab1958e199c05f9 face_recognition_processor.py
|
||||
9ea19f357b3fcec6c8b3875c538e53cb46e407ab188cd544963e0123e535fa03 face_registration.py
|
||||
72648816de611fd9b84d2b98c177b8b4f24374024b69184e8151c06cf44d633b face_statistics_report.py
|
||||
499f197a06f50839ebd5350af380fa56506ce08f073ba40c0e863b8e02b34133 fast_face_clustering_processor.py
|
||||
0191781635b98d0675969fb87733af19525d7b5c148723346c5378c08a00fe33 fast_stamp_search.py
|
||||
00e7e8ed06f6a0f2c46c84a47d7e7f5d366acee941d546a52c4b1b7885c71e08 filter_stamp_colors.py
|
||||
5341fd648cffafc77568070313b06417636943d50ff3b4380a61381260acaafa final_face_validation.py
|
||||
213793ab719f4ef42ec9b22f351dd86d4739211c17be486a46b76ba7e64fd8f1 find_blue_stamp_opencv.py
|
||||
e1490317c0f56b895f73cfbb6f57c8e3ea5c65304bfdd7663f103f6b564e148c find_kids_pose.py
|
||||
08d4cba0650f6a22fc134d07fd15fe8784c8472c3ba687b587e31e0b980e2b1c find_kids_refined.py
|
||||
aecec0784ce5d0e98176c15798f05d4f67ab6a686f9ffafba71fbd82157027f8 find_magnifying_glass.py
|
||||
620db08dd84f00af0c6d744dac54c68360548dd5b2cc26b12ddcefd936239b2e find_pink_stamp.py
|
||||
1f4555b3578f4dc6bc08aa37e34eda1d91ea25d8134439771678d1a57bfdaeb9 find_realistic_stamp_opencv.py
|
||||
277aa3b48eec2e739de3bb95ef501ffbd24104aa2a1bdef28c844ef44fd75013 find_small_stamp_opencv.py
|
||||
fc73bbc9605938db495bd33ea74955e454e9384130531a16d42f25dbd9b515d8 find_stamp_in_hands.py
|
||||
c6ed0f12e78c12df977ddca5d699f58edb174b47199f584e7a24dbdc3b7d02b1 find_stamp_in_magnifier_scene.py
|
||||
ecf12e346619c27a985452e9f84ee262c2da25de9df0ff6e0b293279ccba559b find_stamp_opencv.py
|
||||
4ff93cbcc781a5cff023f78006f1aebbe2d954405ae7d00a473fef6b41b2ebee fix_asr_text.py
|
||||
4090cb892115843a909aa41426c0f39c5a53d8d88a5db69499ec8bafcb780d77 florence2_scan_stamps.py
|
||||
e90e4447db3328b64a2062ca13ed41f6a045220d8fb640542dff5b790d3c4d3b gdino_comparison_test.py
|
||||
7071a9999057c347e2275381f1f0c58e19aa8581d70a572d3170ed14a295a48d gdino_frame_api.py
|
||||
891410310b415ff68a0f7ee0aa39e84eef7f2c75887487bdb88b8f4718d40e94 generate_asr1.py
|
||||
24efe7db016387b40bd9caae449f0445a3d47eb878c00399803bb6e78e6dd5fc generate_benchmark_summary.py
|
||||
dc956a78a3ed26686f45dd6d6d9cb42c023751fcd9b8789585450b6df63670a1 generate_chunk_summaries.py
|
||||
8a0922d75fdc7c5994ebfb31881d765db4b105cbcddfcaa4b4c49d11950b8df4 generate_chunk_visual_stats.py
|
||||
4860bfd00cc6c1c842c2f8e17e725eebca191d81067af3cb5a28661b45d74bd3 generate_parent_chunks_gemma4.py
|
||||
e9fca223a8329ff6bdcb8552fecedb2d8b4607c6516c373c3023f29edfd42e06 generate_sentence_summaries.py
|
||||
cbae7c3e85457274e8c284005196c39dc97f9d9200ed6b0e4ea266e48a381d3a generate_synonyms_llamacpp.py
|
||||
57512cd7a5ec2f52813717fd3d81dec1aaa69dc9c91a9edbca847e7012b1c86f generate_synonyms_ollama.py
|
||||
dc495cb8127858fa03a5f8b8bb4a772c5934ada1abecf97459bf71de80417672 gun_detector_scan.py
|
||||
1a7cfb72723b3b94e3f4fe368477ba693ac3d20ac7af7351962bc548c700b451 head_shoulder_bench.py
|
||||
b2fe8e4d8d7d1057ba928fc5e190f4a06cb60e83e2a02c5d7c423791596c11b8 head_shoulder_quick.py
|
||||
ba5e67a97cb465e6a1a942c2f7342406031759ffcea2b897ae963bee4bc551c4 hybrid_stamp_search.py
|
||||
f5847b6c8ed4c7c51290df9032d5a192317b5f03b5ff418ead1181a6e1b655f2 identity_agent.py
|
||||
12237fa6cc5f0d2dcdd05f26fd50c0a7bfd541d1c922a1640d131fa0c4d6f4fc identity_bind.py
|
||||
046aa90eb4a4b830910912362a9865d1e6170f5bc176fae42be630f967f9d3ff import_file_package.py
|
||||
7cc260d4411ab13559803686f8b645afa07738d652d9459830aecac268597fa7 import_file.py
|
||||
071e3a5141d04cb9e6bd31489a835c778608785896b18ea7fa65e8db9f1547e5 insert_chunks.py
|
||||
d3d53f44daa7f1526488677b141e90fbf4aa5625369b96a3ca275b802414802f integrate_face_asrx.py
|
||||
4cb6a93ef8006cb69e8bdb1bc72899ee9bab1bf7eceaafe9896923bb7023bbd5 integrate_rule3_markers.py
|
||||
75aa3e4bffc9f9cb8b9254db19095c93c3efb43d465fb5dcca8c7b9b730f5c59 integrated_body_action_decoder.py
|
||||
f4dd2e21fb6b668bdf0c51cc56e214188b46937b96a2b4a10d13783e171d0472 language_router.py
|
||||
bef426641645fcf7dcc68c87e3325a6edf3f70925febaf1df84f7c6ff87681e5 lip_analyzer.py
|
||||
7f98b0cc8379b3759cc7e805dd56f736cc518093e83f43b2e5ecf559a19b95f0 lip_processor_cv.py
|
||||
a1473eeba17fce25e4678234fe4e8793a132514e0566b03b36a0bec04eb93acb lip_processor_media.py
|
||||
0df61396756ee22d35356776c189b354458661916c8baf85bcef97c9f8b62ec8 lip_processor_mp.py
|
||||
3202aeca29e651ef1a54f47681c6b3b2d0680555fe3c6d318a932bb12b49e58c lip_processor_simple.py
|
||||
fed15bafb5e09715cc03962f465b2ff618bf05ebeafdf932643690c9635c9840 lip_processor.py
|
||||
b9532949bd145c0411876bdf3a8cbf1540b4233f7585465ce6389928e1bfd908 llm_metadata_enhancer.py
|
||||
1773054e8d563b493865880d0d8bda105e3eb6fb536a25817517237b3bb76afe magnifying_glass_analyze.py
|
||||
7d4d048c452bf273f4a6d96da13eb7bab6aa60ca9dd51de5ca0fb0a01e587b13 magnifying_glass_extract.py
|
||||
8528bbf89d2770fa5a23f461274038898be251fb6e48c5d3adece5aab3bf976d magnifying_glass_owl.py
|
||||
cb645f5e29ee5a36b2f97812039abfdaed7328386bcd25ad7b742af6a6b16399 map_speakers_v2.py
|
||||
a90bd3fb729a05010c29a213134c60cc0bdd17769e27a7d3f1250919b7bf1613 match_face_identity.py
|
||||
2d864dc831c2fd0142b19b8ad2cda169c2a05facd9662d31861d29bb710c4979 match_face_with_pose_filtering.py
|
||||
889d4853707896885ed96ab945d4266acb213f4b122e2ba7c4563eb0e3e9e865 match_identities_to_tmdb.py
|
||||
b34ec373bcf65139e08e41967f58a2fc8ebb67a59c361074d3590cd16541415a match_speakers_to_chunks.py
|
||||
fe6260a94d01d8b43d0d3b59eb820cfd7b4711c907343a1261c69f9010ae990d mediapipe_holistic_processor.py
|
||||
bb36844b4d13bba8edc1b7f0703f02081b62bea795535b8cd8dcbfdb4281f402 migrate_asr_to_children.py
|
||||
819312cbfce6e68a0d8d731e02d283946f79de6044f207991ddf9a28ac853d79 migrate_face_results.py
|
||||
c3d062aab67b5177ac7bf2c3ad2f0e578e12c9893e377f68339a17cc2783316c migrate_identity_files.py
|
||||
c418f6e50054fa7eae1d0d879e28997b98f57437acec48b53ecb09f332728867 migrate_to_4188.py
|
||||
6f60aa899e06f05e575cb5b461ea517481119cc32644566245d74c96eccde722 multi_stage_stamp_search.py
|
||||
b24e2289c00f803c8339f59c34d44ed6c53a3c19dafc13e72c4b260d6bb312a6 music_segmentation_processor.py
|
||||
da2546f84d0dbd711c8800ae4e32e59d9c38de9e62e1b423c4518fa1fda1dbea natural_language_top10.py
|
||||
78c3d1a9302dbfacdf9b3655dab07348957fd9dbb4af94aae83eefecd5343a33 natural_language_vector_detailed.py
|
||||
e924f04d68c9a8211ad373da811aa6671d2c5654281c1634dbf8b1e5e5b51533 natural_language_vector_test.py
|
||||
df6ac92367b1afb50c0af958e362d87555fe569f608a8d213e0a593e2a43cde8 object_search_agent.py
|
||||
fd39b779a0337f521940f3f7b159931f1f207f200eefd610183781fdcf3dfafd object_search.py
|
||||
42d2952fc78b57302b0d12bc3d45790a2c2c46d4ffa3c713a82686134bd63f13 ocr_benchmark_runner.py
|
||||
7b3ccb5c4ddd4c62c5ad04d0e3aafaecc2c1441012b6a98613cdcf055e2e50e8 ocr_processor_contract_v1.py
|
||||
271023eec42d6be4a1ce6ae2ce3f29e825210a57e6bb37554a6f7fdf54616f9a ocr_processor_mps.py
|
||||
2e73c41285e52ef013594fcd4d20df9f5781bfc26bcf62e54dd2c04ec44200c3 ocr_processor.py
|
||||
62196108cb3337b5f9a873d70d2981ac8f49152369afbcc8a12b3a13de579e80 opencv_stamp_search.py
|
||||
b2e8d552c272fd173c77693e9453a85fe16dfc12f7c2cd304d299c6188c14077 paligemma_vs_gdino.py
|
||||
1534d5b7617dbae77f7a37a2c33a89b90f965247a6828f00b73ea6b720f6f4fc parent_chunk_5w1h.py
|
||||
5208c738d4b615282813d351daf09872ce516121bb604caa64968ef5e52c53d3 pipeline_checklist.py
|
||||
8f80c3a2be5c330e2d1853d9250a171c75db84598dbf3304280c42237ed4fb1f pipeline_status.py
|
||||
94db44c0f49115a677d117d4901a1b7991c1517905300eaa495dd62b8ac1c79c pose_processor_contract_v1.py
|
||||
167dee5e42c6bd46674bcffcfd92f368fc0b48a1f42c459c806853b281bc6482 pose_processor_mps.py
|
||||
a6ef3a785ef5c6dc47fa38dbed80d76bc7d4bf48cbaf0f7edb3d26df98d7262c pose_processor.py
|
||||
45e6798dc5900f2f7c8776a2d260c122aae5068a075256b8a5c02e8d0be6c131 probe_file.py
|
||||
01c7b3c30c1531224f9605f0ee633285fe8489ab2d0a3c9c6a41f2b2b60d6626 quick_stamp_search.py
|
||||
e3143673a2bff6139e05c82446fd8770c4b7e59a854a42c3b29662f5ac75efe2 rebuild_parents.py
|
||||
4aa98981632d4f8a11039c510e86aa296ae1cd4b399fc871ed664ac11e445bd9 rebuild_story_content.py
|
||||
090137a5872edfed1b89c97b537d13ad8aafda9a705ebb4c54f30352503e5e3a redis_publisher.py
|
||||
750f778946b56bc57c47d9d2295332bb0f8cec2c1aa03c6b882d39ef4432673d refine_search.py
|
||||
0f8a6a6866a5797e964d3b17e2b7ef146fe7a798f09fcea982fcda6f629b4d06 regenerate_parent_5w1h.py
|
||||
3ee192b623f290136b36bd63abd018aad6e6639a9543970c3415734628b33bd6 register_sample_faces.py
|
||||
334782f0f66d0ad3818a51adf6343186a2de65467378ab68a81ade806e496af9 release_manager.py
|
||||
9a44cdd155953778b52ac0cfb118504c56eb6b1141984365ffbb717e28f3e65b release_pack.py
|
||||
3906b48f3a7764d19605def2bf8ef84a54a6afe64c9291a7cc0881a91472a826 render_face_heatmap.py
|
||||
44e432c31a35211a37dd26695772b7e250487ac42ba4f16a56f843277c2fabbf render_offline_report.py
|
||||
3fac1e6a4125042185a2ce82771f695c562b3137c7aa58a912bada00ad8ecf78 rescan_single_frame_traces.py
|
||||
9c3212cb455c2a6230be918448560fee00c153a8956ffd04fcb62974d5e1abff resume_framework.py
|
||||
7c95ec08daf4f980bd53233503b7a4fa01afc08660e8fe8cd031ea3613ead8f7 save_events_to_db.py
|
||||
24795e1531fe05e33d515104e4fb2f9567b46d802ef1b5a38f11268cf105be76 scan_charade_stamps.py
|
||||
cad2da5073577f851c5cb2abdbd7cab05b39caa0d1179ccc89c378a7df2736c8 scan_full_video_stamps.py
|
||||
03ae71470331fe5b7f8e394f7f789eee08cad4ed5ec9196b46ab2c9dbefa7fec scan_handheld_objects.py
|
||||
d3935ba498786cf260d9d5370ca60d3af7bc4fd438f6be33ce23cfd0b7bab593 scan_keyframes_opencv.py
|
||||
12c9b35212f587f5adb37584bf3c3844804d2bc642ebfc5d82b86b44f46d2472 scan_keyframes.py
|
||||
f386130ac203308c904ba7efea09ce0ca0d640d36762b113bf0cfedc24d7f885 scene_classifier.py
|
||||
482edae04e5467a68c77729760db53d3653e8d7654fa49e5ec9a36f1f8f22616 search_blue_stamp.py
|
||||
e3786422932138272d1096ad4c800594e62c9640952a286a9158372a1e5443e3 search_envelope.py
|
||||
2df1e259c2e52d10d79b20856cb94ffff5a9bfdbe47cee587b1148b2f1c16101 search_objects_in_hands.py
|
||||
9fd49be8ab16f94fd82efc5ae035c029372a7ddeb7fd779b557f1917cdc14592 search_vase.py
|
||||
7a6d8e7c435368f6218db972c04a7be16d7d6680d8d4374f82c05b7162716b9d select_face_reference_vectors_v2.py
|
||||
2bcf7c1b3c407b51a134a5ee4982713f0ea387cfd6df01ed75554c94603971a6 select_face_reference_vectors_v3.py
|
||||
d52098fcf1f9f7ba14f31a9a90bc5b3bc933e1a5e5697e3d09eff389c153cb18 select_face_reference_vectors.py
|
||||
a02cb37639275d86ae0b4504d21f50963b45aaf94630c59472ba30d07722e50c simple_api_test.py
|
||||
02516ab1616c1756c4f8041f48ff12811cc5d672c53b34850b84ce682fefdff1 simple_face_stats.py
|
||||
b024d9bfe244d0d058daae0acd314b9344d6f0912e4f3b02dbc618f9fe3e4949 simple_test.py
|
||||
af8703506769f3cdb89ff7849b071c2421307717850596dd86d2fe0b053e7809 smart_stamp_v2.py
|
||||
5e5f86d47ea2b75bcaa8662689f73af1963645149c0da688dc43482616aa4e76 sound_event_detector.py
|
||||
bab7697e4b4b05e93babc116e0c5b13cbaf1f4d419a65acd5dc1de5bdfc510dc speaker_assign.py
|
||||
381ff240ce806ead7d6463ee40c5b830035eb6252180b4b0901b3c8313fa4bbd speaker_bind_lip.py
|
||||
5eede29fa0966974c1943792d7fcca2dd9179d4f23570cf1a3964dc97bc9ac1e specific_stamp_search.py
|
||||
d5363d832272bdb3c1d6f6d93eee7b7894893b9164a3f5ad5fa08a4a0eaeeb47 split_asr_segments.py
|
||||
8e1269f173f2c72de78857c2d83d3111b62ec89bd79f4fb00c3f57390986ae4f step3_asr_fine.py
|
||||
7592df8be5dc58376b33960bfa7fc0003c51114b70ebc01f1589f39ee9568d3b store_traced_faces.py
|
||||
7ac32c1e2146a19e6654ab3e4bbbfd42e1a6540fb8717d40d55c61e9f5d1bf71 story_embed.py
|
||||
74cc24b328a075f48b1f44a465611157f44eadc8f5dabf6d95cd5cc5f80dd9dc story_pipeline_full.py
|
||||
97628f0f1270825dabafdf0a69f10ef12c4ffe2be4ac12941315f06bfb084e7c story_processor_contract_v1.py
|
||||
1b1f42fc4bbff26551f26f4ac1e8a995dfe3ff98b940a29c9e130410965d0fa0 story_processor.py
|
||||
cdbc7ef88551e2b3a3771eac5be5e0360989e71fa009ac28c97e548507e08a5e sync_face_speaker_to_chunks.py
|
||||
8b08e9a33f5917aad10e070d6aa48805f5e7c23f905ba8fff3b8697b2109d962 sync_to_mongodb.py
|
||||
869b6c56fe16cbf8973826782a17503f02b5cd757ec025b944da693d38bdb4cb sync_users_from_sftpgo.py
|
||||
f64cc6dcb72f54d3e97aa981b40591aef4804ca769e1f14628d901b98bc6aeac terminology_manager.py
|
||||
455546b9bb3a2c2c877c7720229b254e75b28eea33b3715d1731c02ca85294ae test_api_correct_usage.py
|
||||
b03dc1bbb091672e7da2b131850b17badac896b4fbba92fe9bce76c232c99be4 test_api_with_key_id.py
|
||||
7d295c77d5bcd4c72c5673370af48cc89bbccf9292c3b82aad3a230d242547a9 test_args.py
|
||||
f474ec88e6634decbf178da497443fa709096b174bb4a4320a07256f516b1044 test_asr_large_model.py
|
||||
aa952524dd86f346740ffe555075b74adf2e60bb822bb04a943a51b1fd262445 test_birth_uuid.py
|
||||
db87badad7948527325a528400d67a4eeef76abf8d13f5c4254c812e944e4e0c test_end_to_end.py
|
||||
e191c98a82f7e089f7dccfc4c536244da2bf14339f982a3afef05d33332c3755 test_face_api_final.py
|
||||
1b97c9aae2e1744aa7aefb192eaef86c64e6134efc8f08ffa9a274bff16a58d3 test_face_api_with_correct_key.py
|
||||
f7e4078f31b1ca8494c18878219cf2f90c301f19fc851b9e7084657b71a5e150 test_face_api.py
|
||||
9eafc49f8fa42b4cd58109e9b725b3aec3b06943ec426919b1788838ccf1ed92 test_face_db_fix.py
|
||||
38bce82b167e0c97b257cc6b955fdc2e9ded581ce2d39eb0fd2c60249275394b test_face_direct.py
|
||||
24e82bf0af82407e6c04361e9a671770cbfb0b05d92df589bd0d5a0118bb5a98 test_face_learning.py
|
||||
8dcdb144c4253fbb466f220359b42c2a9579193865e320a56e682e384c2ae176 test_face_recognition_integration.py
|
||||
b921e3256fdea176d4391116d1ead472c4f3ca8aac6999140367818818c35ec3 test_face_registration_api.py
|
||||
9af6c6ff0c766b3de92185c3602f2b8b62b815bf88dcb0e3251c2676e61e0a48 test_face_tracker.py
|
||||
4f70eadb6a8b80eb8febe32b17b77e58d1a4823cc5d598e5ea45555342d2d4cb test_florence2_direct.py
|
||||
0588be0acea540950d737943073f71e769b6301374eaa4ff7fdb96a80145c4e0 test_florence2_pipeline.py
|
||||
694c15193616157ddae4bdb0a45feada2a8f8490f01d290a28aa77a4b24eabb2 test_florence2_stamps.py
|
||||
2c281f698616a83e9eeccd610555d9f9ab657b2deac65ae9e3dbfba0b450d9b0 test_identity_db.py
|
||||
7a73e8314ea7e91ca9dad3867a83b9c1101fdab09bdc0fdac0f798d0a7a204f3 test_llm_capabilities.py
|
||||
68300f87b96a474f06a3071a833e6b3ae48d1db5fb8a7e5a3ec1834fd878d808 test_multilingual.py
|
||||
c17cdd0f4ffb7a151a634add08d13cc576ba7a848bb20f54fb97d0c1d9d81cc0 test_object_search.py
|
||||
d07bd363a2878259fbf4ffcba40e367f7f1bf4171b5a5dfdda97f7a53b450d0e test_ollama_feasibility.py
|
||||
8421003b1f66cbd21c6fe5d3aff0a526897753e959b23905ca8f502f644f66a5 test_owl_vit_debug.py
|
||||
6f9e8b7947229ea4aa0a62b59bda5fcec05bd74f6c00dc4a7b06d932bd1b730f test_owl_vit_stamps.py
|
||||
da91a7c97466ce7f03cde13aa9bf6e691b3e482d2cac74519a2e1a61a2abb05a test_parent_chunk_generation.py
|
||||
19d9f2492d3b04b7dafa008f106767d3107dd36b0c8e4601765dca30131027cd test_places365_scene.py
|
||||
de44553023067362e8b2223f03e1bff55fcbd2f11ddf3d01060dc02c4675a744 test_probe_file.py
|
||||
c0e987ba06a61cc0426ffbca8af1eb51a97bd79acab59b70453cfbb18eaee093 test_processor_performance.py
|
||||
7b4b55e23dff35ba107b3da5b0560d03b1b41dfdea1d3a59eac777b4be4d4033 test_pyannote_audio.py
|
||||
5cb8b42033ffba41f25e7ef74ef04cf352c0c277a9971e9eaef53fd673902712 test_pyannote_multilingual.py
|
||||
8580e689ae148754e03d958419e108241040a012584ba49e8a90db114a9f8c13 test_scene_api.py
|
||||
1194d450070b1f42e045d98e532f41205bb3e52fc48ba26e7c9b72a188fe1b2c test_segment_count.py
|
||||
147bfffeac9561cfa407207b04a825862ac623ba97deecf5ed7c6257432dc62c test_speechbrain.py
|
||||
22e4b865bc769329c1146c2f914395044a9bc84cd2a13acf68fb374a57fe1e3e test_v2_detailed.py
|
||||
a616570a2a080b5b19f4bf783877147e714a014103b274143dd37984a946ca08 test_v2_model.py
|
||||
7b83611f6b3028500c91c62197f774c0769e299136eca8dc4b612a7b5743e3d6 test_v2_with_text.py
|
||||
1dd983c78074a61ceec26d7e3623d40772ca55fd6ee63ba368afe756c66ae091 test_with_real_image.py
|
||||
1b738cc0d69d33e967cbb775def0a7f58dc02f1911404af56a5825bd60a5b75b text_semantic_analysis.py
|
||||
a4221417ae00add76881c6c715ee4257c263e2dfd0a846a8887738682dfe8cda thumbnail_extractor.py
|
||||
0d188a738a0df79ead10065d9f17c366fe159c862bd4bafa2860d0e6ba2640c3 tkg_builder.py
|
||||
a084d3b5840e920d552515febffa22b34943b9efa8b73adab9cd193372e71592 tmdb_agent.py
|
||||
8b97f0fdfc0899460bf23d420dba0a51a34737c74ebad0519856909d198662bf tmdb_cast_fetcher.py
|
||||
4858909a0beaf8397becf4103be17fcc350841217afcdc1d917c48c512a9041b tmdb_embed_extractor.py
|
||||
54d8321dfe0f8caa669e4a9d1b48dc772a5b25817eab95b552944140c91f457d tmdb_identity_integration.py
|
||||
2a84aa2dcfb83ac385d2c394f884926f306c81798e4277a26dbd1f3c5506be46 trace_face_aggregator.py
|
||||
61d3b4b362722ce24326a204f1b72cc7b1dcc20cf3264a4f526d4ea343a8d33d transcribe.py
|
||||
ede9a184fd51ef4c87eb3e2541f09b91739a49986cb588591a7c6fbb33433020 unified_synonym_processor.py
|
||||
a408f294c3a71eb6a0eea80b9b586f73dedcefe286c62233f713a7428a9979be update_all_demographics.py
|
||||
e6520bb10ae6835ceade487ceb5e3fa549ca6f06de35b2c785d649921ef443f4 update_fine_speakers.py
|
||||
a2191daff2ad228725b6a66f0e472ec659a6b4fa8f2cbbd74d1bf9c35cca63eb update_person_demographics.py
|
||||
1a7dddd1db467990ee1c685d61b971babfa30c3ae3a754b5df8f3b4c320f3ed1 update_qdrant_uuid.py
|
||||
60060753cfd2a6d1241e55bf40a0c74f1df15739656d0349e22e8543036b2424 update_speaker_assignments.py
|
||||
fdc61009c351263e0018801b32ad90ffd8919af611a2a0580546be7fd62c99c4 update_terminology.py
|
||||
4840c11964a59eabad26b97fe01033ccaf7903e2d24edd5e1035f6dd5fc995ea vectorize_4188.py
|
||||
078979114c5f248d2bfd43aa8df55235fa03ab812f26998b984cd485a3d2cda8 vectorize_chunk_summaries.py
|
||||
ff98864f1b11795cc3bb64f30ccb6f8609771ddc7a5df2c003ba7c2233d16fc2 vectorize_chunks.py
|
||||
5880c128400e6e36c8eb7dffd009dbbc99dd13f8575b0037bdc854e25ddc41fb video_comparison_statistics.py
|
||||
0a1501ffdc027236cdf88706b3d61229e2998ab268fd57fb60e399ccb734b6a1 vision_agent.py
|
||||
eac8f90fbbb655614abcefc4b887e346bf94db5f015d33d37bc9514fb030489d visual_chunk_processor.py
|
||||
c165dfc5fc981dc731b25ef414184ee58e56b73b148d41a32fdce985c701efd5 visualize_stamp.py
|
||||
6c65a82fdd1d585e20bee4fcb2d1bdec2e6220bda71d6ef9cd00d6a3cf74c4d7 voice_embedding_extractor.py
|
||||
2b3a7b357db4ddd07ca30bf200c6600724e33441d8def0a4d9a39673e2cfb1c0 weather_sound_detector.py
|
||||
206b61ebf3c91d7ce3f1488247b52aca6e955042d8aa979c59723e3ff10dd36a yolo_benchmark_runner.py
|
||||
e8cb0963c90fbd1c2aa91141f80340edd3c9560d69780dd825d107c6ed14fa64 yolo_count_comparison.py
|
||||
dad775ecdca0144bd14b7abaa7ec8fb213e8b9428e39906abce541e93db496b6 yolo_processor_contract_v1.py
|
||||
74ff880e664ec514223a4f220b682fbc87089f8c0851c93ac68c97269b8a59b6 yolo_processor_mps.py
|
||||
8af0a6db683b6626e07820b302135ac5960d38e3d4b3d187c640b23ce8a14f72 yolo_processor.py
|
||||
e13cf22b9aeae96c7e28b4512dd2137743a25eb59027da446966c1aaaaf4ce71 zero_shot_combined_test.py
|
||||
f4aaf017ff588999f06cd9ba1787517e06c6d6e6228a15a54d8aa4f54fde5eb3 zero_shot_gun_test.py
|
||||
0a285b8ec33d7999e9d4ae8d43ce768c9f06ee1929e13a6809e98bdabe6357ce zero_shot_objects_test.py
|
||||
|
||||
136
scripts/embed_faces_only.py
Normal file
136
scripts/embed_faces_only.py
Normal file
@@ -0,0 +1,136 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""Embed faces from existing detections JSON using CoreML FaceNet."""
|
||||
import json, os, sys, time
|
||||
import cv2
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
import coremltools as ct
|
||||
|
||||
FACENET_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "models", "facenet512.mlpackage")
|
||||
|
||||
def classify_pose(roll: float, yaw: float) -> str:
|
||||
abs_yaw, abs_roll = abs(yaw), abs(roll)
|
||||
if abs_yaw < 15 and abs_roll < 15:
|
||||
return "frontal"
|
||||
elif abs_yaw > 30:
|
||||
return "profile_right" if yaw > 0 else "profile_left"
|
||||
return "three_quarter"
|
||||
|
||||
def extract_embedding(coreml_model, face_img):
|
||||
resized = cv2.resize(face_img, (160, 160))
|
||||
normalized = (resized.astype(np.float32) / 127.5) - 1.0
|
||||
normalized = np.transpose(normalized, (2, 0, 1))
|
||||
input_array = np.expand_dims(normalized, axis=0)
|
||||
result = coreml_model.predict({"input": input_array})
|
||||
emb_key = [k for k in result.keys() if k.startswith("var_")][0]
|
||||
return result[emb_key].flatten().tolist()
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="Embed faces only")
|
||||
parser.add_argument("detections_json")
|
||||
parser.add_argument("output_json")
|
||||
parser.add_argument("--video", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"[EMBED] Loading detections: {args.detections_json}")
|
||||
with open(args.detections_json) as f:
|
||||
detection_data = json.load(f)
|
||||
|
||||
print(f"[EMBED] Loading CoreML FaceNet: {FACENET_PATH}")
|
||||
coreml_model = ct.models.MLModel(FACENET_PATH)
|
||||
|
||||
print(f"[EMBED] Opening video: {args.video}")
|
||||
video = cv2.VideoCapture(args.video)
|
||||
fps = video.get(cv2.CAP_PROP_FPS)
|
||||
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
|
||||
face_data = {
|
||||
"metadata": {
|
||||
"video_path": os.path.abspath(args.video),
|
||||
"fps": fps, "width": width, "height": height,
|
||||
"sample_interval": detection_data.get("sample_interval", 3),
|
||||
"detection_method": "apple_vision",
|
||||
"embedding_method": "coreml_facenet",
|
||||
"total_frames": total_frames,
|
||||
},
|
||||
"frames": {}
|
||||
}
|
||||
|
||||
frames = detection_data.get("frames", [])
|
||||
t0 = time.time()
|
||||
embed_count, total_face_count = 0, 0
|
||||
batch_size = max(1, len(frames) // 20)
|
||||
|
||||
for idx, frame_info in enumerate(frames):
|
||||
frame_num = frame_info["frame"]
|
||||
faces = []
|
||||
for face in frame_info.get("faces", []):
|
||||
total_face_count += 1
|
||||
bb = face.get("bbox", face)
|
||||
x, y, w, h = bb["x"], bb["y"], bb["width"], bb["height"]
|
||||
if w <= 10 or h <= 10:
|
||||
continue
|
||||
|
||||
video.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
|
||||
ret, frame = video.read()
|
||||
if not ret:
|
||||
continue
|
||||
|
||||
x1, y1 = max(0, x), max(0, y)
|
||||
x2, y2 = min(width, x + w), min(height, y + h)
|
||||
if x2 <= x1 or y2 <= y1:
|
||||
continue
|
||||
face_img = frame[y1:y2, x1:x2]
|
||||
if face_img.size == 0:
|
||||
continue
|
||||
|
||||
emb = extract_embedding(coreml_model, face_img)
|
||||
if emb is not None:
|
||||
embed_count += 1
|
||||
|
||||
pose_info = face.get("pose", {})
|
||||
pose_angle = classify_pose(
|
||||
pose_info.get("roll", 0),
|
||||
pose_info.get("yaw", 0)
|
||||
)
|
||||
|
||||
faces.append({
|
||||
"x": x, "y": y, "width": w, "height": h,
|
||||
"confidence": face.get("confidence", 0.5),
|
||||
"embedding": emb,
|
||||
"pose_angle": {
|
||||
"angle": pose_angle,
|
||||
"roll": pose_info.get("roll", 0),
|
||||
"yaw": pose_info.get("yaw", 0),
|
||||
"pitch": pose_info.get("pitch", 0),
|
||||
},
|
||||
"landmarks": face.get("landmarks", []),
|
||||
})
|
||||
|
||||
face_data["frames"][str(frame_num)] = faces
|
||||
|
||||
if (idx + 1) % batch_size == 0:
|
||||
pct = (idx + 1) / len(frames) * 100
|
||||
elapsed = time.time() - t0
|
||||
eta = (elapsed / (idx + 1)) * (len(frames) - idx - 1) if idx > 0 else 0
|
||||
print(f"[EMBED] {pct:.0f}% | {idx+1}/{len(frames)} frames | "
|
||||
f"{embed_count} embeddings | {elapsed:.0f}s elapsed | "
|
||||
f"{eta:.0f}s ETA", flush=True)
|
||||
|
||||
video.release()
|
||||
face_data["metadata"]["status"] = "completed"
|
||||
|
||||
print(f"[EMBED] Writing output: {args.output_json}")
|
||||
with open(args.output_json, "w") as f:
|
||||
json.dump(face_data, f, indent=2)
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f"[EMBED] Done: {len(frames)} frames, {embed_count}/{total_face_count} embeddings, {elapsed:.0f}s")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
121
scripts/extract_embeddings_from_face_json.py
Normal file
121
scripts/extract_embeddings_from_face_json.py
Normal file
@@ -0,0 +1,121 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Extract embeddings from existing face.json using CoreML FaceNet.
|
||||
|
||||
Usage: python3 scripts/extract_embeddings_from_face_json.py <face_json_path> <video_path> <output_path>
|
||||
"""
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import cv2
|
||||
import numpy as np
|
||||
import coremltools as ct
|
||||
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
FACENET_PATH = os.path.join(SCRIPT_DIR, "..", "models", "facenet512.mlpackage")
|
||||
|
||||
|
||||
def extract_embeddings(face_json_path: str, video_path: str, output_path: str):
|
||||
# Load face.json
|
||||
with open(face_json_path, 'r') as f:
|
||||
face_data = json.load(f)
|
||||
|
||||
frames = face_data.get('frames', [])
|
||||
if not frames:
|
||||
print("No frames in face.json")
|
||||
return
|
||||
|
||||
# Load CoreML FaceNet
|
||||
facenet = os.path.normpath(FACENET_PATH)
|
||||
if not os.path.exists(facenet):
|
||||
print(f"FaceNet model not found: {facenet}")
|
||||
return
|
||||
|
||||
coreml_model = ct.models.MLModel(facenet)
|
||||
print(f"[EMB] CoreML FaceNet loaded: {facenet}")
|
||||
|
||||
# Open video
|
||||
video = cv2.VideoCapture(video_path)
|
||||
if not video.isOpened():
|
||||
print(f"Cannot open video: {video_path}")
|
||||
return
|
||||
|
||||
fps = video.get(cv2.CAP_PROP_FPS)
|
||||
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
print(f"[EMB] Video: {fps} fps, {total_frames} frames")
|
||||
|
||||
# Extract embeddings for each face
|
||||
embed_count = 0
|
||||
processed_frames = 0
|
||||
|
||||
for frame_entry in frames:
|
||||
frame_num = frame_entry.get('frame', 0)
|
||||
faces = frame_entry.get('faces', [])
|
||||
|
||||
# Seek to frame
|
||||
video.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
|
||||
ret, img = video.read()
|
||||
if not ret:
|
||||
continue
|
||||
|
||||
processed_frames += 1
|
||||
|
||||
for face in faces:
|
||||
x = face.get('x', 0)
|
||||
y = face.get('y', 0)
|
||||
w = face.get('width', 0)
|
||||
h = face.get('height', 0)
|
||||
|
||||
if w == 0 or h == 0:
|
||||
continue
|
||||
|
||||
# Crop face
|
||||
crop = img[y:y+h, x:x+w]
|
||||
if crop.size == 0:
|
||||
continue
|
||||
|
||||
# Resize to 160x160 (FaceNet input size)
|
||||
crop_resized = cv2.resize(crop, (160, 160))
|
||||
|
||||
# Convert to RGB
|
||||
crop_rgb = cv2.cvtColor(crop_resized, cv2.COLOR_BGR2RGB)
|
||||
|
||||
# CoreML embedding
|
||||
try:
|
||||
input_dict = {'image': crop_rgb}
|
||||
output = coreml_model.predict(input_dict)
|
||||
emb = output.get('output', output.get('embeddings', None))
|
||||
|
||||
if emb is not None:
|
||||
if isinstance(emb, np.ndarray):
|
||||
emb = emb.flatten().tolist()
|
||||
elif isinstance(emb, dict):
|
||||
emb = list(emb.values())[0]
|
||||
if isinstance(emb, np.ndarray):
|
||||
emb = emb.flatten().tolist()
|
||||
|
||||
face['embedding'] = emb
|
||||
embed_count += 1
|
||||
except Exception as e:
|
||||
print(f"[EMB] Frame {frame_num} embedding failed: {e}")
|
||||
|
||||
if processed_frames % 1000 == 0:
|
||||
print(f"[EMB] Processed {processed_frames} frames, {embed_count} embeddings")
|
||||
|
||||
video.release()
|
||||
|
||||
# Save updated face.json
|
||||
face_data['metadata']['total_embeddings'] = embed_count
|
||||
|
||||
with open(output_path, 'w') as f:
|
||||
json.dump(face_data, f)
|
||||
|
||||
print(f"[EMB] Done: {processed_frames} frames, {embed_count} embeddings")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 4:
|
||||
print("Usage: python3 extract_embeddings_from_face_json.py <face_json> <video> <output>")
|
||||
sys.exit(1)
|
||||
|
||||
extract_embeddings(sys.argv[1], sys.argv[2], sys.argv[3])
|
||||
397
scripts/extract_face_crops.py
Normal file
397
scripts/extract_face_crops.py
Normal file
@@ -0,0 +1,397 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
extract_face_crops.py - 批量提取 face crops
|
||||
|
||||
Usage:
|
||||
python3 scripts/extract_face_crops.py --uuid <file_uuid>
|
||||
python3 scripts/extract_face_crops.py --uuid <file_uuid> --video <video_path>
|
||||
|
||||
儲存位置: {OUTPUT_DIR}/.faces/{file_uuid}/{trace_id}/{frame}.jpg
|
||||
|
||||
條件:
|
||||
- trace_id != None and trace_id != 0
|
||||
- landmarks.left_eye or landmarks.right_eye
|
||||
|
||||
品檢:
|
||||
- file_size > 500 bytes
|
||||
- mean_brightness > 5
|
||||
- std_deviation > 10
|
||||
|
||||
Retry: 最多 3 次
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import subprocess
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Tuple, Set
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import threading
|
||||
|
||||
# Constants
|
||||
MAX_RETRIES = 3
|
||||
MIN_FILE_SIZE = 500
|
||||
MIN_BRIGHTNESS = 5
|
||||
MIN_STD_DEV = 10
|
||||
FFMPEG_TIMEOUT = 30
|
||||
MAX_WORKERS = 8 # Parallel threads for ffmpeg
|
||||
|
||||
|
||||
class FaceCropExtractor:
|
||||
def __init__(self, output_dir: str):
|
||||
self.output_dir = Path(output_dir)
|
||||
self.faces_dir = self.output_dir / ".faces"
|
||||
self.faces_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.stats = {"total_faces": 0, "qualified": 0, "successful": 0, "failed": 0, "skipped": 0, "low_confidence": 0, "too_small": 0}
|
||||
self.stats_lock = threading.Lock()
|
||||
|
||||
def process_video(self, uuid: str, video_path: str) -> dict:
|
||||
"""處理單一影片"""
|
||||
face_json = self.output_dir / f"{uuid}.face.json"
|
||||
traced_json = self.output_dir / f"{uuid}.face_traced.json"
|
||||
|
||||
if not face_json.exists():
|
||||
print(f"[ERROR] face.json not found: {uuid}")
|
||||
return {"error": "face.json not found"}
|
||||
|
||||
if not os.path.exists(video_path):
|
||||
print(f"[ERROR] Video not found: {video_path}")
|
||||
return {"error": "video not found"}
|
||||
|
||||
# Load face.json (landmarks)
|
||||
print(f"[LOAD] Reading {face_json}")
|
||||
with open(face_json) as f:
|
||||
face_data = json.load(f)
|
||||
|
||||
# Load face_traced.json if exists (trace_id)
|
||||
traced_data = {}
|
||||
if traced_json.exists():
|
||||
print(f"[LOAD] Reading {traced_json}")
|
||||
with open(traced_json) as f:
|
||||
traced_data = json.load(f)
|
||||
|
||||
# Build lookup: (frame, x, y) -> trace_id from traced_data
|
||||
trace_lookup: Dict[Tuple[int, int, int], int] = {}
|
||||
frames = traced_data.get("frames", {})
|
||||
if isinstance(frames, dict):
|
||||
for fnum, frm in frames.items():
|
||||
faces = frm.get("faces", [])
|
||||
if faces is None:
|
||||
continue
|
||||
for face in faces:
|
||||
if face is None:
|
||||
continue
|
||||
trace_id = face.get("trace_id")
|
||||
if trace_id and trace_id != 0:
|
||||
x = face.get("x", 0)
|
||||
y = face.get("y", 0)
|
||||
key = (int(fnum), x, y)
|
||||
trace_lookup[key] = trace_id
|
||||
|
||||
# Create output directory
|
||||
uuid_dir = self.faces_dir / uuid
|
||||
uuid_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
results = {"successful": [], "failed": []}
|
||||
processed: Set[Tuple[int, int]] = set() # (trace_id, frame)
|
||||
trace_counts: Dict[int, int] = {} # trace_id -> count
|
||||
|
||||
# Process faces from face.json
|
||||
frames = face_data.get("frames", {})
|
||||
if isinstance(frames, dict):
|
||||
frame_items = frames.items()
|
||||
elif isinstance(frames, list):
|
||||
frame_items = [(frm.get("frame"), frm) for frm in frames]
|
||||
else:
|
||||
frame_items = []
|
||||
|
||||
# Collect extraction tasks
|
||||
tasks = []
|
||||
for fnum, frm in frame_items:
|
||||
if fnum is None:
|
||||
continue
|
||||
|
||||
faces = frm.get("faces", [])
|
||||
if faces is None:
|
||||
continue
|
||||
|
||||
for face in faces:
|
||||
if face is None:
|
||||
continue
|
||||
|
||||
self.stats["total_faces"] += 1
|
||||
|
||||
bb = face.get("bbox", face)
|
||||
x = bb.get("x", 0) if isinstance(bb, dict) else 0
|
||||
y = bb.get("y", 0) if isinstance(bb, dict) else 0
|
||||
w = bb.get("width", 0) if isinstance(bb, dict) else 0
|
||||
h = bb.get("height", 0) if isinstance(bb, dict) else 0
|
||||
confidence = face.get("confidence", 0.0)
|
||||
|
||||
# Quality filtering: confidence + size
|
||||
MIN_CONFIDENCE = 0.6
|
||||
MIN_SIZE = 20
|
||||
|
||||
if confidence < MIN_CONFIDENCE:
|
||||
self.stats["low_confidence"] += 1
|
||||
continue
|
||||
if w < MIN_SIZE or h < MIN_SIZE:
|
||||
self.stats["too_small"] += 1
|
||||
continue
|
||||
|
||||
# Check: has eyes
|
||||
lm = face.get("landmarks")
|
||||
has_eyes = False
|
||||
if lm:
|
||||
if isinstance(lm, dict):
|
||||
has_eyes = lm.get("left_eye") or lm.get("right_eye")
|
||||
elif isinstance(lm, list) and len(lm) >= 2:
|
||||
has_eyes = True
|
||||
|
||||
if not has_eyes:
|
||||
self.stats["skipped"] += 1
|
||||
continue
|
||||
|
||||
self.stats["qualified"] += 1
|
||||
|
||||
# Check: in trace
|
||||
key = (int(fnum), x, y)
|
||||
trace_id = trace_lookup.get(key) or face.get("trace_id")
|
||||
|
||||
# Determine output directory
|
||||
if trace_id and trace_id != 0:
|
||||
output_dir = uuid_dir / str(trace_id)
|
||||
crop_key = (trace_id, int(fnum))
|
||||
else:
|
||||
# No trace_id → unbound directory
|
||||
output_dir = uuid_dir / "unbound"
|
||||
crop_key = ("unbound", int(fnum), x, y) # unique key for unbound
|
||||
|
||||
if crop_key in processed:
|
||||
continue
|
||||
processed.add(crop_key)
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
output_path = output_dir / f"{fnum}.jpg"
|
||||
|
||||
tasks.append({
|
||||
"trace_id": trace_id or "unbound",
|
||||
"frame": int(fnum),
|
||||
"x": x, "y": y, "w": w, "h": h,
|
||||
"output_path": output_path
|
||||
})
|
||||
|
||||
# Parallel extraction
|
||||
print(f"[EXTRACT] Processing {len(tasks)} faces with {MAX_WORKERS} threads...")
|
||||
|
||||
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
|
||||
futures = {
|
||||
executor.submit(
|
||||
self.extract_face, video_path, t["frame"],
|
||||
t["x"], t["y"], t["w"], t["h"], t["output_path"]
|
||||
): t for t in tasks
|
||||
}
|
||||
|
||||
for i, future in enumerate(as_completed(futures)):
|
||||
t = futures[future]
|
||||
result = future.result()
|
||||
|
||||
if result["success"]:
|
||||
self.stats["successful"] += 1
|
||||
results["successful"].append({
|
||||
"trace_id": t["trace_id"],
|
||||
"frame": t["frame"],
|
||||
"path": str(t["output_path"])
|
||||
})
|
||||
trace_counts[t["trace_id"]] = trace_counts.get(t["trace_id"], 0) + 1
|
||||
else:
|
||||
self.stats["failed"] += 1
|
||||
results["failed"].append({
|
||||
"trace_id": t["trace_id"],
|
||||
"frame": t["frame"],
|
||||
"bbox": {"x": t["x"], "y": t["y"], "w": t["w"], "h": t["h"]},
|
||||
"reason": result.get("reason", "unknown")
|
||||
})
|
||||
|
||||
# Progress every 1000
|
||||
if (i + 1) % 1000 == 0:
|
||||
print(f" Progress: {i+1}/{len(tasks)} ({self.stats['successful']} OK, {self.stats['failed']} fail)")
|
||||
|
||||
# Write summary
|
||||
self.write_summary(uuid, trace_counts, results)
|
||||
|
||||
return results
|
||||
|
||||
def extract_face(self, video_path: str, frame: int, x: int, y: int,
|
||||
w: int, h: int, output_path: Path) -> dict:
|
||||
"""提取 face crop(含 retry,使用 -ss 快速 seek)"""
|
||||
for attempt in range(MAX_RETRIES):
|
||||
try:
|
||||
ts = frame / 24.0 # FPS is always 24 for this video
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-ss", f"{ts:.3f}",
|
||||
"-i", video_path,
|
||||
"-vf", f"crop={w}:{h}:{x}:{y}",
|
||||
"-frames:v", "1",
|
||||
"-q:v", "2", # 高品質 JPEG
|
||||
str(output_path)
|
||||
]
|
||||
|
||||
proc = subprocess.run(
|
||||
cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
|
||||
timeout=FFMPEG_TIMEOUT
|
||||
)
|
||||
|
||||
if proc.returncode != 0:
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
continue
|
||||
return {"success": False, "reason": "ffmpeg_failed"}
|
||||
|
||||
# Quality check
|
||||
quality = self.check_quality(output_path)
|
||||
if quality["ok"]:
|
||||
return {"success": True, "path": str(output_path)}
|
||||
|
||||
# Quality failed, retry
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
# Remove bad file
|
||||
if output_path.exists():
|
||||
output_path.unlink()
|
||||
continue
|
||||
|
||||
return {"success": False, "reason": quality.get("reason", "quality_failed")}
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
if attempt < MAX_RETRIES - 1:
|
||||
continue
|
||||
return {"success": False, "reason": "timeout"}
|
||||
except Exception as e:
|
||||
return {"success": False, "reason": str(e)}
|
||||
|
||||
return {"success": False, "reason": "max_retries"}
|
||||
|
||||
def check_quality(self, path: Path) -> dict:
|
||||
"""品檢"""
|
||||
if not path.exists():
|
||||
return {"ok": False, "reason": "file_not_exist"}
|
||||
|
||||
file_size = path.stat().st_size
|
||||
if file_size < MIN_FILE_SIZE:
|
||||
return {"ok": False, "reason": f"empty_file ({file_size}B)"}
|
||||
|
||||
try:
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
|
||||
img = Image.open(path)
|
||||
arr = np.array(img.convert('RGB'))
|
||||
|
||||
mean_brightness = arr.mean()
|
||||
if mean_brightness < MIN_BRIGHTNESS:
|
||||
return {"ok": False, "reason": f"black_frame (mean={mean_brightness:.1f})"}
|
||||
|
||||
std_dev = arr.std()
|
||||
if std_dev < MIN_STD_DEV:
|
||||
return {"ok": False, "reason": f"low_contrast (std={std_dev:.1f})"}
|
||||
|
||||
return {"ok": True}
|
||||
|
||||
except ImportError:
|
||||
# PIL not available, skip advanced quality check
|
||||
return {"ok": True}
|
||||
except Exception as e:
|
||||
return {"ok": False, "reason": str(e)}
|
||||
|
||||
def write_summary(self, uuid: str, trace_counts: Dict[int, int], results: dict):
|
||||
"""寫摘要報告"""
|
||||
summary_path = self.faces_dir / uuid / "_summary.json"
|
||||
|
||||
summary = {
|
||||
"file_uuid": uuid,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"stats": self.stats,
|
||||
"trace_counts": trace_counts,
|
||||
"total_traces": len(trace_counts),
|
||||
"failed_count": len(results["failed"]),
|
||||
"failed_faces": results["failed"] if results["failed"] else None
|
||||
}
|
||||
|
||||
with open(summary_path, "w") as f:
|
||||
json.dump(summary, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n[SUMMARY] Written to {summary_path}")
|
||||
|
||||
def print_stats(self):
|
||||
"""印統計"""
|
||||
print(f"\n=== Statistics ===")
|
||||
print(f"Total faces scanned: {self.stats['total_faces']}")
|
||||
print(f"Filtered (low confidence < 0.6): {self.stats['low_confidence']}")
|
||||
print(f"Filtered (too small < 20px): {self.stats['too_small']}")
|
||||
print(f"Qualified (trace_id + eyes): {self.stats['qualified']}")
|
||||
print(f"Successfully extracted: {self.stats['successful']}")
|
||||
print(f"Failed: {self.stats['failed']}")
|
||||
print(f"Skipped (no trace/eyes): {self.stats['skipped']}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Extract face crops from videos",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog=__doc__
|
||||
)
|
||||
|
||||
parser.add_argument("--uuid", type=str, required=True,
|
||||
help="File UUID to process")
|
||||
parser.add_argument("--video", type=str,
|
||||
help="Video file path (optional, will check DB if not provided)")
|
||||
parser.add_argument("--output-dir", type=str,
|
||||
default="/Users/accusys/momentry/output_dev",
|
||||
help="Output directory (default: output_dev)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Get video path
|
||||
video_path = args.video
|
||||
if not video_path:
|
||||
# Query from DB
|
||||
video_path = query_video_path_from_db(args.uuid)
|
||||
if not video_path:
|
||||
print(f"[ERROR] Video path not found for UUID: {args.uuid}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"=== Face Crop Extraction ===")
|
||||
print(f"UUID: {args.uuid}")
|
||||
print(f"Video: {video_path}")
|
||||
print(f"Output: {args.output_dir}/.faces/{args.uuid}/")
|
||||
print()
|
||||
|
||||
extractor = FaceCropExtractor(args.output_dir)
|
||||
results = extractor.process_video(args.uuid, video_path)
|
||||
extractor.print_stats()
|
||||
|
||||
|
||||
def query_video_path_from_db(uuid: str) -> Optional[str]:
|
||||
"""從 PostgreSQL 取得影片路徑"""
|
||||
psql_path = "/opt/homebrew/Cellar/libpq/18.3/bin/psql"
|
||||
|
||||
if not os.path.exists(psql_path):
|
||||
return None
|
||||
|
||||
cmd = [
|
||||
psql_path, "-U", "accusys", "-d", "momentry", "-t", "-A",
|
||||
"-c", f"SELECT file_path FROM public.videos WHERE file_uuid = '{uuid}' LIMIT 1"
|
||||
]
|
||||
|
||||
try:
|
||||
proc = subprocess.run(cmd, capture_output=True, text=True, timeout=5)
|
||||
path = proc.stdout.strip()
|
||||
return path if path else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
174
scripts/extract_video_embeddings.py
Normal file
174
scripts/extract_video_embeddings.py
Normal file
@@ -0,0 +1,174 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Extract face embeddings for a video file using InsightFace + CoreML FaceNet.
|
||||
Updates face_detections.embedding in PostgreSQL.
|
||||
|
||||
Usage: python3 scripts/extract_video_embeddings.py --file-uuid <uuid> --video-path <path>
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import io
|
||||
import warnings
|
||||
import cv2
|
||||
import numpy as np
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_values
|
||||
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", "postgres://accusys@localhost:5432/momentry")
|
||||
MODELS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "models")
|
||||
FACENET_PATH = os.path.join(MODELS_DIR, "facenet512.mlpackage")
|
||||
|
||||
|
||||
def get_schema():
|
||||
"""Get schema from DATABASE_URL options"""
|
||||
db_url = os.getenv("DATABASE_URL", "")
|
||||
if "search_path=dev" in db_url or "DATABASE_SCHEMA=dev" in os.environ:
|
||||
return "dev"
|
||||
return "public"
|
||||
|
||||
|
||||
def extract_video_embeddings(file_uuid: str, video_path: str, schema: str = "dev"):
|
||||
"""Extract face embeddings from video frames"""
|
||||
|
||||
# Suppress InsightFace verbose output
|
||||
old_stdout = sys.stdout
|
||||
sys.stdout = io.StringIO()
|
||||
try:
|
||||
import insightface
|
||||
from insightface.app import FaceAnalysis
|
||||
import coremltools as ct
|
||||
|
||||
app = FaceAnalysis(name="buffalo_l", providers=["CPUExecutionProvider"])
|
||||
app.prepare(ctx_id=0, det_thresh=0.5)
|
||||
coreml_model = ct.models.MLModel(FACENET_PATH)
|
||||
finally:
|
||||
sys.stdout = old_stdout
|
||||
|
||||
# Open video
|
||||
cap = cv2.VideoCapture(video_path)
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
|
||||
print(f"[EMBED] Video: {total_frames} frames, {fps} fps")
|
||||
|
||||
# Get face detections from DB (without embeddings)
|
||||
conn = psycopg2.connect(DATABASE_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute(f"""
|
||||
SELECT id, frame_number, x, y, width, height
|
||||
FROM {schema}.face_detections
|
||||
WHERE file_uuid = %s AND embedding IS NULL
|
||||
ORDER BY frame_number
|
||||
""", (file_uuid,))
|
||||
|
||||
face_records = cur.fetchall()
|
||||
print(f"[EMBED] Faces without embedding: {len(face_records)}")
|
||||
|
||||
if len(face_records) == 0:
|
||||
print("[EMBED] All faces have embeddings")
|
||||
cur.close()
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# Build frame -> faces mapping
|
||||
frame_faces = {}
|
||||
for face_id, frame_num, x, y, w, h in face_records:
|
||||
if frame_num not in frame_faces:
|
||||
frame_faces[frame_num] = []
|
||||
frame_faces[frame_num].append((face_id, x, y, w, h))
|
||||
|
||||
# Extract embeddings
|
||||
batch_updates = []
|
||||
processed_frames = 0
|
||||
|
||||
for frame_num in sorted(frame_faces.keys()):
|
||||
cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num)
|
||||
ret, frame = cap.read()
|
||||
|
||||
if not ret:
|
||||
continue
|
||||
|
||||
faces_data = frame_faces[frame_num]
|
||||
|
||||
# Detect faces in this frame
|
||||
faces = app.get(frame)
|
||||
|
||||
for face_id, x, y, w, h in faces_data:
|
||||
# Find matching detected face
|
||||
best_face = None
|
||||
best_iou = 0
|
||||
|
||||
for det_face in faces:
|
||||
fx1, fy1, fx2, fy2 = det_face.bbox
|
||||
fw, fh = fx2 - fx1, fy2 - fy1
|
||||
|
||||
# Calculate IoU
|
||||
xi1, yi1 = max(x, fx1), max(y, fy1)
|
||||
xi2, yi2 = min(x + w, fx2), min(y + h, fy2)
|
||||
inter_w, inter_h = max(0, xi2 - xi1), max(0, yi2 - yi1)
|
||||
inter = inter_w * inter_h
|
||||
union = w * h + fw * fh - inter
|
||||
|
||||
iou = inter / union if union > 0 else 0
|
||||
|
||||
if iou > best_iou:
|
||||
best_iou = iou
|
||||
best_face = det_face
|
||||
|
||||
if best_face and best_iou > 0.3:
|
||||
# Get embedding from InsightFace
|
||||
embedding = best_face.embedding
|
||||
|
||||
if embedding is not None and len(embedding) > 0:
|
||||
batch_updates.append((embedding.tolist(), face_id))
|
||||
|
||||
processed_frames += 1
|
||||
if processed_frames % 100 == 0:
|
||||
print(f"[EMBED] Progress: {processed_frames} frames, {len(batch_updates)} embeddings")
|
||||
|
||||
cap.release()
|
||||
|
||||
# Update embeddings in DB
|
||||
if batch_updates:
|
||||
print(f"[EMBED] Updating {len(batch_updates)} embeddings...")
|
||||
|
||||
for emb, face_id in batch_updates:
|
||||
cur.execute(f"""
|
||||
UPDATE {schema}.face_detections
|
||||
SET embedding = %s
|
||||
WHERE id = %s
|
||||
""", (emb, face_id))
|
||||
|
||||
conn.commit()
|
||||
|
||||
# Verify
|
||||
cur.execute(f"""
|
||||
SELECT COUNT(embedding) FROM {schema}.face_detections
|
||||
WHERE file_uuid = %s
|
||||
""", (file_uuid,))
|
||||
embed_count = cur.fetchone()[0]
|
||||
|
||||
print(f"[EMBED] Done: {embed_count} faces with embeddings")
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Extract face embeddings from video")
|
||||
parser.add_argument("--file-uuid", required=True, help="Video file UUID")
|
||||
parser.add_argument("--video-path", required=True, help="Video file path")
|
||||
parser.add_argument("--schema", default=get_schema(), help="Database schema")
|
||||
args = parser.parse_args()
|
||||
|
||||
extract_video_embeddings(args.file_uuid, args.video_path, args.schema)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
64
scripts/insert_face_detections.py
Normal file
64
scripts/insert_face_detections.py
Normal file
@@ -0,0 +1,64 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""Insert face detections from traced JSON into DB."""
|
||||
import json, os, sys
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
|
||||
DB_URL = os.environ.get("DATABASE_URL", "postgresql://accusys@localhost:5432/momentry")
|
||||
|
||||
def insert_faces(file_uuid, traced_json_path, schema):
|
||||
conn = psycopg2.connect(DB_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
with open(traced_json_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
frames = data.get("frames", {})
|
||||
metadata = data.get("metadata", {})
|
||||
fps = metadata.get("fps", 24.0)
|
||||
|
||||
total = 0
|
||||
for frame_num_str, frame_data in sorted(frames.items(), key=lambda x: int(x[0])):
|
||||
frame_num = int(frame_num_str)
|
||||
ts = frame_num / fps
|
||||
faces = frame_data.get("faces", [])
|
||||
|
||||
for face in faces:
|
||||
x = int(face.get("x", 0))
|
||||
y = int(face.get("y", 0))
|
||||
w = int(face.get("width", 0))
|
||||
h = int(face.get("height", 0))
|
||||
confidence = face.get("confidence", 0.0)
|
||||
trace_id = face.get("trace_id")
|
||||
embedding = face.get("embedding")
|
||||
|
||||
try:
|
||||
cur.execute(
|
||||
f"""
|
||||
INSERT INTO {schema}.face_detections
|
||||
(file_uuid, frame_number, timestamp_secs, x, y, width, height, confidence, trace_id, embedding)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT DO NOTHING
|
||||
""",
|
||||
(file_uuid, frame_num, ts, x, y, w, h, confidence, trace_id, embedding),
|
||||
)
|
||||
if cur.rowcount > 0:
|
||||
total += 1
|
||||
except Exception as e:
|
||||
print(f"[INSERT] Error at frame {frame_num}: {e}")
|
||||
conn.rollback()
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
print(f"[INSERT] Inserted {total} face detections into {schema}.face_detections")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser(description="Insert face detections")
|
||||
parser.add_argument("--file-uuid", required=True)
|
||||
parser.add_argument("--face-json", required=True)
|
||||
parser.add_argument("--schema", default="public")
|
||||
args = parser.parse_args()
|
||||
|
||||
insert_faces(args.file_uuid, args.face_json, args.schema)
|
||||
201
scripts/match_faces_to_tmdb.py
Normal file
201
scripts/match_faces_to_tmdb.py
Normal file
@@ -0,0 +1,201 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Match face_detections against TMDb identities via face embedding similarity.
|
||||
Port of match_faces_against_tmdb from src/core/tmdb/face_agent.rs
|
||||
|
||||
Usage: python3 scripts/match_faces_to_tmdb.py <file_uuid> [--schema dev]
|
||||
"""
|
||||
|
||||
import sys
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import numpy as np
|
||||
from collections import defaultdict
|
||||
|
||||
DATABASE_URL = "postgres://accusys@localhost:5432/momentry"
|
||||
THRESHOLD = 0.50
|
||||
QC_MIN_FACES = 4 # Minimum faces per trace for QC
|
||||
|
||||
|
||||
def cosine_similarity(a, b):
|
||||
a = np.array(a, dtype=np.float64)
|
||||
b = np.array(b, dtype=np.float64)
|
||||
na = np.linalg.norm(a)
|
||||
nb = np.linalg.norm(b)
|
||||
if na == 0 or nb == 0:
|
||||
return 0.0
|
||||
return np.dot(a, b) / (na * nb)
|
||||
|
||||
|
||||
def match_faces_to_tmdb(file_uuid: str, schema: str = "dev"):
|
||||
conn = psycopg2.connect(DATABASE_URL)
|
||||
cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
|
||||
|
||||
# Step 1: Load TMDb identities with face embeddings
|
||||
cur.execute(f"""
|
||||
SELECT id, name, tmdb_id, face_embedding::real[] as embedding
|
||||
FROM {schema}.identities
|
||||
WHERE source = 'tmdb' AND face_embedding IS NOT NULL
|
||||
""")
|
||||
tmdb_identities = []
|
||||
for row in cur.fetchall():
|
||||
emb = row["embedding"]
|
||||
if emb and len(emb) > 0:
|
||||
tmdb_identities.append({
|
||||
"id": row["id"],
|
||||
"name": row["name"],
|
||||
"tmdb_id": row["tmdb_id"],
|
||||
"embedding": emb,
|
||||
})
|
||||
|
||||
print(f"[TMDB-MATCH] Loaded {len(tmdb_identities)} TMDb identities")
|
||||
|
||||
if not tmdb_identities:
|
||||
print("[TMDB-MATCH] No TMDb identities with embeddings")
|
||||
cur.close()
|
||||
conn.close()
|
||||
return 0
|
||||
|
||||
# Step 2: Load face_detections with trace_id and embedding
|
||||
cur.execute(f"""
|
||||
SELECT id, trace_id, frame_number, embedding::real[] as embedding, confidence
|
||||
FROM {schema}.face_detections
|
||||
WHERE file_uuid = %s AND trace_id IS NOT NULL AND embedding IS NOT NULL
|
||||
ORDER BY trace_id, frame_number
|
||||
""", (file_uuid,))
|
||||
|
||||
fd_rows = cur.fetchall()
|
||||
if not fd_rows:
|
||||
print(f"[TMDB-MATCH] No face detections for {file_uuid}")
|
||||
cur.close()
|
||||
conn.close()
|
||||
return 0
|
||||
|
||||
# Group by trace_id
|
||||
trace_faces = defaultdict(list)
|
||||
for row in fd_rows:
|
||||
trace_id = row["trace_id"]
|
||||
emb = row["embedding"]
|
||||
if emb:
|
||||
trace_faces[trace_id].append({
|
||||
"id": row["id"],
|
||||
"embedding": emb,
|
||||
"frame": row["frame_number"],
|
||||
"confidence": row["confidence"],
|
||||
})
|
||||
|
||||
# Dedup near-identical embeddings within trace (sim > 0.99)
|
||||
for tid, faces in trace_faces.items():
|
||||
faces.sort(key=lambda x: x["embedding"][0])
|
||||
unique = []
|
||||
for f in faces:
|
||||
if not unique or cosine_similarity(f["embedding"], unique[-1]["embedding"]) <= 0.99:
|
||||
unique.append(f)
|
||||
trace_faces[tid] = unique
|
||||
|
||||
total_traces = len(trace_faces)
|
||||
total_faces = len(fd_rows)
|
||||
print(f"[TMDB-MATCH] {total_traces} traces with {total_faces} faces")
|
||||
|
||||
# Step 3: Single-pass matching (one round only for performance)
|
||||
matched = {} # trace_id → (identity_id, name)
|
||||
|
||||
# Build reference pool from TMDb seeds only
|
||||
reference_pool = []
|
||||
for tmdb in tmdb_identities:
|
||||
reference_pool.append({
|
||||
"embedding": tmdb["embedding"],
|
||||
"identity_id": tmdb["id"],
|
||||
"name": tmdb["name"],
|
||||
})
|
||||
|
||||
print(f"[TMDB-MATCH] Matching {total_traces} traces against {len(reference_pool)} TMDb identities (threshold={THRESHOLD})")
|
||||
|
||||
# Match each trace against TMDb seeds
|
||||
for tid, faces in trace_faces.items():
|
||||
trace_scores = defaultdict(list)
|
||||
for f in faces:
|
||||
for ref in reference_pool:
|
||||
sim = cosine_similarity(f["embedding"], ref["embedding"])
|
||||
if sim >= THRESHOLD:
|
||||
trace_scores[ref["identity_id"]].append((sim, ref["name"]))
|
||||
|
||||
if not trace_scores:
|
||||
continue
|
||||
|
||||
# Select identity with highest aggregate score
|
||||
best_identity = None
|
||||
best_score = 0
|
||||
best_name = None
|
||||
|
||||
for identity_id, scores in trace_scores.items():
|
||||
avg_sim = np.mean([s[0] for s in scores])
|
||||
if avg_sim > best_score:
|
||||
best_score = avg_sim
|
||||
best_identity = identity_id
|
||||
best_name = scores[0][1]
|
||||
|
||||
if best_identity:
|
||||
matched[tid] = (best_identity, best_name, best_score)
|
||||
|
||||
# Step 4: Quality Control - minimum faces per trace
|
||||
qc_removed = 0
|
||||
for tid, faces in trace_faces.items():
|
||||
if tid in matched and len(faces) < QC_MIN_FACES:
|
||||
del matched[tid]
|
||||
qc_removed += 1
|
||||
|
||||
# Step 5: Temporal collision check
|
||||
frame_identity_count = defaultdict(lambda: defaultdict(int))
|
||||
for tid, faces in trace_faces.items():
|
||||
if tid in matched:
|
||||
identity_id = matched[tid][0]
|
||||
for f in faces:
|
||||
frame_identity_count[f["frame"]][identity_id] += 1
|
||||
|
||||
for frame, identity_counts in frame_identity_count.items():
|
||||
for identity_id, count in identity_counts.items():
|
||||
if count > 1:
|
||||
conflicting = []
|
||||
for tid, faces in trace_faces.items():
|
||||
if tid in matched and matched[tid][0] == identity_id:
|
||||
for f in faces:
|
||||
if f["frame"] == frame:
|
||||
conflicting.append((tid, f["confidence"]))
|
||||
|
||||
conflicting.sort(key=lambda x: x[1], reverse=True)
|
||||
for tid, _ in conflicting[1:]:
|
||||
if tid in matched:
|
||||
del matched[tid]
|
||||
qc_removed += 1
|
||||
|
||||
if qc_removed > 0:
|
||||
print(f"[TMDB-MATCH] QC removed {qc_removed} traces")
|
||||
|
||||
# Step 6: Update face_detections.identity_id
|
||||
bindings_created = 0
|
||||
for tid, (identity_id, name, score) in matched.items():
|
||||
for f in trace_faces[tid]:
|
||||
cur.execute(f"""
|
||||
UPDATE {schema}.face_detections
|
||||
SET identity_id = %s
|
||||
WHERE id = %s AND identity_id IS NULL
|
||||
""", (identity_id, f["id"]))
|
||||
bindings_created += cur.rowcount
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
print(f"[TMDB-MATCH] {bindings_created} bindings created, {len(matched)} traces matched")
|
||||
return bindings_created
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("file_uuid", help="Video file UUID")
|
||||
parser.add_argument("--schema", default="dev", help="Database schema")
|
||||
args = parser.parse_args()
|
||||
|
||||
match_faces_to_tmdb(args.file_uuid, args.schema)
|
||||
@@ -84,18 +84,22 @@ def process_ocr(
|
||||
|
||||
|
||||
def _fallback(video_path, output_path, uuid, sample_interval):
|
||||
"""Fallback to original PaddleOCR implementation"""
|
||||
"""Fallback to MPS OCR implementation"""
|
||||
import importlib
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"paddle_ocr",
|
||||
os.path.join(os.path.dirname(__file__), "ocr_paddle.py")
|
||||
"ocr_mps",
|
||||
os.path.join(os.path.dirname(__file__), "ocr_processor_mps.py")
|
||||
)
|
||||
if spec is None:
|
||||
print("[OCR] No fallback available, returning empty result", file=sys.stderr)
|
||||
return {"frame_count": 0, "fps": 0, "frames": []}
|
||||
paddle = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(paddle)
|
||||
return paddle.process_ocr(video_path, output_path, uuid, sample_interval=sample_interval)
|
||||
ocr_mps = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(ocr_mps)
|
||||
return ocr_mps.process_video_ocr(
|
||||
video_path=video_path,
|
||||
output_path=output_path,
|
||||
sample_interval=sample_interval
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
77
scripts/qdrant_sync_collection.py
Normal file
77
scripts/qdrant_sync_collection.py
Normal file
@@ -0,0 +1,77 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Sync all vectors from one Qdrant collection to another on the same instance."""
|
||||
import json
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
API_KEY = "Test3200Test3200Test3200"
|
||||
SOURCE = "momentry_dev_rule1_v2"
|
||||
TARGET = "momentry_rule1"
|
||||
BATCH_SIZE = 500
|
||||
SLEEP = 0.05
|
||||
|
||||
|
||||
def qdrant(method, path, body=None):
|
||||
url = f"{QDRANT_URL}{path}"
|
||||
data = json.dumps(body).encode() if body else None
|
||||
req = urllib.request.Request(url, data=data, method=method)
|
||||
req.add_header("Content-Type", "application/json")
|
||||
req.add_header("Api-Key", API_KEY)
|
||||
try:
|
||||
with urllib.request.urlopen(req) as resp:
|
||||
return json.loads(resp.read())
|
||||
except urllib.error.HTTPError as e:
|
||||
print(f" HTTP {e.code}: {e.read().decode()}")
|
||||
raise
|
||||
|
||||
|
||||
def scroll_batch(offset=None):
|
||||
body = {"limit": BATCH_SIZE, "with_payload": True, "with_vector": True}
|
||||
if offset is not None:
|
||||
body["offset"] = offset
|
||||
result = qdrant("POST", f"/collections/{SOURCE}/points/scroll", body)
|
||||
points = result.get("result", {}).get("points", [])
|
||||
next_offset = result.get("result", {}).get("next_page_offset")
|
||||
return points, next_offset
|
||||
|
||||
|
||||
def upsert_batch(points):
|
||||
body = {"points": points}
|
||||
result = qdrant("PUT", f"/collections/{TARGET}/points", body)
|
||||
return result.get("status") == "ok" or result.get("result", {}).get("status") == "ok"
|
||||
|
||||
|
||||
def main():
|
||||
offset = None
|
||||
total = 0
|
||||
batch_num = 0
|
||||
t0 = time.time()
|
||||
|
||||
while True:
|
||||
points, offset = scroll_batch(offset)
|
||||
if not points:
|
||||
break
|
||||
|
||||
ok = upsert_batch(points)
|
||||
if not ok:
|
||||
print(f" FAILED batch {batch_num} ({len(points)} pts)")
|
||||
break
|
||||
|
||||
total += len(points)
|
||||
batch_num += 1
|
||||
if batch_num % 10 == 0:
|
||||
elapsed = time.time() - t0
|
||||
print(f" Synced {total} points ({elapsed:.1f}s)")
|
||||
|
||||
if offset is None:
|
||||
break
|
||||
time.sleep(SLEEP)
|
||||
|
||||
elapsed = time.time() - t0
|
||||
print(f"Done: {total} points synced in {elapsed:.1f}s")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -4,6 +4,24 @@ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
|
||||
ENV_FILE="${PROJECT_DIR}/.env.development"
|
||||
|
||||
# Load env vars (silently)
|
||||
source "$ENV_FILE" 2>/dev/null || true
|
||||
|
||||
# Path defaults (can be overridden by env vars above)
|
||||
LOG_DIR="${MOMENTRY_LOG_DIR:-/Users/accusys/momentry/logs}"
|
||||
PG_BIN_DIR="${MOMENTRY_PG_BIN_DIR:-/Users/accusys/pgsql/18.3/bin}"
|
||||
PG_DATA_DIR="${MOMENTRY_PG_DATA_DIR:-/Users/accusys/pgsql/data}"
|
||||
QDRANT_BIN="${MOMENTRY_QDRANT_BIN:-/Users/accusys/.cargo/bin/qdrant}"
|
||||
QDRANT_STORAGE_DIR="${MOMENTRY_QDRANT_STORAGE_DIR:-/Users/accusys/momentry/qdrant_storage}"
|
||||
LLAMACPP_BIN="${MOMENTRY_LLAMACPP_BIN:-/Users/accusys/llama/bin/llama-server}"
|
||||
A4B_MODEL="${MOMENTRY_LLM_A4B_MODEL_PATH:-/Users/accusys/models/google_gemma-4-26B-A4B-it-Q5_K_M.gguf}"
|
||||
A4B_MMPROJ="${MOMENTRY_LLM_A4B_MMPROJ_PATH:-/Users/accusys/models/gemma-4-26B-A4B-it.mmproj-f16.gguf}"
|
||||
E4B_MODEL="${MOMENTRY_LLM_E4B_MODEL_PATH:-/Users/accusys/models/gemma-4-E4B-it-Q4_K_M.gguf}"
|
||||
E4B_MMPROJ="${MOMENTRY_LLM_E4B_MMPROJ_PATH:-/Users/accusys/models/mmproj-gemma-4-E4B-it-BF16.gguf}"
|
||||
OLLAMA_BIN="${MOMENTRY_OLLAMA_BIN:-/Users/accusys/bin/ollama}"
|
||||
PLAYGROUND_BIN="${MOMENTRY_PLAYGROUND_BIN:-target/debug/momentry_playground}"
|
||||
API_KEY="${MOMENTRY_API_KEY:-muser_68600856036340bcafc01930eb4bd839_1774418104_97221b69}"
|
||||
|
||||
# Colors
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
@@ -17,27 +35,23 @@ check() {
|
||||
if [ $? -eq 0 ]; then echo -e " ${GREEN}✅${NC} $1"; else echo -e " ${RED}❌${NC} $1"; FAILURES+=("$1"); fi
|
||||
}
|
||||
|
||||
echo -e "${CYAN}====================================${NC}"
|
||||
echo -e "${CYAN}========================================${NC}"
|
||||
echo -e "${CYAN} Momentry Core - Startup Sequence${NC}"
|
||||
echo -e "${CYAN}====================================${NC}"
|
||||
echo -e "${CYAN}========================================${NC}"
|
||||
echo ""
|
||||
|
||||
LOG_DIR="/Users/accusys/momentry/logs"
|
||||
|
||||
# ── 1. PostgreSQL ──
|
||||
echo -e "${YELLOW}[1/8] PostgreSQL${NC}"
|
||||
PG_DATA="/Users/accusys/pgsql/data"
|
||||
PG_BIN="/Users/accusys/pgsql/18.3/bin"
|
||||
if $PG_BIN/pg_isready -q 2>/dev/null; then
|
||||
echo -e "${YELLOW}[1/10] PostgreSQL${NC}"
|
||||
if "$PG_BIN_DIR/pg_isready" -q 2>/dev/null; then
|
||||
echo -e " ${GREEN}✅${NC} already running"
|
||||
else
|
||||
$PG_BIN/pg_ctl -D "$PG_DATA" -l "$LOG_DIR/pg.log" start 2>/dev/null
|
||||
"$PG_BIN_DIR/pg_ctl" -D "$PG_DATA_DIR" -l "$LOG_DIR/pg.log" start 2>/dev/null
|
||||
sleep 2
|
||||
$PG_BIN/pg_isready -q 2>/dev/null; check "started"
|
||||
"$PG_BIN_DIR/pg_isready" -q 2>/dev/null; check "started"
|
||||
fi
|
||||
|
||||
# ── 2. Redis ──
|
||||
echo -e "${YELLOW}[2/8] Redis${NC}"
|
||||
echo -e "${YELLOW}[2/10] Redis${NC}"
|
||||
if redis-cli ping 2>/dev/null | grep -q PONG; then
|
||||
echo -e " ${GREEN}✅${NC} already running"
|
||||
else
|
||||
@@ -46,14 +60,22 @@ else
|
||||
redis-cli ping 2>/dev/null | grep -q PONG; check "started"
|
||||
fi
|
||||
|
||||
# ── 3. Qdrant ──
|
||||
echo -e "${YELLOW}[3/8] Qdrant${NC}"
|
||||
QDRANT_BIN="/Users/accusys/momentry_resources/bin/qdrant"
|
||||
QDRANT_STORAGE="/Users/accusys/momentry/qdrant_storage"
|
||||
# ── 3. MongoDB ──
|
||||
echo -e "${YELLOW}[3/10] MongoDB${NC}"
|
||||
if pgrep -q mongod 2>/dev/null; then
|
||||
echo -e " ${GREEN}✅${NC} already running"
|
||||
else
|
||||
brew services start mongodb-community 2>/dev/null || mongod --dbpath /opt/homebrew/var/mongodb --logpath "$LOG_DIR/mongodb.log" --fork 2>/dev/null
|
||||
sleep 2
|
||||
pgrep -q mongod 2>/dev/null; check "started"
|
||||
fi
|
||||
|
||||
# ── 4. Qdrant ──
|
||||
echo -e "${YELLOW}[4/10] Qdrant${NC}"
|
||||
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 http://localhost:6333/healthz 2>/dev/null | grep -q 200; then
|
||||
echo -e " ${GREEN}✅${NC} already running"
|
||||
else
|
||||
mkdir -p "$QDRANT_STORAGE"
|
||||
mkdir -p "$QDRANT_STORAGE_DIR"
|
||||
"$QDRANT_BIN" > "$LOG_DIR/qdrant.log" 2>&1 &
|
||||
for i in $(seq 1 15); do
|
||||
sleep 2
|
||||
@@ -64,9 +86,8 @@ else
|
||||
curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 http://localhost:6333/healthz 2>/dev/null | grep -q 200; check "started"
|
||||
fi
|
||||
|
||||
# ── 4. Qdrant Collection ──
|
||||
echo -e "${YELLOW}[4/8] Qdrant Collection${NC}"
|
||||
source "$ENV_FILE" 2>/dev/null || true
|
||||
# ── 5. Qdrant Collection ──
|
||||
echo -e "${YELLOW}[5/10] Qdrant Collection${NC}"
|
||||
COLLECTION="${QDRANT_COLLECTION:-momentry_dev_rule1_v2}"
|
||||
EXISTS=$(curl -s "http://localhost:6333/collections/$COLLECTION" 2>/dev/null | python3 -c "import sys,json; d=json.load(sys.stdin); print(d.get('result',{}).get('status','not_found'))" 2>/dev/null)
|
||||
if [ "$EXISTS" = "not_found" ]; then
|
||||
@@ -78,15 +99,19 @@ fi
|
||||
curl -s "http://localhost:6333/collections/$COLLECTION" 2>/dev/null | python3 -c "import sys,json; d=json.load(sys.stdin); s=d.get('result',{}).get('status','not_found'); assert s in ('green','ok'), f'unexpected status: {s}'" 2>/dev/null
|
||||
check "collection '$COLLECTION' ready"
|
||||
|
||||
# ── 5. LLM (Gemma4 / llama.cpp) ──
|
||||
echo -e "${YELLOW}[5/8] LLM Server (Gemma4)${NC}"
|
||||
# ── 6a. LLM Chat (A4B, port 8082) ──
|
||||
echo -e "${YELLOW}[6a/10] LLM Chat - A4B (port 8082)${NC}"
|
||||
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:8082/health 2>/dev/null | grep -q 200; then
|
||||
echo -e " ${GREEN}✅${NC} already running"
|
||||
else
|
||||
LLM_BIN="/Users/accusys/momentry_resources/llama/bin/llama-server"
|
||||
LLM_MODEL="/Users/accusys/momentry/models/llm/google_gemma-4-26B-A4B-it-Q5_K_M.gguf"
|
||||
"$LLM_BIN" -m "$LLM_MODEL" --host 0.0.0.0 --port 8082 -ngl 99 -c 16384 --temp 0.1 --mlock --reasoning off > "$LOG_DIR/llama_server.log" 2>&1 &
|
||||
echo -e " ${YELLOW}⏳ loading model (~30s)...${NC}"
|
||||
LLAMA_ARGS_A4B=(
|
||||
-m "$A4B_MODEL"
|
||||
--mmproj "$A4B_MMPROJ"
|
||||
--host 0.0.0.0 --port 8082
|
||||
-ngl 99 -c 16384 --temp 0.1 --mlock --reasoning off
|
||||
)
|
||||
"$LLAMACPP_BIN" "${LLAMA_ARGS_A4B[@]}" > "$LOG_DIR/llama_a4b.log" 2>&1 &
|
||||
echo -e " ${YELLOW}⏳ loading A4B model (~30s)...${NC}"
|
||||
for i in $(seq 1 30); do
|
||||
sleep 2
|
||||
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 2 http://localhost:8082/health 2>/dev/null | grep -q 200; then
|
||||
@@ -96,8 +121,30 @@ else
|
||||
curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 http://localhost:8082/health 2>/dev/null | grep -q 200; check "started"
|
||||
fi
|
||||
|
||||
# ── 6. Embedding Server ──
|
||||
echo -e "${YELLOW}[6/8] EmbeddingGemma${NC}"
|
||||
# ── 6b. LLM Vision (E4B, port 8083) ──
|
||||
echo -e "${YELLOW}[6b/10] LLM Vision - E4B (port 8083)${NC}"
|
||||
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:8083/health 2>/dev/null | grep -q 200; then
|
||||
echo -e " ${GREEN}✅${NC} already running"
|
||||
else
|
||||
LLAMA_ARGS_E4B=(
|
||||
-m "$E4B_MODEL"
|
||||
--mmproj "$E4B_MMPROJ"
|
||||
--host 0.0.0.0 --port 8083
|
||||
-ngl 99 -c 16384 --temp 0.1 --mlock
|
||||
)
|
||||
"$LLAMACPP_BIN" "${LLAMA_ARGS_E4B[@]}" > "$LOG_DIR/llama_e4b.log" 2>&1 &
|
||||
echo -e " ${YELLOW}⏳ loading E4B model (~30s)...${NC}"
|
||||
for i in $(seq 1 30); do
|
||||
sleep 2
|
||||
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 2 http://localhost:8083/health 2>/dev/null | grep -q 200; then
|
||||
break
|
||||
fi
|
||||
done
|
||||
curl -s -o /dev/null -w "%{http_code}" --connect-timeout 3 http://localhost:8083/health 2>/dev/null | grep -q 200; check "started"
|
||||
fi
|
||||
|
||||
# ── 7. Embedding Server ──
|
||||
echo -e "${YELLOW}[7/10] EmbeddingGemma${NC}"
|
||||
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:11436/health 2>/dev/null | grep -q 200; then
|
||||
echo -e " ${GREEN}✅${NC} already running"
|
||||
else
|
||||
@@ -112,23 +159,22 @@ else
|
||||
curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:11436/health 2>/dev/null | grep -q 200; check "started"
|
||||
fi
|
||||
|
||||
# ── 7. Playground Server ──
|
||||
echo -e "${YELLOW}[7/8] Playground API Server${NC}"
|
||||
if curl -s -o /dev/null -w "%{http_code}" -H "X-API-Key: muser_68600856036340bcafc01930eb4bd839_1774418104_97221b69" --connect-timeout 5 http://127.0.0.1:3003/api/v1/agents/5w1h/status 2>/dev/null | grep -q 200; then
|
||||
# ── 8. Playground Server ──
|
||||
echo -e "${YELLOW}[8/10] Playground API Server${NC}"
|
||||
if curl -s -o /dev/null -w "%{http_code}" -H "X-API-Key: $API_KEY" --connect-timeout 5 http://127.0.0.1:3003/api/v1/agents/5w1h/status 2>/dev/null | grep -q 200; then
|
||||
echo -e " ${GREEN}✅${NC} already running"
|
||||
else
|
||||
cd "$PROJECT_DIR"
|
||||
target/debug/momentry_playground server > "$LOG_DIR/playground.log" 2>&1 &
|
||||
$PLAYGROUND_BIN server > "$LOG_DIR/playground.log" 2>&1 &
|
||||
sleep 4
|
||||
curl -s -o /dev/null -w "%{http_code}" -H "X-API-Key: muser_68600856036340bcafc01930eb4bd839_1774418104_97221b69" --connect-timeout 5 http://127.0.0.1:3003/api/v1/agents/5w1h/status 2>/dev/null | grep -q 200; check "started"
|
||||
curl -s -o /dev/null -w "%{http_code}" -H "X-API-Key: $API_KEY" --connect-timeout 5 http://127.0.0.1:3003/api/v1/agents/5w1h/status 2>/dev/null | grep -q 200; check "started"
|
||||
fi
|
||||
|
||||
# ── 8. Ollama (Gemma4 E4B) ──
|
||||
echo -e "${YELLOW}[8/8] Ollama (Gemma4 E4B)${NC}"
|
||||
# ── 9. Ollama ──
|
||||
echo -e "${YELLOW}[9/10] Ollama${NC}"
|
||||
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:11434/api/tags 2>/dev/null | grep -q 200; then
|
||||
echo -e " ${GREEN}✅${NC} already running"
|
||||
else
|
||||
OLLAMA_BIN="/Users/accusys/momentry_resources/bin/ollama"
|
||||
if [ ! -f "$OLLAMA_BIN" ]; then
|
||||
echo -e " ${YELLOW}⚠ ollama binary not found, skipping${NC}"
|
||||
else
|
||||
@@ -138,6 +184,16 @@ else
|
||||
fi
|
||||
fi
|
||||
|
||||
# ── 10. SFTPGo ──
|
||||
echo -e "${YELLOW}[10/10] SFTPGo${NC}"
|
||||
if curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:8080/api/v1/version 2>/dev/null | grep -q 200; then
|
||||
echo -e " ${GREEN}✅${NC} already running"
|
||||
else
|
||||
/Users/accusys/bin/sftpgo serve -c /Users/accusys/momentry/etc/sftpgo > "$LOG_DIR/sftpgo.log" 2>&1 &
|
||||
sleep 3
|
||||
curl -s -o /dev/null -w "%{http_code}" --connect-timeout 5 http://localhost:8080/api/v1/version 2>/dev/null | grep -q 200; check "started"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
if [ ${#FAILURES[@]} -eq 0 ]; then
|
||||
echo -e "${GREEN}====================================${NC}"
|
||||
@@ -151,10 +207,13 @@ else
|
||||
fi
|
||||
echo ""
|
||||
echo " Playground: http://127.0.0.1:3003"
|
||||
echo " LLM: http://127.0.0.1:8082"
|
||||
echo " LLM Chat: http://127.0.0.1:8082"
|
||||
echo " LLM Vision: http://127.0.0.1:8083"
|
||||
echo " Embedding: http://127.0.0.1:11436"
|
||||
echo " Ollama: http://localhost:11434"
|
||||
echo " Qdrant: http://localhost:6333"
|
||||
echo " PostgreSQL: localhost:5432"
|
||||
echo " Redis: localhost:6379"
|
||||
echo " MongoDB: localhost:27017"
|
||||
echo " SFTPGo: http://localhost:8080 (SFTP: port 2022)"
|
||||
echo ""
|
||||
|
||||
@@ -319,12 +319,13 @@ def store_traced_faces(file_uuid: str, traced_json_path: str, schema: str = SCHE
|
||||
cur.execute(
|
||||
f"""
|
||||
UPDATE {schema}.face_detections
|
||||
SET trace_id = %s
|
||||
SET trace_id = %s, embedding = %s
|
||||
WHERE file_uuid = %s AND frame_number = %s
|
||||
AND x = %s AND y = %s AND width = %s AND height = %s
|
||||
""",
|
||||
(
|
||||
trace_id,
|
||||
embed_vec,
|
||||
file_uuid, frame_num, x, y, w, h,
|
||||
),
|
||||
)
|
||||
|
||||
@@ -126,12 +126,24 @@ struct SwiftFace: ParsableCommand {
|
||||
let imgH = CGFloat(cgImage.height)
|
||||
|
||||
// Process landmark observations FIRST (each has bbox + landmarks, self-consistent)
|
||||
// Quality filtering
|
||||
let MIN_CONFIDENCE = 0.6
|
||||
let MIN_SIZE = 20
|
||||
|
||||
for lmObs in landmarkObservations {
|
||||
// Confidence filter
|
||||
let lmConf = Double(lmObs.confidence)
|
||||
if lmConf < MIN_CONFIDENCE { continue }
|
||||
|
||||
let bb = lmObs.boundingBox
|
||||
let faceX = Int(bb.origin.x * imgW)
|
||||
let faceY = Int((1.0 - bb.origin.y - bb.size.height) * imgH)
|
||||
let faceW = Int(bb.size.width * imgW)
|
||||
let faceH = Int(bb.size.height * imgH)
|
||||
|
||||
// Size filter
|
||||
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
|
||||
|
||||
let faceX = Int(bb.origin.x * imgW)
|
||||
let faceY = Int((1.0 - bb.origin.y - bb.size.height) * imgH)
|
||||
|
||||
var faceData: [String: Any] = [
|
||||
"bbox": ["x": max(0, faceX), "y": max(0, faceY),
|
||||
@@ -203,11 +215,21 @@ struct SwiftFace: ParsableCommand {
|
||||
}
|
||||
}
|
||||
if matched { continue }
|
||||
|
||||
// Quality filtering for unmatched face rects
|
||||
let MIN_CONFIDENCE = 0.6
|
||||
let MIN_SIZE = 20
|
||||
|
||||
let faceConf = Double(faceObs.faceCaptureQuality ?? faceObs.confidence)
|
||||
if faceConf < MIN_CONFIDENCE { continue }
|
||||
|
||||
let faceW = Int(fBB.size.width * imgW)
|
||||
let faceH = Int(fBB.size.height * imgH)
|
||||
if faceW < MIN_SIZE || faceH < MIN_SIZE { continue }
|
||||
|
||||
// Unmatched face rect: output without landmarks
|
||||
let faceX = Int(fBB.origin.x * imgW)
|
||||
let faceY = Int((1.0 - fBB.origin.y - fBB.size.height) * imgH)
|
||||
let faceW = Int(fBB.size.width * imgW)
|
||||
let faceH = Int(fBB.size.height * imgH)
|
||||
|
||||
var faceData: [String: Any] = [
|
||||
"bbox": ["x": max(0, faceX), "y": max(0, faceY),
|
||||
|
||||
107
scripts/update_embeddings.py
Normal file
107
scripts/update_embeddings.py
Normal file
@@ -0,0 +1,107 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""Update face_detections embeddings from face_traced.json"""
|
||||
|
||||
import json
|
||||
import psycopg2
|
||||
import sys
|
||||
import os
|
||||
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", "postgres://accusys@localhost:5432/momentry")
|
||||
|
||||
def update_embeddings(file_uuid: str, traced_json_path: str, schema: str = "dev"):
|
||||
"""Update embeddings in face_detections from face_traced.json"""
|
||||
|
||||
with open(traced_json_path, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
conn = psycopg2.connect(DATABASE_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
frames = data.get('frames', {})
|
||||
|
||||
updated = 0
|
||||
no_match = 0
|
||||
|
||||
# Process frames in batches of 1000
|
||||
batch = []
|
||||
|
||||
for frame_key, frame_data in frames.items():
|
||||
frame_num = int(frame_key)
|
||||
faces = frame_data.get('faces', [])
|
||||
|
||||
for face in faces:
|
||||
embedding = face.get('embedding')
|
||||
if not embedding or len(embedding) == 0:
|
||||
continue
|
||||
|
||||
trace_id = face.get('trace_id')
|
||||
x = face.get('x', 0)
|
||||
y = face.get('y', 0)
|
||||
w = face.get('width', 0)
|
||||
h = face.get('height', 0)
|
||||
|
||||
# Convert embedding to PostgreSQL array format
|
||||
emb_str = '[' + ','.join(str(v) for v in embedding) + ']'
|
||||
|
||||
batch.append((trace_id, emb_str, file_uuid, frame_num, x, y, w, h))
|
||||
|
||||
# Execute batch every 1000 frames
|
||||
if len(batch) >= 1000:
|
||||
for item in batch:
|
||||
try:
|
||||
cur.execute(
|
||||
f"""
|
||||
UPDATE {schema}.face_detections
|
||||
SET embedding = %s::real[], trace_id = %s
|
||||
WHERE file_uuid = %s AND frame_number = %s
|
||||
AND x = %s AND y = %s AND width = %s AND height = %s
|
||||
AND embedding IS NULL
|
||||
""",
|
||||
(item[1], item[0], item[2], item[3], item[4], item[5], item[6], item[7])
|
||||
)
|
||||
updated += cur.rowcount
|
||||
except Exception as e:
|
||||
pass
|
||||
conn.commit()
|
||||
batch = []
|
||||
print(f"[UPDATE] Processed {updated} so far...", file=sys.stderr)
|
||||
|
||||
# Final batch
|
||||
for item in batch:
|
||||
try:
|
||||
cur.execute(
|
||||
f"""
|
||||
UPDATE {schema}.face_detections
|
||||
SET embedding = %s::real[], trace_id = %s
|
||||
WHERE file_uuid = %s AND frame_number = %s
|
||||
AND x = %s AND y = %s AND width = %s AND height = %s
|
||||
AND embedding IS NULL
|
||||
""",
|
||||
(item[1], item[0], item[2], item[3], item[4], item[5], item[6], item[7])
|
||||
)
|
||||
updated += cur.rowcount
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
conn.commit()
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
print(f"[UPDATE] Total updated: {updated}")
|
||||
return updated
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--file-uuid", required=True)
|
||||
parser.add_argument("--traced-json", help="Path to face_traced.json")
|
||||
parser.add_argument("--schema", default="dev")
|
||||
args = parser.parse_args()
|
||||
|
||||
traced_json = args.traced_json or f"/Users/accusys/momentry/output_dev/{args.file_uuid}.face_traced.json"
|
||||
|
||||
if not os.path.exists(traced_json):
|
||||
print(f"File not found: {traced_json}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
update_embeddings(args.file_uuid, traced_json, args.schema)
|
||||
170
scripts/verify_charade_pipeline.py
Normal file
170
scripts/verify_charade_pipeline.py
Normal file
@@ -0,0 +1,170 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Verify Charade pipeline completion.
|
||||
Usage: python3 scripts/verify_charade_pipeline.py <file_uuid>
|
||||
"""
|
||||
|
||||
import sys
|
||||
import psycopg2
|
||||
import subprocess
|
||||
import json
|
||||
|
||||
DATABASE_URL = "postgres://accusys@localhost:5432/momentry"
|
||||
OUTPUT_DIR = "/Users/accusys/momentry/output_dev"
|
||||
|
||||
|
||||
def check_file_outputs(file_uuid):
|
||||
"""Check all expected output files exist"""
|
||||
expected_files = [
|
||||
"cut.json",
|
||||
"yolo.json",
|
||||
"face.json",
|
||||
"face_traced.json",
|
||||
"pose.json",
|
||||
"asrx.json",
|
||||
"visual_chunk.json",
|
||||
"scene.json",
|
||||
"scene_meta.json",
|
||||
"story_llm.json",
|
||||
"story_story.json",
|
||||
"tmdb.json",
|
||||
]
|
||||
|
||||
results = []
|
||||
for ext in expected_files:
|
||||
path = f"{OUTPUT_DIR}/{file_uuid}.{ext}"
|
||||
try:
|
||||
size = subprocess.check_output(["stat", "-f%z", path]).decode().strip()
|
||||
results.append({"file": ext, "exists": True, "size": int(size)})
|
||||
except:
|
||||
results.append({"file": ext, "exists": False, "size": 0})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def check_db_records(file_uuid, schema="dev"):
|
||||
"""Check database records"""
|
||||
conn = psycopg2.connect(DATABASE_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
checks = [
|
||||
("videos", f"SELECT status FROM {schema}.videos WHERE file_uuid = '{file_uuid}'"),
|
||||
("monitor_jobs", f"SELECT status, completed_processors FROM {schema}.monitor_jobs WHERE uuid = '{file_uuid}'"),
|
||||
("pre_chunks", f"SELECT COUNT(*) FROM {schema}.pre_chunks WHERE file_uuid = '{file_uuid}'"),
|
||||
("face_detections_total", f"SELECT COUNT(*) FROM {schema}.face_detections WHERE file_uuid = '{file_uuid}'"),
|
||||
("face_detections_embedding", f"SELECT COUNT(embedding) FROM {schema}.face_detections WHERE file_uuid = '{file_uuid}'"),
|
||||
("face_detections_trace", f"SELECT COUNT(trace_id) FROM {schema}.face_detections WHERE file_uuid = '{file_uuid}'"),
|
||||
("face_detections_identity", f"SELECT COUNT(identity_id) FROM {schema}.face_detections WHERE file_uuid = '{file_uuid}'"),
|
||||
("chunks_total", f"SELECT COUNT(*) FROM {schema}.chunk WHERE file_uuid = '{file_uuid}'"),
|
||||
("chunks_embedding", f"SELECT COUNT(embedding) FROM {schema}.chunk WHERE file_uuid = '{file_uuid}'"),
|
||||
]
|
||||
|
||||
results = []
|
||||
for name, query in checks:
|
||||
try:
|
||||
cur.execute(query)
|
||||
result = cur.fetchone()
|
||||
results.append({"check": name, "value": result[0] if result else None})
|
||||
except Exception as e:
|
||||
results.append({"check": name, "error": str(e)})
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
return results
|
||||
|
||||
|
||||
def check_identity_bindings(file_uuid, schema="dev"):
|
||||
"""Check identity bindings"""
|
||||
conn = psycopg2.connect(DATABASE_URL)
|
||||
cur = conn.cursor()
|
||||
|
||||
checks = [
|
||||
("audrey_faces", f"""
|
||||
SELECT COUNT(*) FROM {schema}.face_detections fd
|
||||
JOIN {schema}.identities i ON fd.identity_id = i.id
|
||||
WHERE fd.file_uuid = '{file_uuid}' AND i.name = 'Audrey Hepburn'
|
||||
"""),
|
||||
("cary_faces", f"""
|
||||
SELECT COUNT(*) FROM {schema}.face_detections fd
|
||||
JOIN {schema}.identities i ON fd.identity_id = i.id
|
||||
WHERE fd.file_uuid = '{file_uuid}' AND i.name = 'Cary Grant'
|
||||
"""),
|
||||
("top_identities", f"""
|
||||
SELECT i.name, COUNT(*) as count
|
||||
FROM {schema}.face_detections fd
|
||||
JOIN {schema}.identities i ON fd.identity_id = i.id
|
||||
WHERE fd.file_uuid = '{file_uuid}' AND fd.identity_id IS NOT NULL
|
||||
GROUP BY i.name
|
||||
ORDER BY count DESC
|
||||
LIMIT 10
|
||||
"""),
|
||||
]
|
||||
|
||||
results = []
|
||||
for name, query in checks:
|
||||
try:
|
||||
cur.execute(query)
|
||||
if name == "top_identities":
|
||||
rows = cur.fetchall()
|
||||
results.append({"check": name, "value": rows})
|
||||
else:
|
||||
result = cur.fetchone()
|
||||
results.append({"check": name, "value": result[0] if result else 0})
|
||||
except Exception as e:
|
||||
results.append({"check": name, "error": str(e)})
|
||||
|
||||
cur.close()
|
||||
conn.close()
|
||||
return results
|
||||
|
||||
|
||||
def print_report(file_uuid, file_outputs, db_records, identity_bindings):
|
||||
"""Print verification report"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Charade Pipeline Verification Report")
|
||||
print(f"File UUID: {file_uuid}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
print("## File Outputs")
|
||||
for f in file_outputs:
|
||||
status = "✅" if f["exists"] and f["size"] > 0 else "❌"
|
||||
size_kb = f["size"] / 1024
|
||||
print(f" {status} {f['file']}: {size_kb:.1f} KB")
|
||||
|
||||
print("\n## Database Records")
|
||||
for r in db_records:
|
||||
value = r.get("value", r.get("error", "N/A"))
|
||||
if isinstance(value, (list, tuple)):
|
||||
value = ", ".join(str(v) for v in value)
|
||||
print(f" {r['check']}: {value}")
|
||||
|
||||
print("\n## Identity Bindings")
|
||||
for r in identity_bindings:
|
||||
value = r.get("value", r.get("error", "N/A"))
|
||||
if isinstance(value, list):
|
||||
print(f" {r['check']}:")
|
||||
for row in value:
|
||||
print(f" - {row[0]}: {row[1]} faces")
|
||||
else:
|
||||
print(f" {r['check']}: {value}")
|
||||
|
||||
print(f"\n{'='*60}\n")
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
file_uuid = "c3c635e3641da80dde10cc555ffcdda5"
|
||||
else:
|
||||
file_uuid = sys.argv[1]
|
||||
|
||||
print("Verifying pipeline...")
|
||||
|
||||
file_outputs = check_file_outputs(file_uuid)
|
||||
db_records = check_db_records(file_uuid)
|
||||
identity_bindings = check_identity_bindings(file_uuid)
|
||||
|
||||
print_report(file_uuid, file_outputs, db_records, identity_bindings)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,445 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
視覺分片處理器 (Phase 2.2)
|
||||
|
||||
從 YOLO 結果生成視覺分片,支持多種分片策略:
|
||||
1. 固定幀數分片
|
||||
2. 基於物件相似度分片
|
||||
3. 基於場景變化分片
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Any, Optional
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
|
||||
# 添加父目錄到路徑以導入其他模組
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from scripts.yolo_processor_contract_v1 import YOLOProcessor
|
||||
|
||||
|
||||
class VisualChunkProcessor:
|
||||
"""視覺分片處理器"""
|
||||
|
||||
def __init__(self, video_path: str, yolo_result_path: Optional[str] = None):
|
||||
self.video_path = video_path
|
||||
self.yolo_result_path = yolo_result_path
|
||||
self.yolo_result = None
|
||||
|
||||
def load_yolo_result(self):
|
||||
"""加載 YOLO 結果"""
|
||||
if self.yolo_result_path and os.path.exists(self.yolo_result_path):
|
||||
with open(self.yolo_result_path, "r", encoding="utf-8") as f:
|
||||
self.yolo_result = json.load(f)
|
||||
else:
|
||||
# 如果沒有提供 YOLO 結果路徑,則運行 YOLO 檢測
|
||||
print(f"[VisualChunk] Running YOLO detection for: {self.video_path}")
|
||||
yolo_processor = YOLOProcessor(self.video_path)
|
||||
yolo_result = yolo_processor.process()
|
||||
self.yolo_result = yolo_processor.to_json_dict()
|
||||
|
||||
def create_fixed_frame_chunks(
|
||||
self, frames_per_chunk: int = 30
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""創建固定幀數分片
|
||||
|
||||
Args:
|
||||
frames_per_chunk: 每個分片的幀數
|
||||
|
||||
Returns:
|
||||
視覺分片列表
|
||||
"""
|
||||
if not self.yolo_result:
|
||||
self.load_yolo_result()
|
||||
|
||||
frames = self.yolo_result.get("frames", {})
|
||||
if not frames:
|
||||
return []
|
||||
|
||||
# 將幀字典轉換為排序後的列表
|
||||
frame_list = []
|
||||
for frame_key, frame_data in frames.items():
|
||||
frame_list.append(
|
||||
{
|
||||
"frame_number": int(frame_key),
|
||||
"timestamp": frame_data.get("time_seconds", 0),
|
||||
"objects": frame_data.get("detections", []),
|
||||
}
|
||||
)
|
||||
|
||||
# 按幀號排序
|
||||
frame_list.sort(key=lambda x: x["frame_number"])
|
||||
|
||||
chunks = []
|
||||
total_frames = len(frame_list)
|
||||
|
||||
for start_idx in range(0, total_frames, frames_per_chunk):
|
||||
end_idx = min(start_idx + frames_per_chunk, total_frames)
|
||||
chunk_frames = frame_list[start_idx:end_idx]
|
||||
|
||||
if not chunk_frames:
|
||||
continue
|
||||
|
||||
# 計算分片統計
|
||||
chunk_stats = self._calculate_chunk_stats(chunk_frames)
|
||||
|
||||
chunk = {
|
||||
"start_frame": chunk_frames[0]["frame_number"],
|
||||
"end_frame": chunk_frames[-1]["frame_number"] + 1, # exclusive
|
||||
"frame_count": len(chunk_frames),
|
||||
"keyframe_objects": self._extract_keyframe_objects(chunk_frames),
|
||||
"dominant_objects": chunk_stats["dominant_objects"],
|
||||
"metadata": {
|
||||
"object_count": chunk_stats["total_objects"],
|
||||
"unique_classes": chunk_stats["unique_classes"],
|
||||
"max_confidence": chunk_stats["max_confidence"],
|
||||
"avg_confidence": chunk_stats["avg_confidence"],
|
||||
"spatial_density": chunk_stats["spatial_density"],
|
||||
},
|
||||
}
|
||||
|
||||
chunks.append(chunk)
|
||||
|
||||
return chunks
|
||||
|
||||
def create_similarity_based_chunks(
|
||||
self, similarity_threshold: float = 0.5, min_frames_per_chunk: int = 10
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""基於物件相似度創建分片
|
||||
|
||||
Args:
|
||||
similarity_threshold: 相似度閾值 (0-1)
|
||||
min_frames_per_chunk: 最小幀數
|
||||
|
||||
Returns:
|
||||
視覺分片列表
|
||||
"""
|
||||
if not self.yolo_result:
|
||||
self.load_yolo_result()
|
||||
|
||||
frames = self.yolo_result.get("frames", {})
|
||||
if not frames:
|
||||
return []
|
||||
|
||||
# 將幀字典轉換為排序後的列表
|
||||
frame_list = []
|
||||
for frame_key, frame_data in frames.items():
|
||||
frame_list.append(
|
||||
{
|
||||
"frame_number": int(frame_key),
|
||||
"timestamp": frame_data.get("time_seconds", 0),
|
||||
"objects": frame_data.get("detections", []),
|
||||
}
|
||||
)
|
||||
|
||||
# 按幀號排序
|
||||
frame_list.sort(key=lambda x: x["frame_number"])
|
||||
|
||||
chunks = []
|
||||
current_chunk_frames = []
|
||||
current_start_frame = 0
|
||||
|
||||
for i, frame in enumerate(frame_list):
|
||||
if not current_chunk_frames:
|
||||
current_chunk_frames.append(frame)
|
||||
current_start_frame = frame["frame_number"]
|
||||
continue
|
||||
|
||||
# 計算相似度
|
||||
last_frame = current_chunk_frames[-1]
|
||||
similarity = self._calculate_frame_similarity(last_frame, frame)
|
||||
|
||||
if similarity >= similarity_threshold:
|
||||
# 相似度高,加入當前分片
|
||||
current_chunk_frames.append(frame)
|
||||
else:
|
||||
# 相似度低,創建新分片
|
||||
if len(current_chunk_frames) >= min_frames_per_chunk:
|
||||
chunk = self._create_chunk_from_frames(
|
||||
current_chunk_frames,
|
||||
current_start_frame,
|
||||
frame_list[i - 1]["frame_number"] + 1,
|
||||
)
|
||||
chunks.append(chunk)
|
||||
|
||||
# 開始新的分片
|
||||
current_chunk_frames = [frame]
|
||||
current_start_frame = frame["frame_number"]
|
||||
|
||||
# 處理最後一個分片
|
||||
if len(current_chunk_frames) >= min_frames_per_chunk:
|
||||
chunk = self._create_chunk_from_frames(
|
||||
current_chunk_frames,
|
||||
current_start_frame,
|
||||
current_chunk_frames[-1]["frame_number"] + 1,
|
||||
)
|
||||
chunks.append(chunk)
|
||||
|
||||
return chunks
|
||||
|
||||
def _calculate_frame_similarity(self, frame1: Dict, frame2: Dict) -> float:
|
||||
"""計算兩個幀之間的相似度(基於物件類別)"""
|
||||
objects1 = frame1.get("objects", [])
|
||||
objects2 = frame2.get("objects", [])
|
||||
|
||||
if not objects1 and not objects2:
|
||||
return 1.0
|
||||
|
||||
if not objects1 or not objects2:
|
||||
return 0.0
|
||||
|
||||
# 提取物件類別
|
||||
classes1 = set(
|
||||
obj.get("class_name", "") for obj in objects1 if obj.get("class_name")
|
||||
)
|
||||
classes2 = set(
|
||||
obj.get("class_name", "") for obj in objects2 if obj.get("class_name")
|
||||
)
|
||||
|
||||
# 計算 Jaccard 相似度
|
||||
intersection = classes1.intersection(classes2)
|
||||
union = classes1.union(classes2)
|
||||
|
||||
if not union:
|
||||
return 0.0
|
||||
|
||||
return len(intersection) / len(union)
|
||||
|
||||
def _calculate_chunk_stats(self, frames: List[Dict]) -> Dict[str, Any]:
|
||||
"""計算分片統計信息"""
|
||||
all_objects = []
|
||||
for frame in frames:
|
||||
all_objects.extend(frame.get("objects", []))
|
||||
|
||||
# 總物件數
|
||||
total_objects = len(all_objects)
|
||||
|
||||
# 唯一類別
|
||||
unique_classes = list(
|
||||
set(
|
||||
obj.get("class_name", "")
|
||||
for obj in all_objects
|
||||
if obj.get("class_name")
|
||||
)
|
||||
)
|
||||
|
||||
# 信心值統計
|
||||
confidences = [obj.get("confidence", 0) for obj in all_objects]
|
||||
max_confidence = max(confidences) if confidences else 0
|
||||
avg_confidence = np.mean(confidences) if confidences else 0
|
||||
|
||||
# 空間密度(每幀平均物件數)
|
||||
spatial_density = total_objects / len(frames) if frames else 0
|
||||
|
||||
# 主要物件(出現在大多數幀中的物件)
|
||||
object_counts = {}
|
||||
for frame in frames:
|
||||
frame_classes = set(
|
||||
obj.get("class_name", "")
|
||||
for obj in frame.get("objects", [])
|
||||
if obj.get("class_name")
|
||||
)
|
||||
for class_name in frame_classes:
|
||||
object_counts[class_name] = object_counts.get(class_name, 0) + 1
|
||||
|
||||
dominant_objects = [
|
||||
class_name
|
||||
for class_name, count in object_counts.items()
|
||||
if count / len(frames) > 0.5
|
||||
]
|
||||
dominant_objects.sort()
|
||||
|
||||
return {
|
||||
"total_objects": total_objects,
|
||||
"unique_classes": unique_classes,
|
||||
"max_confidence": float(max_confidence),
|
||||
"avg_confidence": float(avg_confidence),
|
||||
"spatial_density": float(spatial_density),
|
||||
"dominant_objects": dominant_objects,
|
||||
}
|
||||
|
||||
def _extract_keyframe_objects(self, frames: List[Dict]) -> List[Dict[str, Any]]:
|
||||
"""提取關鍵幀物件"""
|
||||
keyframe_objects = []
|
||||
|
||||
# 簡化:每5幀取一個關鍵幀
|
||||
for i in range(0, len(frames), 5):
|
||||
if i < len(frames):
|
||||
frame = frames[i]
|
||||
objects = []
|
||||
|
||||
for obj in frame.get("objects", []):
|
||||
objects.append(
|
||||
{
|
||||
"class_name": obj.get("class_name", ""),
|
||||
"class_id": obj.get("class_id", 0),
|
||||
"confidence": float(obj.get("confidence", 0)),
|
||||
"bbox": {
|
||||
"x": obj.get("x1", 0),
|
||||
"y": obj.get("y1", 0),
|
||||
"width": obj.get("width", 0),
|
||||
"height": obj.get("height", 0),
|
||||
}
|
||||
if "x1" in obj
|
||||
else None,
|
||||
"occurrence": 1,
|
||||
}
|
||||
)
|
||||
|
||||
keyframe_objects.append(
|
||||
{
|
||||
"timestamp": float(frame.get("timestamp", 0)),
|
||||
"frame_number": frame.get("frame_number", 0),
|
||||
"objects": objects,
|
||||
}
|
||||
)
|
||||
|
||||
return keyframe_objects
|
||||
|
||||
def _create_chunk_from_frames(
|
||||
self, frames: List[Dict], start_frame: int, end_frame: int
|
||||
) -> Dict[str, Any]:
|
||||
"""從幀列表創建分片"""
|
||||
chunk_stats = self._calculate_chunk_stats(frames)
|
||||
|
||||
return {
|
||||
"start_frame": start_frame,
|
||||
"end_frame": end_frame, # exclusive
|
||||
"frame_count": len(frames),
|
||||
"keyframe_objects": self._extract_keyframe_objects(frames),
|
||||
"dominant_objects": chunk_stats["dominant_objects"],
|
||||
"object_relationships": [], # 可選:後期添加關係檢測
|
||||
"scene_description": None, # 可選:後期添加 LLM 生成的場景描述
|
||||
"metadata": {
|
||||
"object_count": chunk_stats["total_objects"],
|
||||
"unique_classes": chunk_stats["unique_classes"],
|
||||
"max_confidence": chunk_stats["max_confidence"],
|
||||
"avg_confidence": chunk_stats["avg_confidence"],
|
||||
"spatial_density": chunk_stats["spatial_density"],
|
||||
},
|
||||
}
|
||||
|
||||
def process(self, strategy: str = "fixed", **kwargs) -> Dict[str, Any]:
|
||||
"""處理視覺分片生成
|
||||
|
||||
Args:
|
||||
strategy: 分片策略 ("fixed" 或 "similarity")
|
||||
**kwargs: 策略參數
|
||||
|
||||
Returns:
|
||||
處理結果
|
||||
"""
|
||||
if not self.yolo_result:
|
||||
self.load_yolo_result()
|
||||
|
||||
start_time = datetime.now()
|
||||
|
||||
if strategy == "fixed":
|
||||
frames_per_chunk = kwargs.get("frames_per_chunk", 30)
|
||||
chunks = self.create_fixed_frame_chunks(frames_per_chunk)
|
||||
elif strategy == "similarity":
|
||||
similarity_threshold = kwargs.get("similarity_threshold", 0.5)
|
||||
min_frames = kwargs.get("min_frames_per_chunk", 10)
|
||||
chunks = self.create_similarity_based_chunks(
|
||||
similarity_threshold, min_frames
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown strategy: {strategy}")
|
||||
|
||||
# 計算總統計
|
||||
total_frames = sum(chunk["frame_count"] for chunk in chunks)
|
||||
total_objects = sum(chunk["metadata"]["object_count"] for chunk in chunks)
|
||||
|
||||
# 收集所有唯一類別
|
||||
all_unique_classes = set()
|
||||
for chunk in chunks:
|
||||
all_unique_classes.update(chunk["metadata"]["unique_classes"])
|
||||
|
||||
processing_time = (datetime.now() - start_time).total_seconds()
|
||||
|
||||
result = {
|
||||
"metadata": {
|
||||
"video_path": self.video_path,
|
||||
"processing_time": processing_time,
|
||||
"strategy": strategy,
|
||||
"parameters": kwargs,
|
||||
"processed_at": datetime.now().isoformat(),
|
||||
},
|
||||
"chunk_count": len(chunks),
|
||||
"total_frames": total_frames,
|
||||
"total_objects": total_objects,
|
||||
"unique_classes": len(all_unique_classes),
|
||||
"chunks": chunks,
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="視覺分片處理器")
|
||||
parser.add_argument("video_path", help="視頻文件路徑")
|
||||
parser.add_argument("output_path", help="輸出文件路徑")
|
||||
parser.add_argument("--yolo-result", help="YOLO 結果文件路徑(可選)")
|
||||
parser.add_argument("--uuid", help="檔案 UUID(由 executor 傳入)")
|
||||
parser.add_argument(
|
||||
"--strategy", choices=["fixed", "similarity"], default="fixed", help="分片策略"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--frames-per-chunk", type=int, default=30, help="固定幀數策略:每個分片的幀數"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--similarity-threshold", type=float, default=0.5, help="相似度策略:相似度閾值"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--min-frames-per-chunk", type=int, default=10, help="相似度策略:最小幀數"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"[VisualChunk] Starting processing: {args.video_path}")
|
||||
print(f"[VisualChunk] Strategy: {args.strategy}")
|
||||
|
||||
try:
|
||||
processor = VisualChunkProcessor(args.video_path, args.yolo_result)
|
||||
|
||||
if args.strategy == "fixed":
|
||||
result = processor.process(
|
||||
strategy="fixed", frames_per_chunk=args.frames_per_chunk
|
||||
)
|
||||
else:
|
||||
result = processor.process(
|
||||
strategy="similarity",
|
||||
similarity_threshold=args.similarity_threshold,
|
||||
min_frames_per_chunk=args.min_frames_per_chunk,
|
||||
)
|
||||
|
||||
# 保存結果
|
||||
with open(args.output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print("[VisualChunk] Processing completed")
|
||||
print(f"[VisualChunk] Generated {result['chunk_count']} visual chunks")
|
||||
print(f"[VisualChunk] Total frames: {result['total_frames']}")
|
||||
print(f"[VisualChunk] Total objects: {result['total_objects']}")
|
||||
print(f"[VisualChunk] Unique classes: {result['unique_classes']}")
|
||||
print(f"[VisualChunk] Result saved to: {args.output_path}")
|
||||
except Exception as e:
|
||||
print(f"[VisualChunk] Error: {e}", file=sys.stderr)
|
||||
result = {
|
||||
"chunk_count": 0,
|
||||
"total_frames": 0,
|
||||
"total_objects": 0,
|
||||
"unique_classes": 0,
|
||||
"chunks": [],
|
||||
}
|
||||
with open(args.output_path, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
print(f"[VisualChunk] Fallback: empty result saved to {args.output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
21
scripts/wrapper_embedding.sh
Executable file
21
scripts/wrapper_embedding.sh
Executable file
@@ -0,0 +1,21 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
# Source environment (silently)
|
||||
source "$PROJECT_DIR/.env" 2>/dev/null || true
|
||||
source "$PROJECT_DIR/.env.development" 2>/dev/null || true
|
||||
|
||||
# Ensure PATH is set
|
||||
export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:$PATH"
|
||||
|
||||
# Find python
|
||||
PYTHON="${MOMENTRY_PYTHON_PATH:-$PROJECT_DIR/venv/bin/python}"
|
||||
if [ ! -f "$PYTHON" ]; then
|
||||
PYTHON="/opt/homebrew/bin/python3.11"
|
||||
fi
|
||||
|
||||
EMBED_SCRIPT="$PROJECT_DIR/scripts/embeddinggemma_server.py"
|
||||
exec "$PYTHON" "$EMBED_SCRIPT" --port 11436
|
||||
14
scripts/wrapper_playground.sh
Executable file
14
scripts/wrapper_playground.sh
Executable file
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
# Source environment (silently)
|
||||
source "$PROJECT_DIR/.env" 2>/dev/null || true
|
||||
source "$PROJECT_DIR/.env.development" 2>/dev/null || true
|
||||
|
||||
# Ensure PATH is set
|
||||
export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:$PATH"
|
||||
|
||||
exec "$PROJECT_DIR/target/debug/momentry_playground" server --host 0.0.0.0
|
||||
13
scripts/wrapper_production.sh
Executable file
13
scripts/wrapper_production.sh
Executable file
@@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||||
|
||||
# Source environment (silently)
|
||||
source "$PROJECT_DIR/.env" 2>/dev/null || true
|
||||
|
||||
# Ensure PATH is set
|
||||
export PATH="/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:$PATH"
|
||||
|
||||
exec "$PROJECT_DIR/target/release/momentry" server --host 0.0.0.0
|
||||
Reference in New Issue
Block a user