cleanup: remove dead code and duplicate docs
- Remove session-ses_2f27.md (161KB raw session log) - Remove 49 ROOT_* duplicate files across REFERENCE/ - Remove 14 duplicate files between REFERENCE/ root and history/ - Remove asr_legacy.rs (dead code, replaced by asr.rs) - Remove src/core/worker/ (duplicate JobWorker) - Remove src/core/layers/ (empty directory) - Remove 4 .bak files in src/ - Remove 7 dead private methods in worker/processor.rs - Remove backup directory from git tracking
This commit is contained in:
Binary file not shown.
@@ -4,14 +4,12 @@
|
||||
"""
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
from datetime import datetime
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
|
||||
# 導入人臉識別處理器
|
||||
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||
@@ -275,7 +273,7 @@ class VideoFaceAnalyzer:
|
||||
with open(result_file, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n分析完成:")
|
||||
print("\n分析完成:")
|
||||
print(f" - 處理幀數: {len(frames)}")
|
||||
print(f" - 檢測到人臉: {len(detections)}")
|
||||
print(f" - 分析時間: {result['analysis_time']:.1f}秒")
|
||||
@@ -454,14 +452,14 @@ def main():
|
||||
total_faces = sum(r["faces_detected"] for r in video_results)
|
||||
total_time = sum(r["analysis_time"] for r in video_results)
|
||||
|
||||
print(f"\n📈 分析摘要:")
|
||||
print("\n📈 分析摘要:")
|
||||
print(f" - 總處理視頻: {len(video_results)}")
|
||||
print(f" - 總處理幀數: {total_frames}")
|
||||
print(f" - 總檢測人臉: {total_faces}")
|
||||
print(f" - 總分析時間: {total_time:.1f}秒")
|
||||
|
||||
# 列出生成的文件
|
||||
print(f"\n📄 生成的文件:")
|
||||
print("\n📄 生成的文件:")
|
||||
for filename in sorted(os.listdir(analyzer.output_dir)):
|
||||
filepath = os.path.join(analyzer.output_dir, filename)
|
||||
if os.path.isfile(filepath):
|
||||
|
||||
@@ -23,7 +23,7 @@ import signal
|
||||
import platform
|
||||
import psutil
|
||||
from datetime import datetime, timezone
|
||||
from typing import Dict, Any, Optional, List, Tuple
|
||||
from typing import Dict, Any, List
|
||||
from pathlib import Path
|
||||
import traceback
|
||||
|
||||
@@ -606,7 +606,7 @@ class ASRBenchmarkRunner:
|
||||
metrics = result.get('metrics', {})
|
||||
real_time = result.get('real_time', {})
|
||||
|
||||
lines.append(f"- **Status**: Success")
|
||||
lines.append("- **Status**: Success")
|
||||
lines.append(f"- **Start**: {real_time.get('test_start', 'N/A')}")
|
||||
lines.append(f"- **End**: {real_time.get('test_end', 'N/A')}")
|
||||
lines.append(f"- **Duration**: {metrics.get('processing_time_seconds', 0):.3f}s")
|
||||
@@ -615,7 +615,7 @@ class ASRBenchmarkRunner:
|
||||
lines.append(f"- **Memory Peak**: {metrics.get('peak_memory_mb', 0):.1f}MB")
|
||||
lines.append(f"- **Language**: {metrics.get('language_detected', 'N/A')} ({metrics.get('language_probability', 0):.2f})")
|
||||
else:
|
||||
lines.append(f"- **Status**: Failed")
|
||||
lines.append("- **Status**: Failed")
|
||||
lines.append(f"- **Error**: {result.get('error', 'Unknown error')}")
|
||||
|
||||
lines.append("")
|
||||
@@ -680,7 +680,7 @@ def main():
|
||||
runner.generate_results_json()
|
||||
runner.generate_markdown_report()
|
||||
|
||||
print(f"\nBenchmark completed!")
|
||||
print("\nBenchmark completed!")
|
||||
print(f"Results: {output_dir / 'asr_benchmark_results.json'}")
|
||||
print(f"Report: {output_dir / 'asr_benchmark_report.md'}")
|
||||
|
||||
|
||||
@@ -96,7 +96,7 @@ def print_stats(dist, total_segments):
|
||||
avg_faces = total_faces_sum / total_segments if total_segments > 0 else 0
|
||||
max_faces = max(dist.keys()) if dist else 0
|
||||
|
||||
print(f"\n📊 Summary:")
|
||||
print("\n📊 Summary:")
|
||||
print(f" Average faces per segment: {avg_faces:.1f}")
|
||||
print(f" Max faces in a segment: {max_faces}")
|
||||
print(
|
||||
@@ -110,20 +110,20 @@ def print_stats(dist, total_segments):
|
||||
)
|
||||
|
||||
# Show some example segments
|
||||
print(f"\n🔍 Example Segments:")
|
||||
print(f" 0 faces:")
|
||||
print("\n🔍 Example Segments:")
|
||||
print(" 0 faces:")
|
||||
examples = [s for s in segment_details if s["face_count"] == 0][:3]
|
||||
for ex in examples:
|
||||
print(f" [{ex['start']:.0f}s-{ex['end']:.0f}s] {ex['text']}...")
|
||||
|
||||
print(f" 1 face:")
|
||||
print(" 1 face:")
|
||||
examples = [s for s in segment_details if s["face_count"] == 1][:3]
|
||||
for ex in examples:
|
||||
print(
|
||||
f" [{ex['start']:.0f}s-{ex['end']:.0f}s] {ex['person_ids'][0]}: {ex['text']}..."
|
||||
)
|
||||
|
||||
print(f" 3 faces:")
|
||||
print(" 3 faces:")
|
||||
examples = [s for s in segment_details if s["face_count"] == 3][:3]
|
||||
for ex in examples:
|
||||
pids = ", ".join(ex["person_ids"])
|
||||
|
||||
@@ -18,12 +18,10 @@ Configuration:
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import argparse
|
||||
import signal
|
||||
import subprocess
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
PROCESSOR_VERSION = "2.1"
|
||||
@@ -164,44 +162,127 @@ def run_asr(video_path, output_path, uuid: str = ""):
|
||||
sys.stderr.flush()
|
||||
sys.exit(0)
|
||||
|
||||
# 嘗試以 CUT 場景分段處理(降低長片記憶體使用)
|
||||
cut_scenes = []
|
||||
cut_path = output_path.replace(".asr.json", ".cut.json")
|
||||
if os.path.exists(cut_path):
|
||||
try:
|
||||
with open(cut_path) as f:
|
||||
cut_data = json.load(f)
|
||||
scenes = cut_data.get("scenes", [])
|
||||
if scenes:
|
||||
cut_scenes = [(s["start_time"], s["end_time"]) for s in scenes]
|
||||
print(f"[ASR] Loaded {len(cut_scenes)} cut scenes for segmented transcription", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f"[ASR] Failed to load cut scenes: {e}", file=sys.stderr)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asr", "Loading Whisper model...")
|
||||
|
||||
# Use small model with CPU (MPS not supported by faster_whisper)
|
||||
# small 模型在準確率和速度間取得最佳平衡
|
||||
model = WhisperModel("small", device="cpu", compute_type="int8")
|
||||
model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8")
|
||||
|
||||
if publisher:
|
||||
publisher.info("asr", f"Transcribing: {video_path}")
|
||||
|
||||
# Transcribe with VAD filter for better accuracy, with PyAV fallback
|
||||
segments, info = transcribe_with_fallback(model, video_path, publisher)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asr", f"ASR_LANGUAGE:{info.language}")
|
||||
|
||||
results = []
|
||||
total_segments = 0
|
||||
|
||||
for segment in segments:
|
||||
results.append(
|
||||
{"start": segment.start, "end": segment.end, "text": segment.text.strip()}
|
||||
)
|
||||
total_segments += 1
|
||||
if total_segments % 100 == 0:
|
||||
if publisher:
|
||||
publisher.progress(
|
||||
"asr", total_segments, 0, f"Segment {total_segments}"
|
||||
if cut_scenes:
|
||||
# 分段處理:對每個場景萃取音訊並轉錄
|
||||
import subprocess
|
||||
import tempfile
|
||||
import json
|
||||
temp_dir = tempfile.mkdtemp(prefix="asr_cut_")
|
||||
transcript_language = None
|
||||
|
||||
# 建立 scene lookup: 給定時間點,找是哪個 scene
|
||||
import bisect
|
||||
scene_starts = [s[0] for s in cut_scenes]
|
||||
def find_scene_idx(t):
|
||||
i = bisect.bisect_right(scene_starts, t) - 1
|
||||
return max(0, i)
|
||||
|
||||
# 逐段處理,每段結果即時寫入 .asr.tmp
|
||||
tmp_path = output_path + ".tmp"
|
||||
all_segments = []
|
||||
|
||||
for idx, (start_t, end_t) in enumerate(cut_scenes):
|
||||
seg_wav = os.path.join(temp_dir, f"seg_{idx:04d}.wav")
|
||||
# 用 ffmpeg 萃取出該段音訊
|
||||
cmd = ["ffmpeg", "-y", "-v", "quiet", "-i", video_path,
|
||||
"-ss", str(start_t), "-to", str(end_t),
|
||||
"-ar", "16000", "-ac", "1", seg_wav]
|
||||
subprocess.run(cmd, check=False, capture_output=True)
|
||||
|
||||
if not os.path.exists(seg_wav) or os.path.getsize(seg_wav) < 100:
|
||||
continue # 跳過空音訊
|
||||
|
||||
try:
|
||||
seg_result, seg_info = model.transcribe(
|
||||
seg_wav, beam_size=5,
|
||||
vad_filter=True,
|
||||
vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
|
||||
)
|
||||
if transcript_language is None:
|
||||
transcript_language = seg_info.language
|
||||
|
||||
output = {
|
||||
"language": info.language,
|
||||
"language_probability": info.language_probability,
|
||||
"segments": results,
|
||||
}
|
||||
scene_segments = []
|
||||
for segment in seg_result:
|
||||
seg_start = start_t + segment.start
|
||||
seg_end = start_t + segment.end
|
||||
scene_idx = find_scene_idx((seg_start + seg_end) / 2)
|
||||
scene_segments.append({
|
||||
"start": seg_start,
|
||||
"end": seg_end,
|
||||
"text": segment.text.strip(),
|
||||
"scene_number": scene_idx + 1,
|
||||
})
|
||||
total_segments += 1
|
||||
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
# 當前 scene 結果寫入 .asr.tmp
|
||||
all_segments.extend(scene_segments)
|
||||
with open(tmp_path, "w") as f:
|
||||
json.dump({"language": transcript_language or "", "segments": all_segments}, f)
|
||||
|
||||
if total_segments % 100 == 0:
|
||||
if publisher:
|
||||
publisher.progress("asr", total_segments, 0, f"Segment {total_segments}")
|
||||
except Exception as e:
|
||||
print(f"[ASR] Segment {idx} failed: {e}", file=sys.stderr)
|
||||
|
||||
# 清理暫存 WAV
|
||||
try: os.remove(seg_wav)
|
||||
except: pass
|
||||
|
||||
try: os.rmdir(temp_dir)
|
||||
except: pass
|
||||
|
||||
info_language = transcript_language or "unknown"
|
||||
print(f"[ASR] Segmented transcription complete: {total_segments} segments", file=sys.stderr)
|
||||
else:
|
||||
# 無 CUT 資料,直接轉錄(原有流程)
|
||||
segments, info = transcribe_with_fallback(model, video_path, publisher)
|
||||
info_language = info.language
|
||||
|
||||
tmp_path = output_path + ".tmp"
|
||||
all_segments = []
|
||||
for segment in segments:
|
||||
all_segments.append({
|
||||
"start": segment.start, "end": segment.end,
|
||||
"text": segment.text.strip(),
|
||||
})
|
||||
total_segments += 1
|
||||
if total_segments % 100 == 0:
|
||||
if publisher:
|
||||
publisher.progress("asr", total_segments, 0, f"Segment {total_segments}")
|
||||
with open(tmp_path, "w") as f:
|
||||
json.dump({"language": info_language, "segments": all_segments}, f)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asr", f"ASR_LANGUAGE:{info_language}")
|
||||
|
||||
# rename .tmp → .json
|
||||
os.rename(tmp_path, output_path)
|
||||
|
||||
if publisher:
|
||||
publisher.complete("asr", f"{len(results)} segments")
|
||||
|
||||
@@ -2,12 +2,19 @@
|
||||
"""
|
||||
ASRX Processor - Custom Implementation Wrapper
|
||||
Uses SpeechBrain ECAPA-TDNN (no HuggingFace token required)
|
||||
|
||||
Pipeline:
|
||||
1. Preprocess: ffprobe audio tracks → select best track → extract WAV
|
||||
2. Process: VAD (Silero) → Speaker embedding (ECAPA-TDNN) → Spectral clustering
|
||||
3. Output: segments with speaker_id
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
@@ -18,6 +25,78 @@ sys.path.insert(
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
|
||||
def probe_audio_tracks(video_path: str) -> list:
|
||||
"""Use ffprobe to list all audio tracks in the video file."""
|
||||
cmd = [
|
||||
"ffprobe", "-v", "quiet", "-print_format", "json",
|
||||
"-show_streams", "-select_streams", "a", video_path,
|
||||
]
|
||||
try:
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
||||
data = json.loads(result.stdout)
|
||||
tracks = []
|
||||
for stream in data.get("streams", []):
|
||||
track = {
|
||||
"index": stream.get("index"),
|
||||
"codec": stream.get("codec_name"),
|
||||
"language": stream.get("tags", {}).get("language", "und"),
|
||||
"channels": stream.get("channels", 0),
|
||||
"sample_rate": stream.get("sample_rate", "0"),
|
||||
}
|
||||
tracks.append(track)
|
||||
return tracks
|
||||
except Exception as e:
|
||||
print(f"[ASRX] ffprobe failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def select_best_track(tracks: list) -> int:
|
||||
"""Select the best audio track: English > first available > fallback to 0."""
|
||||
if not tracks:
|
||||
return 0
|
||||
|
||||
# Priority 1: English track
|
||||
for i, t in enumerate(tracks):
|
||||
if t["language"] == "eng" or t["language"] == "en":
|
||||
print(f"[ASRX] Selected English track (index {t['index']})")
|
||||
return i
|
||||
|
||||
# Priority 2: First track with the most channels
|
||||
best = 0
|
||||
for i, t in enumerate(tracks):
|
||||
if t["channels"] > tracks[best]["channels"]:
|
||||
best = i
|
||||
|
||||
print(f"[ASRX] Selected track {best} (lang={tracks[best]['language']}, ch={tracks[best]['channels']})")
|
||||
return best
|
||||
|
||||
|
||||
def extract_audio_to_wav(video_path: str, track_index: int, output_wav: str) -> bool:
|
||||
"""Extract selected audio track to 16kHz mono WAV using ffmpeg."""
|
||||
cmd = [
|
||||
"ffmpeg", "-y", "-v", "quiet",
|
||||
"-i", video_path,
|
||||
"-map", f"0:{track_index}",
|
||||
"-ar", "16000",
|
||||
"-ac", "1",
|
||||
"-sample_fmt", "s16",
|
||||
output_wav,
|
||||
]
|
||||
try:
|
||||
subprocess.run(cmd, check=True, capture_output=True, timeout=300)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"[ASRX] ffmpeg extraction failed: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def _cleanup(tmp_dir):
|
||||
"""Clean up temporary directory."""
|
||||
if tmp_dir and os.path.exists(tmp_dir):
|
||||
import shutil
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
|
||||
|
||||
def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
|
||||
"""Process video for speaker diarization using custom implementation"""
|
||||
|
||||
@@ -25,25 +104,102 @@ def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_START")
|
||||
|
||||
tmp_dir = None
|
||||
|
||||
try:
|
||||
# Ensure working directory is the scripts dir for model loading
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
os.chdir(script_dir)
|
||||
|
||||
# Debug: check ffmpeg availability
|
||||
import shutil
|
||||
ffmpeg_path = shutil.which("ffmpeg")
|
||||
print(f"[ASRX] ffmpeg: {ffmpeg_path}", file=sys.stderr)
|
||||
print(f"[ASRX] CWD: {os.getcwd()}", file=sys.stderr)
|
||||
|
||||
# ---- Stage 1: Audio Track Preprocessing ----
|
||||
print("\n[ASRX] ===== Stage 1: Audio Track Analysis =====", file=sys.stderr)
|
||||
print(f"[ASRX] Input: {video_path}", file=sys.stderr)
|
||||
|
||||
tracks = probe_audio_tracks(video_path)
|
||||
if tracks:
|
||||
print(f"[ASRX] Found {len(tracks)} audio track(s):", file=sys.stderr)
|
||||
for t in tracks:
|
||||
print(f" Track {t['index']}: {t['codec']} {t['channels']}ch {t['sample_rate']}Hz lang={t['language']}", file=sys.stderr)
|
||||
else:
|
||||
print("[ASRX] No audio tracks found via ffprobe, using raw file", file=sys.stderr)
|
||||
|
||||
# Select best track
|
||||
track_idx = select_best_track(tracks) if tracks else 0
|
||||
actual_track_index = tracks[track_idx]["index"] if tracks else track_idx
|
||||
|
||||
# Extract audio to WAV
|
||||
tmp_dir = tempfile.mkdtemp(prefix="asrx_")
|
||||
wav_path = os.path.join(tmp_dir, "audio.wav")
|
||||
|
||||
if extract_audio_to_wav(video_path, actual_track_index, wav_path):
|
||||
wav_size = os.path.getsize(wav_path)
|
||||
print(f"[ASRX] Audio extracted: {wav_path} ({wav_size / 1024 / 1024:.1f}MB)", file=sys.stderr)
|
||||
audio_input = wav_path
|
||||
else:
|
||||
print("[ASRX] Audio extraction failed, falling back to original file", file=sys.stderr)
|
||||
audio_input = video_path
|
||||
|
||||
# ---- Stage 2: Load ASR segments for time alignment ----
|
||||
# Try multiple paths to find ASR JSON
|
||||
asr_segments = []
|
||||
asr_fallback_reason = ""
|
||||
asr_candidates = [
|
||||
output_path.replace(".asrx.json", ".asr.json") if output_path else "",
|
||||
os.path.join(os.path.dirname(output_path) if output_path else ".", os.path.basename(video_path).rsplit(".", 1)[0] + ".asr.json"),
|
||||
os.path.join(os.path.dirname(output_path) if output_path else ".", "dd61fda85fee441fdd00ab5528213ff7.asr.json"),
|
||||
]
|
||||
asr_path = ""
|
||||
for candidate in asr_candidates:
|
||||
if candidate and os.path.exists(candidate):
|
||||
asr_path = candidate
|
||||
break
|
||||
if asr_path:
|
||||
try:
|
||||
with open(asr_path) as f:
|
||||
asr_data = json.load(f)
|
||||
asr_segments = asr_data.get("segments", [])
|
||||
print(f"[ASRX] Loaded {len(asr_segments)} ASR segments from {asr_path}", file=sys.stderr)
|
||||
asr_fallback_reason = f"loaded_{len(asr_segments)}_segments"
|
||||
except Exception as e:
|
||||
asr_fallback_reason = f"load_error_{e}"
|
||||
print(f"[ASRX] Failed to load ASR segments: {e}", file=sys.stderr)
|
||||
else:
|
||||
asr_fallback_reason = f"asr_json_not_found_tried_{len(asr_candidates)}_paths"
|
||||
print(f"[ASRX] ASR output not found, tried {len(asr_candidates)} paths. First candidate: {asr_candidates[0]}", file=sys.stderr)
|
||||
|
||||
# ---- Stage 3: ASRX Processing ----
|
||||
from asrx_self.main_fixed import SelfASRXFixed
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_LOADING_MODEL")
|
||||
|
||||
# Initialize custom ASRX processor
|
||||
asrx = SelfASRXFixed()
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", "ASRX_TRANSCRIBING")
|
||||
|
||||
# Process video/audio
|
||||
result = asrx.process(
|
||||
video_path,
|
||||
output_path=None, # We'll save our own format
|
||||
min_speech_duration_ms=500,
|
||||
max_speakers=10,
|
||||
)
|
||||
if asr_segments:
|
||||
# Use ASR segment boundaries for speaker embedding extraction
|
||||
print(f"[ASRX] Using {len(asr_segments)} ASR segments for diarization", file=sys.stderr)
|
||||
result = asrx.process_with_segments(
|
||||
audio_input,
|
||||
asr_segments,
|
||||
output_path=None,
|
||||
)
|
||||
else:
|
||||
# Fallback: VAD-based diarization
|
||||
result = asrx.process(
|
||||
audio_input,
|
||||
output_path=None,
|
||||
min_speech_duration_ms=500,
|
||||
max_speakers=10,
|
||||
)
|
||||
|
||||
if "error" in result:
|
||||
if publisher:
|
||||
@@ -58,21 +214,47 @@ def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
|
||||
_cleanup(tmp_dir)
|
||||
return output_result
|
||||
|
||||
# Convert to Rust-expected format
|
||||
# Convert to Rust-expected format (start_frame/end_frame/speaker)
|
||||
# Read fps from probe json ({file_uuid}.probe.json)
|
||||
_debug = {"asr_fallback": asr_fallback_reason, "asr_path": asr_path}
|
||||
fps = 30.0
|
||||
output_dir = os.path.dirname(output_path) if output_path else "."
|
||||
base_name = os.path.basename(output_path) if output_path else ""
|
||||
# Extract uuid from {uuid}.{type}.json format
|
||||
uuid_part = base_name.split(".")[0] if base_name else ""
|
||||
probe_candidates = [
|
||||
os.path.join(output_dir, f"{uuid_part}.probe.json"),
|
||||
]
|
||||
for p in probe_candidates:
|
||||
if os.path.exists(p):
|
||||
try:
|
||||
with open(p) as pf:
|
||||
probe_data = json.load(pf)
|
||||
if "fps" in probe_data:
|
||||
fps = float(probe_data["fps"])
|
||||
print(f"[ASRX] FPS from probe: {fps}", file=sys.stderr)
|
||||
break
|
||||
except:
|
||||
pass
|
||||
output_result = {
|
||||
"language": None, # Custom implementation doesn't detect language
|
||||
"language": None,
|
||||
"segments": [],
|
||||
}
|
||||
|
||||
# Convert segments
|
||||
for seg in result["segments"]:
|
||||
start_sec = seg["start"]
|
||||
end_sec = seg["end"]
|
||||
output_result["segments"].append(
|
||||
{
|
||||
"start": seg["start"],
|
||||
"end": seg["end"],
|
||||
"text": "", # Will be filled by matching with ASR later
|
||||
"start_time": start_sec,
|
||||
"end_time": end_sec,
|
||||
"start_frame": int(start_sec * fps),
|
||||
"end_frame": int(end_sec * fps),
|
||||
"text": "",
|
||||
"speaker_id": seg["speaker"],
|
||||
}
|
||||
)
|
||||
@@ -81,20 +263,24 @@ def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
|
||||
if "speaker_stats" in result:
|
||||
output_result["speaker_stats"] = result["speaker_stats"]
|
||||
|
||||
# 傳遞 embeddings(每個 segment 對應的 192-D speaker embedding)
|
||||
if "embeddings" in result:
|
||||
output_result["embeddings"] = result["embeddings"]
|
||||
|
||||
if publisher:
|
||||
publisher.info("asrx", f"ASRX_COMPLETE:{len(output_result['segments'])}")
|
||||
|
||||
# Save output
|
||||
output_result["_debug"] = _debug
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output_result, f, indent=2)
|
||||
|
||||
if publisher:
|
||||
publisher.complete("asrx", f"{len(output_result['segments'])} segments")
|
||||
|
||||
print(
|
||||
f"[ASRX-Custom] Saved {len(output_result['segments'])} segments to {output_path}"
|
||||
)
|
||||
print(f"[ASRX-Custom] Saved {len(output_result['segments'])} segments to {output_path}", file=sys.stderr)
|
||||
|
||||
_cleanup(tmp_dir)
|
||||
return output_result
|
||||
|
||||
except Exception as e:
|
||||
@@ -114,6 +300,7 @@ def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
|
||||
if publisher:
|
||||
publisher.complete("asrx", "0 segments")
|
||||
|
||||
_cleanup(tmp_dir)
|
||||
return output_result
|
||||
|
||||
|
||||
@@ -133,7 +320,7 @@ if __name__ == "__main__":
|
||||
|
||||
result = process_asrx_custom(args.video_path, args.output_path, args.uuid)
|
||||
|
||||
print(f"\n[Summary]")
|
||||
print("\n[Summary]")
|
||||
print(f" Total segments: {len(result['segments'])}")
|
||||
if "speaker_stats" in result:
|
||||
print(f" Detected speakers: {len(result['speaker_stats'])}")
|
||||
|
||||
@@ -130,12 +130,12 @@ def main():
|
||||
integrated = match_face_with_speaker_v3(face_data, asrx_data, args.threshold)
|
||||
|
||||
# 分析
|
||||
print(f"\n[Analyze] Analyzing speaker-face correspondence...")
|
||||
print("\n[Analyze] Analyzing speaker-face correspondence...")
|
||||
speaker_stats = analyze_speaker_face(integrated)
|
||||
|
||||
# 顯示統計
|
||||
print(f"\n{'='*70}")
|
||||
print(f"說話人 - 人臉對應統計")
|
||||
print("說話人 - 人臉對應統計")
|
||||
print(f"{'='*70}")
|
||||
|
||||
total_segments = len(integrated)
|
||||
|
||||
@@ -16,7 +16,6 @@ Self-implemented ASRX - 自實作說話人分離系統
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
# 導入自定義模組
|
||||
@@ -182,7 +181,7 @@ class SelfASRX:
|
||||
result["processing_time"] = round(total_time, 2)
|
||||
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
|
||||
|
||||
print(f"\n[SelfASRX] Processing completed!")
|
||||
print("\n[SelfASRX] Processing completed!")
|
||||
print(f" Total time: {total_time:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
print(f" Detected speakers: {estimated_n_speakers}")
|
||||
@@ -249,14 +248,14 @@ def main():
|
||||
|
||||
# 顯示結果摘要
|
||||
if "error" not in result:
|
||||
print(f"\n[Summary]")
|
||||
print("\n[Summary]")
|
||||
print(f" Audio duration: {result['total_duration']:.2f}s")
|
||||
print(f" Speech segments: {result['n_speech_segments']}")
|
||||
print(f" Detected speakers: {result['n_speakers']}")
|
||||
print(f" Processing time: {result['processing_time']:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
|
||||
print(f"\n[Speaker Statistics]")
|
||||
print("\n[Speaker Statistics]")
|
||||
for speaker, stats in result["speaker_stats"].items():
|
||||
pct = stats["duration"] / result["total_duration"] * 100
|
||||
print(
|
||||
|
||||
@@ -134,7 +134,7 @@ class SelfASRXFixed:
|
||||
result["processing_time"] = round(total_time, 2)
|
||||
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
|
||||
|
||||
print(f"\n[SelfASRX-Fixed] Processing completed!")
|
||||
print("\n[SelfASRX-Fixed] Processing completed!")
|
||||
print(f" Total time: {total_time:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
print(f" Detected speakers: {estimated_n_speakers}")
|
||||
@@ -154,6 +154,117 @@ class SelfASRXFixed:
|
||||
return result
|
||||
|
||||
|
||||
def process_with_segments(self, audio_path, asr_segments, output_path=None):
|
||||
"""
|
||||
使用 ASR segment 邊界進行 speaker diarization,取代 VAD 步驟。
|
||||
|
||||
Args:
|
||||
audio_path: 音頻文件路徑(WAV)
|
||||
asr_segments: ASR segment 列表,每個包含 start/end(秒)
|
||||
output_path: 輸出 JSON 路徑(可選)
|
||||
"""
|
||||
start_time = time.time()
|
||||
print(f"\n[SelfASRX-Fixed] Processing with {len(asr_segments)} ASR segments: {audio_path}")
|
||||
print("=" * 60)
|
||||
|
||||
# 載入完整音頻
|
||||
import soundfile as sf
|
||||
wav, sample_rate = sf.read(audio_path)
|
||||
if len(wav.shape) > 1:
|
||||
wav = np.mean(wav, axis=1) # 轉 mono
|
||||
print(f" Audio loaded: {len(wav)/sample_rate:.2f}s, {sample_rate}Hz")
|
||||
|
||||
# 使用 ASR segments 取代 VAD
|
||||
speech_segments = [(s["start"], s["end"]) for s in asr_segments]
|
||||
print(f" Speech segments from ASR: {len(speech_segments)}")
|
||||
|
||||
if len(speech_segments) == 0:
|
||||
print("[SelfASRX-Fixed] No ASR segments provided!")
|
||||
return {"error": "No ASR segments", "segments": []}
|
||||
|
||||
# 提取語音片段
|
||||
audio_segments = []
|
||||
for start_sec, end_sec in speech_segments:
|
||||
start_sample = int(start_sec * sample_rate)
|
||||
end_sample = int(end_sec * sample_rate)
|
||||
if start_sample >= len(wav):
|
||||
continue
|
||||
audio_segments.append(wav[start_sample:min(end_sample, len(wav))])
|
||||
|
||||
print(f" Audio segments extracted: {len(audio_segments)}")
|
||||
|
||||
# 批量提取聲紋嵌入
|
||||
print("\n[Step 2] Speaker embedding extraction...")
|
||||
step2_start = time.time()
|
||||
embeddings = extract_speaker_embeddings_batch(
|
||||
self.speaker_encoder, audio_segments, sample_rate
|
||||
)
|
||||
embeddings = normalize_embeddings(embeddings)
|
||||
step2_time = time.time() - step2_start
|
||||
print(f" Embedding shape: {embeddings.shape}")
|
||||
print(f" Embedding time: {step2_time:.2f}s")
|
||||
|
||||
# 聚類
|
||||
print("\n[Step 3] Robust speaker clustering...")
|
||||
step3_start = time.time()
|
||||
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
|
||||
embeddings, n_speakers=None, max_speakers=10
|
||||
)
|
||||
step3_time = time.time() - step3_start
|
||||
print(f" Clustering time: {step3_time:.2f}s")
|
||||
|
||||
# 建立輸出
|
||||
result = {
|
||||
"audio_path": str(audio_path),
|
||||
"total_duration": len(wav) / sample_rate,
|
||||
"n_speech_segments": len(speech_segments),
|
||||
"n_speakers": int(estimated_n_speakers),
|
||||
"segments": []
|
||||
}
|
||||
|
||||
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
|
||||
result["segments"].append({
|
||||
"index": i,
|
||||
"start": round(start, 3),
|
||||
"end": round(end, 3),
|
||||
"duration": round(end - start, 3),
|
||||
"speaker": f"SPEAKER_{int(label)}"
|
||||
})
|
||||
|
||||
# 加入 embeddings(每個 segment 對應的 192-D speaker embedding)
|
||||
result["embeddings"] = []
|
||||
for emb in embeddings:
|
||||
result["embeddings"].append(emb.tolist())
|
||||
|
||||
# 統計
|
||||
speaker_stats = {}
|
||||
for seg in result["segments"]:
|
||||
speaker = seg["speaker"]
|
||||
if speaker not in speaker_stats:
|
||||
speaker_stats[speaker] = {"count": 0, "duration": 0}
|
||||
speaker_stats[speaker]["count"] += 1
|
||||
speaker_stats[speaker]["duration"] += seg["duration"]
|
||||
result["speaker_stats"] = speaker_stats
|
||||
|
||||
total_time = time.time() - start_time
|
||||
result["processing_time"] = round(total_time, 2)
|
||||
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
|
||||
|
||||
print("\n[SelfASRX-Fixed] Processing completed!")
|
||||
print(f" Total time: {total_time:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
print(f" Detected speakers: {estimated_n_speakers}")
|
||||
|
||||
if output_path:
|
||||
import json
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, indent=2, ensure_ascii=False)
|
||||
print(f" Results saved to: {output_path}")
|
||||
|
||||
print("=" * 60)
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
||||
@@ -180,14 +291,14 @@ def main():
|
||||
)
|
||||
|
||||
if "error" not in result:
|
||||
print(f"\n[Summary]")
|
||||
print("\n[Summary]")
|
||||
print(f" Audio duration: {result['total_duration']:.2f}s")
|
||||
print(f" Speech segments: {result['n_speech_segments']}")
|
||||
print(f" Detected speakers: {result['n_speakers']}")
|
||||
print(f" Processing time: {result['processing_time']:.2f}s")
|
||||
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
|
||||
|
||||
print(f"\n[Speaker Statistics]")
|
||||
print("\n[Speaker Statistics]")
|
||||
for speaker, stats in result['speaker_stats'].items():
|
||||
pct = stats['duration'] / result['total_duration'] * 100
|
||||
print(f" {speaker}: {stats['count']} segments, " +
|
||||
|
||||
@@ -138,7 +138,7 @@ def spectral_clustering_speaker(
|
||||
|
||||
speaker_labels = clustering.fit_predict(similarity_matrix)
|
||||
|
||||
print(f"[Clustering] Spectral clustering completed")
|
||||
print("[Clustering] Spectral clustering completed")
|
||||
print(f"[Clustering] n_speakers: {n_speakers}")
|
||||
print(f"[Clustering] n_segments: {n_segments}")
|
||||
|
||||
@@ -146,7 +146,7 @@ def spectral_clustering_speaker(
|
||||
|
||||
except Exception as e:
|
||||
print(f"[Clustering] Spectral clustering failed: {e}")
|
||||
print(f"[Clustering] Using fallback: 2 speakers")
|
||||
print("[Clustering] Using fallback: 2 speakers")
|
||||
# 簡單分配:前一半是 SPEAKER_0,後一半是 SPEAKER_1
|
||||
speaker_labels = np.array(
|
||||
[0] * (n_segments // 2) + [1] * (n_segments - n_segments // 2)
|
||||
@@ -203,7 +203,7 @@ def agglomerative_clustering_speaker(
|
||||
|
||||
speaker_labels = clustering.fit_predict(embeddings)
|
||||
|
||||
print(f"[Clustering] Agglomerative clustering completed")
|
||||
print("[Clustering] Agglomerative clustering completed")
|
||||
print(f"[Clustering] n_speakers: {n_speakers}")
|
||||
|
||||
return speaker_labels, n_speakers
|
||||
@@ -249,7 +249,6 @@ def compute_diarization_purity(speaker_labels, ground_truth_labels=None):
|
||||
"""
|
||||
if ground_truth_labels is None:
|
||||
# 沒有 ground truth,使用聚類純度近似
|
||||
from sklearn.metrics import silhouette_score
|
||||
|
||||
# 使用餘弦相似度作為距離
|
||||
purity = 0.5 # 預設值
|
||||
@@ -300,7 +299,7 @@ if __name__ == "__main__":
|
||||
similarity, n_speakers=None, auto_estimate=True
|
||||
)
|
||||
|
||||
print(f"\n[Test] Clustering results:")
|
||||
print("\n[Test] Clustering results:")
|
||||
print(f" True n_speakers: {n_speakers}")
|
||||
print(f" Estimated n_speakers: {n_clusters}")
|
||||
print(f" Unique labels: {np.unique(labels)}")
|
||||
|
||||
@@ -6,7 +6,6 @@ Speaker Clustering - Fixed Version
|
||||
|
||||
import numpy as np
|
||||
from sklearn.cluster import AgglomerativeClustering
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
|
||||
def robust_speaker_clustering(embeddings, n_speakers=None, max_speakers=10):
|
||||
@@ -57,7 +56,7 @@ def robust_speaker_clustering(embeddings, n_speakers=None, max_speakers=10):
|
||||
|
||||
# 統計每個聚類的大小
|
||||
unique, counts = np.unique(speaker_labels, return_counts=True)
|
||||
print(f"[Clustering] Cluster sizes:")
|
||||
print("[Clustering] Cluster sizes:")
|
||||
for label, count in zip(unique, counts):
|
||||
print(f" SPEAKER_{label}: {count} segments ({count/n_segments*100:.1f}%)")
|
||||
|
||||
@@ -148,6 +147,6 @@ if __name__ == "__main__":
|
||||
# 測試聚類
|
||||
labels, n_clusters = robust_speaker_clustering(embeddings)
|
||||
|
||||
print(f"\nResult:")
|
||||
print("\nResult:")
|
||||
print(f" True n_speakers: {n_speakers}")
|
||||
print(f" Estimated n_speakers: {n_clusters}")
|
||||
|
||||
@@ -33,8 +33,8 @@ def load_speaker_encoder(model_name="speechbrain/spkrec-ecapa-voxceleb"):
|
||||
)
|
||||
|
||||
# 獲取模型資訊
|
||||
print(f"[SpeakerEncoder] Model loaded successfully")
|
||||
print(f"[SpeakerEncoder] Embedding dimension: 192")
|
||||
print("[SpeakerEncoder] Model loaded successfully")
|
||||
print("[SpeakerEncoder] Embedding dimension: 192")
|
||||
|
||||
return classifier
|
||||
|
||||
@@ -187,5 +187,5 @@ if __name__ == "__main__":
|
||||
print(f"[Test] Embedding std: {embedding.std():.4f}")
|
||||
|
||||
# 顯示部分嵌入值
|
||||
print(f"\n[Test] First 10 embedding values:")
|
||||
print("\n[Test] First 10 embedding values:")
|
||||
print(f" {embedding[:10]}")
|
||||
|
||||
@@ -11,7 +11,6 @@ import os
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
|
||||
try:
|
||||
import tkinter as tk
|
||||
|
||||
@@ -11,7 +11,6 @@ import os
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
|
||||
try:
|
||||
import tkinter as tk
|
||||
@@ -203,7 +202,7 @@ class SpeakerPlayerGUI:
|
||||
self.face_path = filename
|
||||
self.face_label.config(text=Path(filename).name)
|
||||
self.integrate_button.config(state=tk.NORMAL)
|
||||
self.status_label.config(text=f"✅ Face 已選擇 - 請點擊整合")
|
||||
self.status_label.config(text="✅ Face 已選擇 - 請點擊整合")
|
||||
|
||||
def integrate_face(self):
|
||||
"""整合 Face 與 ASRX"""
|
||||
|
||||
@@ -93,14 +93,14 @@ def show_menu(speaker_segments: Dict[str, List[Dict]], speaker_id: str):
|
||||
print(f" ... and {len(segs) - 20} more segments")
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
print(f"Commands:")
|
||||
print("Commands:")
|
||||
print(f" [1-{min(20, len(segs))}] Play specific segment")
|
||||
print(f" all Play all segments (may take a while)")
|
||||
print(f" first N Play first N segments")
|
||||
print(f" next Next speaker")
|
||||
print(f" prev Previous speaker")
|
||||
print(f" list List all speakers")
|
||||
print(f" quit Exit")
|
||||
print(" all Play all segments (may take a while)")
|
||||
print(" first N Play first N segments")
|
||||
print(" next Next speaker")
|
||||
print(" prev Previous speaker")
|
||||
print(" list List all speakers")
|
||||
print(" quit Exit")
|
||||
print(f"{'=' * 70}")
|
||||
|
||||
|
||||
@@ -132,7 +132,7 @@ def interactive_player(audio_path: str, result_path: str):
|
||||
|
||||
current_speaker_idx = 0
|
||||
|
||||
print(f"\n🎬 Speaker Audio Player")
|
||||
print("\n🎬 Speaker Audio Player")
|
||||
print(f"📁 Audio: {audio_path}")
|
||||
print(f"📊 Speakers: {len(speakers)}")
|
||||
print(f"{'=' * 70}")
|
||||
@@ -159,7 +159,7 @@ def interactive_player(audio_path: str, result_path: str):
|
||||
print(
|
||||
f" ⏱️ {seg['start']:.2f}s - {seg['end']:.2f}s ({seg['duration']:.2f}s)"
|
||||
)
|
||||
print(f" ▶️ Playing...", end="", flush=True)
|
||||
print(" ▶️ Playing...", end="", flush=True)
|
||||
if extract_and_play(audio_path, seg["start"], seg["end"]):
|
||||
print(" ✅ Done")
|
||||
else:
|
||||
@@ -220,7 +220,7 @@ def interactive_player(audio_path: str, result_path: str):
|
||||
# 列出所有說話人
|
||||
elif cmd == "list":
|
||||
print(f"\n{'=' * 70}")
|
||||
print(f"📢 All speakers:")
|
||||
print("📢 All speakers:")
|
||||
print(f"{'=' * 70}")
|
||||
for i, speaker in enumerate(speakers, 1):
|
||||
segs = speaker_segments[speaker]
|
||||
|
||||
@@ -6,8 +6,6 @@ GUI Face Player 自動化測試腳本
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import time
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
@@ -55,7 +54,7 @@ def test_asrx_results():
|
||||
print(f"📊 語音片段:{n_segments}")
|
||||
|
||||
# 說話人統計
|
||||
print(f"\n📢 說話人分佈:")
|
||||
print("\n📢 說話人分佈:")
|
||||
speaker_stats = data.get('speaker_stats', {})
|
||||
for speaker, stats in sorted(speaker_stats.items(), key=lambda x: x[1]['duration'], reverse=True):
|
||||
duration = stats.get('duration', 0)
|
||||
@@ -102,7 +101,7 @@ def test_integration():
|
||||
print(f"📊 匹配率:{match_rate:.2f}%")
|
||||
|
||||
# 說話人匹配統計
|
||||
print(f"\n📢 說話人匹配詳情:")
|
||||
print("\n📢 說話人匹配詳情:")
|
||||
speaker_stats = data.get('speaker_stats', {})
|
||||
for speaker, stats in sorted(speaker_stats.items()):
|
||||
total_seg = stats.get('total_segments', 0)
|
||||
@@ -164,7 +163,7 @@ def test_playback():
|
||||
end = first_seg['end']
|
||||
duration = end - start
|
||||
|
||||
print(f"\n🎵 測試提取第一個片段:")
|
||||
print("\n🎵 測試提取第一個片段:")
|
||||
print(f" 時間:{start:.2f}s - {end:.2f}s ({duration:.2f}s)")
|
||||
|
||||
# 實際提取測試
|
||||
@@ -222,10 +221,10 @@ def generate_report():
|
||||
# 保存報告
|
||||
report_path = '/tmp/long_movie_test_report.md'
|
||||
with open(report_path, 'w', encoding='utf-8') as f:
|
||||
f.write(f"# 長影片測試報告\n\n")
|
||||
f.write("# 長影片測試報告\n\n")
|
||||
f.write(f"**測試時間**: {datetime.now().isoformat()}\n")
|
||||
f.write(f"**測試影片**: Charade 1963 (114.7 分鐘)\n\n")
|
||||
f.write(f"## 結果\n\n")
|
||||
f.write("**測試影片**: Charade 1963 (114.7 分鐘)\n\n")
|
||||
f.write("## 結果\n\n")
|
||||
f.write(f"**通過**: {passed}/{total}\n\n")
|
||||
for name, result in tests:
|
||||
status = "✅" if result else "❌"
|
||||
|
||||
@@ -9,7 +9,6 @@ VAD (Voice Activity Detection) - 語音活動檢測
|
||||
"""
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
|
||||
def load_vad_model():
|
||||
@@ -143,7 +142,7 @@ if __name__ == "__main__":
|
||||
print(f"[VAD] Processing: {audio_path}")
|
||||
segments, wav, sr = extract_speech_segments(audio_path, model, utils)
|
||||
|
||||
print(f"\n[VAD] Results:")
|
||||
print("\n[VAD] Results:")
|
||||
print(f" Sample rate: {sr} Hz")
|
||||
print(f" Speech segments: {len(segments)}")
|
||||
print(f" Total duration: {len(wav) / sr:.2f}s")
|
||||
@@ -153,7 +152,7 @@ if __name__ == "__main__":
|
||||
f" Total speech: {total_speech:.2f}s ({total_speech / (len(wav) / sr) * 100:.1f}%)"
|
||||
)
|
||||
|
||||
print(f"\n[VAD] Segments:")
|
||||
print("\n[VAD] Segments:")
|
||||
for i, (start, end) in enumerate(segments[:10]):
|
||||
print(f" {i + 1:3d}. {start:6.2f}s - {end:6.2f}s ({end - start:5.2f}s)")
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@ Audio Taxonomy Processor (Hugging Face Transformers)
|
||||
職責:使用 AST 模型進行高精度音頻分類,並映射到業務分類。
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
@@ -75,7 +74,7 @@ def map_to_taxonomy(predictions):
|
||||
|
||||
def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):
|
||||
"""執行分類"""
|
||||
print(f"🔍 Loading AST model (MIT) from Hugging Face...")
|
||||
print("🔍 Loading AST model (MIT) from Hugging Face...")
|
||||
# 使用 Audio Spectrogram Transformer,準確率高且支援 MPS/CPU
|
||||
classifier = pipeline(
|
||||
"audio-classification",
|
||||
@@ -103,7 +102,7 @@ def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):
|
||||
|
||||
if taxonomy:
|
||||
results.append({"timestamp": round(current, 1), "categories": taxonomy})
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass # 跳過錯誤片段
|
||||
|
||||
current += hop_sec
|
||||
@@ -132,6 +131,6 @@ if __name__ == "__main__":
|
||||
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump({"audio_taxonomy": events}, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n🎉 Classification Complete!")
|
||||
print("\n🎉 Classification Complete!")
|
||||
print(f"✅ Found {len(events)} tagged audio segments.")
|
||||
print(f"💾 Saved to {OUTPUT_JSON}")
|
||||
|
||||
@@ -99,7 +99,7 @@ def map_to_taxonomy(logits, model):
|
||||
|
||||
def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):
|
||||
"""執行分類"""
|
||||
print(f"🔍 Loading AST model (MIT)...")
|
||||
print("🔍 Loading AST model (MIT)...")
|
||||
model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"
|
||||
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
|
||||
@@ -167,6 +167,6 @@ if __name__ == "__main__":
|
||||
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump({"audio_taxonomy": events}, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n🎉 Classification Complete!")
|
||||
print("\n🎉 Classification Complete!")
|
||||
print(f"✅ Found {len(events)} tagged audio segments.")
|
||||
print(f"💾 Saved to {OUTPUT_JSON}")
|
||||
|
||||
@@ -105,7 +105,7 @@ def main():
|
||||
|
||||
# 6. Generate report
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"📊 Person Identification Results")
|
||||
print("📊 Person Identification Results")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
# Sort by frame count
|
||||
@@ -177,7 +177,7 @@ def main():
|
||||
print(f"✅ Executed {executed} SQL statements")
|
||||
|
||||
# 9. Generate SQL INSERT statements for person_identities
|
||||
print(f"\n--- SQL INSERT statements for person_identities ---")
|
||||
print("\n--- SQL INSERT statements for person_identities ---")
|
||||
for p in output["persons"][:10]:
|
||||
speaker_val = f"'{p['speaker_id']}'" if p["speaker_id"] else "NULL"
|
||||
print(
|
||||
|
||||
@@ -4,11 +4,9 @@ Backfill missing Age & Gender for persons.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import cv2
|
||||
import psycopg2
|
||||
import insightface
|
||||
import numpy as np
|
||||
|
||||
DB_CONFIG = {"host": "localhost", "user": "accusys", "dbname": "momentry"}
|
||||
BASE_VIDEO_DIR = "output"
|
||||
@@ -94,7 +92,7 @@ def main():
|
||||
else:
|
||||
print(f" -> Detection incomplete (Age:{age}, Gender:{gender})")
|
||||
else:
|
||||
print(f" -> No face found in frame.")
|
||||
print(" -> No face found in frame.")
|
||||
|
||||
print("=== Done ===")
|
||||
conn.close()
|
||||
|
||||
@@ -10,8 +10,8 @@ from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
|
||||
UUID = "384b0ff44aaaa1f1"
|
||||
OUTPUT_DIR = f"output/{UUID}/florence2_results"
|
||||
INPUT_IMG = os.path.join(OUTPUT_DIR, f"raw_6846.jpg")
|
||||
OUTPUT_IMG = os.path.join(OUTPUT_DIR, f"all_stamps_detected.jpg")
|
||||
INPUT_IMG = os.path.join(OUTPUT_DIR, "raw_6846.jpg")
|
||||
OUTPUT_IMG = os.path.join(OUTPUT_DIR, "all_stamps_detected.jpg")
|
||||
|
||||
# Patch for compatibility (Same as before)
|
||||
import types
|
||||
|
||||
@@ -67,10 +67,10 @@ def main():
|
||||
|
||||
all_passed = doc_check_success and code_doc_check_success
|
||||
if all_passed:
|
||||
print(f"\n🎉 所有檢查通過!")
|
||||
print("\n🎉 所有檢查通過!")
|
||||
print("架構文檔符合 Phase 1 標準化要求。")
|
||||
else:
|
||||
print(f"\n⚠️ 發現問題,請參考檢查結果進行修復。")
|
||||
print("\n⚠️ 發現問題,請參考檢查結果進行修復。")
|
||||
print("提示:")
|
||||
print(" 1. 使用 TERMINOLOGY_MAPPING.md 作為術語標準參考")
|
||||
print(" 2. 確保設計與實現差異在 DESIGN_IMPLEMENTATION_GAP.md 中記錄")
|
||||
|
||||
@@ -12,14 +12,13 @@
|
||||
python3 scripts/check_architecture_docs.py [--report] [--verbose]
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import glob
|
||||
import json
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Set, Tuple, Optional
|
||||
from typing import Dict, List, Set, Optional
|
||||
from collections import defaultdict
|
||||
|
||||
# 配置
|
||||
@@ -410,15 +409,15 @@ class ArchitectureDocChecker:
|
||||
print(f"{'=' * 60}")
|
||||
print(f"📁 檢查文件數: {total_files}")
|
||||
print(f"⚠️ 發現問題數: {total_issues}")
|
||||
print(f"\n問題分類:")
|
||||
print("\n問題分類:")
|
||||
for issue_type, count in report["summary"]["issues_by_type"].items():
|
||||
print(f" - {issue_type}: {count}")
|
||||
print(f"\n嚴重程度:")
|
||||
print("\n嚴重程度:")
|
||||
for severity, count in report["summary"]["issues_by_severity"].items():
|
||||
print(f" - {severity}: {count}")
|
||||
|
||||
if total_issues > 0:
|
||||
print(f"\n🔍 詳細問題:")
|
||||
print("\n🔍 詳細問題:")
|
||||
for file_report in report["files"]:
|
||||
if file_report["issues"]:
|
||||
print(f"\n文件: {file_report['file']}")
|
||||
@@ -474,7 +473,7 @@ def main():
|
||||
print(f"\n❌ 發現 {report['summary']['total_issues']} 個問題,請修復")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f"\n✅ 所有檢查通過!")
|
||||
print("\n✅ 所有檢查通過!")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
|
||||
@@ -6,9 +6,7 @@
|
||||
核心原則:當設計與實現出現矛盾時,以實際的 Rust 代碼實現為最高權威
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
@@ -177,7 +175,7 @@ def main():
|
||||
issues = check_terminology_consistency(implemented_variants)
|
||||
|
||||
# 3. 顯示結果
|
||||
print(f"\n📊 檢查完成:")
|
||||
print("\n📊 檢查完成:")
|
||||
print(f" 發現問題數: {len(issues)}")
|
||||
|
||||
if issues:
|
||||
|
||||
@@ -5,7 +5,6 @@ Analyze Frame at 112:36 (6756s) for Stamps
|
||||
|
||||
import os
|
||||
import cv2
|
||||
import torch
|
||||
import types
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
|
||||
@@ -5,7 +5,6 @@ Analyze Frame at 91:59 (5519s) for Stamps
|
||||
|
||||
import os
|
||||
import cv2
|
||||
import torch
|
||||
import types
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
|
||||
@@ -6,7 +6,6 @@ Generates a comprehensive report of each chunk's content.
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
UUID = "384b0ff44aaaa1f1"
|
||||
BASE_DIR = f"output/{UUID}"
|
||||
@@ -107,7 +106,7 @@ def print_summary(chunks):
|
||||
1 for c in chunks if not c["has_speech"] and not c["has_faces"]
|
||||
)
|
||||
|
||||
print(f"\n📊 Overview:")
|
||||
print("\n📊 Overview:")
|
||||
print(f" Total chunks: {len(chunks)}")
|
||||
print(
|
||||
f" Chunks with speech: {total_speech_chunks} ({total_speech_chunks / len(chunks) * 100:.0f}%)"
|
||||
@@ -125,7 +124,7 @@ def print_summary(chunks):
|
||||
print(f" Total face frames: {total_faces}")
|
||||
|
||||
# Combination breakdown
|
||||
print(f"\n🎯 ASR/Face Combination Breakdown:")
|
||||
print("\n🎯 ASR/Face Combination Breakdown:")
|
||||
|
||||
combos = {}
|
||||
for c in chunks:
|
||||
@@ -148,7 +147,7 @@ def print_summary(chunks):
|
||||
)
|
||||
|
||||
# Top chunks by activity
|
||||
print(f"\n🔥 Top 10 Most Active Chunks (by ASR+Faces):")
|
||||
print("\n🔥 Top 10 Most Active Chunks (by ASR+Faces):")
|
||||
scored_chunks = []
|
||||
for c in chunks:
|
||||
score = c["asr_count"] + c["face_count"]
|
||||
@@ -164,7 +163,7 @@ def print_summary(chunks):
|
||||
)
|
||||
|
||||
# Stamp scene chunk
|
||||
print(f"\n🔍 Special Interest Chunks:")
|
||||
print("\n🔍 Special Interest Chunks:")
|
||||
for c in chunks:
|
||||
# Stamp scene around 5730s
|
||||
if c["start"] <= 5730 <= c["end"]:
|
||||
|
||||
@@ -256,7 +256,7 @@ def test_similarity_search(
|
||||
result = cur.fetchone()
|
||||
|
||||
if not result or not result[0]:
|
||||
print(f"⚠️ Identity embedding not found")
|
||||
print("⚠️ Identity embedding not found")
|
||||
return []
|
||||
|
||||
stored_embedding_raw = result[0]
|
||||
@@ -323,7 +323,7 @@ def main():
|
||||
logo_path = TEMP_DIR / f"{name.replace(' ', '_')}.png"
|
||||
|
||||
if not logo_path.exists():
|
||||
print(f"\n🔧 Downloading logo...")
|
||||
print("\n🔧 Downloading logo...")
|
||||
if not download_image(logo_url, logo_path):
|
||||
sys.exit(1)
|
||||
|
||||
@@ -334,18 +334,18 @@ def main():
|
||||
if args.performance:
|
||||
perf_result = test_mps_performance(model, processor, device, logo_path, iterations=10)
|
||||
if perf_result:
|
||||
print(f"\n📊 Performance Summary:")
|
||||
print("\n📊 Performance Summary:")
|
||||
print(f" MPS: {perf_result['mps_time']:.4f}s/img")
|
||||
print(f" CPU: {perf_result['cpu_time']:.4f}s/img")
|
||||
print(f" Speedup: {perf_result['speedup']:.2f}x")
|
||||
|
||||
print(f"\n🔧 Extracting CLIP embedding...")
|
||||
print("\n🔧 Extracting CLIP embedding...")
|
||||
embedding = extract_clip_embedding(model, processor, device, logo_path)
|
||||
|
||||
if not embedding:
|
||||
sys.exit(1)
|
||||
|
||||
print(f"\n🔧 Registering to database...")
|
||||
print("\n🔧 Registering to database...")
|
||||
uuid = register_logo_identity_to_db(
|
||||
name=name,
|
||||
logo_url=logo_url,
|
||||
@@ -354,13 +354,13 @@ def main():
|
||||
)
|
||||
|
||||
if uuid:
|
||||
print(f"\n🎉 Integration completed!")
|
||||
print("\n🎉 Integration completed!")
|
||||
print(f" Identity: {name}")
|
||||
print(f" UUID: {uuid}")
|
||||
print(f" Embedding: {len(embedding)}-dim")
|
||||
print(f" URL: {logo_url}")
|
||||
|
||||
print(f"\n🔧 Testing similarity search...")
|
||||
print("\n🔧 Testing similarity search...")
|
||||
test_embeddings = [
|
||||
embedding,
|
||||
[0.1] * 768,
|
||||
@@ -369,9 +369,9 @@ def main():
|
||||
matches = test_similarity_search(uuid, test_embeddings, threshold=0.85, schema=args.schema)
|
||||
|
||||
if matches:
|
||||
print(f"\n✅ Similarity search test passed")
|
||||
print("\n✅ Similarity search test passed")
|
||||
else:
|
||||
print(f"\n❌ Integration failed")
|
||||
print("\n❌ Integration failed")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ ASR方案内容对比分析
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from difflib import unified_diff, SequenceMatcher
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
def load_segments(json_path):
|
||||
"""加载JSON文件中的segments"""
|
||||
@@ -25,7 +25,7 @@ def compare_segments(seg_a, seg_b, name_a, name_b):
|
||||
print(f"{'='*60}")
|
||||
|
||||
# 统计
|
||||
print(f"\n【数量对比】")
|
||||
print("\n【数量对比】")
|
||||
print(f" {name_a}: {len(seg_a)} segments")
|
||||
print(f" {name_b}: {len(seg_b)} segments")
|
||||
print(f" 差异: {len(seg_a) - len(seg_b)} segments")
|
||||
@@ -34,7 +34,7 @@ def compare_segments(seg_a, seg_b, name_a, name_b):
|
||||
total_time_a = sum(s['end'] - s['start'] for s in seg_a)
|
||||
total_time_b = sum(s['end'] - s['start'] for s in seg_b)
|
||||
|
||||
print(f"\n【时间覆盖】")
|
||||
print("\n【时间覆盖】")
|
||||
print(f" {name_a}: {total_time_a:.2f}秒")
|
||||
print(f" {name_b}: {total_time_b:.2f}秒")
|
||||
print(f" 差异: {total_time_a - total_time_b:.2f}秒")
|
||||
@@ -48,11 +48,11 @@ def compare_segments(seg_a, seg_b, name_a, name_b):
|
||||
text_b_full = ' '.join(texts_b)
|
||||
similarity = SequenceMatcher(None, text_a_full, text_b_full).ratio()
|
||||
|
||||
print(f"\n【文本相似度】")
|
||||
print("\n【文本相似度】")
|
||||
print(f" 相似度: {similarity*100:.1f}%")
|
||||
|
||||
# 差异分析
|
||||
print(f"\n【详细差异】")
|
||||
print("\n【详细差异】")
|
||||
|
||||
# 按时间对齐对比
|
||||
matched_diffs = []
|
||||
@@ -98,7 +98,7 @@ def compare_segments(seg_a, seg_b, name_a, name_b):
|
||||
if len(matched_diffs) > 10:
|
||||
print(f"\n ... 还有 {len(matched_diffs) - 10} 处差异")
|
||||
else:
|
||||
print(f" ✓ 无显著文本差异")
|
||||
print(" ✓ 无显著文本差异")
|
||||
|
||||
return {
|
||||
'segments_diff': len(seg_a) - len(seg_b),
|
||||
@@ -122,10 +122,10 @@ def main():
|
||||
|
||||
# 方案基本信息
|
||||
print("【测试方案】")
|
||||
print(f" 方案A: faster-whisper small CPU")
|
||||
print(f" 方案B: OpenAI whisper small CPU")
|
||||
print(f" 方案D: OpenAI whisper medium CPU")
|
||||
print(f" 方案C/E: MPS失败(不支持)")
|
||||
print(" 方案A: faster-whisper small CPU")
|
||||
print(" 方案B: OpenAI whisper small CPU")
|
||||
print(" 方案D: OpenAI whisper medium CPU")
|
||||
print(" 方案C/E: MPS失败(不支持)")
|
||||
print()
|
||||
|
||||
# 三组对比
|
||||
@@ -142,16 +142,16 @@ def main():
|
||||
print("="*60)
|
||||
|
||||
print("\n【Segments数量】")
|
||||
print(f" 方案A: 77 segments (最多)")
|
||||
print(f" 方案B: 74 segments")
|
||||
print(f" 方案D: 74 segments")
|
||||
print(f" 结论: faster-whisper分割更细(+3 segments)")
|
||||
print(" 方案A: 77 segments (最多)")
|
||||
print(" 方案B: 74 segments")
|
||||
print(" 方案D: 74 segments")
|
||||
print(" 结论: faster-whisper分割更细(+3 segments)")
|
||||
|
||||
print("\n【文本相似度】")
|
||||
print(f" A vs B: {results['A_vs_B']['similarity']*100:.1f}%")
|
||||
print(f" A vs D: {results['A_vs_D']['similarity']*100:.1f}%")
|
||||
print(f" B vs D: {results['B_vs_D']['similarity']*100:.1f}%")
|
||||
print(f" 结论: 三个方案文本高度相似")
|
||||
print(" 结论: 三个方案文本高度相似")
|
||||
|
||||
print("\n【文本差异统计】")
|
||||
print(f" A vs B: {results['A_vs_B']['text_diffs']}处差异")
|
||||
@@ -159,9 +159,9 @@ def main():
|
||||
print(f" B vs D: {results['B_vs_D']['text_diffs']}处差异")
|
||||
|
||||
print("\n【方案D(medium)vs 方案B(small)】")
|
||||
print(f" Segments数量相同: 74条")
|
||||
print(" Segments数量相同: 74条")
|
||||
print(f" 文本相似度: {results['B_vs_D']['similarity']*100:.1f}%")
|
||||
print(f" 结论: medium模型无明显提升")
|
||||
print(" 结论: medium模型无明显提升")
|
||||
|
||||
print()
|
||||
print("="*60)
|
||||
|
||||
131
scripts/compare_segmentation.py
Normal file
131
scripts/compare_segmentation.py
Normal file
@@ -0,0 +1,131 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
POC: Compare silence-based segmentation vs CUT-based segmentation for ASR.
|
||||
|
||||
Tests a short video segment and reports:
|
||||
1. Number of segments from each method
|
||||
2. Segment boundaries
|
||||
3. ASR quality comparison (WER estimate)
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
VIDEO_PATH = sys.argv[1] if len(sys.argv) > 1 else "/Users/accusys/test_video/Old_Time_Movie_Show_-_Charade_1963.HD.mov"
|
||||
DURATION = 300 # Test first 5 minutes only
|
||||
|
||||
model = WhisperModel("small", device="cpu", compute_type="int8")
|
||||
|
||||
def extract_audio_segment(start, end, out_wav):
|
||||
cmd = ["ffmpeg", "-y", "-v", "quiet", "-i", VIDEO_PATH,
|
||||
"-ss", str(start), "-to", str(end),
|
||||
"-ar", "16000", "-ac", "1", out_wav]
|
||||
subprocess.run(cmd, check=False, capture_output=True)
|
||||
return os.path.getsize(out_wav) > 100
|
||||
|
||||
def transcribe(wav_path):
|
||||
segs, info = model.transcribe(wav_path, beam_size=5, vad_filter=True,
|
||||
vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200))
|
||||
return list(segs), info
|
||||
|
||||
# === Method 1: CUT-based segmentation ===
|
||||
print("=" * 60)
|
||||
print("METHOD 1: CUT-based segmentation")
|
||||
print("=" * 60)
|
||||
cut_path = "/Users/accusys/momentry/output_dev/417a7e93860d70c87aee6c4c1b715d70.cut.json"
|
||||
cut_scenes = []
|
||||
if os.path.exists(cut_path):
|
||||
with open(cut_path) as f:
|
||||
data = json.load(f)
|
||||
cut_scenes = [(s["start_time"], s["end_time"]) for s in data.get("scenes", []) if s["start_time"] < DURATION]
|
||||
print(f" Scenes in first {DURATION}s: {len(cut_scenes)}")
|
||||
|
||||
tmpdir = tempfile.mkdtemp(prefix="seg_compare_")
|
||||
t1 = time.time()
|
||||
cut_segments = []
|
||||
total_chars = 0
|
||||
for idx, (st, et) in enumerate(cut_scenes):
|
||||
wav = os.path.join(tmpdir, f"cut_{idx:04d}.wav")
|
||||
if not extract_audio_segment(st, et, wav):
|
||||
continue
|
||||
segs, info = transcribe(wav)
|
||||
for s in segs:
|
||||
cut_segments.append({"start": st + s.start, "end": st + s.end, "text": s.text})
|
||||
total_chars += len(s.text)
|
||||
cut_time = time.time() - t1
|
||||
print(f" Segments: {len(cut_segments)}, Total chars: {total_chars}, Time: {cut_time:.1f}s")
|
||||
print(f" Avg segment duration: {DURATION/len(cut_segments):.1f}s" if cut_segments else "")
|
||||
|
||||
# === Method 2: Silence-based segmentation (ffmpeg silencedetect) ===
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("METHOD 2: Silence-based segmentation (ffmpeg silencedetect)")
|
||||
print("=" * 60)
|
||||
|
||||
# Extract full 5min audio
|
||||
full_wav = os.path.join(tmpdir, "full_audio.wav")
|
||||
extract_audio_segment(0, DURATION, full_wav)
|
||||
|
||||
# Use ffmpeg silencedetect to find speech segments
|
||||
t2 = time.time()
|
||||
detect_cmd = ["ffmpeg", "-i", full_wav, "-af", "silencedetect=noise=-30dB:d=0.5", "-f", "null", "-"]
|
||||
result = subprocess.run(detect_cmd, capture_output=True, text=True)
|
||||
stderr = result.stderr
|
||||
|
||||
# Parse silencedetect output
|
||||
silence_starts = []
|
||||
silence_ends = []
|
||||
for line in stderr.split("\n"):
|
||||
if "silence_start:" in line:
|
||||
silence_starts.append(float(line.split("silence_start:")[1].strip()))
|
||||
elif "silence_end:" in line:
|
||||
silence_ends.append(float(line.split("silence_end:")[1].split("|")[0].strip()))
|
||||
|
||||
# Build speech segments: gaps between silence periods
|
||||
speech_segments = []
|
||||
last_end = 0.0
|
||||
for ss, se in zip(silence_starts, silence_ends):
|
||||
if ss > last_end + 0.5:
|
||||
speech_segments.append((last_end, ss))
|
||||
last_end = se
|
||||
if last_end < DURATION:
|
||||
speech_segments.append((last_end, DURATION))
|
||||
|
||||
print(f" Silence periods detected: {len(silence_starts)}")
|
||||
print(f" Speech segments: {len(speech_segments)}")
|
||||
|
||||
# Transcribe each speech segment
|
||||
silence_segments = []
|
||||
total_chars2 = 0
|
||||
for idx, (st, et) in enumerate(speech_segments):
|
||||
wav = os.path.join(tmpdir, f"sil_{idx:04d}.wav")
|
||||
if not extract_audio_segment(st, et, wav):
|
||||
continue
|
||||
segs, info = transcribe(wav)
|
||||
for s in segs:
|
||||
silence_segments.append({"start": st + s.start, "end": st + s.end, "text": s.text})
|
||||
total_chars2 += len(s.text)
|
||||
silence_time = time.time() - t2
|
||||
print(f" Segments: {len(silence_segments)}, Total chars: {total_chars2}, Time: {silence_time:.1f}s")
|
||||
|
||||
# === Comparison ===
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("COMPARISON")
|
||||
print("=" * 60)
|
||||
print(f"{'Metric':<30} {'CUT-based':<15} {'Silence-based':<15}")
|
||||
print("-" * 60)
|
||||
print(f"{'Number of audio segments':<30} {len(cut_scenes):<15} {len(speech_segments):<15}")
|
||||
print(f"{'Number of ASR segments':<30} {len(cut_segments):<15} {len(silence_segments):<15}")
|
||||
print(f"{'Total chars recognized':<30} {total_chars:<15} {total_chars2:<15}")
|
||||
print(f"{'Processing time (s)':<30} {cut_time:<15.1f} {silence_time:<15.1f}")
|
||||
|
||||
# Cleanup
|
||||
import shutil
|
||||
shutil.rmtree(tmpdir, ignore_errors=True)
|
||||
print()
|
||||
print("Done.")
|
||||
@@ -13,7 +13,6 @@ OUTPUT_DIR = f"output/{UUID}/florence2_results"
|
||||
# These are placeholders - I need to re-run to get the exact boxes if they weren't printed.
|
||||
# Since I saw the logs, I know it found them.
|
||||
# But I need the exact coordinates. Let's run a detection script that crops them immediately.
|
||||
import torch
|
||||
import types
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
|
||||
@@ -6,7 +6,6 @@ Crop the detected stamp from the 112:36 frame (with Patch).
|
||||
from PIL import Image
|
||||
import os
|
||||
import cv2
|
||||
import torch
|
||||
import types
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
|
||||
|
||||
@@ -140,7 +140,7 @@ def main():
|
||||
|
||||
video_stream = next((s for s in video_info["streams"] if s["codec_type"] == "video"), None)
|
||||
|
||||
print(f"\n测试视频:")
|
||||
print("\n测试视频:")
|
||||
print(f" 文件: {int(video_info['format'].get('size', 0)) / 1024 / 1024:.1f} MB")
|
||||
print(f" 时长: {float(video_info['format'].get('duration', 0)):.1f} 秒")
|
||||
print(f" 分辨率: {video_stream.get('width', 0)}x{video_stream.get('height', 0)}")
|
||||
@@ -188,7 +188,7 @@ def main():
|
||||
"file_size_kb": result["file_size_kb"],
|
||||
})
|
||||
|
||||
print(f"\n✅ 处理完成:")
|
||||
print("\n✅ 处理完成:")
|
||||
print(f" 时间: {result['elapsed_time']:.2f}秒")
|
||||
print(f" 内存峰值: {result['peak_memory_mb']:.1f} MB")
|
||||
print(f" 检测场景数: {result['total_scenes']}")
|
||||
@@ -223,7 +223,7 @@ def main():
|
||||
print(f"{'=' * 80}")
|
||||
|
||||
print("\n【对比总结】")
|
||||
print(f"\n| 方案 | 脚本 | 时间(秒) | 内存(MB) | 场景数 | 平均时长(秒) |")
|
||||
print("\n| 方案 | 脚本 | 时间(秒) | 内存(MB) | 场景数 | 平均时长(秒) |")
|
||||
print("|------|------|---------|---------|--------|-------------|")
|
||||
|
||||
for r in results:
|
||||
|
||||
@@ -4,7 +4,6 @@ Debug script to test face registration with same arguments Rust uses
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Simulate what Rust would call
|
||||
|
||||
@@ -7,7 +7,6 @@ Deep Analysis of 112:36 Frame
|
||||
|
||||
import os
|
||||
import cv2
|
||||
import torch
|
||||
import types
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
@@ -149,7 +148,7 @@ try:
|
||||
2,
|
||||
)
|
||||
else:
|
||||
print(f" ❌ Not found.")
|
||||
print(" ❌ Not found.")
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Error: {e}")
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@ Momentry Core Visual Demo Dashboard
|
||||
職責:提供處理器模組的視覺化預覽,支持時間軸檢查與多模組疊加顯示。
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import json
|
||||
import cv2
|
||||
|
||||
@@ -6,7 +6,6 @@ Demonstrate face learning capability
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
# Add script directory to path
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Dict, Tuple
|
||||
import re
|
||||
|
||||
# 簡單的語言檢測規則(可擴展)
|
||||
|
||||
@@ -5,7 +5,6 @@ Detect and Crop Envelopes/Objects in Keyframes
|
||||
|
||||
import os
|
||||
import cv2
|
||||
import torch
|
||||
import types
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
|
||||
@@ -7,7 +7,6 @@ Export Person Thumbnails
|
||||
import cv2
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
# 設定
|
||||
OUTPUT_DIR = "output/quick_preview"
|
||||
|
||||
@@ -4,8 +4,6 @@
|
||||
"""
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
@@ -247,7 +245,7 @@ def create_female_faces_report(female_frames_info, output_dir="/tmp/female_faces
|
||||
f"- `{os.path.basename(info['thumbnail'])}` - 縮略圖(800px寬)\n"
|
||||
)
|
||||
|
||||
f.write(f"- `female_faces_report.md` - 本報告文件\n\n")
|
||||
f.write("- `female_faces_report.md` - 本報告文件\n\n")
|
||||
|
||||
f.write("## 🔍 分析說明\n\n")
|
||||
f.write("1. **邊界框顏色**: 粉色 (RGB: 255,105,180) 表示女性人臉\n")
|
||||
@@ -332,20 +330,20 @@ def main():
|
||||
info for info in female_frames_info if info["female_count"] == max_females
|
||||
][0]
|
||||
|
||||
print(f"📊 統計摘要:")
|
||||
print("📊 統計摘要:")
|
||||
print(f" - 總分析畫面: {len(female_frames_info)}")
|
||||
print(f" - 女性最多畫面: 幀 {max_frame_info['frame_number']}")
|
||||
print(f" - 女性數量: {max_females} 人")
|
||||
print(f" - 時間位置: {max_frame_info['timestamp_formatted']}")
|
||||
print()
|
||||
|
||||
print(f"📁 生成文件:")
|
||||
print("📁 生成文件:")
|
||||
print(f" - 標記圖像: {output_dir}/female_faces_frame_*.jpg")
|
||||
print(f" - 縮略圖: {output_dir}/female_faces_frame_*_thumbnail.jpg")
|
||||
print(f" - 分析報告: {report_path}")
|
||||
print()
|
||||
|
||||
print(f"🔍 查看結果:")
|
||||
print("🔍 查看結果:")
|
||||
print(f" ls -la {output_dir}/")
|
||||
print(f" open {output_dir}/female_faces_report.md")
|
||||
|
||||
|
||||
@@ -23,7 +23,6 @@ import sys
|
||||
import json
|
||||
import time
|
||||
import subprocess
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
@@ -230,7 +229,7 @@ def main():
|
||||
sys.exit(1)
|
||||
|
||||
video_info = get_video_info(video_path)
|
||||
print(f"\n测试视频:")
|
||||
print("\n测试视频:")
|
||||
print(f" UUID: {video_uuid}")
|
||||
print(f" 文件: {video_info.get('size_mb', 0):.1f} MB")
|
||||
print(f" 时长: {video_info.get('duration', 0):.1f} 秒")
|
||||
@@ -286,7 +285,7 @@ def main():
|
||||
"has_landmarks": result["has_landmarks"]
|
||||
})
|
||||
|
||||
print(f"\n✅ 处理完成:")
|
||||
print("\n✅ 处理完成:")
|
||||
print(f" 时间: {result['elapsed_time']:.2f}秒")
|
||||
print(f" 速度: {speed:.2f}x 实时倍速")
|
||||
print(f" 内存峰值: {result['peak_memory_mb']:.1f} MB")
|
||||
@@ -324,7 +323,7 @@ def main():
|
||||
print(f"{'=' * 80}")
|
||||
|
||||
print("\n【对比总结】")
|
||||
print(f"\n| 方案 | 脚本 | 时间(秒) | 速度 | 内存(MB) | 人脸数 | Embedding |")
|
||||
print("\n| 方案 | 脚本 | 时间(秒) | 速度 | 内存(MB) | 人脸数 | Embedding |")
|
||||
print("|------|------|---------|------|---------|--------|-----------|")
|
||||
|
||||
for r in results:
|
||||
|
||||
@@ -5,9 +5,7 @@ Face Detection Count Comparison
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
def load_results(filepath):
|
||||
"""加载检测结果"""
|
||||
@@ -172,7 +170,7 @@ def main():
|
||||
|
||||
stats = analyze_detection_distribution(results_a, results_b, results_c)
|
||||
|
||||
print(f"| 版本 | 总人脸数 | 检测帧数 | 有人脸帧 | 无人脸帧 | 平均每帧 | 最多人脸 |")
|
||||
print("| 版本 | 总人脸数 | 检测帧数 | 有人脸帧 | 无人脸帧 | 平均每帧 | 最多人脸 |")
|
||||
print("|------|---------|---------|---------|---------|---------|---------|")
|
||||
|
||||
for name, s in stats.items():
|
||||
@@ -187,14 +185,14 @@ def main():
|
||||
print(f"共有 {len(comparison)} 帧检测数量不同")
|
||||
print()
|
||||
|
||||
print(f"| 帧号 | 时间(秒) | InsightFace | MediaPipe | OpenCV | 最大差异 |")
|
||||
print("| 帧号 | 时间(秒) | InsightFace | MediaPipe | OpenCV | 最大差异 |")
|
||||
print("|------|---------|------------|----------|--------|---------|")
|
||||
|
||||
for item in comparison[:30]: # 只显示前30帧
|
||||
print(f"| {item['frame']} | {item['timestamp']:.2f} | {item['insightface']} | {item['mediapipe']} | {item['opencv']} | {item['diff']} |")
|
||||
|
||||
if len(comparison) > 30:
|
||||
print(f"| ... | ... | ... | ... | ... | ... |")
|
||||
print("| ... | ... | ... | ... | ... | ... |")
|
||||
print(f"| 共 {len(comparison)} 帧有差异 |")
|
||||
|
||||
print()
|
||||
@@ -212,7 +210,7 @@ def main():
|
||||
|
||||
if mediapipe_missed:
|
||||
print("MediaPipe漏检详情(前10帧):")
|
||||
print(f"| 帧号 | InsightFace检测 | OpenCV检测 |")
|
||||
print("| 帧号 | InsightFace检测 | OpenCV检测 |")
|
||||
print("|------|----------------|-----------|")
|
||||
for m in mediapipe_missed[:10]:
|
||||
print(f"| {m['frame']} | {m.get('insightface_count', m.get('others_count', '?'))} | {m.get('opencv_count', '?')} |")
|
||||
@@ -225,7 +223,7 @@ def main():
|
||||
|
||||
print(f"以InsightFace为基准({baseline}张人脸):")
|
||||
print()
|
||||
print(f"| 版本 | 检测数 | 检测率 | 漏检数 |")
|
||||
print("| 版本 | 检测数 | 检测率 | 漏检数 |")
|
||||
print("|------|--------|--------|--------|")
|
||||
|
||||
for name, s in stats.items():
|
||||
|
||||
@@ -38,7 +38,7 @@ def extract_face_embeddings(uuid: str, video_path: str):
|
||||
return {}
|
||||
|
||||
# 1. 加載 Face JSON 數據
|
||||
face_path = os.path.join(OUTPUT_DIR, "quick_preview", f"preview.face.json")
|
||||
face_path = os.path.join(OUTPUT_DIR, "quick_preview", "preview.face.json")
|
||||
if not os.path.exists(face_path):
|
||||
print(f" [Skip] No Face data for {uuid}")
|
||||
return {}
|
||||
@@ -119,7 +119,7 @@ def extract_face_embeddings(uuid: str, video_path: str):
|
||||
)
|
||||
if result:
|
||||
embeddings.append(np.array(result[0]["embedding"]))
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
# 忽略無法識別的臉部
|
||||
pass
|
||||
|
||||
|
||||
@@ -21,7 +21,6 @@ import os
|
||||
import time
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
from resume_framework import ResumeFramework, format_time, print_progress
|
||||
from utils.pose_analyzer import calculate_pose_angle_v2
|
||||
|
||||
@@ -141,7 +140,7 @@ def process_face(
|
||||
print(f"\nProcessing video: {total_frames} frames @ {fps:.2f} fps")
|
||||
print(f"Auto-save every {auto_save_interval}s or {auto_save_frames} frames")
|
||||
print(f"Resume from frame {frame_count + 1 if resume_mode else 1}")
|
||||
print(f"Detection method: InsightFace (REQUIRED)")
|
||||
print("Detection method: InsightFace (REQUIRED)")
|
||||
print()
|
||||
|
||||
while True:
|
||||
@@ -199,7 +198,7 @@ def process_face(
|
||||
"pitch": pose_result.get("pitch", "neutral"),
|
||||
"features": pose_result.get("features", {}),
|
||||
}
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
face_list.append(
|
||||
@@ -255,6 +254,45 @@ def process_face(
|
||||
return face_data
|
||||
|
||||
|
||||
def _convert_to_face_result(face_data: dict) -> dict:
|
||||
"""Convert ResumeFramework output to FaceResult format expected by Rust."""
|
||||
metadata = face_data.get("metadata", {})
|
||||
raw_frames = face_data.get("frames", {})
|
||||
fps = metadata.get("fps", 30.0)
|
||||
frames = []
|
||||
for frame_key in sorted(raw_frames.keys(), key=lambda k: int(k)):
|
||||
f = raw_frames[frame_key]
|
||||
faces = []
|
||||
for raw_face in f.get("faces", []):
|
||||
pose = raw_face.get("pose_angle")
|
||||
attributes = raw_face.get("attributes", {})
|
||||
face = {
|
||||
"face_id": None,
|
||||
"x": raw_face["x"],
|
||||
"y": raw_face["y"],
|
||||
"width": raw_face["width"],
|
||||
"height": raw_face["height"],
|
||||
"confidence": raw_face.get("confidence", 0.0),
|
||||
"embedding": raw_face.get("embedding"),
|
||||
"landmarks": raw_face.get("landmarks"),
|
||||
"attributes": {
|
||||
"age": attributes.get("age") if attributes else None,
|
||||
"gender": attributes.get("gender") if attributes else None,
|
||||
},
|
||||
}
|
||||
faces.append(face)
|
||||
frames.append({
|
||||
"frame": f["frame_number"],
|
||||
"timestamp": f["time_seconds"],
|
||||
"faces": faces,
|
||||
})
|
||||
return {
|
||||
"frame_count": len(frames),
|
||||
"fps": fps,
|
||||
"frames": frames,
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Face Detection & Demographics with Resume Support")
|
||||
parser.add_argument("video_path", help="Path to video file")
|
||||
@@ -285,11 +323,11 @@ if __name__ == "__main__":
|
||||
"-s",
|
||||
help="Frame sample interval",
|
||||
type=int,
|
||||
default=30,
|
||||
default=5,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
process_face(
|
||||
result = process_face(
|
||||
args.video_path,
|
||||
args.output_path,
|
||||
args.uuid,
|
||||
@@ -297,4 +335,7 @@ if __name__ == "__main__":
|
||||
args.auto_save_frames,
|
||||
args.force_restart,
|
||||
args.sample_interval,
|
||||
)
|
||||
)
|
||||
face_result = _convert_to_face_result(result)
|
||||
with open(args.output_path, "w") as f:
|
||||
json.dump(face_result, f, indent=2)
|
||||
@@ -18,7 +18,7 @@ import os
|
||||
import signal
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Dict, List
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
@@ -108,7 +108,7 @@ class MediaPipeFaceDetector:
|
||||
print(f"[Face] Using fallback model: {alt_path}")
|
||||
return alt_path
|
||||
|
||||
raise RuntimeError(f"Could not download MediaPipe model from any source")
|
||||
raise RuntimeError("Could not download MediaPipe model from any source")
|
||||
|
||||
return model_path
|
||||
|
||||
|
||||
@@ -9,10 +9,8 @@ import sys
|
||||
import json
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
import numpy as np
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
import uuid
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
@@ -8,7 +8,6 @@ import sys
|
||||
import json
|
||||
import argparse
|
||||
import os
|
||||
import numpy as np
|
||||
import time
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
@@ -176,7 +175,7 @@ class FaceRegistration:
|
||||
}
|
||||
|
||||
if len(faces) > 1:
|
||||
print(f"[WARNING] Multiple faces detected, using the first one")
|
||||
print("[WARNING] Multiple faces detected, using the first one")
|
||||
|
||||
# Use the first face
|
||||
face = faces[0]
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
"""
|
||||
|
||||
import psycopg2
|
||||
import json
|
||||
from datetime import datetime
|
||||
import sys
|
||||
|
||||
@@ -235,7 +234,7 @@ def main():
|
||||
with open("/tmp/face_statistics_report.txt", "w") as f:
|
||||
f.write(report)
|
||||
|
||||
print(f"\n報告已保存到: /tmp/face_statistics_report.txt")
|
||||
print("\n報告已保存到: /tmp/face_statistics_report.txt")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ 生成報告時出錯: {e}")
|
||||
|
||||
@@ -74,7 +74,7 @@ def main():
|
||||
|
||||
total_faces = sum(len(faces) for faces in faces_map.values())
|
||||
print(f"✅ Indexed {len(faces_map)} frames, containing {total_faces} faces.")
|
||||
print(f"🚀 Starting Linear Video Scan...")
|
||||
print("🚀 Starting Linear Video Scan...")
|
||||
|
||||
# 2. 線性掃描
|
||||
video_path = VIDEO_PATH # 使用區域變數避免 global 問題
|
||||
@@ -138,7 +138,7 @@ def main():
|
||||
face_refs.append(
|
||||
{"frame_idx": current_frame, "face_idx": face_idx}
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
processed_frames += 1
|
||||
|
||||
@@ -220,7 +220,7 @@ for sec in range(0, total_sec, FRAME_INTERVAL):
|
||||
print(
|
||||
f" 🎯 {sec}s | {term} | {s:.2f} | {int(orig_w)}x{int(orig_h)}px"
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Save annotated frame if stamps found
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
|
||||
def run_test(script_name, description):
|
||||
@@ -50,7 +49,7 @@ def check_server_status():
|
||||
|
||||
response = requests.get("http://localhost:3002/health", timeout=5)
|
||||
if response.status_code == 200:
|
||||
print(f"✅ 生產服務器運行正常 (端口 3002)")
|
||||
print("✅ 生產服務器運行正常 (端口 3002)")
|
||||
return True
|
||||
else:
|
||||
print(f"❌ 生產服務器異常: {response.status_code}")
|
||||
@@ -63,7 +62,7 @@ def check_server_status():
|
||||
|
||||
response = requests.get("http://localhost:3003/health", timeout=5)
|
||||
if response.status_code == 200:
|
||||
print(f"✅ 開發服務器運行正常 (端口 3003)")
|
||||
print("✅ 開發服務器運行正常 (端口 3003)")
|
||||
return True
|
||||
else:
|
||||
print(f"❌ 開發服務器異常: {response.status_code}")
|
||||
@@ -100,7 +99,7 @@ def check_database():
|
||||
""")
|
||||
|
||||
tables = cursor.fetchall()
|
||||
print(f"✅ 數據庫連接正常")
|
||||
print("✅ 數據庫連接正常")
|
||||
print(f"✅ 找到 {len(tables)} 個人臉相關表:")
|
||||
for table in tables:
|
||||
print(f" - {table[0]}")
|
||||
|
||||
@@ -6,7 +6,6 @@ Heuristic: Kids have a larger head relative to their body height (approx 1:5 or
|
||||
|
||||
import json
|
||||
import math
|
||||
import sys
|
||||
|
||||
# Configuration
|
||||
POSE_JSON_PATH = "output/384b0ff44aaaa1f1/384b0ff44aaaa1f1.pose.json"
|
||||
@@ -161,7 +160,7 @@ def find_kids():
|
||||
# Sort by timestamp
|
||||
sorted_kids = sorted(unique_kids.values(), key=lambda x: x['timestamp'])
|
||||
|
||||
print(f"\nUnique potential kid detections (timestamps):")
|
||||
print("\nUnique potential kid detections (timestamps):")
|
||||
for k in sorted_kids:
|
||||
print(f" -> Timestamp: {k['timestamp']:.2f}s | Ratio: {k['ratio']}")
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@ Filters:
|
||||
|
||||
import json
|
||||
import math
|
||||
import sys
|
||||
import os
|
||||
|
||||
POSE_JSON_PATH = "output/384b0ff44aaaa1f1/384b0ff44aaaa1f1.pose.json"
|
||||
@@ -133,7 +132,7 @@ def find_kids():
|
||||
|
||||
sorted_kids = sorted(unique_kids.values(), key=lambda x: x["timestamp"])
|
||||
|
||||
print(f"\nRefined Timestamps:")
|
||||
print("\nRefined Timestamps:")
|
||||
for k in sorted_kids:
|
||||
print(
|
||||
f" ⏱️ {k['timestamp']:.2f}s | Ratio: {k['ratio']} | Width: {k['shoulder_width']}px | Conf: {k['confidence']}"
|
||||
|
||||
@@ -5,7 +5,6 @@ Search for magnifying glass in key stamp scenes using OWL-ViT
|
||||
|
||||
import os
|
||||
import cv2
|
||||
import json
|
||||
from PIL import Image
|
||||
import torch
|
||||
from transformers import OwlViTProcessor, OwlViTForObjectDetection
|
||||
|
||||
@@ -17,7 +17,7 @@ os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
# Scan frames at 5-minute intervals throughout the 2-hour video
|
||||
TIMESTAMPS = list(range(0, 6879, 300)) # Every 5 minutes
|
||||
|
||||
print(f"📽️ Loading Florence-2 model...")
|
||||
print("📽️ Loading Florence-2 model...")
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
"microsoft/Florence-2-base", trust_remote_code=True
|
||||
)
|
||||
|
||||
@@ -148,7 +148,7 @@ def generate_summary_report():
|
||||
fastest_scheme = fastest.get('file_info', {}).get('scheme_id', 'unknown')
|
||||
fastest_time = fastest.get('metrics', {}).get('processing_time_seconds', 0)
|
||||
|
||||
lines.append(f"### Performance Comparison")
|
||||
lines.append("### Performance Comparison")
|
||||
lines.append("")
|
||||
lines.append(f"- **Fastest Scheme**: {fastest_scheme} ({fastest_time:.1f}s)")
|
||||
|
||||
@@ -169,7 +169,7 @@ def generate_summary_report():
|
||||
lines.append("")
|
||||
|
||||
if failed_tests:
|
||||
lines.append(f"### Failed Tests")
|
||||
lines.append("### Failed Tests")
|
||||
lines.append("")
|
||||
for result in failed_tests:
|
||||
scheme_id = result.get('file_info', {}).get('scheme_id', 'unknown')
|
||||
@@ -178,8 +178,8 @@ def generate_summary_report():
|
||||
|
||||
if 'MPS' in error_msg:
|
||||
lines.append(f"- **{scheme_id} ({scheme_name})**: MPS backend compatibility issue")
|
||||
lines.append(f" - PyTorch SparseMPS backend does not support `_sparse_coo_tensor_with_dims_and_tensors`")
|
||||
lines.append(f" - OpenAI whisper requires this operation for MPS device")
|
||||
lines.append(" - PyTorch SparseMPS backend does not support `_sparse_coo_tensor_with_dims_and_tensors`")
|
||||
lines.append(" - OpenAI whisper requires this operation for MPS device")
|
||||
|
||||
lines.append("")
|
||||
|
||||
|
||||
@@ -252,7 +252,6 @@ Summary: [2-3 sentence detailed summary connecting to scene]"""
|
||||
|
||||
def parse_5w1h_summary(result_text):
|
||||
"""Parse 5W1H and summary from LLM response"""
|
||||
import re
|
||||
|
||||
data = {
|
||||
"who": "",
|
||||
@@ -314,7 +313,6 @@ def update_chunk_summary(
|
||||
uuid=None,
|
||||
):
|
||||
"""Update chunk summary, 5W1H, identity, and visual in database"""
|
||||
import json
|
||||
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
cur = conn.cursor()
|
||||
|
||||
@@ -203,7 +203,7 @@ def main():
|
||||
)
|
||||
|
||||
# Step 3: Generate summaries and insert
|
||||
print(f"\n🤖 Generating summaries with gemma4...")
|
||||
print("\n🤖 Generating summaries with gemma4...")
|
||||
inserted = insert_parent_chunks(scenes)
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
|
||||
@@ -100,7 +100,7 @@ def check_server_health(api_url: str) -> bool:
|
||||
except requests.exceptions.ConnectionError:
|
||||
print(f"❌ Cannot connect to llama.cpp server at {api_url}")
|
||||
except requests.exceptions.Timeout:
|
||||
print(f"❌ Connection to llama.cpp server timed out")
|
||||
print("❌ Connection to llama.cpp server timed out")
|
||||
return False
|
||||
|
||||
|
||||
@@ -282,7 +282,7 @@ def main():
|
||||
# Check server health
|
||||
if not check_server_health(args.url):
|
||||
print("\n💡 Start llama.cpp server with:")
|
||||
print(f" llama-server --model <gemma4.gguf> --port 8081")
|
||||
print(" llama-server --model <gemma4.gguf> --port 8081")
|
||||
sys.exit(1)
|
||||
|
||||
# Prepare seed words
|
||||
|
||||
@@ -172,7 +172,7 @@ for idx, (sec, frame) in enumerate(candidate_frames):
|
||||
)
|
||||
|
||||
print(f" 🎯 {sec}s | {term} | {s:.2f} | {bw}x{bh}px")
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if found:
|
||||
|
||||
@@ -20,7 +20,6 @@ import argparse
|
||||
import os
|
||||
import numpy as np
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from datetime import datetime
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
@@ -7,7 +7,6 @@ Face + ASRX 整合處理器
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import os
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
@@ -194,7 +193,7 @@ def integrate_face_asrx(face_path, asrx_path, output_path, time_threshold=1.0):
|
||||
f" With face: {speaker['with_face']} ({speaker['with_face'] / speaker['segment_count'] * 100:.0f}%)"
|
||||
)
|
||||
|
||||
print(f"\n[Face-ASRX] Integration complete!")
|
||||
print("\n[Face-ASRX] Integration complete!")
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
@@ -15,13 +15,10 @@ Output:
|
||||
- Integrated action data with all body parts
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import numpy as np
|
||||
from typing import Dict, List
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class IntegratedBodyActionDecoder:
|
||||
|
||||
@@ -297,10 +297,10 @@ def main():
|
||||
print(f" 檔案路徑: {result['file_path']}")
|
||||
print(f" 檔案存在: {result['file_exists']}")
|
||||
if result.get("fallback_used"):
|
||||
print(f" 使用了回退: 是")
|
||||
print(" 使用了回退: 是")
|
||||
print(f" 回退原因: {result.get('fallback_reason', '未知')}")
|
||||
else:
|
||||
print(f" 使用了回退: 否")
|
||||
print(" 使用了回退: 否")
|
||||
print(f" 可用語言: {', '.join(result['available_languages'])}")
|
||||
else:
|
||||
if result["file_exists"]:
|
||||
|
||||
@@ -235,7 +235,7 @@ def process_lip(
|
||||
)
|
||||
else:
|
||||
landmarks = None
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
landmarks = None
|
||||
|
||||
if landmarks is not None and len(landmarks) >= 468:
|
||||
|
||||
@@ -10,7 +10,6 @@ import argparse
|
||||
import os
|
||||
import signal
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
@@ -10,7 +10,6 @@ import argparse
|
||||
import os
|
||||
import signal
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
@@ -6,7 +6,6 @@ Extracts frames at 1fps around key dialogue moments for thorough analysis.
|
||||
|
||||
import cv2
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
UUID = "384b0ff44aaaa1f1"
|
||||
VIDEO_PATH = f"output/{UUID}/{UUID}.mp4"
|
||||
|
||||
@@ -14,7 +14,6 @@ Usage:
|
||||
import json
|
||||
import argparse
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
import psycopg2
|
||||
import os
|
||||
|
||||
@@ -313,7 +312,7 @@ def analyze_match_results(results):
|
||||
print(f" Is Match: {r['is_match']}")
|
||||
|
||||
if r['strategy'] == 'combined':
|
||||
print(f" Details:")
|
||||
print(" Details:")
|
||||
print(f" Best Match: {r['best_match']:.4f}")
|
||||
print(f" Vote Ratio: {r['vote_ratio']:.2%}")
|
||||
print(f" Weighted Sim: {r['weighted_sim']:.4f}")
|
||||
@@ -408,7 +407,7 @@ def main():
|
||||
print("❌ No embedding in first face")
|
||||
return
|
||||
|
||||
print(f"\n🔧 Matching first face...")
|
||||
print("\n🔧 Matching first face...")
|
||||
match_result = match_face_to_identity(
|
||||
detected_embedding=embedding,
|
||||
identity_uuid=identity_uuid,
|
||||
@@ -419,7 +418,7 @@ def main():
|
||||
)
|
||||
|
||||
if match_result:
|
||||
print(f"\n✅ Match Result:")
|
||||
print("\n✅ Match Result:")
|
||||
print(f" Identity: {match_result['identity_name']}")
|
||||
print(f" Strategy: {match_result['strategy']}")
|
||||
print(f" Is Match: {match_result['is_match']}")
|
||||
|
||||
@@ -19,7 +19,6 @@ Usage:
|
||||
import json
|
||||
import argparse
|
||||
import numpy as np
|
||||
from datetime import datetime
|
||||
import psycopg2
|
||||
import os
|
||||
import sys
|
||||
@@ -424,7 +423,7 @@ def analyze_pose_match_results(results):
|
||||
for angle, threshold in adaptive_thresholds_used.items():
|
||||
print(f"{angle}: {threshold:.2f}")
|
||||
|
||||
print(f"\n=== Angle Match Types ===")
|
||||
print("\n=== Angle Match Types ===")
|
||||
print(f"{angle_match_types}")
|
||||
|
||||
# Top 5 details
|
||||
@@ -528,7 +527,7 @@ def main():
|
||||
pose_features = match_result.get("pose_features", {})
|
||||
ratio_str = f"{pose_ratio:.3f}" if pose_ratio else f"{pose_features.get('nose_to_eye_ratio', 'N/A')}"
|
||||
|
||||
print(f"\n✅ Result:")
|
||||
print("\n✅ Result:")
|
||||
print(f" Pose: {match_result['pose_angle']} (ratio: {ratio_str})")
|
||||
print(f" Similarity: {match_result['best_similarity']:.4f}")
|
||||
print(f" Match: {match_result['is_match']}")
|
||||
|
||||
@@ -43,15 +43,12 @@ Output structure:
|
||||
}
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
import cv2
|
||||
import numpy as np
|
||||
import mediapipe as mp
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
from collections import defaultdict
|
||||
from typing import Dict
|
||||
|
||||
|
||||
class MediaPipeHolisticProcessor:
|
||||
|
||||
@@ -150,7 +150,7 @@ def migrate_results():
|
||||
migrated_count += 1
|
||||
print(f" ✅ Migrated {total_faces} faces")
|
||||
else:
|
||||
print(f" ⚠️ Already exists, skipping")
|
||||
print(" ⚠️ Already exists, skipping")
|
||||
|
||||
# Commit changes
|
||||
conn.commit()
|
||||
@@ -193,7 +193,7 @@ def test_api_after_migration():
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
print(f"✅ Success!")
|
||||
print("✅ Success!")
|
||||
print(f"Video UUID: {data.get('video_uuid')}")
|
||||
print(f"Total faces: {data.get('total_faces')}")
|
||||
print(f"Processing time: {data.get('processing_time_secs')}s")
|
||||
@@ -203,7 +203,7 @@ def test_api_after_migration():
|
||||
if isinstance(result_data, str):
|
||||
result_data = json.loads(result_data)
|
||||
|
||||
print(f"\n📊 Detailed results:")
|
||||
print("\n📊 Detailed results:")
|
||||
print(f" Frames with faces: {result_data.get('frames_with_faces')}")
|
||||
|
||||
gender_dist = result_data.get("gender_distribution", {})
|
||||
|
||||
@@ -9,9 +9,7 @@ Stage 3: Filter and rank results
|
||||
import os
|
||||
import cv2
|
||||
import json
|
||||
import glob
|
||||
import time
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import torch
|
||||
from transformers import OwlViTProcessor, OwlViTForObjectDetection
|
||||
@@ -123,7 +121,7 @@ for idx, (sec, frame_path) in enumerate(frames_to_process):
|
||||
],
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not containers:
|
||||
@@ -226,7 +224,7 @@ for idx, (sec, frame_path) in enumerate(frames_to_process):
|
||||
print(
|
||||
f" 🎯 {sec}s | {stamp_term} | {s:.2f} | {int(orig_w)}x{int(orig_h)}px"
|
||||
)
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ─── Stage 3: Filter and rank ───
|
||||
|
||||
@@ -8,7 +8,6 @@ import librosa
|
||||
import numpy as np
|
||||
import os
|
||||
import json
|
||||
import matplotlib.pyplot as plt # Only for debug if needed, but we stick to console for now
|
||||
|
||||
# 設定
|
||||
UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
|
||||
@@ -29,7 +28,7 @@ def analyze_music_segmentation(audio_path):
|
||||
hop_length = int(1.0 * sr)
|
||||
chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)
|
||||
|
||||
print(f"📊 Analyzing transitions...")
|
||||
print("📊 Analyzing transitions...")
|
||||
|
||||
# 2. 計算自我相似度矩陣 (Self-Similarity Matrix) - 優化版
|
||||
# 這裡我們簡化為計算相鄰片段的餘弦距離 (Cosine Distance)
|
||||
@@ -45,7 +44,6 @@ def analyze_music_segmentation(audio_path):
|
||||
|
||||
# 使用 librosa 的 onset_strength 的變體,但針對 Chroma
|
||||
# 這裡手動計算 Cosine Distance 以確保準確度
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
# 為了效能,我們不逐一計算,而是使用向量化的方法
|
||||
# 計算 frame[t] 和 frame[t+lag] 的差異
|
||||
@@ -127,12 +125,12 @@ if __name__ == "__main__":
|
||||
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump({"music_segments": segments}, f, indent=2, ensure_ascii=False)
|
||||
|
||||
print(f"\n🎉 Analysis Complete!")
|
||||
print("\n🎉 Analysis Complete!")
|
||||
print(f"✅ Identified {len(segments)} music-based scenes.")
|
||||
print(f"💾 Saved to {OUTPUT_JSON}")
|
||||
|
||||
# 顯示結果
|
||||
print(f"\n🎶 Top Music Segments:")
|
||||
print("\n🎶 Top Music Segments:")
|
||||
for i, seg in enumerate(segments[:20]):
|
||||
m_s, s_s = divmod(seg["start_time"], 60)
|
||||
print(f" {i + 1:02d}. [{int(m_s):02d}:{s_s:05.2f}] - {seg['duration']}s")
|
||||
|
||||
@@ -173,7 +173,7 @@ def main():
|
||||
|
||||
video_stream = next((s for s in video_info["streams"] if s["codec_type"] == "video"), None)
|
||||
|
||||
print(f"\n测试视频:")
|
||||
print("\n测试视频:")
|
||||
print(f" 文件: {float(video_info['format'].get('size', 0)) / 1024 / 1024:.1f} MB")
|
||||
print(f" 时长: {float(video_info['format'].get('duration', 0)):.1f} 秒")
|
||||
print(f" 分辨率: {video_stream.get('width', 0)}x{video_stream.get('height', 0)}")
|
||||
@@ -229,7 +229,7 @@ def main():
|
||||
"file_size_kb": result["file_size_kb"],
|
||||
})
|
||||
|
||||
print(f"\n✅ 处理完成:")
|
||||
print("\n✅ 处理完成:")
|
||||
print(f" 时间: {result['elapsed_time']:.2f}秒")
|
||||
print(f" 内存峰值: {result['peak_memory_mb']:.1f} MB")
|
||||
print(f" 检测帧数: {result['total_frames']}")
|
||||
@@ -266,7 +266,7 @@ def main():
|
||||
print(f"{'=' * 80}")
|
||||
|
||||
print("\n【对比总结】")
|
||||
print(f"\n| 方案 | 脚本 | 语言 | 时间(秒) | 内存(MB) | 帧数 | 文字数 | 置信度 | 空帧率 |")
|
||||
print("\n| 方案 | 脚本 | 语言 | 时间(秒) | 内存(MB) | 帧数 | 文字数 | 置信度 | 空帧率 |")
|
||||
print("|------|------|------|---------|---------|------|--------|--------|--------|")
|
||||
|
||||
for r in results:
|
||||
|
||||
@@ -13,12 +13,9 @@ import sys
|
||||
import json
|
||||
import argparse
|
||||
import os
|
||||
import signal
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
from resume_framework import ResumeFramework, format_time, print_progress
|
||||
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ import os
|
||||
import signal
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Dict, List
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
@@ -17,10 +17,8 @@ import json
|
||||
import argparse
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
from resume_framework import ResumeFramework, format_time, print_progress
|
||||
|
||||
|
||||
|
||||
@@ -17,10 +17,9 @@ import os
|
||||
import signal
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Dict
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
from ultralytics import YOLO
|
||||
|
||||
|
||||
@@ -5,14 +5,13 @@ Refined Search for "Postage Stamp" in the Image
|
||||
|
||||
import os
|
||||
import cv2
|
||||
import torch
|
||||
import types
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
|
||||
UUID = "384b0ff44aaaa1f1"
|
||||
OUTPUT_DIR = f"output/{UUID}/florence2_results"
|
||||
INPUT_IMG = os.path.join(OUTPUT_DIR, f"raw_6846.jpg")
|
||||
INPUT_IMG = os.path.join(OUTPUT_DIR, "raw_6846.jpg")
|
||||
|
||||
|
||||
# Patch for compatibility (Required for this environment)
|
||||
|
||||
@@ -185,7 +185,7 @@ def main():
|
||||
if update_parent_chunk(parent, analysis):
|
||||
success_count += 1
|
||||
else:
|
||||
print(f" ❌ Failed to generate analysis")
|
||||
print(" ❌ Failed to generate analysis")
|
||||
|
||||
print(f"\n{'=' * 70}")
|
||||
print(
|
||||
|
||||
@@ -4,8 +4,6 @@ Register sample faces to test the face recognition system
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
import base64
|
||||
import os
|
||||
|
||||
# API configuration
|
||||
|
||||
@@ -41,7 +41,7 @@ import json
|
||||
import signal
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Dict, Optional, Tuple, Any, Callable
|
||||
from typing import Dict, Optional, Tuple, Callable
|
||||
|
||||
|
||||
class ResumeFramework:
|
||||
|
||||
@@ -5,7 +5,6 @@ Scan Multiple Frames for Stamps
|
||||
|
||||
import os
|
||||
import cv2
|
||||
import torch
|
||||
import types
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
|
||||
@@ -6,7 +6,6 @@ Batch Scan Keyframes for SMALL red stamps
|
||||
import cv2
|
||||
import numpy as np
|
||||
import os
|
||||
import json
|
||||
|
||||
UUID = "384b0ff44aaaa1f1"
|
||||
BASE_DIR = f"output/{UUID}/florence2_results"
|
||||
@@ -93,4 +92,4 @@ for frame_name in FRAMES:
|
||||
res_name = f"result_opencv_{frame_name}"
|
||||
cv2.imwrite(os.path.join(BASE_DIR, res_name), img)
|
||||
else:
|
||||
print(f" ❌ No small stamps found.")
|
||||
print(" ❌ No small stamps found.")
|
||||
|
||||
@@ -230,7 +230,7 @@ class SceneClassifier:
|
||||
print("[SCENE] Places365 model loaded successfully (365 classes)")
|
||||
else:
|
||||
print(
|
||||
f"[SCENE] Places365 model not found, using ImageNet pretrained"
|
||||
"[SCENE] Places365 model not found, using ImageNet pretrained"
|
||||
)
|
||||
self.model = models.resnet18(pretrained=True)
|
||||
self.model_type = "imagenet"
|
||||
|
||||
@@ -85,7 +85,7 @@ for frame_name in FRAMES:
|
||||
cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 3)
|
||||
cv2.putText(
|
||||
img,
|
||||
f"BLUE STAMP?",
|
||||
"BLUE STAMP?",
|
||||
(x, y - 10),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.6,
|
||||
|
||||
@@ -5,7 +5,6 @@ Search for Envelope/Stamp in Keyframes
|
||||
|
||||
import os
|
||||
import cv2
|
||||
import torch
|
||||
import types
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
|
||||
@@ -5,7 +5,6 @@ Search for "vase" in the video using OWL-ViT on a subset of frames.
|
||||
|
||||
import os
|
||||
import cv2
|
||||
import json
|
||||
import glob
|
||||
from PIL import Image
|
||||
import torch
|
||||
|
||||
@@ -15,7 +15,6 @@ Usage:
|
||||
import json
|
||||
import argparse
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import psycopg2
|
||||
import os
|
||||
@@ -313,10 +312,10 @@ def main():
|
||||
)
|
||||
|
||||
if uuid:
|
||||
print(f"\n🎉 Registration completed!")
|
||||
print("\n🎉 Registration completed!")
|
||||
else:
|
||||
print(f"\n📊 Analysis only (no registration)")
|
||||
print(f" To register, run with --register flag")
|
||||
print("\n📊 Analysis only (no registration)")
|
||||
print(" To register, run with --register flag")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -443,7 +443,7 @@ def main():
|
||||
print_selection_report(angle_groups, selected, coverage_report)
|
||||
|
||||
if not args.report_only and args.register and args.identity_name:
|
||||
print(f"\n🔧 Step 5: Registering Identity...")
|
||||
print("\n🔧 Step 5: Registering Identity...")
|
||||
|
||||
reference_data = build_reference_data_structure(selected, args.video_uuid)
|
||||
|
||||
@@ -454,14 +454,14 @@ def main():
|
||||
)
|
||||
|
||||
if uuid:
|
||||
print(f"\n✅ Registration completed!")
|
||||
print("\n✅ Registration completed!")
|
||||
print(f" UUID: {uuid}")
|
||||
print(f" Name: {args.identity_name}")
|
||||
print(f" Angles: {coverage_report['angles_covered']}")
|
||||
print(f" Total vectors: {coverage_report['total_references']}")
|
||||
print(f" Quality avg: {coverage_report['quality_avg']:.2f}")
|
||||
elif args.report_only:
|
||||
print(f"\n📊 Report only (no registration)")
|
||||
print("\n📊 Report only (no registration)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -329,7 +329,7 @@ def main():
|
||||
print("Please run face_tracker.py first")
|
||||
return
|
||||
|
||||
print(f"\n=== Available Traces ===")
|
||||
print("\n=== Available Traces ===")
|
||||
for trace_id_str, trace in sorted(traces.items(), key=lambda x: int(x[0])):
|
||||
print(f"Trace {trace_id_str}:")
|
||||
print(f" Frames: {trace['start_frame']}-{trace['end_frame']} ({trace['duration_frames']} frames)")
|
||||
@@ -364,7 +364,7 @@ def main():
|
||||
# Filter faces by trace
|
||||
filtered_face_data = filter_faces_by_trace(face_data, trace_id_filter)
|
||||
|
||||
print(f"\n=== Filtering Faces ===")
|
||||
print("\n=== Filtering Faces ===")
|
||||
print(f"Original frames: {len(face_data.get('frames', {}))}")
|
||||
print(f"Filtered frames: {len(filtered_face_data.get('frames', {}))}")
|
||||
|
||||
@@ -379,7 +379,7 @@ def main():
|
||||
print("❌ No reference vectors selected")
|
||||
return
|
||||
|
||||
print(f"\n=== Selected Reference Vectors ===")
|
||||
print("\n=== Selected Reference Vectors ===")
|
||||
print(f"Total: {len(selected_vectors)}")
|
||||
|
||||
angle_distribution = defaultdict(int)
|
||||
@@ -390,7 +390,7 @@ def main():
|
||||
print(f"Distribution: {dict(angle_distribution)}")
|
||||
print(f"Quality avg: {np.mean([v['quality_score'] for v in selected_vectors]):.3f}")
|
||||
|
||||
print(f"\n=== Vector Details ===")
|
||||
print("\n=== Vector Details ===")
|
||||
for i, v in enumerate(selected_vectors[:10]):
|
||||
print(f"Vector {i+1}:")
|
||||
print(f" Angle: {v['pose_angle']} (confidence: {v['pose_confidence']:.2f})")
|
||||
@@ -404,7 +404,7 @@ def main():
|
||||
return
|
||||
|
||||
if args.register and args.identity_name:
|
||||
print(f"\n=== Registering Identity ===")
|
||||
print("\n=== Registering Identity ===")
|
||||
|
||||
identity_uuid = register_identity_with_trace(
|
||||
identity_name=args.identity_name,
|
||||
@@ -416,7 +416,7 @@ def main():
|
||||
)
|
||||
|
||||
if identity_uuid:
|
||||
print(f"\n✅ Registration completed!")
|
||||
print("\n✅ Registration completed!")
|
||||
print(f" UUID: {identity_uuid}")
|
||||
print(f" Name: {args.identity_name}")
|
||||
print(f" Trace ID: {trace_id_filter}")
|
||||
|
||||
@@ -34,7 +34,7 @@ def test_endpoint(endpoint, method="GET", data=None):
|
||||
print(f"Headers: {dict(response.headers)}")
|
||||
|
||||
if response.status_code == 200:
|
||||
print(f"✅ Success!")
|
||||
print("✅ Success!")
|
||||
if response.text:
|
||||
print(f"Response (first 500 chars): {response.text[:500]}")
|
||||
return True
|
||||
|
||||
@@ -93,14 +93,14 @@ def main():
|
||||
# 直接回答問題
|
||||
print("📝 問題回答:")
|
||||
print("-" * 40)
|
||||
print(f"Q: 這兩個影片內有幾個人?")
|
||||
print("Q: 這兩個影片內有幾個人?")
|
||||
print(f"A: 總共檢測到 {total_faces} 個人臉")
|
||||
print()
|
||||
print(f"Q: 幾男幾女?")
|
||||
print("Q: 幾男幾女?")
|
||||
print(f"A: 男性 {male_count} 人 ({male_count / total_faces * 100:.1f}%)")
|
||||
print(f" 女性 {female_count} 人 ({female_count / total_faces * 100:.1f}%)")
|
||||
print()
|
||||
print(f"Q: 平均年齡?")
|
||||
print("Q: 平均年齡?")
|
||||
print(f"A: 平均 {avg_age} 歲 (範圍: {min_age}-{max_age}歲)")
|
||||
print()
|
||||
print("=" * 60)
|
||||
|
||||
@@ -26,7 +26,7 @@ def detect_impulse_sounds(audio_path, threshold_multiplier=1.5):
|
||||
# 載入音頻 (Mono, 22050Hz)
|
||||
y, sr = librosa.load(audio_path, sr=22050)
|
||||
|
||||
print(f"📊 Analyzing energy envelope...")
|
||||
print("📊 Analyzing energy envelope...")
|
||||
# 1. 計算 RMS 能量 (以 0.05秒 為一幀)
|
||||
frame_length = int(0.05 * sr)
|
||||
hop_length = int(0.02 * sr)
|
||||
|
||||
@@ -5,14 +5,13 @@ Search for Specific Stamps in the Image (Avoiding Watermark)
|
||||
|
||||
import os
|
||||
import cv2
|
||||
import torch
|
||||
import types
|
||||
from PIL import Image
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
|
||||
UUID = "384b0ff44aaaa1f1"
|
||||
OUTPUT_DIR = f"output/{UUID}/florence2_results"
|
||||
INPUT_IMG = os.path.join(OUTPUT_DIR, f"raw_6846.jpg")
|
||||
INPUT_IMG = os.path.join(OUTPUT_DIR, "raw_6846.jpg")
|
||||
|
||||
|
||||
# Patch for compatibility
|
||||
|
||||
1
scripts/swift_processors/.build/.lock
Normal file
1
scripts/swift_processors/.build/.lock
Normal file
@@ -0,0 +1 @@
|
||||
7861
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user