cleanup: remove dead code and duplicate docs

- Remove session-ses_2f27.md (161KB raw session log)
- Remove 49 ROOT_* duplicate files across REFERENCE/
- Remove 14 duplicate files between REFERENCE/ root and history/
- Remove asr_legacy.rs (dead code, replaced by asr.rs)
- Remove src/core/worker/ (duplicate JobWorker)
- Remove src/core/layers/ (empty directory)
- Remove 4 .bak files in src/
- Remove 7 dead private methods in worker/processor.rs
- Remove backup directory from git tracking
This commit is contained in:
Warren
2026-05-04 01:31:21 +08:00
parent ee81e343ce
commit e75c4d6f07
3270 changed files with 35190 additions and 53367 deletions

View File

@@ -4,14 +4,12 @@
"""
import cv2
import numpy as np
import os
import sys
import json
import time
from datetime import datetime
import psycopg2
from psycopg2.extras import RealDictCursor
# 導入人臉識別處理器
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
@@ -275,7 +273,7 @@ class VideoFaceAnalyzer:
with open(result_file, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f"\n分析完成:")
print("\n分析完成:")
print(f" - 處理幀數: {len(frames)}")
print(f" - 檢測到人臉: {len(detections)}")
print(f" - 分析時間: {result['analysis_time']:.1f}")
@@ -454,14 +452,14 @@ def main():
total_faces = sum(r["faces_detected"] for r in video_results)
total_time = sum(r["analysis_time"] for r in video_results)
print(f"\n📈 分析摘要:")
print("\n📈 分析摘要:")
print(f" - 總處理視頻: {len(video_results)}")
print(f" - 總處理幀數: {total_frames}")
print(f" - 總檢測人臉: {total_faces}")
print(f" - 總分析時間: {total_time:.1f}")
# 列出生成的文件
print(f"\n📄 生成的文件:")
print("\n📄 生成的文件:")
for filename in sorted(os.listdir(analyzer.output_dir)):
filepath = os.path.join(analyzer.output_dir, filename)
if os.path.isfile(filepath):

View File

@@ -23,7 +23,7 @@ import signal
import platform
import psutil
from datetime import datetime, timezone
from typing import Dict, Any, Optional, List, Tuple
from typing import Dict, Any, List
from pathlib import Path
import traceback
@@ -606,7 +606,7 @@ class ASRBenchmarkRunner:
metrics = result.get('metrics', {})
real_time = result.get('real_time', {})
lines.append(f"- **Status**: Success")
lines.append("- **Status**: Success")
lines.append(f"- **Start**: {real_time.get('test_start', 'N/A')}")
lines.append(f"- **End**: {real_time.get('test_end', 'N/A')}")
lines.append(f"- **Duration**: {metrics.get('processing_time_seconds', 0):.3f}s")
@@ -615,7 +615,7 @@ class ASRBenchmarkRunner:
lines.append(f"- **Memory Peak**: {metrics.get('peak_memory_mb', 0):.1f}MB")
lines.append(f"- **Language**: {metrics.get('language_detected', 'N/A')} ({metrics.get('language_probability', 0):.2f})")
else:
lines.append(f"- **Status**: Failed")
lines.append("- **Status**: Failed")
lines.append(f"- **Error**: {result.get('error', 'Unknown error')}")
lines.append("")
@@ -680,7 +680,7 @@ def main():
runner.generate_results_json()
runner.generate_markdown_report()
print(f"\nBenchmark completed!")
print("\nBenchmark completed!")
print(f"Results: {output_dir / 'asr_benchmark_results.json'}")
print(f"Report: {output_dir / 'asr_benchmark_report.md'}")

View File

@@ -96,7 +96,7 @@ def print_stats(dist, total_segments):
avg_faces = total_faces_sum / total_segments if total_segments > 0 else 0
max_faces = max(dist.keys()) if dist else 0
print(f"\n📊 Summary:")
print("\n📊 Summary:")
print(f" Average faces per segment: {avg_faces:.1f}")
print(f" Max faces in a segment: {max_faces}")
print(
@@ -110,20 +110,20 @@ def print_stats(dist, total_segments):
)
# Show some example segments
print(f"\n🔍 Example Segments:")
print(f" 0 faces:")
print("\n🔍 Example Segments:")
print(" 0 faces:")
examples = [s for s in segment_details if s["face_count"] == 0][:3]
for ex in examples:
print(f" [{ex['start']:.0f}s-{ex['end']:.0f}s] {ex['text']}...")
print(f" 1 face:")
print(" 1 face:")
examples = [s for s in segment_details if s["face_count"] == 1][:3]
for ex in examples:
print(
f" [{ex['start']:.0f}s-{ex['end']:.0f}s] {ex['person_ids'][0]}: {ex['text']}..."
)
print(f" 3 faces:")
print(" 3 faces:")
examples = [s for s in segment_details if s["face_count"] == 3][:3]
for ex in examples:
pids = ", ".join(ex["person_ids"])

View File

@@ -18,12 +18,10 @@ Configuration:
import sys
import json
import os
import time
import argparse
import signal
import subprocess
import tempfile
from datetime import datetime
from faster_whisper import WhisperModel
PROCESSOR_VERSION = "2.1"
@@ -164,44 +162,127 @@ def run_asr(video_path, output_path, uuid: str = ""):
sys.stderr.flush()
sys.exit(0)
# 嘗試以 CUT 場景分段處理(降低長片記憶體使用)
cut_scenes = []
cut_path = output_path.replace(".asr.json", ".cut.json")
if os.path.exists(cut_path):
try:
with open(cut_path) as f:
cut_data = json.load(f)
scenes = cut_data.get("scenes", [])
if scenes:
cut_scenes = [(s["start_time"], s["end_time"]) for s in scenes]
print(f"[ASR] Loaded {len(cut_scenes)} cut scenes for segmented transcription", file=sys.stderr)
except Exception as e:
print(f"[ASR] Failed to load cut scenes: {e}", file=sys.stderr)
if publisher:
publisher.info("asr", "Loading Whisper model...")
# Use small model with CPU (MPS not supported by faster_whisper)
# small 模型在準確率和速度間取得最佳平衡
model = WhisperModel("small", device="cpu", compute_type="int8")
model = WhisperModel(MODEL_SIZE, device="cpu", compute_type="int8")
if publisher:
publisher.info("asr", f"Transcribing: {video_path}")
# Transcribe with VAD filter for better accuracy, with PyAV fallback
segments, info = transcribe_with_fallback(model, video_path, publisher)
if publisher:
publisher.info("asr", f"ASR_LANGUAGE:{info.language}")
results = []
total_segments = 0
for segment in segments:
results.append(
{"start": segment.start, "end": segment.end, "text": segment.text.strip()}
)
total_segments += 1
if total_segments % 100 == 0:
if publisher:
publisher.progress(
"asr", total_segments, 0, f"Segment {total_segments}"
if cut_scenes:
# 分段處理:對每個場景萃取音訊並轉錄
import subprocess
import tempfile
import json
temp_dir = tempfile.mkdtemp(prefix="asr_cut_")
transcript_language = None
# 建立 scene lookup: 給定時間點,找是哪個 scene
import bisect
scene_starts = [s[0] for s in cut_scenes]
def find_scene_idx(t):
i = bisect.bisect_right(scene_starts, t) - 1
return max(0, i)
# 逐段處理,每段結果即時寫入 .asr.tmp
tmp_path = output_path + ".tmp"
all_segments = []
for idx, (start_t, end_t) in enumerate(cut_scenes):
seg_wav = os.path.join(temp_dir, f"seg_{idx:04d}.wav")
# 用 ffmpeg 萃取出該段音訊
cmd = ["ffmpeg", "-y", "-v", "quiet", "-i", video_path,
"-ss", str(start_t), "-to", str(end_t),
"-ar", "16000", "-ac", "1", seg_wav]
subprocess.run(cmd, check=False, capture_output=True)
if not os.path.exists(seg_wav) or os.path.getsize(seg_wav) < 100:
continue # 跳過空音訊
try:
seg_result, seg_info = model.transcribe(
seg_wav, beam_size=5,
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200),
)
if transcript_language is None:
transcript_language = seg_info.language
output = {
"language": info.language,
"language_probability": info.language_probability,
"segments": results,
}
scene_segments = []
for segment in seg_result:
seg_start = start_t + segment.start
seg_end = start_t + segment.end
scene_idx = find_scene_idx((seg_start + seg_end) / 2)
scene_segments.append({
"start": seg_start,
"end": seg_end,
"text": segment.text.strip(),
"scene_number": scene_idx + 1,
})
total_segments += 1
with open(output_path, "w") as f:
json.dump(output, f, indent=2)
# 當前 scene 結果寫入 .asr.tmp
all_segments.extend(scene_segments)
with open(tmp_path, "w") as f:
json.dump({"language": transcript_language or "", "segments": all_segments}, f)
if total_segments % 100 == 0:
if publisher:
publisher.progress("asr", total_segments, 0, f"Segment {total_segments}")
except Exception as e:
print(f"[ASR] Segment {idx} failed: {e}", file=sys.stderr)
# 清理暫存 WAV
try: os.remove(seg_wav)
except: pass
try: os.rmdir(temp_dir)
except: pass
info_language = transcript_language or "unknown"
print(f"[ASR] Segmented transcription complete: {total_segments} segments", file=sys.stderr)
else:
# 無 CUT 資料,直接轉錄(原有流程)
segments, info = transcribe_with_fallback(model, video_path, publisher)
info_language = info.language
tmp_path = output_path + ".tmp"
all_segments = []
for segment in segments:
all_segments.append({
"start": segment.start, "end": segment.end,
"text": segment.text.strip(),
})
total_segments += 1
if total_segments % 100 == 0:
if publisher:
publisher.progress("asr", total_segments, 0, f"Segment {total_segments}")
with open(tmp_path, "w") as f:
json.dump({"language": info_language, "segments": all_segments}, f)
if publisher:
publisher.info("asr", f"ASR_LANGUAGE:{info_language}")
# rename .tmp → .json
os.rename(tmp_path, output_path)
if publisher:
publisher.complete("asr", f"{len(results)} segments")

View File

@@ -2,12 +2,19 @@
"""
ASRX Processor - Custom Implementation Wrapper
Uses SpeechBrain ECAPA-TDNN (no HuggingFace token required)
Pipeline:
1. Preprocess: ffprobe audio tracks → select best track → extract WAV
2. Process: VAD (Silero) → Speaker embedding (ECAPA-TDNN) → Spectral clustering
3. Output: segments with speaker_id
"""
import sys
import json
import argparse
import os
import subprocess
import tempfile
from pathlib import Path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
@@ -18,6 +25,78 @@ sys.path.insert(
from redis_publisher import RedisPublisher
def probe_audio_tracks(video_path: str) -> list:
"""Use ffprobe to list all audio tracks in the video file."""
cmd = [
"ffprobe", "-v", "quiet", "-print_format", "json",
"-show_streams", "-select_streams", "a", video_path,
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
data = json.loads(result.stdout)
tracks = []
for stream in data.get("streams", []):
track = {
"index": stream.get("index"),
"codec": stream.get("codec_name"),
"language": stream.get("tags", {}).get("language", "und"),
"channels": stream.get("channels", 0),
"sample_rate": stream.get("sample_rate", "0"),
}
tracks.append(track)
return tracks
except Exception as e:
print(f"[ASRX] ffprobe failed: {e}")
return []
def select_best_track(tracks: list) -> int:
"""Select the best audio track: English > first available > fallback to 0."""
if not tracks:
return 0
# Priority 1: English track
for i, t in enumerate(tracks):
if t["language"] == "eng" or t["language"] == "en":
print(f"[ASRX] Selected English track (index {t['index']})")
return i
# Priority 2: First track with the most channels
best = 0
for i, t in enumerate(tracks):
if t["channels"] > tracks[best]["channels"]:
best = i
print(f"[ASRX] Selected track {best} (lang={tracks[best]['language']}, ch={tracks[best]['channels']})")
return best
def extract_audio_to_wav(video_path: str, track_index: int, output_wav: str) -> bool:
"""Extract selected audio track to 16kHz mono WAV using ffmpeg."""
cmd = [
"ffmpeg", "-y", "-v", "quiet",
"-i", video_path,
"-map", f"0:{track_index}",
"-ar", "16000",
"-ac", "1",
"-sample_fmt", "s16",
output_wav,
]
try:
subprocess.run(cmd, check=True, capture_output=True, timeout=300)
return True
except Exception as e:
print(f"[ASRX] ffmpeg extraction failed: {e}")
return False
def _cleanup(tmp_dir):
"""Clean up temporary directory."""
if tmp_dir and os.path.exists(tmp_dir):
import shutil
shutil.rmtree(tmp_dir, ignore_errors=True)
def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
"""Process video for speaker diarization using custom implementation"""
@@ -25,25 +104,102 @@ def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
if publisher:
publisher.info("asrx", "ASRX_START")
tmp_dir = None
try:
# Ensure working directory is the scripts dir for model loading
script_dir = os.path.dirname(os.path.abspath(__file__))
os.chdir(script_dir)
# Debug: check ffmpeg availability
import shutil
ffmpeg_path = shutil.which("ffmpeg")
print(f"[ASRX] ffmpeg: {ffmpeg_path}", file=sys.stderr)
print(f"[ASRX] CWD: {os.getcwd()}", file=sys.stderr)
# ---- Stage 1: Audio Track Preprocessing ----
print("\n[ASRX] ===== Stage 1: Audio Track Analysis =====", file=sys.stderr)
print(f"[ASRX] Input: {video_path}", file=sys.stderr)
tracks = probe_audio_tracks(video_path)
if tracks:
print(f"[ASRX] Found {len(tracks)} audio track(s):", file=sys.stderr)
for t in tracks:
print(f" Track {t['index']}: {t['codec']} {t['channels']}ch {t['sample_rate']}Hz lang={t['language']}", file=sys.stderr)
else:
print("[ASRX] No audio tracks found via ffprobe, using raw file", file=sys.stderr)
# Select best track
track_idx = select_best_track(tracks) if tracks else 0
actual_track_index = tracks[track_idx]["index"] if tracks else track_idx
# Extract audio to WAV
tmp_dir = tempfile.mkdtemp(prefix="asrx_")
wav_path = os.path.join(tmp_dir, "audio.wav")
if extract_audio_to_wav(video_path, actual_track_index, wav_path):
wav_size = os.path.getsize(wav_path)
print(f"[ASRX] Audio extracted: {wav_path} ({wav_size / 1024 / 1024:.1f}MB)", file=sys.stderr)
audio_input = wav_path
else:
print("[ASRX] Audio extraction failed, falling back to original file", file=sys.stderr)
audio_input = video_path
# ---- Stage 2: Load ASR segments for time alignment ----
# Try multiple paths to find ASR JSON
asr_segments = []
asr_fallback_reason = ""
asr_candidates = [
output_path.replace(".asrx.json", ".asr.json") if output_path else "",
os.path.join(os.path.dirname(output_path) if output_path else ".", os.path.basename(video_path).rsplit(".", 1)[0] + ".asr.json"),
os.path.join(os.path.dirname(output_path) if output_path else ".", "dd61fda85fee441fdd00ab5528213ff7.asr.json"),
]
asr_path = ""
for candidate in asr_candidates:
if candidate and os.path.exists(candidate):
asr_path = candidate
break
if asr_path:
try:
with open(asr_path) as f:
asr_data = json.load(f)
asr_segments = asr_data.get("segments", [])
print(f"[ASRX] Loaded {len(asr_segments)} ASR segments from {asr_path}", file=sys.stderr)
asr_fallback_reason = f"loaded_{len(asr_segments)}_segments"
except Exception as e:
asr_fallback_reason = f"load_error_{e}"
print(f"[ASRX] Failed to load ASR segments: {e}", file=sys.stderr)
else:
asr_fallback_reason = f"asr_json_not_found_tried_{len(asr_candidates)}_paths"
print(f"[ASRX] ASR output not found, tried {len(asr_candidates)} paths. First candidate: {asr_candidates[0]}", file=sys.stderr)
# ---- Stage 3: ASRX Processing ----
from asrx_self.main_fixed import SelfASRXFixed
if publisher:
publisher.info("asrx", "ASRX_LOADING_MODEL")
# Initialize custom ASRX processor
asrx = SelfASRXFixed()
if publisher:
publisher.info("asrx", "ASRX_TRANSCRIBING")
# Process video/audio
result = asrx.process(
video_path,
output_path=None, # We'll save our own format
min_speech_duration_ms=500,
max_speakers=10,
)
if asr_segments:
# Use ASR segment boundaries for speaker embedding extraction
print(f"[ASRX] Using {len(asr_segments)} ASR segments for diarization", file=sys.stderr)
result = asrx.process_with_segments(
audio_input,
asr_segments,
output_path=None,
)
else:
# Fallback: VAD-based diarization
result = asrx.process(
audio_input,
output_path=None,
min_speech_duration_ms=500,
max_speakers=10,
)
if "error" in result:
if publisher:
@@ -58,21 +214,47 @@ def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
if publisher:
publisher.complete("asrx", "0 segments")
_cleanup(tmp_dir)
return output_result
# Convert to Rust-expected format
# Convert to Rust-expected format (start_frame/end_frame/speaker)
# Read fps from probe json ({file_uuid}.probe.json)
_debug = {"asr_fallback": asr_fallback_reason, "asr_path": asr_path}
fps = 30.0
output_dir = os.path.dirname(output_path) if output_path else "."
base_name = os.path.basename(output_path) if output_path else ""
# Extract uuid from {uuid}.{type}.json format
uuid_part = base_name.split(".")[0] if base_name else ""
probe_candidates = [
os.path.join(output_dir, f"{uuid_part}.probe.json"),
]
for p in probe_candidates:
if os.path.exists(p):
try:
with open(p) as pf:
probe_data = json.load(pf)
if "fps" in probe_data:
fps = float(probe_data["fps"])
print(f"[ASRX] FPS from probe: {fps}", file=sys.stderr)
break
except:
pass
output_result = {
"language": None, # Custom implementation doesn't detect language
"language": None,
"segments": [],
}
# Convert segments
for seg in result["segments"]:
start_sec = seg["start"]
end_sec = seg["end"]
output_result["segments"].append(
{
"start": seg["start"],
"end": seg["end"],
"text": "", # Will be filled by matching with ASR later
"start_time": start_sec,
"end_time": end_sec,
"start_frame": int(start_sec * fps),
"end_frame": int(end_sec * fps),
"text": "",
"speaker_id": seg["speaker"],
}
)
@@ -81,20 +263,24 @@ def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
if "speaker_stats" in result:
output_result["speaker_stats"] = result["speaker_stats"]
# 傳遞 embeddings每個 segment 對應的 192-D speaker embedding
if "embeddings" in result:
output_result["embeddings"] = result["embeddings"]
if publisher:
publisher.info("asrx", f"ASRX_COMPLETE:{len(output_result['segments'])}")
# Save output
output_result["_debug"] = _debug
with open(output_path, "w") as f:
json.dump(output_result, f, indent=2)
if publisher:
publisher.complete("asrx", f"{len(output_result['segments'])} segments")
print(
f"[ASRX-Custom] Saved {len(output_result['segments'])} segments to {output_path}"
)
print(f"[ASRX-Custom] Saved {len(output_result['segments'])} segments to {output_path}", file=sys.stderr)
_cleanup(tmp_dir)
return output_result
except Exception as e:
@@ -114,6 +300,7 @@ def process_asrx_custom(video_path: str, output_path: str, uuid: str = ""):
if publisher:
publisher.complete("asrx", "0 segments")
_cleanup(tmp_dir)
return output_result
@@ -133,7 +320,7 @@ if __name__ == "__main__":
result = process_asrx_custom(args.video_path, args.output_path, args.uuid)
print(f"\n[Summary]")
print("\n[Summary]")
print(f" Total segments: {len(result['segments'])}")
if "speaker_stats" in result:
print(f" Detected speakers: {len(result['speaker_stats'])}")

View File

@@ -130,12 +130,12 @@ def main():
integrated = match_face_with_speaker_v3(face_data, asrx_data, args.threshold)
# 分析
print(f"\n[Analyze] Analyzing speaker-face correspondence...")
print("\n[Analyze] Analyzing speaker-face correspondence...")
speaker_stats = analyze_speaker_face(integrated)
# 顯示統計
print(f"\n{'='*70}")
print(f"說話人 - 人臉對應統計")
print("說話人 - 人臉對應統計")
print(f"{'='*70}")
total_segments = len(integrated)

View File

@@ -16,7 +16,6 @@ Self-implemented ASRX - 自實作說話人分離系統
import sys
import json
import time
import numpy as np
from pathlib import Path
# 導入自定義模組
@@ -182,7 +181,7 @@ class SelfASRX:
result["processing_time"] = round(total_time, 2)
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
print(f"\n[SelfASRX] Processing completed!")
print("\n[SelfASRX] Processing completed!")
print(f" Total time: {total_time:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f" Detected speakers: {estimated_n_speakers}")
@@ -249,14 +248,14 @@ def main():
# 顯示結果摘要
if "error" not in result:
print(f"\n[Summary]")
print("\n[Summary]")
print(f" Audio duration: {result['total_duration']:.2f}s")
print(f" Speech segments: {result['n_speech_segments']}")
print(f" Detected speakers: {result['n_speakers']}")
print(f" Processing time: {result['processing_time']:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f"\n[Speaker Statistics]")
print("\n[Speaker Statistics]")
for speaker, stats in result["speaker_stats"].items():
pct = stats["duration"] / result["total_duration"] * 100
print(

View File

@@ -134,7 +134,7 @@ class SelfASRXFixed:
result["processing_time"] = round(total_time, 2)
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
print(f"\n[SelfASRX-Fixed] Processing completed!")
print("\n[SelfASRX-Fixed] Processing completed!")
print(f" Total time: {total_time:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f" Detected speakers: {estimated_n_speakers}")
@@ -154,6 +154,117 @@ class SelfASRXFixed:
return result
def process_with_segments(self, audio_path, asr_segments, output_path=None):
"""
使用 ASR segment 邊界進行 speaker diarization取代 VAD 步驟。
Args:
audio_path: 音頻文件路徑WAV
asr_segments: ASR segment 列表,每個包含 start/end
output_path: 輸出 JSON 路徑(可選)
"""
start_time = time.time()
print(f"\n[SelfASRX-Fixed] Processing with {len(asr_segments)} ASR segments: {audio_path}")
print("=" * 60)
# 載入完整音頻
import soundfile as sf
wav, sample_rate = sf.read(audio_path)
if len(wav.shape) > 1:
wav = np.mean(wav, axis=1) # 轉 mono
print(f" Audio loaded: {len(wav)/sample_rate:.2f}s, {sample_rate}Hz")
# 使用 ASR segments 取代 VAD
speech_segments = [(s["start"], s["end"]) for s in asr_segments]
print(f" Speech segments from ASR: {len(speech_segments)}")
if len(speech_segments) == 0:
print("[SelfASRX-Fixed] No ASR segments provided!")
return {"error": "No ASR segments", "segments": []}
# 提取語音片段
audio_segments = []
for start_sec, end_sec in speech_segments:
start_sample = int(start_sec * sample_rate)
end_sample = int(end_sec * sample_rate)
if start_sample >= len(wav):
continue
audio_segments.append(wav[start_sample:min(end_sample, len(wav))])
print(f" Audio segments extracted: {len(audio_segments)}")
# 批量提取聲紋嵌入
print("\n[Step 2] Speaker embedding extraction...")
step2_start = time.time()
embeddings = extract_speaker_embeddings_batch(
self.speaker_encoder, audio_segments, sample_rate
)
embeddings = normalize_embeddings(embeddings)
step2_time = time.time() - step2_start
print(f" Embedding shape: {embeddings.shape}")
print(f" Embedding time: {step2_time:.2f}s")
# 聚類
print("\n[Step 3] Robust speaker clustering...")
step3_start = time.time()
speaker_labels, estimated_n_speakers = robust_speaker_clustering(
embeddings, n_speakers=None, max_speakers=10
)
step3_time = time.time() - step3_start
print(f" Clustering time: {step3_time:.2f}s")
# 建立輸出
result = {
"audio_path": str(audio_path),
"total_duration": len(wav) / sample_rate,
"n_speech_segments": len(speech_segments),
"n_speakers": int(estimated_n_speakers),
"segments": []
}
for i, ((start, end), label) in enumerate(zip(speech_segments, speaker_labels)):
result["segments"].append({
"index": i,
"start": round(start, 3),
"end": round(end, 3),
"duration": round(end - start, 3),
"speaker": f"SPEAKER_{int(label)}"
})
# 加入 embeddings每個 segment 對應的 192-D speaker embedding
result["embeddings"] = []
for emb in embeddings:
result["embeddings"].append(emb.tolist())
# 統計
speaker_stats = {}
for seg in result["segments"]:
speaker = seg["speaker"]
if speaker not in speaker_stats:
speaker_stats[speaker] = {"count": 0, "duration": 0}
speaker_stats[speaker]["count"] += 1
speaker_stats[speaker]["duration"] += seg["duration"]
result["speaker_stats"] = speaker_stats
total_time = time.time() - start_time
result["processing_time"] = round(total_time, 2)
result["realtime_factor"] = round(result["total_duration"] / total_time, 2)
print("\n[SelfASRX-Fixed] Processing completed!")
print(f" Total time: {total_time:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f" Detected speakers: {estimated_n_speakers}")
if output_path:
import json
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(result, f, indent=2, ensure_ascii=False)
print(f" Results saved to: {output_path}")
print("=" * 60)
return result
def main():
import argparse
@@ -180,14 +291,14 @@ def main():
)
if "error" not in result:
print(f"\n[Summary]")
print("\n[Summary]")
print(f" Audio duration: {result['total_duration']:.2f}s")
print(f" Speech segments: {result['n_speech_segments']}")
print(f" Detected speakers: {result['n_speakers']}")
print(f" Processing time: {result['processing_time']:.2f}s")
print(f" Realtime factor: {result['realtime_factor']:.2f}x")
print(f"\n[Speaker Statistics]")
print("\n[Speaker Statistics]")
for speaker, stats in result['speaker_stats'].items():
pct = stats['duration'] / result['total_duration'] * 100
print(f" {speaker}: {stats['count']} segments, " +

View File

@@ -138,7 +138,7 @@ def spectral_clustering_speaker(
speaker_labels = clustering.fit_predict(similarity_matrix)
print(f"[Clustering] Spectral clustering completed")
print("[Clustering] Spectral clustering completed")
print(f"[Clustering] n_speakers: {n_speakers}")
print(f"[Clustering] n_segments: {n_segments}")
@@ -146,7 +146,7 @@ def spectral_clustering_speaker(
except Exception as e:
print(f"[Clustering] Spectral clustering failed: {e}")
print(f"[Clustering] Using fallback: 2 speakers")
print("[Clustering] Using fallback: 2 speakers")
# 簡單分配:前一半是 SPEAKER_0後一半是 SPEAKER_1
speaker_labels = np.array(
[0] * (n_segments // 2) + [1] * (n_segments - n_segments // 2)
@@ -203,7 +203,7 @@ def agglomerative_clustering_speaker(
speaker_labels = clustering.fit_predict(embeddings)
print(f"[Clustering] Agglomerative clustering completed")
print("[Clustering] Agglomerative clustering completed")
print(f"[Clustering] n_speakers: {n_speakers}")
return speaker_labels, n_speakers
@@ -249,7 +249,6 @@ def compute_diarization_purity(speaker_labels, ground_truth_labels=None):
"""
if ground_truth_labels is None:
# 沒有 ground truth使用聚類純度近似
from sklearn.metrics import silhouette_score
# 使用餘弦相似度作為距離
purity = 0.5 # 預設值
@@ -300,7 +299,7 @@ if __name__ == "__main__":
similarity, n_speakers=None, auto_estimate=True
)
print(f"\n[Test] Clustering results:")
print("\n[Test] Clustering results:")
print(f" True n_speakers: {n_speakers}")
print(f" Estimated n_speakers: {n_clusters}")
print(f" Unique labels: {np.unique(labels)}")

View File

@@ -6,7 +6,6 @@ Speaker Clustering - Fixed Version
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
def robust_speaker_clustering(embeddings, n_speakers=None, max_speakers=10):
@@ -57,7 +56,7 @@ def robust_speaker_clustering(embeddings, n_speakers=None, max_speakers=10):
# 統計每個聚類的大小
unique, counts = np.unique(speaker_labels, return_counts=True)
print(f"[Clustering] Cluster sizes:")
print("[Clustering] Cluster sizes:")
for label, count in zip(unique, counts):
print(f" SPEAKER_{label}: {count} segments ({count/n_segments*100:.1f}%)")
@@ -148,6 +147,6 @@ if __name__ == "__main__":
# 測試聚類
labels, n_clusters = robust_speaker_clustering(embeddings)
print(f"\nResult:")
print("\nResult:")
print(f" True n_speakers: {n_speakers}")
print(f" Estimated n_speakers: {n_clusters}")

View File

@@ -33,8 +33,8 @@ def load_speaker_encoder(model_name="speechbrain/spkrec-ecapa-voxceleb"):
)
# 獲取模型資訊
print(f"[SpeakerEncoder] Model loaded successfully")
print(f"[SpeakerEncoder] Embedding dimension: 192")
print("[SpeakerEncoder] Model loaded successfully")
print("[SpeakerEncoder] Embedding dimension: 192")
return classifier
@@ -187,5 +187,5 @@ if __name__ == "__main__":
print(f"[Test] Embedding std: {embedding.std():.4f}")
# 顯示部分嵌入值
print(f"\n[Test] First 10 embedding values:")
print("\n[Test] First 10 embedding values:")
print(f" {embedding[:10]}")

View File

@@ -11,7 +11,6 @@ import os
import threading
import time
from pathlib import Path
from typing import List, Dict
try:
import tkinter as tk

View File

@@ -11,7 +11,6 @@ import os
import threading
import time
from pathlib import Path
from typing import List, Dict
try:
import tkinter as tk
@@ -203,7 +202,7 @@ class SpeakerPlayerGUI:
self.face_path = filename
self.face_label.config(text=Path(filename).name)
self.integrate_button.config(state=tk.NORMAL)
self.status_label.config(text=f"✅ Face 已選擇 - 請點擊整合")
self.status_label.config(text="✅ Face 已選擇 - 請點擊整合")
def integrate_face(self):
"""整合 Face 與 ASRX"""

View File

@@ -93,14 +93,14 @@ def show_menu(speaker_segments: Dict[str, List[Dict]], speaker_id: str):
print(f" ... and {len(segs) - 20} more segments")
print(f"\n{'=' * 70}")
print(f"Commands:")
print("Commands:")
print(f" [1-{min(20, len(segs))}] Play specific segment")
print(f" all Play all segments (may take a while)")
print(f" first N Play first N segments")
print(f" next Next speaker")
print(f" prev Previous speaker")
print(f" list List all speakers")
print(f" quit Exit")
print(" all Play all segments (may take a while)")
print(" first N Play first N segments")
print(" next Next speaker")
print(" prev Previous speaker")
print(" list List all speakers")
print(" quit Exit")
print(f"{'=' * 70}")
@@ -132,7 +132,7 @@ def interactive_player(audio_path: str, result_path: str):
current_speaker_idx = 0
print(f"\n🎬 Speaker Audio Player")
print("\n🎬 Speaker Audio Player")
print(f"📁 Audio: {audio_path}")
print(f"📊 Speakers: {len(speakers)}")
print(f"{'=' * 70}")
@@ -159,7 +159,7 @@ def interactive_player(audio_path: str, result_path: str):
print(
f" ⏱️ {seg['start']:.2f}s - {seg['end']:.2f}s ({seg['duration']:.2f}s)"
)
print(f" ▶️ Playing...", end="", flush=True)
print(" ▶️ Playing...", end="", flush=True)
if extract_and_play(audio_path, seg["start"], seg["end"]):
print(" ✅ Done")
else:
@@ -220,7 +220,7 @@ def interactive_player(audio_path: str, result_path: str):
# 列出所有說話人
elif cmd == "list":
print(f"\n{'=' * 70}")
print(f"📢 All speakers:")
print("📢 All speakers:")
print(f"{'=' * 70}")
for i, speaker in enumerate(speakers, 1):
segs = speaker_segments[speaker]

View File

@@ -6,8 +6,6 @@ GUI Face Player 自動化測試腳本
import json
import subprocess
import time
import os
from pathlib import Path

View File

@@ -5,7 +5,6 @@
import json
import subprocess
import time
from pathlib import Path
from datetime import datetime
@@ -55,7 +54,7 @@ def test_asrx_results():
print(f"📊 語音片段:{n_segments}")
# 說話人統計
print(f"\n📢 說話人分佈:")
print("\n📢 說話人分佈:")
speaker_stats = data.get('speaker_stats', {})
for speaker, stats in sorted(speaker_stats.items(), key=lambda x: x[1]['duration'], reverse=True):
duration = stats.get('duration', 0)
@@ -102,7 +101,7 @@ def test_integration():
print(f"📊 匹配率:{match_rate:.2f}%")
# 說話人匹配統計
print(f"\n📢 說話人匹配詳情:")
print("\n📢 說話人匹配詳情:")
speaker_stats = data.get('speaker_stats', {})
for speaker, stats in sorted(speaker_stats.items()):
total_seg = stats.get('total_segments', 0)
@@ -164,7 +163,7 @@ def test_playback():
end = first_seg['end']
duration = end - start
print(f"\n🎵 測試提取第一個片段:")
print("\n🎵 測試提取第一個片段:")
print(f" 時間:{start:.2f}s - {end:.2f}s ({duration:.2f}s)")
# 實際提取測試
@@ -222,10 +221,10 @@ def generate_report():
# 保存報告
report_path = '/tmp/long_movie_test_report.md'
with open(report_path, 'w', encoding='utf-8') as f:
f.write(f"# 長影片測試報告\n\n")
f.write("# 長影片測試報告\n\n")
f.write(f"**測試時間**: {datetime.now().isoformat()}\n")
f.write(f"**測試影片**: Charade 1963 (114.7 分鐘)\n\n")
f.write(f"## 結果\n\n")
f.write("**測試影片**: Charade 1963 (114.7 分鐘)\n\n")
f.write("## 結果\n\n")
f.write(f"**通過**: {passed}/{total}\n\n")
for name, result in tests:
status = "" if result else ""

View File

@@ -9,7 +9,6 @@ VAD (Voice Activity Detection) - 語音活動檢測
"""
import torch
import numpy as np
def load_vad_model():
@@ -143,7 +142,7 @@ if __name__ == "__main__":
print(f"[VAD] Processing: {audio_path}")
segments, wav, sr = extract_speech_segments(audio_path, model, utils)
print(f"\n[VAD] Results:")
print("\n[VAD] Results:")
print(f" Sample rate: {sr} Hz")
print(f" Speech segments: {len(segments)}")
print(f" Total duration: {len(wav) / sr:.2f}s")
@@ -153,7 +152,7 @@ if __name__ == "__main__":
f" Total speech: {total_speech:.2f}s ({total_speech / (len(wav) / sr) * 100:.1f}%)"
)
print(f"\n[VAD] Segments:")
print("\n[VAD] Segments:")
for i, (start, end) in enumerate(segments[:10]):
print(f" {i + 1:3d}. {start:6.2f}s - {end:6.2f}s ({end - start:5.2f}s)")

View File

@@ -4,7 +4,6 @@ Audio Taxonomy Processor (Hugging Face Transformers)
職責:使用 AST 模型進行高精度音頻分類,並映射到業務分類。
"""
import numpy as np
import json
import os
import sys
@@ -75,7 +74,7 @@ def map_to_taxonomy(predictions):
def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):
"""執行分類"""
print(f"🔍 Loading AST model (MIT) from Hugging Face...")
print("🔍 Loading AST model (MIT) from Hugging Face...")
# 使用 Audio Spectrogram Transformer準確率高且支援 MPS/CPU
classifier = pipeline(
"audio-classification",
@@ -103,7 +102,7 @@ def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):
if taxonomy:
results.append({"timestamp": round(current, 1), "categories": taxonomy})
except Exception as e:
except Exception:
pass # 跳過錯誤片段
current += hop_sec
@@ -132,6 +131,6 @@ if __name__ == "__main__":
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
json.dump({"audio_taxonomy": events}, f, indent=2, ensure_ascii=False)
print(f"\n🎉 Classification Complete!")
print("\n🎉 Classification Complete!")
print(f"✅ Found {len(events)} tagged audio segments.")
print(f"💾 Saved to {OUTPUT_JSON}")

View File

@@ -99,7 +99,7 @@ def map_to_taxonomy(logits, model):
def run_audio_taxonomy(audio_path, chunk_sec=1.0, hop_sec=0.5):
"""執行分類"""
print(f"🔍 Loading AST model (MIT)...")
print("🔍 Loading AST model (MIT)...")
model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
@@ -167,6 +167,6 @@ if __name__ == "__main__":
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
json.dump({"audio_taxonomy": events}, f, indent=2, ensure_ascii=False)
print(f"\n🎉 Classification Complete!")
print("\n🎉 Classification Complete!")
print(f"✅ Found {len(events)} tagged audio segments.")
print(f"💾 Saved to {OUTPUT_JSON}")

View File

@@ -105,7 +105,7 @@ def main():
# 6. Generate report
print(f"\n{'=' * 60}")
print(f"📊 Person Identification Results")
print("📊 Person Identification Results")
print(f"{'=' * 60}")
# Sort by frame count
@@ -177,7 +177,7 @@ def main():
print(f"✅ Executed {executed} SQL statements")
# 9. Generate SQL INSERT statements for person_identities
print(f"\n--- SQL INSERT statements for person_identities ---")
print("\n--- SQL INSERT statements for person_identities ---")
for p in output["persons"][:10]:
speaker_val = f"'{p['speaker_id']}'" if p["speaker_id"] else "NULL"
print(

View File

@@ -4,11 +4,9 @@ Backfill missing Age & Gender for persons.
"""
import os
import sys
import cv2
import psycopg2
import insightface
import numpy as np
DB_CONFIG = {"host": "localhost", "user": "accusys", "dbname": "momentry"}
BASE_VIDEO_DIR = "output"
@@ -94,7 +92,7 @@ def main():
else:
print(f" -> Detection incomplete (Age:{age}, Gender:{gender})")
else:
print(f" -> No face found in frame.")
print(" -> No face found in frame.")
print("=== Done ===")
conn.close()

View File

@@ -10,8 +10,8 @@ from transformers import AutoProcessor, AutoModelForCausalLM
UUID = "384b0ff44aaaa1f1"
OUTPUT_DIR = f"output/{UUID}/florence2_results"
INPUT_IMG = os.path.join(OUTPUT_DIR, f"raw_6846.jpg")
OUTPUT_IMG = os.path.join(OUTPUT_DIR, f"all_stamps_detected.jpg")
INPUT_IMG = os.path.join(OUTPUT_DIR, "raw_6846.jpg")
OUTPUT_IMG = os.path.join(OUTPUT_DIR, "all_stamps_detected.jpg")
# Patch for compatibility (Same as before)
import types

View File

@@ -67,10 +67,10 @@ def main():
all_passed = doc_check_success and code_doc_check_success
if all_passed:
print(f"\n🎉 所有檢查通過!")
print("\n🎉 所有檢查通過!")
print("架構文檔符合 Phase 1 標準化要求。")
else:
print(f"\n⚠️ 發現問題,請參考檢查結果進行修復。")
print("\n⚠️ 發現問題,請參考檢查結果進行修復。")
print("提示:")
print(" 1. 使用 TERMINOLOGY_MAPPING.md 作為術語標準參考")
print(" 2. 確保設計與實現差異在 DESIGN_IMPLEMENTATION_GAP.md 中記錄")

View File

@@ -12,14 +12,13 @@
python3 scripts/check_architecture_docs.py [--report] [--verbose]
"""
import os
import re
import sys
import glob
import json
import argparse
from pathlib import Path
from typing import Dict, List, Set, Tuple, Optional
from typing import Dict, List, Set, Optional
from collections import defaultdict
# 配置
@@ -410,15 +409,15 @@ class ArchitectureDocChecker:
print(f"{'=' * 60}")
print(f"📁 檢查文件數: {total_files}")
print(f"⚠️ 發現問題數: {total_issues}")
print(f"\n問題分類:")
print("\n問題分類:")
for issue_type, count in report["summary"]["issues_by_type"].items():
print(f" - {issue_type}: {count}")
print(f"\n嚴重程度:")
print("\n嚴重程度:")
for severity, count in report["summary"]["issues_by_severity"].items():
print(f" - {severity}: {count}")
if total_issues > 0:
print(f"\n🔍 詳細問題:")
print("\n🔍 詳細問題:")
for file_report in report["files"]:
if file_report["issues"]:
print(f"\n文件: {file_report['file']}")
@@ -474,7 +473,7 @@ def main():
print(f"\n❌ 發現 {report['summary']['total_issues']} 個問題,請修復")
sys.exit(1)
else:
print(f"\n✅ 所有檢查通過!")
print("\n✅ 所有檢查通過!")
sys.exit(0)

View File

@@ -6,9 +6,7 @@
核心原則:當設計與實現出現矛盾時,以實際的 Rust 代碼實現為最高權威
"""
import os
import re
import sys
from pathlib import Path
@@ -177,7 +175,7 @@ def main():
issues = check_terminology_consistency(implemented_variants)
# 3. 顯示結果
print(f"\n📊 檢查完成:")
print("\n📊 檢查完成:")
print(f" 發現問題數: {len(issues)}")
if issues:

View File

@@ -5,7 +5,6 @@ Analyze Frame at 112:36 (6756s) for Stamps
import os
import cv2
import torch
import types
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

View File

@@ -5,7 +5,6 @@ Analyze Frame at 91:59 (5519s) for Stamps
import os
import cv2
import torch
import types
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

View File

@@ -6,7 +6,6 @@ Generates a comprehensive report of each chunk's content.
import json
import os
import sys
UUID = "384b0ff44aaaa1f1"
BASE_DIR = f"output/{UUID}"
@@ -107,7 +106,7 @@ def print_summary(chunks):
1 for c in chunks if not c["has_speech"] and not c["has_faces"]
)
print(f"\n📊 Overview:")
print("\n📊 Overview:")
print(f" Total chunks: {len(chunks)}")
print(
f" Chunks with speech: {total_speech_chunks} ({total_speech_chunks / len(chunks) * 100:.0f}%)"
@@ -125,7 +124,7 @@ def print_summary(chunks):
print(f" Total face frames: {total_faces}")
# Combination breakdown
print(f"\n🎯 ASR/Face Combination Breakdown:")
print("\n🎯 ASR/Face Combination Breakdown:")
combos = {}
for c in chunks:
@@ -148,7 +147,7 @@ def print_summary(chunks):
)
# Top chunks by activity
print(f"\n🔥 Top 10 Most Active Chunks (by ASR+Faces):")
print("\n🔥 Top 10 Most Active Chunks (by ASR+Faces):")
scored_chunks = []
for c in chunks:
score = c["asr_count"] + c["face_count"]
@@ -164,7 +163,7 @@ def print_summary(chunks):
)
# Stamp scene chunk
print(f"\n🔍 Special Interest Chunks:")
print("\n🔍 Special Interest Chunks:")
for c in chunks:
# Stamp scene around 5730s
if c["start"] <= 5730 <= c["end"]:

View File

@@ -256,7 +256,7 @@ def test_similarity_search(
result = cur.fetchone()
if not result or not result[0]:
print(f"⚠️ Identity embedding not found")
print("⚠️ Identity embedding not found")
return []
stored_embedding_raw = result[0]
@@ -323,7 +323,7 @@ def main():
logo_path = TEMP_DIR / f"{name.replace(' ', '_')}.png"
if not logo_path.exists():
print(f"\n🔧 Downloading logo...")
print("\n🔧 Downloading logo...")
if not download_image(logo_url, logo_path):
sys.exit(1)
@@ -334,18 +334,18 @@ def main():
if args.performance:
perf_result = test_mps_performance(model, processor, device, logo_path, iterations=10)
if perf_result:
print(f"\n📊 Performance Summary:")
print("\n📊 Performance Summary:")
print(f" MPS: {perf_result['mps_time']:.4f}s/img")
print(f" CPU: {perf_result['cpu_time']:.4f}s/img")
print(f" Speedup: {perf_result['speedup']:.2f}x")
print(f"\n🔧 Extracting CLIP embedding...")
print("\n🔧 Extracting CLIP embedding...")
embedding = extract_clip_embedding(model, processor, device, logo_path)
if not embedding:
sys.exit(1)
print(f"\n🔧 Registering to database...")
print("\n🔧 Registering to database...")
uuid = register_logo_identity_to_db(
name=name,
logo_url=logo_url,
@@ -354,13 +354,13 @@ def main():
)
if uuid:
print(f"\n🎉 Integration completed!")
print("\n🎉 Integration completed!")
print(f" Identity: {name}")
print(f" UUID: {uuid}")
print(f" Embedding: {len(embedding)}-dim")
print(f" URL: {logo_url}")
print(f"\n🔧 Testing similarity search...")
print("\n🔧 Testing similarity search...")
test_embeddings = [
embedding,
[0.1] * 768,
@@ -369,9 +369,9 @@ def main():
matches = test_similarity_search(uuid, test_embeddings, threshold=0.85, schema=args.schema)
if matches:
print(f"\n✅ Similarity search test passed")
print("\n✅ Similarity search test passed")
else:
print(f"\n❌ Integration failed")
print("\n❌ Integration failed")
sys.exit(1)

View File

@@ -10,7 +10,7 @@ ASR方案内容对比分析
import json
from pathlib import Path
from difflib import unified_diff, SequenceMatcher
from difflib import SequenceMatcher
def load_segments(json_path):
"""加载JSON文件中的segments"""
@@ -25,7 +25,7 @@ def compare_segments(seg_a, seg_b, name_a, name_b):
print(f"{'='*60}")
# 统计
print(f"\n【数量对比】")
print("\n【数量对比】")
print(f" {name_a}: {len(seg_a)} segments")
print(f" {name_b}: {len(seg_b)} segments")
print(f" 差异: {len(seg_a) - len(seg_b)} segments")
@@ -34,7 +34,7 @@ def compare_segments(seg_a, seg_b, name_a, name_b):
total_time_a = sum(s['end'] - s['start'] for s in seg_a)
total_time_b = sum(s['end'] - s['start'] for s in seg_b)
print(f"\n【时间覆盖】")
print("\n【时间覆盖】")
print(f" {name_a}: {total_time_a:.2f}")
print(f" {name_b}: {total_time_b:.2f}")
print(f" 差异: {total_time_a - total_time_b:.2f}")
@@ -48,11 +48,11 @@ def compare_segments(seg_a, seg_b, name_a, name_b):
text_b_full = ' '.join(texts_b)
similarity = SequenceMatcher(None, text_a_full, text_b_full).ratio()
print(f"\n【文本相似度】")
print("\n【文本相似度】")
print(f" 相似度: {similarity*100:.1f}%")
# 差异分析
print(f"\n【详细差异】")
print("\n【详细差异】")
# 按时间对齐对比
matched_diffs = []
@@ -98,7 +98,7 @@ def compare_segments(seg_a, seg_b, name_a, name_b):
if len(matched_diffs) > 10:
print(f"\n ... 还有 {len(matched_diffs) - 10} 处差异")
else:
print(f" ✓ 无显著文本差异")
print(" ✓ 无显著文本差异")
return {
'segments_diff': len(seg_a) - len(seg_b),
@@ -122,10 +122,10 @@ def main():
# 方案基本信息
print("【测试方案】")
print(f" 方案A: faster-whisper small CPU")
print(f" 方案B: OpenAI whisper small CPU")
print(f" 方案D: OpenAI whisper medium CPU")
print(f" 方案C/E: MPS失败不支持")
print(" 方案A: faster-whisper small CPU")
print(" 方案B: OpenAI whisper small CPU")
print(" 方案D: OpenAI whisper medium CPU")
print(" 方案C/E: MPS失败不支持")
print()
# 三组对比
@@ -142,16 +142,16 @@ def main():
print("="*60)
print("\n【Segments数量】")
print(f" 方案A: 77 segments (最多)")
print(f" 方案B: 74 segments")
print(f" 方案D: 74 segments")
print(f" 结论: faster-whisper分割更细+3 segments")
print(" 方案A: 77 segments (最多)")
print(" 方案B: 74 segments")
print(" 方案D: 74 segments")
print(" 结论: faster-whisper分割更细+3 segments")
print("\n【文本相似度】")
print(f" A vs B: {results['A_vs_B']['similarity']*100:.1f}%")
print(f" A vs D: {results['A_vs_D']['similarity']*100:.1f}%")
print(f" B vs D: {results['B_vs_D']['similarity']*100:.1f}%")
print(f" 结论: 三个方案文本高度相似")
print(" 结论: 三个方案文本高度相似")
print("\n【文本差异统计】")
print(f" A vs B: {results['A_vs_B']['text_diffs']}处差异")
@@ -159,9 +159,9 @@ def main():
print(f" B vs D: {results['B_vs_D']['text_diffs']}处差异")
print("\n【方案Dmediumvs 方案Bsmall")
print(f" Segments数量相同: 74条")
print(" Segments数量相同: 74条")
print(f" 文本相似度: {results['B_vs_D']['similarity']*100:.1f}%")
print(f" 结论: medium模型无明显提升")
print(" 结论: medium模型无明显提升")
print()
print("="*60)

View File

@@ -0,0 +1,131 @@
#!/opt/homebrew/bin/python3.11
"""
POC: Compare silence-based segmentation vs CUT-based segmentation for ASR.
Tests a short video segment and reports:
1. Number of segments from each method
2. Segment boundaries
3. ASR quality comparison (WER estimate)
"""
import json
import os
import sys
import subprocess
import tempfile
import time
from faster_whisper import WhisperModel
VIDEO_PATH = sys.argv[1] if len(sys.argv) > 1 else "/Users/accusys/test_video/Old_Time_Movie_Show_-_Charade_1963.HD.mov"
DURATION = 300 # Test first 5 minutes only
model = WhisperModel("small", device="cpu", compute_type="int8")
def extract_audio_segment(start, end, out_wav):
cmd = ["ffmpeg", "-y", "-v", "quiet", "-i", VIDEO_PATH,
"-ss", str(start), "-to", str(end),
"-ar", "16000", "-ac", "1", out_wav]
subprocess.run(cmd, check=False, capture_output=True)
return os.path.getsize(out_wav) > 100
def transcribe(wav_path):
segs, info = model.transcribe(wav_path, beam_size=5, vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500, speech_pad_ms=200))
return list(segs), info
# === Method 1: CUT-based segmentation ===
print("=" * 60)
print("METHOD 1: CUT-based segmentation")
print("=" * 60)
cut_path = "/Users/accusys/momentry/output_dev/417a7e93860d70c87aee6c4c1b715d70.cut.json"
cut_scenes = []
if os.path.exists(cut_path):
with open(cut_path) as f:
data = json.load(f)
cut_scenes = [(s["start_time"], s["end_time"]) for s in data.get("scenes", []) if s["start_time"] < DURATION]
print(f" Scenes in first {DURATION}s: {len(cut_scenes)}")
tmpdir = tempfile.mkdtemp(prefix="seg_compare_")
t1 = time.time()
cut_segments = []
total_chars = 0
for idx, (st, et) in enumerate(cut_scenes):
wav = os.path.join(tmpdir, f"cut_{idx:04d}.wav")
if not extract_audio_segment(st, et, wav):
continue
segs, info = transcribe(wav)
for s in segs:
cut_segments.append({"start": st + s.start, "end": st + s.end, "text": s.text})
total_chars += len(s.text)
cut_time = time.time() - t1
print(f" Segments: {len(cut_segments)}, Total chars: {total_chars}, Time: {cut_time:.1f}s")
print(f" Avg segment duration: {DURATION/len(cut_segments):.1f}s" if cut_segments else "")
# === Method 2: Silence-based segmentation (ffmpeg silencedetect) ===
print()
print("=" * 60)
print("METHOD 2: Silence-based segmentation (ffmpeg silencedetect)")
print("=" * 60)
# Extract full 5min audio
full_wav = os.path.join(tmpdir, "full_audio.wav")
extract_audio_segment(0, DURATION, full_wav)
# Use ffmpeg silencedetect to find speech segments
t2 = time.time()
detect_cmd = ["ffmpeg", "-i", full_wav, "-af", "silencedetect=noise=-30dB:d=0.5", "-f", "null", "-"]
result = subprocess.run(detect_cmd, capture_output=True, text=True)
stderr = result.stderr
# Parse silencedetect output
silence_starts = []
silence_ends = []
for line in stderr.split("\n"):
if "silence_start:" in line:
silence_starts.append(float(line.split("silence_start:")[1].strip()))
elif "silence_end:" in line:
silence_ends.append(float(line.split("silence_end:")[1].split("|")[0].strip()))
# Build speech segments: gaps between silence periods
speech_segments = []
last_end = 0.0
for ss, se in zip(silence_starts, silence_ends):
if ss > last_end + 0.5:
speech_segments.append((last_end, ss))
last_end = se
if last_end < DURATION:
speech_segments.append((last_end, DURATION))
print(f" Silence periods detected: {len(silence_starts)}")
print(f" Speech segments: {len(speech_segments)}")
# Transcribe each speech segment
silence_segments = []
total_chars2 = 0
for idx, (st, et) in enumerate(speech_segments):
wav = os.path.join(tmpdir, f"sil_{idx:04d}.wav")
if not extract_audio_segment(st, et, wav):
continue
segs, info = transcribe(wav)
for s in segs:
silence_segments.append({"start": st + s.start, "end": st + s.end, "text": s.text})
total_chars2 += len(s.text)
silence_time = time.time() - t2
print(f" Segments: {len(silence_segments)}, Total chars: {total_chars2}, Time: {silence_time:.1f}s")
# === Comparison ===
print()
print("=" * 60)
print("COMPARISON")
print("=" * 60)
print(f"{'Metric':<30} {'CUT-based':<15} {'Silence-based':<15}")
print("-" * 60)
print(f"{'Number of audio segments':<30} {len(cut_scenes):<15} {len(speech_segments):<15}")
print(f"{'Number of ASR segments':<30} {len(cut_segments):<15} {len(silence_segments):<15}")
print(f"{'Total chars recognized':<30} {total_chars:<15} {total_chars2:<15}")
print(f"{'Processing time (s)':<30} {cut_time:<15.1f} {silence_time:<15.1f}")
# Cleanup
import shutil
shutil.rmtree(tmpdir, ignore_errors=True)
print()
print("Done.")

View File

@@ -13,7 +13,6 @@ OUTPUT_DIR = f"output/{UUID}/florence2_results"
# These are placeholders - I need to re-run to get the exact boxes if they weren't printed.
# Since I saw the logs, I know it found them.
# But I need the exact coordinates. Let's run a detection script that crops them immediately.
import torch
import types
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

View File

@@ -6,7 +6,6 @@ Crop the detected stamp from the 112:36 frame (with Patch).
from PIL import Image
import os
import cv2
import torch
import types
from transformers import AutoProcessor, AutoModelForCausalLM

View File

@@ -140,7 +140,7 @@ def main():
video_stream = next((s for s in video_info["streams"] if s["codec_type"] == "video"), None)
print(f"\n测试视频:")
print("\n测试视频:")
print(f" 文件: {int(video_info['format'].get('size', 0)) / 1024 / 1024:.1f} MB")
print(f" 时长: {float(video_info['format'].get('duration', 0)):.1f}")
print(f" 分辨率: {video_stream.get('width', 0)}x{video_stream.get('height', 0)}")
@@ -188,7 +188,7 @@ def main():
"file_size_kb": result["file_size_kb"],
})
print(f"\n✅ 处理完成:")
print("\n✅ 处理完成:")
print(f" 时间: {result['elapsed_time']:.2f}")
print(f" 内存峰值: {result['peak_memory_mb']:.1f} MB")
print(f" 检测场景数: {result['total_scenes']}")
@@ -223,7 +223,7 @@ def main():
print(f"{'=' * 80}")
print("\n【对比总结】")
print(f"\n| 方案 | 脚本 | 时间(秒) | 内存(MB) | 场景数 | 平均时长(秒) |")
print("\n| 方案 | 脚本 | 时间(秒) | 内存(MB) | 场景数 | 平均时长(秒) |")
print("|------|------|---------|---------|--------|-------------|")
for r in results:

View File

@@ -4,7 +4,6 @@ Debug script to test face registration with same arguments Rust uses
"""
import subprocess
import sys
import os
# Simulate what Rust would call

View File

@@ -7,7 +7,6 @@ Deep Analysis of 112:36 Frame
import os
import cv2
import torch
import types
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM
@@ -149,7 +148,7 @@ try:
2,
)
else:
print(f" ❌ Not found.")
print(" ❌ Not found.")
except Exception as e:
print(f" ⚠️ Error: {e}")

View File

@@ -4,7 +4,6 @@ Momentry Core Visual Demo Dashboard
職責:提供處理器模組的視覺化預覽,支持時間軸檢查與多模組疊加顯示。
"""
import sys
import os
import json
import cv2

View File

@@ -6,7 +6,6 @@ Demonstrate face learning capability
import json
import os
import sys
import numpy as np
from pathlib import Path
# Add script directory to path

View File

@@ -8,7 +8,7 @@
import sys
import json
import argparse
from typing import Dict, List, Optional, Tuple
from typing import Dict, Tuple
import re
# 簡單的語言檢測規則(可擴展)

View File

@@ -5,7 +5,6 @@ Detect and Crop Envelopes/Objects in Keyframes
import os
import cv2
import torch
import types
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

View File

@@ -7,7 +7,6 @@ Export Person Thumbnails
import cv2
import json
import os
import sys
# 設定
OUTPUT_DIR = "output/quick_preview"

View File

@@ -4,8 +4,6 @@
"""
import cv2
import numpy as np
import json
import os
from datetime import datetime
@@ -247,7 +245,7 @@ def create_female_faces_report(female_frames_info, output_dir="/tmp/female_faces
f"- `{os.path.basename(info['thumbnail'])}` - 縮略圖800px寬\n"
)
f.write(f"- `female_faces_report.md` - 本報告文件\n\n")
f.write("- `female_faces_report.md` - 本報告文件\n\n")
f.write("## 🔍 分析說明\n\n")
f.write("1. **邊界框顏色**: 粉色 (RGB: 255,105,180) 表示女性人臉\n")
@@ -332,20 +330,20 @@ def main():
info for info in female_frames_info if info["female_count"] == max_females
][0]
print(f"📊 統計摘要:")
print("📊 統計摘要:")
print(f" - 總分析畫面: {len(female_frames_info)}")
print(f" - 女性最多畫面: 幀 {max_frame_info['frame_number']}")
print(f" - 女性數量: {max_females}")
print(f" - 時間位置: {max_frame_info['timestamp_formatted']}")
print()
print(f"📁 生成文件:")
print("📁 生成文件:")
print(f" - 標記圖像: {output_dir}/female_faces_frame_*.jpg")
print(f" - 縮略圖: {output_dir}/female_faces_frame_*_thumbnail.jpg")
print(f" - 分析報告: {report_path}")
print()
print(f"🔍 查看結果:")
print("🔍 查看結果:")
print(f" ls -la {output_dir}/")
print(f" open {output_dir}/female_faces_report.md")

View File

@@ -23,7 +23,6 @@ import sys
import json
import time
import subprocess
import shutil
from pathlib import Path
from datetime import datetime
@@ -230,7 +229,7 @@ def main():
sys.exit(1)
video_info = get_video_info(video_path)
print(f"\n测试视频:")
print("\n测试视频:")
print(f" UUID: {video_uuid}")
print(f" 文件: {video_info.get('size_mb', 0):.1f} MB")
print(f" 时长: {video_info.get('duration', 0):.1f}")
@@ -286,7 +285,7 @@ def main():
"has_landmarks": result["has_landmarks"]
})
print(f"\n✅ 处理完成:")
print("\n✅ 处理完成:")
print(f" 时间: {result['elapsed_time']:.2f}")
print(f" 速度: {speed:.2f}x 实时倍速")
print(f" 内存峰值: {result['peak_memory_mb']:.1f} MB")
@@ -324,7 +323,7 @@ def main():
print(f"{'=' * 80}")
print("\n【对比总结】")
print(f"\n| 方案 | 脚本 | 时间(秒) | 速度 | 内存(MB) | 人脸数 | Embedding |")
print("\n| 方案 | 脚本 | 时间(秒) | 速度 | 内存(MB) | 人脸数 | Embedding |")
print("|------|------|---------|------|---------|--------|-----------|")
for r in results:

View File

@@ -5,9 +5,7 @@ Face Detection Count Comparison
"""
import json
import sys
from pathlib import Path
from collections import defaultdict
def load_results(filepath):
"""加载检测结果"""
@@ -172,7 +170,7 @@ def main():
stats = analyze_detection_distribution(results_a, results_b, results_c)
print(f"| 版本 | 总人脸数 | 检测帧数 | 有人脸帧 | 无人脸帧 | 平均每帧 | 最多人脸 |")
print("| 版本 | 总人脸数 | 检测帧数 | 有人脸帧 | 无人脸帧 | 平均每帧 | 最多人脸 |")
print("|------|---------|---------|---------|---------|---------|---------|")
for name, s in stats.items():
@@ -187,14 +185,14 @@ def main():
print(f"共有 {len(comparison)} 帧检测数量不同")
print()
print(f"| 帧号 | 时间(秒) | InsightFace | MediaPipe | OpenCV | 最大差异 |")
print("| 帧号 | 时间(秒) | InsightFace | MediaPipe | OpenCV | 最大差异 |")
print("|------|---------|------------|----------|--------|---------|")
for item in comparison[:30]: # 只显示前30帧
print(f"| {item['frame']} | {item['timestamp']:.2f} | {item['insightface']} | {item['mediapipe']} | {item['opencv']} | {item['diff']} |")
if len(comparison) > 30:
print(f"| ... | ... | ... | ... | ... | ... |")
print("| ... | ... | ... | ... | ... | ... |")
print(f"| 共 {len(comparison)} 帧有差异 |")
print()
@@ -212,7 +210,7 @@ def main():
if mediapipe_missed:
print("MediaPipe漏检详情前10帧:")
print(f"| 帧号 | InsightFace检测 | OpenCV检测 |")
print("| 帧号 | InsightFace检测 | OpenCV检测 |")
print("|------|----------------|-----------|")
for m in mediapipe_missed[:10]:
print(f"| {m['frame']} | {m.get('insightface_count', m.get('others_count', '?'))} | {m.get('opencv_count', '?')} |")
@@ -225,7 +223,7 @@ def main():
print(f"以InsightFace为基准{baseline}张人脸):")
print()
print(f"| 版本 | 检测数 | 检测率 | 漏检数 |")
print("| 版本 | 检测数 | 检测率 | 漏检数 |")
print("|------|--------|--------|--------|")
for name, s in stats.items():

View File

@@ -38,7 +38,7 @@ def extract_face_embeddings(uuid: str, video_path: str):
return {}
# 1. 加載 Face JSON 數據
face_path = os.path.join(OUTPUT_DIR, "quick_preview", f"preview.face.json")
face_path = os.path.join(OUTPUT_DIR, "quick_preview", "preview.face.json")
if not os.path.exists(face_path):
print(f" [Skip] No Face data for {uuid}")
return {}
@@ -119,7 +119,7 @@ def extract_face_embeddings(uuid: str, video_path: str):
)
if result:
embeddings.append(np.array(result[0]["embedding"]))
except Exception as e:
except Exception:
# 忽略無法識別的臉部
pass

View File

@@ -21,7 +21,6 @@ import os
import time
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher
from resume_framework import ResumeFramework, format_time, print_progress
from utils.pose_analyzer import calculate_pose_angle_v2
@@ -141,7 +140,7 @@ def process_face(
print(f"\nProcessing video: {total_frames} frames @ {fps:.2f} fps")
print(f"Auto-save every {auto_save_interval}s or {auto_save_frames} frames")
print(f"Resume from frame {frame_count + 1 if resume_mode else 1}")
print(f"Detection method: InsightFace (REQUIRED)")
print("Detection method: InsightFace (REQUIRED)")
print()
while True:
@@ -199,7 +198,7 @@ def process_face(
"pitch": pose_result.get("pitch", "neutral"),
"features": pose_result.get("features", {}),
}
except Exception as e:
except Exception:
pass
face_list.append(
@@ -255,6 +254,45 @@ def process_face(
return face_data
def _convert_to_face_result(face_data: dict) -> dict:
"""Convert ResumeFramework output to FaceResult format expected by Rust."""
metadata = face_data.get("metadata", {})
raw_frames = face_data.get("frames", {})
fps = metadata.get("fps", 30.0)
frames = []
for frame_key in sorted(raw_frames.keys(), key=lambda k: int(k)):
f = raw_frames[frame_key]
faces = []
for raw_face in f.get("faces", []):
pose = raw_face.get("pose_angle")
attributes = raw_face.get("attributes", {})
face = {
"face_id": None,
"x": raw_face["x"],
"y": raw_face["y"],
"width": raw_face["width"],
"height": raw_face["height"],
"confidence": raw_face.get("confidence", 0.0),
"embedding": raw_face.get("embedding"),
"landmarks": raw_face.get("landmarks"),
"attributes": {
"age": attributes.get("age") if attributes else None,
"gender": attributes.get("gender") if attributes else None,
},
}
faces.append(face)
frames.append({
"frame": f["frame_number"],
"timestamp": f["time_seconds"],
"faces": faces,
})
return {
"frame_count": len(frames),
"fps": fps,
"frames": frames,
}
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Face Detection & Demographics with Resume Support")
parser.add_argument("video_path", help="Path to video file")
@@ -285,11 +323,11 @@ if __name__ == "__main__":
"-s",
help="Frame sample interval",
type=int,
default=30,
default=5,
)
args = parser.parse_args()
process_face(
result = process_face(
args.video_path,
args.output_path,
args.uuid,
@@ -297,4 +335,7 @@ if __name__ == "__main__":
args.auto_save_frames,
args.force_restart,
args.sample_interval,
)
)
face_result = _convert_to_face_result(result)
with open(args.output_path, "w") as f:
json.dump(face_result, f, indent=2)

View File

@@ -18,7 +18,7 @@ import os
import signal
import time
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from typing import Dict, List
import cv2
import numpy as np
@@ -108,7 +108,7 @@ class MediaPipeFaceDetector:
print(f"[Face] Using fallback model: {alt_path}")
return alt_path
raise RuntimeError(f"Could not download MediaPipe model from any source")
raise RuntimeError("Could not download MediaPipe model from any source")
return model_path

View File

@@ -9,10 +9,8 @@ import sys
import json
import argparse
import os
import time
import numpy as np
from typing import List, Dict, Any, Optional, Tuple
import uuid
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher

View File

@@ -8,7 +8,6 @@ import sys
import json
import argparse
import os
import numpy as np
import time
from typing import Dict, Any, Optional
@@ -176,7 +175,7 @@ class FaceRegistration:
}
if len(faces) > 1:
print(f"[WARNING] Multiple faces detected, using the first one")
print("[WARNING] Multiple faces detected, using the first one")
# Use the first face
face = faces[0]

View File

@@ -4,7 +4,6 @@
"""
import psycopg2
import json
from datetime import datetime
import sys
@@ -235,7 +234,7 @@ def main():
with open("/tmp/face_statistics_report.txt", "w") as f:
f.write(report)
print(f"\n報告已保存到: /tmp/face_statistics_report.txt")
print("\n報告已保存到: /tmp/face_statistics_report.txt")
except Exception as e:
print(f"❌ 生成報告時出錯: {e}")

View File

@@ -74,7 +74,7 @@ def main():
total_faces = sum(len(faces) for faces in faces_map.values())
print(f"✅ Indexed {len(faces_map)} frames, containing {total_faces} faces.")
print(f"🚀 Starting Linear Video Scan...")
print("🚀 Starting Linear Video Scan...")
# 2. 線性掃描
video_path = VIDEO_PATH # 使用區域變數避免 global 問題
@@ -138,7 +138,7 @@ def main():
face_refs.append(
{"frame_idx": current_frame, "face_idx": face_idx}
)
except Exception as e:
except Exception:
pass
processed_frames += 1

View File

@@ -220,7 +220,7 @@ for sec in range(0, total_sec, FRAME_INTERVAL):
print(
f" 🎯 {sec}s | {term} | {s:.2f} | {int(orig_w)}x{int(orig_h)}px"
)
except Exception as e:
except Exception:
pass
# Save annotated frame if stamps found

View File

@@ -7,7 +7,6 @@
import sys
import os
import subprocess
import time
def run_test(script_name, description):
@@ -50,7 +49,7 @@ def check_server_status():
response = requests.get("http://localhost:3002/health", timeout=5)
if response.status_code == 200:
print(f"✅ 生產服務器運行正常 (端口 3002)")
print("✅ 生產服務器運行正常 (端口 3002)")
return True
else:
print(f"❌ 生產服務器異常: {response.status_code}")
@@ -63,7 +62,7 @@ def check_server_status():
response = requests.get("http://localhost:3003/health", timeout=5)
if response.status_code == 200:
print(f"✅ 開發服務器運行正常 (端口 3003)")
print("✅ 開發服務器運行正常 (端口 3003)")
return True
else:
print(f"❌ 開發服務器異常: {response.status_code}")
@@ -100,7 +99,7 @@ def check_database():
""")
tables = cursor.fetchall()
print(f"✅ 數據庫連接正常")
print("✅ 數據庫連接正常")
print(f"✅ 找到 {len(tables)} 個人臉相關表:")
for table in tables:
print(f" - {table[0]}")

View File

@@ -6,7 +6,6 @@ Heuristic: Kids have a larger head relative to their body height (approx 1:5 or
import json
import math
import sys
# Configuration
POSE_JSON_PATH = "output/384b0ff44aaaa1f1/384b0ff44aaaa1f1.pose.json"
@@ -161,7 +160,7 @@ def find_kids():
# Sort by timestamp
sorted_kids = sorted(unique_kids.values(), key=lambda x: x['timestamp'])
print(f"\nUnique potential kid detections (timestamps):")
print("\nUnique potential kid detections (timestamps):")
for k in sorted_kids:
print(f" -> Timestamp: {k['timestamp']:.2f}s | Ratio: {k['ratio']}")

View File

@@ -8,7 +8,6 @@ Filters:
import json
import math
import sys
import os
POSE_JSON_PATH = "output/384b0ff44aaaa1f1/384b0ff44aaaa1f1.pose.json"
@@ -133,7 +132,7 @@ def find_kids():
sorted_kids = sorted(unique_kids.values(), key=lambda x: x["timestamp"])
print(f"\nRefined Timestamps:")
print("\nRefined Timestamps:")
for k in sorted_kids:
print(
f" ⏱️ {k['timestamp']:.2f}s | Ratio: {k['ratio']} | Width: {k['shoulder_width']}px | Conf: {k['confidence']}"

View File

@@ -5,7 +5,6 @@ Search for magnifying glass in key stamp scenes using OWL-ViT
import os
import cv2
import json
from PIL import Image
import torch
from transformers import OwlViTProcessor, OwlViTForObjectDetection

View File

@@ -17,7 +17,7 @@ os.makedirs(OUTPUT_DIR, exist_ok=True)
# Scan frames at 5-minute intervals throughout the 2-hour video
TIMESTAMPS = list(range(0, 6879, 300)) # Every 5 minutes
print(f"📽️ Loading Florence-2 model...")
print("📽️ Loading Florence-2 model...")
processor = AutoProcessor.from_pretrained(
"microsoft/Florence-2-base", trust_remote_code=True
)

View File

@@ -148,7 +148,7 @@ def generate_summary_report():
fastest_scheme = fastest.get('file_info', {}).get('scheme_id', 'unknown')
fastest_time = fastest.get('metrics', {}).get('processing_time_seconds', 0)
lines.append(f"### Performance Comparison")
lines.append("### Performance Comparison")
lines.append("")
lines.append(f"- **Fastest Scheme**: {fastest_scheme} ({fastest_time:.1f}s)")
@@ -169,7 +169,7 @@ def generate_summary_report():
lines.append("")
if failed_tests:
lines.append(f"### Failed Tests")
lines.append("### Failed Tests")
lines.append("")
for result in failed_tests:
scheme_id = result.get('file_info', {}).get('scheme_id', 'unknown')
@@ -178,8 +178,8 @@ def generate_summary_report():
if 'MPS' in error_msg:
lines.append(f"- **{scheme_id} ({scheme_name})**: MPS backend compatibility issue")
lines.append(f" - PyTorch SparseMPS backend does not support `_sparse_coo_tensor_with_dims_and_tensors`")
lines.append(f" - OpenAI whisper requires this operation for MPS device")
lines.append(" - PyTorch SparseMPS backend does not support `_sparse_coo_tensor_with_dims_and_tensors`")
lines.append(" - OpenAI whisper requires this operation for MPS device")
lines.append("")

View File

@@ -252,7 +252,6 @@ Summary: [2-3 sentence detailed summary connecting to scene]"""
def parse_5w1h_summary(result_text):
"""Parse 5W1H and summary from LLM response"""
import re
data = {
"who": "",
@@ -314,7 +313,6 @@ def update_chunk_summary(
uuid=None,
):
"""Update chunk summary, 5W1H, identity, and visual in database"""
import json
conn = psycopg2.connect(**DB_CONFIG)
cur = conn.cursor()

View File

@@ -203,7 +203,7 @@ def main():
)
# Step 3: Generate summaries and insert
print(f"\n🤖 Generating summaries with gemma4...")
print("\n🤖 Generating summaries with gemma4...")
inserted = insert_parent_chunks(scenes)
print(f"\n{'=' * 70}")

View File

@@ -100,7 +100,7 @@ def check_server_health(api_url: str) -> bool:
except requests.exceptions.ConnectionError:
print(f"❌ Cannot connect to llama.cpp server at {api_url}")
except requests.exceptions.Timeout:
print(f"❌ Connection to llama.cpp server timed out")
print("❌ Connection to llama.cpp server timed out")
return False
@@ -282,7 +282,7 @@ def main():
# Check server health
if not check_server_health(args.url):
print("\n💡 Start llama.cpp server with:")
print(f" llama-server --model <gemma4.gguf> --port 8081")
print(" llama-server --model <gemma4.gguf> --port 8081")
sys.exit(1)
# Prepare seed words

View File

@@ -172,7 +172,7 @@ for idx, (sec, frame) in enumerate(candidate_frames):
)
print(f" 🎯 {sec}s | {term} | {s:.2f} | {bw}x{bh}px")
except Exception as e:
except Exception:
pass
if found:

View File

@@ -20,7 +20,6 @@ import argparse
import os
import numpy as np
from typing import Dict, List, Optional, Tuple
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

View File

@@ -7,7 +7,6 @@ Face + ASRX 整合處理器
import sys
import json
import argparse
import os
from pathlib import Path
from datetime import datetime
@@ -194,7 +193,7 @@ def integrate_face_asrx(face_path, asrx_path, output_path, time_threshold=1.0):
f" With face: {speaker['with_face']} ({speaker['with_face'] / speaker['segment_count'] * 100:.0f}%)"
)
print(f"\n[Face-ASRX] Integration complete!")
print("\n[Face-ASRX] Integration complete!")
def main():

View File

@@ -15,13 +15,10 @@ Output:
- Integrated action data with all body parts
"""
import sys
import json
import argparse
import numpy as np
from typing import Dict, List
from collections import defaultdict
from pathlib import Path
class IntegratedBodyActionDecoder:

View File

@@ -297,10 +297,10 @@ def main():
print(f" 檔案路徑: {result['file_path']}")
print(f" 檔案存在: {result['file_exists']}")
if result.get("fallback_used"):
print(f" 使用了回退: 是")
print(" 使用了回退: 是")
print(f" 回退原因: {result.get('fallback_reason', '未知')}")
else:
print(f" 使用了回退: 否")
print(" 使用了回退: 否")
print(f" 可用語言: {', '.join(result['available_languages'])}")
else:
if result["file_exists"]:

View File

@@ -235,7 +235,7 @@ def process_lip(
)
else:
landmarks = None
except Exception as e:
except Exception:
landmarks = None
if landmarks is not None and len(landmarks) >= 468:

View File

@@ -10,7 +10,6 @@ import argparse
import os
import signal
import cv2
import numpy as np
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher

View File

@@ -10,7 +10,6 @@ import argparse
import os
import signal
import cv2
import numpy as np
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher

View File

@@ -6,7 +6,6 @@ Extracts frames at 1fps around key dialogue moments for thorough analysis.
import cv2
import os
import subprocess
UUID = "384b0ff44aaaa1f1"
VIDEO_PATH = f"output/{UUID}/{UUID}.mp4"

View File

@@ -14,7 +14,6 @@ Usage:
import json
import argparse
import numpy as np
from datetime import datetime
import psycopg2
import os
@@ -313,7 +312,7 @@ def analyze_match_results(results):
print(f" Is Match: {r['is_match']}")
if r['strategy'] == 'combined':
print(f" Details:")
print(" Details:")
print(f" Best Match: {r['best_match']:.4f}")
print(f" Vote Ratio: {r['vote_ratio']:.2%}")
print(f" Weighted Sim: {r['weighted_sim']:.4f}")
@@ -408,7 +407,7 @@ def main():
print("❌ No embedding in first face")
return
print(f"\n🔧 Matching first face...")
print("\n🔧 Matching first face...")
match_result = match_face_to_identity(
detected_embedding=embedding,
identity_uuid=identity_uuid,
@@ -419,7 +418,7 @@ def main():
)
if match_result:
print(f"\n✅ Match Result:")
print("\n✅ Match Result:")
print(f" Identity: {match_result['identity_name']}")
print(f" Strategy: {match_result['strategy']}")
print(f" Is Match: {match_result['is_match']}")

View File

@@ -19,7 +19,6 @@ Usage:
import json
import argparse
import numpy as np
from datetime import datetime
import psycopg2
import os
import sys
@@ -424,7 +423,7 @@ def analyze_pose_match_results(results):
for angle, threshold in adaptive_thresholds_used.items():
print(f"{angle}: {threshold:.2f}")
print(f"\n=== Angle Match Types ===")
print("\n=== Angle Match Types ===")
print(f"{angle_match_types}")
# Top 5 details
@@ -528,7 +527,7 @@ def main():
pose_features = match_result.get("pose_features", {})
ratio_str = f"{pose_ratio:.3f}" if pose_ratio else f"{pose_features.get('nose_to_eye_ratio', 'N/A')}"
print(f"\n✅ Result:")
print("\n✅ Result:")
print(f" Pose: {match_result['pose_angle']} (ratio: {ratio_str})")
print(f" Similarity: {match_result['best_similarity']:.4f}")
print(f" Match: {match_result['is_match']}")

View File

@@ -43,15 +43,12 @@ Output structure:
}
"""
import sys
import json
import argparse
import cv2
import numpy as np
import mediapipe as mp
from pathlib import Path
from typing import Dict, List, Optional
from collections import defaultdict
from typing import Dict
class MediaPipeHolisticProcessor:

View File

@@ -150,7 +150,7 @@ def migrate_results():
migrated_count += 1
print(f" ✅ Migrated {total_faces} faces")
else:
print(f" ⚠️ Already exists, skipping")
print(" ⚠️ Already exists, skipping")
# Commit changes
conn.commit()
@@ -193,7 +193,7 @@ def test_api_after_migration():
if response.status_code == 200:
data = response.json()
print(f"✅ Success!")
print("✅ Success!")
print(f"Video UUID: {data.get('video_uuid')}")
print(f"Total faces: {data.get('total_faces')}")
print(f"Processing time: {data.get('processing_time_secs')}s")
@@ -203,7 +203,7 @@ def test_api_after_migration():
if isinstance(result_data, str):
result_data = json.loads(result_data)
print(f"\n📊 Detailed results:")
print("\n📊 Detailed results:")
print(f" Frames with faces: {result_data.get('frames_with_faces')}")
gender_dist = result_data.get("gender_distribution", {})

View File

@@ -9,9 +9,7 @@ Stage 3: Filter and rank results
import os
import cv2
import json
import glob
import time
import numpy as np
from PIL import Image
import torch
from transformers import OwlViTProcessor, OwlViTForObjectDetection
@@ -123,7 +121,7 @@ for idx, (sec, frame_path) in enumerate(frames_to_process):
],
}
)
except Exception as e:
except Exception:
pass
if not containers:
@@ -226,7 +224,7 @@ for idx, (sec, frame_path) in enumerate(frames_to_process):
print(
f" 🎯 {sec}s | {stamp_term} | {s:.2f} | {int(orig_w)}x{int(orig_h)}px"
)
except Exception as e:
except Exception:
pass
# ─── Stage 3: Filter and rank ───

View File

@@ -8,7 +8,6 @@ import librosa
import numpy as np
import os
import json
import matplotlib.pyplot as plt # Only for debug if needed, but we stick to console for now
# 設定
UUID = os.getenv("UUID", "384b0ff44aaaa1f1")
@@ -29,7 +28,7 @@ def analyze_music_segmentation(audio_path):
hop_length = int(1.0 * sr)
chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)
print(f"📊 Analyzing transitions...")
print("📊 Analyzing transitions...")
# 2. 計算自我相似度矩陣 (Self-Similarity Matrix) - 優化版
# 這裡我們簡化為計算相鄰片段的餘弦距離 (Cosine Distance)
@@ -45,7 +44,6 @@ def analyze_music_segmentation(audio_path):
# 使用 librosa 的 onset_strength 的變體,但針對 Chroma
# 這裡手動計算 Cosine Distance 以確保準確度
from sklearn.metrics.pairwise import cosine_similarity
# 為了效能,我們不逐一計算,而是使用向量化的方法
# 計算 frame[t] 和 frame[t+lag] 的差異
@@ -127,12 +125,12 @@ if __name__ == "__main__":
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
json.dump({"music_segments": segments}, f, indent=2, ensure_ascii=False)
print(f"\n🎉 Analysis Complete!")
print("\n🎉 Analysis Complete!")
print(f"✅ Identified {len(segments)} music-based scenes.")
print(f"💾 Saved to {OUTPUT_JSON}")
# 顯示結果
print(f"\n🎶 Top Music Segments:")
print("\n🎶 Top Music Segments:")
for i, seg in enumerate(segments[:20]):
m_s, s_s = divmod(seg["start_time"], 60)
print(f" {i + 1:02d}. [{int(m_s):02d}:{s_s:05.2f}] - {seg['duration']}s")

View File

@@ -173,7 +173,7 @@ def main():
video_stream = next((s for s in video_info["streams"] if s["codec_type"] == "video"), None)
print(f"\n测试视频:")
print("\n测试视频:")
print(f" 文件: {float(video_info['format'].get('size', 0)) / 1024 / 1024:.1f} MB")
print(f" 时长: {float(video_info['format'].get('duration', 0)):.1f}")
print(f" 分辨率: {video_stream.get('width', 0)}x{video_stream.get('height', 0)}")
@@ -229,7 +229,7 @@ def main():
"file_size_kb": result["file_size_kb"],
})
print(f"\n✅ 处理完成:")
print("\n✅ 处理完成:")
print(f" 时间: {result['elapsed_time']:.2f}")
print(f" 内存峰值: {result['peak_memory_mb']:.1f} MB")
print(f" 检测帧数: {result['total_frames']}")
@@ -266,7 +266,7 @@ def main():
print(f"{'=' * 80}")
print("\n【对比总结】")
print(f"\n| 方案 | 脚本 | 语言 | 时间(秒) | 内存(MB) | 帧数 | 文字数 | 置信度 | 空帧率 |")
print("\n| 方案 | 脚本 | 语言 | 时间(秒) | 内存(MB) | 帧数 | 文字数 | 置信度 | 空帧率 |")
print("|------|------|------|---------|---------|------|--------|--------|--------|")
for r in results:

View File

@@ -13,12 +13,9 @@ import sys
import json
import argparse
import os
import signal
import time
from datetime import datetime
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher
from resume_framework import ResumeFramework, format_time, print_progress

View File

@@ -18,7 +18,7 @@ import os
import signal
import time
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from typing import Dict, List
import cv2
import numpy as np

View File

@@ -17,10 +17,8 @@ import json
import argparse
import os
import time
from datetime import datetime
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher
from resume_framework import ResumeFramework, format_time, print_progress

View File

@@ -17,10 +17,9 @@ import os
import signal
import time
from datetime import datetime
from typing import Dict, List, Optional, Tuple
from typing import Dict
import cv2
import numpy as np
import torch
from ultralytics import YOLO

View File

@@ -5,14 +5,13 @@ Refined Search for "Postage Stamp" in the Image
import os
import cv2
import torch
import types
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM
UUID = "384b0ff44aaaa1f1"
OUTPUT_DIR = f"output/{UUID}/florence2_results"
INPUT_IMG = os.path.join(OUTPUT_DIR, f"raw_6846.jpg")
INPUT_IMG = os.path.join(OUTPUT_DIR, "raw_6846.jpg")
# Patch for compatibility (Required for this environment)

View File

@@ -185,7 +185,7 @@ def main():
if update_parent_chunk(parent, analysis):
success_count += 1
else:
print(f" ❌ Failed to generate analysis")
print(" ❌ Failed to generate analysis")
print(f"\n{'=' * 70}")
print(

View File

@@ -4,8 +4,6 @@ Register sample faces to test the face recognition system
"""
import requests
import json
import base64
import os
# API configuration

View File

@@ -41,7 +41,7 @@ import json
import signal
import time
from datetime import datetime
from typing import Dict, Optional, Tuple, Any, Callable
from typing import Dict, Optional, Tuple, Callable
class ResumeFramework:

View File

@@ -5,7 +5,6 @@ Scan Multiple Frames for Stamps
import os
import cv2
import torch
import types
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

View File

@@ -6,7 +6,6 @@ Batch Scan Keyframes for SMALL red stamps
import cv2
import numpy as np
import os
import json
UUID = "384b0ff44aaaa1f1"
BASE_DIR = f"output/{UUID}/florence2_results"
@@ -93,4 +92,4 @@ for frame_name in FRAMES:
res_name = f"result_opencv_{frame_name}"
cv2.imwrite(os.path.join(BASE_DIR, res_name), img)
else:
print(f" ❌ No small stamps found.")
print(" ❌ No small stamps found.")

View File

@@ -230,7 +230,7 @@ class SceneClassifier:
print("[SCENE] Places365 model loaded successfully (365 classes)")
else:
print(
f"[SCENE] Places365 model not found, using ImageNet pretrained"
"[SCENE] Places365 model not found, using ImageNet pretrained"
)
self.model = models.resnet18(pretrained=True)
self.model_type = "imagenet"

View File

@@ -85,7 +85,7 @@ for frame_name in FRAMES:
cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 3)
cv2.putText(
img,
f"BLUE STAMP?",
"BLUE STAMP?",
(x, y - 10),
cv2.FONT_HERSHEY_SIMPLEX,
0.6,

View File

@@ -5,7 +5,6 @@ Search for Envelope/Stamp in Keyframes
import os
import cv2
import torch
import types
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM

View File

@@ -5,7 +5,6 @@ Search for "vase" in the video using OWL-ViT on a subset of frames.
import os
import cv2
import json
import glob
from PIL import Image
import torch

View File

@@ -15,7 +15,6 @@ Usage:
import json
import argparse
import numpy as np
from pathlib import Path
from datetime import datetime
import psycopg2
import os
@@ -313,10 +312,10 @@ def main():
)
if uuid:
print(f"\n🎉 Registration completed!")
print("\n🎉 Registration completed!")
else:
print(f"\n📊 Analysis only (no registration)")
print(f" To register, run with --register flag")
print("\n📊 Analysis only (no registration)")
print(" To register, run with --register flag")
if __name__ == "__main__":

View File

@@ -443,7 +443,7 @@ def main():
print_selection_report(angle_groups, selected, coverage_report)
if not args.report_only and args.register and args.identity_name:
print(f"\n🔧 Step 5: Registering Identity...")
print("\n🔧 Step 5: Registering Identity...")
reference_data = build_reference_data_structure(selected, args.video_uuid)
@@ -454,14 +454,14 @@ def main():
)
if uuid:
print(f"\n✅ Registration completed!")
print("\n✅ Registration completed!")
print(f" UUID: {uuid}")
print(f" Name: {args.identity_name}")
print(f" Angles: {coverage_report['angles_covered']}")
print(f" Total vectors: {coverage_report['total_references']}")
print(f" Quality avg: {coverage_report['quality_avg']:.2f}")
elif args.report_only:
print(f"\n📊 Report only (no registration)")
print("\n📊 Report only (no registration)")
if __name__ == "__main__":

View File

@@ -329,7 +329,7 @@ def main():
print("Please run face_tracker.py first")
return
print(f"\n=== Available Traces ===")
print("\n=== Available Traces ===")
for trace_id_str, trace in sorted(traces.items(), key=lambda x: int(x[0])):
print(f"Trace {trace_id_str}:")
print(f" Frames: {trace['start_frame']}-{trace['end_frame']} ({trace['duration_frames']} frames)")
@@ -364,7 +364,7 @@ def main():
# Filter faces by trace
filtered_face_data = filter_faces_by_trace(face_data, trace_id_filter)
print(f"\n=== Filtering Faces ===")
print("\n=== Filtering Faces ===")
print(f"Original frames: {len(face_data.get('frames', {}))}")
print(f"Filtered frames: {len(filtered_face_data.get('frames', {}))}")
@@ -379,7 +379,7 @@ def main():
print("❌ No reference vectors selected")
return
print(f"\n=== Selected Reference Vectors ===")
print("\n=== Selected Reference Vectors ===")
print(f"Total: {len(selected_vectors)}")
angle_distribution = defaultdict(int)
@@ -390,7 +390,7 @@ def main():
print(f"Distribution: {dict(angle_distribution)}")
print(f"Quality avg: {np.mean([v['quality_score'] for v in selected_vectors]):.3f}")
print(f"\n=== Vector Details ===")
print("\n=== Vector Details ===")
for i, v in enumerate(selected_vectors[:10]):
print(f"Vector {i+1}:")
print(f" Angle: {v['pose_angle']} (confidence: {v['pose_confidence']:.2f})")
@@ -404,7 +404,7 @@ def main():
return
if args.register and args.identity_name:
print(f"\n=== Registering Identity ===")
print("\n=== Registering Identity ===")
identity_uuid = register_identity_with_trace(
identity_name=args.identity_name,
@@ -416,7 +416,7 @@ def main():
)
if identity_uuid:
print(f"\n✅ Registration completed!")
print("\n✅ Registration completed!")
print(f" UUID: {identity_uuid}")
print(f" Name: {args.identity_name}")
print(f" Trace ID: {trace_id_filter}")

View File

@@ -34,7 +34,7 @@ def test_endpoint(endpoint, method="GET", data=None):
print(f"Headers: {dict(response.headers)}")
if response.status_code == 200:
print(f"✅ Success!")
print("✅ Success!")
if response.text:
print(f"Response (first 500 chars): {response.text[:500]}")
return True

View File

@@ -93,14 +93,14 @@ def main():
# 直接回答問題
print("📝 問題回答:")
print("-" * 40)
print(f"Q: 這兩個影片內有幾個人?")
print("Q: 這兩個影片內有幾個人?")
print(f"A: 總共檢測到 {total_faces} 個人臉")
print()
print(f"Q: 幾男幾女?")
print("Q: 幾男幾女?")
print(f"A: 男性 {male_count} 人 ({male_count / total_faces * 100:.1f}%)")
print(f" 女性 {female_count} 人 ({female_count / total_faces * 100:.1f}%)")
print()
print(f"Q: 平均年齡?")
print("Q: 平均年齡?")
print(f"A: 平均 {avg_age} 歲 (範圍: {min_age}-{max_age}歲)")
print()
print("=" * 60)

View File

@@ -26,7 +26,7 @@ def detect_impulse_sounds(audio_path, threshold_multiplier=1.5):
# 載入音頻 (Mono, 22050Hz)
y, sr = librosa.load(audio_path, sr=22050)
print(f"📊 Analyzing energy envelope...")
print("📊 Analyzing energy envelope...")
# 1. 計算 RMS 能量 (以 0.05秒 為一幀)
frame_length = int(0.05 * sr)
hop_length = int(0.02 * sr)

View File

@@ -5,14 +5,13 @@ Search for Specific Stamps in the Image (Avoiding Watermark)
import os
import cv2
import torch
import types
from PIL import Image
from transformers import AutoProcessor, AutoModelForCausalLM
UUID = "384b0ff44aaaa1f1"
OUTPUT_DIR = f"output/{UUID}/florence2_results"
INPUT_IMG = os.path.join(OUTPUT_DIR, f"raw_6846.jpg")
INPUT_IMG = os.path.join(OUTPUT_DIR, "raw_6846.jpg")
# Patch for compatibility

View File

@@ -0,0 +1 @@
7861

Some files were not shown because too many files have changed in this diff Show More