fix: restore identity_id after face_dedup, rebuild package v20260512

- Re-ran identity_bind.py to restore identity_id on face_detections
- Dedup cleanup had removed rows with identity_id, kept NULL rows
- 70691 face_detections now have identity_id, 428 identities
- Full package rebuild: 169MB sqlite, 1358MB tar.gz
- identities.json: 428 identities + 5483 bindings + 5483 trace maps
- TMDB matching complete: Audrey Hepburn 843 traces, Cary Grant 482
This commit is contained in:
Accusys
2026-05-13 04:30:18 +08:00
parent fff2af8ad1
commit 48c3b13c37
837 changed files with 33273 additions and 5473 deletions

511
scripts/asr_processor.py.backup Executable file
View File

@@ -0,0 +1,511 @@
#!/opt/homebrew/bin/python3.11
"""
ASR Processor with chunked transcription for large files and resource monitoring.
Maintains backward compatibility with existing API.
"""
import sys
import json
import os
import argparse
import signal
import subprocess
import tempfile
import time
import shutil
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
# Try to import psutil for resource monitoring
PSUTIL_AVAILABLE = False
psutil = None
try:
import psutil
PSUTIL_AVAILABLE = True
except ImportError:
sys.stderr.write("WARNING: psutil not available, resource monitoring disabled\n")
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher
def signal_handler(signum, frame):
sys.stderr.write(f"ASR: Received signal {signum}, exiting...\n")
sys.exit(1)
def has_audio_stream(video_path: str) -> bool:
"""Check if video file has audio stream using ffprobe."""
try:
cmd = [
"ffprobe",
"-v",
"error",
"-select_streams",
"a",
"-show_entries",
"stream=codec_type",
"-of",
"csv=p=0",
video_path,
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
return bool(result.stdout.strip())
except subprocess.CalledProcessError:
return False
except FileNotFoundError:
sys.stderr.write("WARNING: ffprobe not found, assuming audio exists\n")
return True
def get_media_duration(media_path: str) -> float:
"""Get media duration in seconds using ffprobe."""
cmd = [
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"csv=p=0",
media_path,
]
result = subprocess.run(cmd, capture_output=True, text=True)
try:
return float(result.stdout.strip())
except (ValueError, AttributeError):
return 0.0
def extract_audio(video_path: str, audio_path: str) -> bool:
"""Extract audio from video to WAV format."""
cmd = [
"ffmpeg",
"-i",
video_path,
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
"-y",
audio_path,
]
result = subprocess.run(cmd, capture_output=True)
return result.returncode == 0 and os.path.exists(audio_path)
def extract_chunk(
audio_path: str, start: float, duration: float, output_path: str
) -> bool:
"""Extract a chunk of audio using ffmpeg."""
cmd = [
"ffmpeg",
"-i",
audio_path,
"-ss",
str(start),
"-t",
str(duration),
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
"-y",
output_path,
]
result = subprocess.run(cmd, capture_output=True)
return os.path.exists(output_path) and os.path.getsize(output_path) > 0
def monitor_resources(pid: int, interval: float = 0.1) -> Dict[str, Any]:
"""Monitor CPU and memory usage for a process."""
if not PSUTIL_AVAILABLE or psutil is None:
return {"cpu_percent": 0.0, "memory_mb": 0.0, "available": False}
try:
process = psutil.Process(pid)
cpu_percent = process.cpu_percent(interval=interval)
memory_info = process.memory_info()
memory_mb = memory_info.rss / (1024 * 1024)
return {
"cpu_percent": cpu_percent,
"memory_mb": memory_mb,
"available": True,
"pid": pid,
}
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
return {"cpu_percent": 0.0, "memory_mb": 0.0, "available": False}
def transcribe_direct(
model, audio_path: str, publisher: Optional[RedisPublisher] = None
) -> Tuple[List[Dict[str, Any]], Any]:
"""Transcribe audio directly (non-chunked)."""
if publisher:
publisher.info("asr", "Transcribing audio directly...")
start_time = time.time()
segments, info = model.transcribe(audio_path, beam_size=5)
results = []
total_segments = 0
for segment in segments:
results.append(
{"start": segment.start, "end": segment.end, "text": segment.text.strip()}
)
total_segments += 1
if total_segments % 100 == 0 and publisher:
publisher.progress("asr", total_segments, 0, f"Segment {total_segments}")
elapsed = time.time() - start_time
if publisher:
publisher.info(
"asr", f"Direct transcription: {len(results)} segments in {elapsed:.1f}s"
)
return results, info
def transcribe_chunk(
model,
chunk_path: str,
chunk_start: float,
chunk_idx: int,
total_chunks: int,
publisher: Optional[RedisPublisher] = None,
) -> Tuple[List[Dict[str, Any]], Any]:
"""Transcribe a single audio chunk."""
if publisher:
publisher.info("asr", f"Transcribing chunk {chunk_idx + 1}/{total_chunks}")
start_time = time.time()
segments, info = model.transcribe(chunk_path, beam_size=5)
results = []
for segment in segments:
results.append(
{
"start": segment.start + chunk_start,
"end": segment.end + chunk_start,
"text": segment.text.strip(),
}
)
elapsed = time.time() - start_time
if publisher:
publisher.info(
"asr",
f"Chunk {chunk_idx + 1}/{total_chunks}: {len(results)} segments in {elapsed:.1f}s",
)
return results, info
def run_asr(
video_path: str,
output_path: str,
uuid: str = "",
chunk_duration: int = 600, # 10 minutes default
max_direct_duration: int = 1800, # 30 minutes: use direct transcription for shorter files
model_size: str = "tiny",
compute_type: str = "int8",
monitor_interval: int = 60,
) -> None:
# Set up signal handlers
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
publisher = RedisPublisher(uuid) if uuid else None
if publisher:
publisher.info("asr", "ASR_START")
# Check for audio stream
if not has_audio_stream(video_path):
if publisher:
publisher.info("asr", "No audio stream detected, skipping transcription")
output = {"language": "", "language_probability": 0.0, "segments": []}
with open(output_path, "w") as f:
json.dump(output, f, indent=2)
if publisher:
publisher.complete("asr", "0 segments (no audio)")
sys.stderr.write("ASR: No audio stream, skipping transcription\n")
sys.stderr.flush()
sys.exit(0)
# Create temporary directory
temp_dir = tempfile.mkdtemp(prefix="asr_")
audio_path = os.path.join(temp_dir, "audio.wav")
if publisher:
publisher.info("asr", "Extracting audio from video...")
# Extract audio
if not extract_audio(video_path, audio_path):
if publisher:
publisher.error("asr", "Failed to extract audio")
sys.stderr.write("ASR: Failed to extract audio\n")
sys.stderr.flush()
# Clean up
shutil.rmtree(temp_dir, ignore_errors=True)
sys.exit(1)
# Get audio duration
try:
total_duration = get_media_duration(audio_path)
except Exception as e:
if publisher:
publisher.error("asr", f"Failed to get audio duration: {e}")
sys.stderr.write(f"ASR: Failed to get audio duration: {e}\n")
sys.stderr.flush()
shutil.rmtree(temp_dir, ignore_errors=True)
sys.exit(1)
if publisher:
publisher.info(
"asr",
f"Audio duration: {total_duration:.1f}s ({total_duration / 3600:.1f} hrs)",
)
# Load Whisper model
if publisher:
publisher.info(
"asr", f"Loading Whisper model ({model_size}, {compute_type})..."
)
try:
from faster_whisper import WhisperModel
model = WhisperModel(model_size, device="cpu", compute_type=compute_type)
except Exception as e:
if publisher:
publisher.error("asr", f"Failed to load Whisper model: {e}")
sys.stderr.write(f"ASR: Failed to load Whisper model: {e}\n")
sys.stderr.flush()
shutil.rmtree(temp_dir, ignore_errors=True)
sys.exit(1)
if publisher:
publisher.info("asr", "Whisper model loaded successfully")
# Decide whether to use chunked or direct transcription
use_chunked = total_duration > max_direct_duration
all_segments = []
language = None
language_prob = None
chunks = [] # Initialize chunks variable
if not use_chunked:
# Direct transcription for shorter audio
if publisher:
publisher.info(
"asr", f"Using direct transcription (duration ≤ {max_direct_duration}s)"
)
try:
segments, info = transcribe_direct(model, audio_path, publisher)
all_segments.extend(segments)
language = info.language
language_prob = info.language_probability
except Exception as e:
if publisher:
publisher.error("asr", f"Direct transcription failed: {e}")
sys.stderr.write(f"ASR: Direct transcription failed: {e}\n")
sys.stderr.flush()
# Fall back to chunked approach
use_chunked = True
if publisher:
publisher.info("asr", "Falling back to chunked transcription")
if use_chunked:
# Chunked transcription for long audio
if publisher:
publisher.info(
"asr", f"Using chunked transcription ({chunk_duration}s chunks)"
)
# Calculate chunks
chunks = []
start = 0.0
chunk_idx = 0
while start < total_duration:
chunk_end = min(start + chunk_duration, total_duration)
chunks.append(
{
"start": start,
"end": chunk_end,
"duration": chunk_end - start,
"idx": chunk_idx,
}
)
start = chunk_end
chunk_idx += 1
if publisher:
publisher.info("asr", f"Split into {len(chunks)} chunks")
chunk_temp_dir = os.path.join(temp_dir, "chunks")
os.makedirs(chunk_temp_dir, exist_ok=True)
last_resource_report = time.time()
for i, chunk in enumerate(chunks):
chunk_path = os.path.join(chunk_temp_dir, f"chunk_{i:04d}.wav")
if publisher:
publisher.progress(
"asr", i, len(chunks), f"Processing chunk {i + 1}/{len(chunks)}"
)
# Extract chunk
if not extract_chunk(
audio_path, chunk["start"], chunk["duration"], chunk_path
):
if publisher:
publisher.warning("asr", f"Failed to extract chunk {i}, skipping")
continue
# Resource monitoring (sample every monitor_interval seconds)
current_time = time.time()
if (
PSUTIL_AVAILABLE
and publisher
and (current_time - last_resource_report) >= monitor_interval
):
resources = monitor_resources(os.getpid())
if resources["available"]:
publisher.info(
"asr",
f"Resource usage: CPU {resources['cpu_percent']:.1f}%, "
f"Memory {resources['memory_mb']:.1f}MB",
)
last_resource_report = current_time
# Transcribe chunk
try:
segments, info = transcribe_chunk(
model, chunk_path, chunk["start"], i, len(chunks), publisher
)
all_segments.extend(segments)
if language is None:
language = info.language
language_prob = info.language_probability
if publisher:
publisher.info(
"asr",
f"Detected language: {language} (prob {language_prob:.2f})",
)
except Exception as e:
if publisher:
publisher.error("asr", f"Error transcribing chunk {i}: {e}")
sys.stderr.write(f"ASR: Error transcribing chunk {i}: {e}\n")
sys.stderr.flush()
# Continue with next chunk
# Clean up chunk file
try:
os.unlink(chunk_path)
except:
pass
# Clean up temporary directory
try:
shutil.rmtree(temp_dir, ignore_errors=True)
except:
pass
# Sort segments by start time
all_segments.sort(key=lambda x: x["start"])
# Prepare output (maintain same format as original)
output = {
"language": language or "",
"language_probability": language_prob or 0.0,
"segments": all_segments,
}
# Add metadata for chunked processing (optional)
if use_chunked:
output["processing_mode"] = "chunked"
output["chunk_count"] = len(chunks) if "chunks" in locals() else 0
output["chunk_duration"] = chunk_duration
else:
output["processing_mode"] = "direct"
# Write output
with open(output_path, "w") as f:
json.dump(output, f, indent=2)
if publisher:
publisher.complete(
"asr",
f"{len(all_segments)} segments ({'chunked' if use_chunked else 'direct'} mode)",
)
sys.stderr.write(
f"ASR: Transcription complete, {len(all_segments)} segments written to {output_path}\n"
)
sys.stderr.flush()
sys.exit(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="ASR Transcription with chunked processing"
)
parser.add_argument("video_path", help="Path to video file")
parser.add_argument("output_path", help="Output JSON path")
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
parser.add_argument("--version", action="version", version="2.0.0")
# Hidden arguments for configuration (can be set via environment variables)
parser.add_argument(
"--chunk-duration", type=int, default=600, help=argparse.SUPPRESS
) # 10 minutes default
parser.add_argument(
"--max-direct-duration", type=int, default=1800, help=argparse.SUPPRESS
) # 30 minutes
parser.add_argument("--model-size", default="tiny", help=argparse.SUPPRESS)
parser.add_argument("--compute-type", default="int8", help=argparse.SUPPRESS)
parser.add_argument(
"--monitor-interval", type=int, default=60, help=argparse.SUPPRESS
)
args = parser.parse_args()
# Allow environment variable overrides
chunk_duration_str = os.environ.get("MOMENTRY_ASR_CHUNK_DURATION")
if chunk_duration_str is not None:
chunk_duration = int(chunk_duration_str)
else:
chunk_duration = args.chunk_duration
max_direct_duration_str = os.environ.get("MOMENTRY_ASR_MAX_DIRECT_DURATION")
if max_direct_duration_str is not None:
max_direct_duration = int(max_direct_duration_str)
else:
max_direct_duration = args.max_direct_duration
model_size = os.environ.get("MOMENTRY_ASR_MODEL_SIZE")
if model_size is None:
model_size = args.model_size
compute_type = os.environ.get("MOMENTRY_ASR_COMPUTE_TYPE")
if compute_type is None:
compute_type = args.compute_type
run_asr(
args.video_path,
args.output_path,
args.uuid,
chunk_duration,
max_direct_duration,
model_size,
compute_type,
)

672
scripts/asr_processor.py.bak Executable file
View File

@@ -0,0 +1,672 @@
#!/opt/homebrew/bin/python3.11
"""
ASR Processor with chunked transcription for large files and resource monitoring.
Maintains backward compatibility with existing API.
"""
import sys
import json
import os
import argparse
import signal
import subprocess
import tempfile
import time
import shutil
from typing import List, Dict, Any, Optional, Tuple
# Try to import psutil for resource monitoring
PSUTIL_AVAILABLE = False
psutil = None
try:
import psutil
PSUTIL_AVAILABLE = True
except ImportError:
sys.stderr.write("WARNING: psutil not available, resource monitoring disabled\n")
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from redis_publisher import RedisPublisher # noqa: E402
def save_checkpoint(
checkpoint_path: str,
segments: List[Dict[str, Any]],
language: Optional[str],
language_prob: Optional[float],
processed_chunks: List[int],
total_chunks: int,
) -> None:
"""Save transcription checkpoint to resume later."""
checkpoint_data = {
"segments": segments,
"language": language or "",
"language_probability": language_prob or 0.0,
"processed_chunks": processed_chunks,
"total_chunks": total_chunks,
"timestamp": time.time(),
}
try:
with open(checkpoint_path, "w") as f:
json.dump(checkpoint_data, f, indent=2, default=str)
except Exception as e:
sys.stderr.write(f"ASR: Failed to save checkpoint: {e}\n")
def load_checkpoint(checkpoint_path: str) -> Optional[Dict[str, Any]]:
"""Load transcription checkpoint if exists."""
try:
with open(checkpoint_path, "r") as f:
return json.load(f)
except Exception:
return None
def check_health() -> Dict[str, Any]:
"""Check health of ASR processor dependencies."""
health = {
"status": "healthy",
"checks": {},
"timestamp": time.time(),
}
# Check ffmpeg
try:
result = subprocess.run(["ffmpeg", "-version"], capture_output=True, text=True)
health["checks"]["ffmpeg"] = {
"available": result.returncode == 0,
"version": result.stdout.split("\n")[0].split(" ")[2]
if result.stdout
else "unknown",
}
except Exception as e:
health["checks"]["ffmpeg"] = {"available": False, "error": str(e)}
# Check ffprobe
try:
result = subprocess.run(["ffprobe", "-version"], capture_output=True, text=True)
health["checks"]["ffprobe"] = {
"available": result.returncode == 0,
"version": result.stdout.split("\n")[0].split(" ")[2]
if result.stdout
else "unknown",
}
except Exception as e:
health["checks"]["ffprobe"] = {"available": False, "error": str(e)}
# Check faster_whisper import
try:
import faster_whisper
health["checks"]["faster_whisper"] = {
"available": True,
"version": getattr(faster_whisper, "__version__", "unknown"),
}
except ImportError as e:
health["checks"]["faster_whisper"] = {"available": False, "error": str(e)}
health["status"] = "unhealthy"
# Check psutil import
try:
import psutil
health["checks"]["psutil"] = {
"available": True,
"version": getattr(psutil, "__version__", "unknown"),
}
except ImportError:
health["checks"]["psutil"] = {
"available": False,
"warning": "resource monitoring disabled",
}
# Determine overall status
if not health["checks"].get("ffmpeg", {}).get("available", False) or not health[
"checks"
].get("ffprobe", {}).get("available", False):
health["status"] = "unhealthy"
return health
def signal_handler(signum, frame):
sys.stderr.write(f"ASR: Received signal {signum}, exiting...\n")
sys.exit(1)
def has_audio_stream(video_path: str) -> bool:
"""Check if video file has audio stream using ffprobe."""
try:
cmd = [
"ffprobe",
"-v",
"error",
"-select_streams",
"a",
"-show_entries",
"stream=codec_type",
"-of",
"csv=p=0",
video_path,
]
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
return bool(result.stdout.strip())
except subprocess.CalledProcessError:
return False
except FileNotFoundError:
sys.stderr.write("WARNING: ffprobe not found, assuming audio exists\n")
return True
def get_media_duration(media_path: str) -> float:
"""Get media duration in seconds using ffprobe."""
cmd = [
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"csv=p=0",
media_path,
]
result = subprocess.run(cmd, capture_output=True, text=True)
try:
return float(result.stdout.strip())
except (ValueError, AttributeError):
return 0.0
def extract_audio(video_path: str, audio_path: str) -> bool:
"""Extract audio from video to WAV format."""
cmd = [
"ffmpeg",
"-i",
video_path,
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
"-y",
audio_path,
]
result = subprocess.run(cmd, capture_output=True)
return result.returncode == 0 and os.path.exists(audio_path)
def extract_chunk(
audio_path: str, start: float, duration: float, output_path: str
) -> bool:
"""Extract a chunk of audio using ffmpeg."""
cmd = [
"ffmpeg",
"-i",
audio_path,
"-ss",
str(start),
"-t",
str(duration),
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
"-y",
output_path,
]
result = subprocess.run(cmd, capture_output=True)
return (
result.returncode == 0
and os.path.exists(output_path)
and os.path.getsize(output_path) > 0
)
def monitor_resources(pid: int, interval: float = 0.1) -> Dict[str, Any]:
"""Monitor CPU and memory usage for a process."""
if not PSUTIL_AVAILABLE or psutil is None:
return {"cpu_percent": 0.0, "memory_mb": 0.0, "available": False}
try:
process = psutil.Process(pid)
cpu_percent = process.cpu_percent(interval=interval)
memory_info = process.memory_info()
memory_mb = memory_info.rss / (1024 * 1024)
return {
"cpu_percent": cpu_percent,
"memory_mb": memory_mb,
"available": True,
"pid": pid,
}
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
return {"cpu_percent": 0.0, "memory_mb": 0.0, "available": False}
def transcribe_direct(
model, audio_path: str, publisher: Optional[RedisPublisher] = None
) -> Tuple[List[Dict[str, Any]], Any]:
"""Transcribe audio directly (non-chunked)."""
if publisher:
publisher.info("asr", "Transcribing audio directly...")
start_time = time.time()
segments, info = model.transcribe(audio_path, beam_size=5)
results = []
total_segments = 0
for segment in segments:
results.append(
{"start": segment.start, "end": segment.end, "text": segment.text.strip()}
)
total_segments += 1
if total_segments % 100 == 0 and publisher:
publisher.progress("asr", total_segments, 0, f"Segment {total_segments}")
elapsed = time.time() - start_time
if publisher:
publisher.info(
"asr", f"Direct transcription: {len(results)} segments in {elapsed:.1f}s"
)
return results, info
def transcribe_chunk(
model,
chunk_path: str,
chunk_start: float,
chunk_idx: int,
total_chunks: int,
publisher: Optional[RedisPublisher] = None,
) -> Tuple[List[Dict[str, Any]], Any]:
"""Transcribe a single audio chunk."""
if publisher:
publisher.info("asr", f"Transcribing chunk {chunk_idx + 1}/{total_chunks}")
start_time = time.time()
segments, info = model.transcribe(chunk_path, beam_size=5)
results = []
for segment in segments:
results.append(
{
"start": segment.start + chunk_start,
"end": segment.end + chunk_start,
"text": segment.text.strip(),
}
)
elapsed = time.time() - start_time
if publisher:
publisher.info(
"asr",
f"Chunk {chunk_idx + 1}/{total_chunks}: {len(results)} segments in {elapsed:.1f}s",
)
return results, info
def run_asr(
video_path: str,
output_path: str,
uuid: str = "",
chunk_duration: int = 600, # 10 minutes default
max_direct_duration: int = 1200, # 20 minutes: use direct transcription for shorter files (safe limit)
model_size: str = "tiny",
compute_type: str = "int8",
monitor_interval: int = 60,
) -> None:
# Set up signal handlers
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
publisher = RedisPublisher(uuid) if uuid else None
if publisher:
publisher.info("asr", "ASR_START")
# Check for audio stream
if not has_audio_stream(video_path):
if publisher:
publisher.info("asr", "No audio stream detected, skipping transcription")
output = {
"processor_name": "asr",
"processor_version": "2.0.0",
"contract_version": "1.0",
"language": None,
"language_probability": None,
"segments": [],
}
with open(output_path, "w") as f:
json.dump(output, f, indent=2)
if publisher:
publisher.complete("asr", "0 segments (no audio)")
sys.stderr.write("ASR: No audio stream, skipping transcription\n")
sys.stderr.flush()
sys.exit(0)
# Create temporary directory
temp_dir = tempfile.mkdtemp(prefix="asr_")
audio_path = os.path.join(temp_dir, "audio.wav")
if publisher:
publisher.info("asr", "Extracting audio from video...")
# Extract audio
if not extract_audio(video_path, audio_path):
if publisher:
publisher.error("asr", "Failed to extract audio")
sys.stderr.write("ASR: Failed to extract audio\n")
sys.stderr.flush()
# Clean up
shutil.rmtree(temp_dir, ignore_errors=True)
sys.exit(1)
# Get audio duration
try:
total_duration = get_media_duration(audio_path)
except Exception as e:
if publisher:
publisher.error("asr", f"Failed to get audio duration: {e}")
sys.stderr.write(f"ASR: Failed to get audio duration: {e}\n")
sys.stderr.flush()
shutil.rmtree(temp_dir, ignore_errors=True)
sys.exit(1)
if publisher:
publisher.info(
"asr",
f"Audio duration: {total_duration:.1f}s ({total_duration / 3600:.1f} hrs)",
)
# Load Whisper model
if publisher:
publisher.info(
"asr", f"Loading Whisper model ({model_size}, {compute_type})..."
)
try:
from faster_whisper import WhisperModel
model = WhisperModel(model_size, device="cpu", compute_type=compute_type)
except Exception as e:
if publisher:
publisher.error("asr", f"Failed to load Whisper model: {e}")
sys.stderr.write(f"ASR: Failed to load Whisper model: {e}\n")
sys.stderr.flush()
shutil.rmtree(temp_dir, ignore_errors=True)
sys.exit(1)
if publisher:
publisher.info("asr", "Whisper model loaded successfully")
# Decide whether to use chunked or direct transcription
use_chunked = total_duration > max_direct_duration
all_segments = []
language = None
language_prob = None
chunks = [] # Initialize chunks variable
if not use_chunked:
# Direct transcription for shorter audio
if publisher:
publisher.info(
"asr", f"Using direct transcription (duration ≤ {max_direct_duration}s)"
)
try:
segments, info = transcribe_direct(model, audio_path, publisher)
all_segments.extend(segments)
language = info.language
language_prob = info.language_probability
except Exception as e:
if publisher:
publisher.error("asr", f"Direct transcription failed: {e}")
sys.stderr.write(f"ASR: Direct transcription failed: {e}\n")
sys.stderr.flush()
# Fall back to chunked approach
use_chunked = True
if publisher:
publisher.info("asr", "Falling back to chunked transcription")
if use_chunked:
# Chunked transcription for long audio
if publisher:
publisher.info(
"asr", f"Using chunked transcription ({chunk_duration}s chunks)"
)
# Calculate chunks
chunks = []
start = 0.0
chunk_idx = 0
while start < total_duration:
chunk_end = min(start + chunk_duration, total_duration)
chunks.append(
{
"start": start,
"end": chunk_end,
"duration": chunk_end - start,
"idx": chunk_idx,
}
)
start = chunk_end
chunk_idx += 1
if publisher:
publisher.info("asr", f"Split into {len(chunks)} chunks")
chunk_temp_dir = os.path.join(temp_dir, "chunks")
os.makedirs(chunk_temp_dir, exist_ok=True)
last_resource_report = time.time()
for i, chunk in enumerate(chunks):
chunk_path = os.path.join(chunk_temp_dir, f"chunk_{i:04d}.wav")
if publisher and os.environ.get("MOMENTRY_DISABLE_REDIS") != "1":
publisher.progress(
"asr", i, len(chunks), f"Processing chunk {i + 1}/{len(chunks)}"
)
# Extract chunk
if not extract_chunk(
audio_path, chunk["start"], chunk["duration"], chunk_path
):
if publisher:
publisher.warning("asr", f"Failed to extract chunk {i}, skipping")
continue
# Resource monitoring (sample every monitor_interval seconds)
current_time = time.time()
if (
PSUTIL_AVAILABLE
and publisher
and (current_time - last_resource_report) >= monitor_interval
):
resources = monitor_resources(os.getpid())
if resources["available"]:
publisher.info(
"asr",
f"Resource usage: CPU {resources['cpu_percent']:.1f}%, "
f"Memory {resources['memory_mb']:.1f}MB",
)
last_resource_report = current_time
# Transcribe chunk with retry logic
max_retries = 3
transcribed = False
last_error = None
for retry in range(max_retries):
try:
segments, info = transcribe_chunk(
model, chunk_path, chunk["start"], i, len(chunks), publisher
)
all_segments.extend(segments)
if language is None:
language = info.language
language_prob = info.language_probability
if publisher:
publisher.info(
"asr",
f"Detected language: {language} (prob {language_prob:.2f})",
)
transcribed = True
break # Success, exit retry loop
except Exception as e:
last_error = e
if publisher:
publisher.warning(
"asr",
f"Error transcribing chunk {i} (attempt {retry + 1}/{max_retries}): {e}",
)
sys.stderr.write(
f"ASR: Error transcribing chunk {i} (attempt {retry + 1}/{max_retries}): {e}\n"
)
sys.stderr.flush()
if retry < max_retries - 1:
# Wait before retry (exponential backoff)
wait_time = 2**retry # 1, 2, 4 seconds
if publisher:
publisher.info("asr", f"Retrying in {wait_time}s...")
time.sleep(wait_time)
else:
# Final attempt failed
if publisher:
publisher.error(
"asr",
f"Failed to transcribe chunk {i} after {max_retries} attempts: {last_error}",
)
sys.stderr.write(
f"ASR: Failed to transcribe chunk {i} after {max_retries} attempts: {last_error}\n"
)
sys.stderr.flush()
# Continue with next chunk (skip this one)
# Clean up chunk file
try:
os.unlink(chunk_path)
except Exception:
pass
# Clean up temporary directory
try:
shutil.rmtree(temp_dir, ignore_errors=True)
except Exception:
pass
# Sort segments by start time
all_segments.sort(key=lambda x: x["start"])
# Prepare output (maintain same format as original)
output = {
"processor_name": "asr",
"processor_version": "2.0.0",
"contract_version": "1.0",
"language": language if language is not None else None,
"language_probability": language_prob if language_prob is not None else None,
"segments": all_segments,
}
# Add metadata for chunked processing (optional)
if use_chunked:
output["processing_mode"] = "chunked"
output["chunk_count"] = len(chunks) if "chunks" in locals() else 0
output["chunk_duration"] = chunk_duration
else:
output["processing_mode"] = "direct"
# Write output
with open(output_path, "w") as f:
json.dump(output, f, indent=2)
if publisher:
publisher.complete(
"asr",
f"{len(all_segments)} segments ({'chunked' if use_chunked else 'direct'} mode)",
)
sys.stderr.write(
f"ASR: Transcription complete, {len(all_segments)} segments written to {output_path}\n"
)
sys.stderr.flush()
sys.exit(0)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="ASR Transcription with chunked processing"
)
parser.add_argument("video_path", nargs="?", help="Path to video file")
parser.add_argument("output_path", nargs="?", help="Output JSON path")
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
parser.add_argument("--version", action="version", version="2.0.0")
parser.add_argument(
"--check-health", action="store_true", help="Check dependencies and exit"
)
# Hidden arguments for configuration (can be set via environment variables)
parser.add_argument(
"--chunk-duration", type=int, default=600, help=argparse.SUPPRESS
) # 10 minutes default
parser.add_argument(
"--max-direct-duration", type=int, default=1200, help=argparse.SUPPRESS
) # 20 minutes (safe limit based on testing)
parser.add_argument("--model-size", default="tiny", help=argparse.SUPPRESS)
parser.add_argument("--compute-type", default="int8", help=argparse.SUPPRESS)
parser.add_argument(
"--monitor-interval", type=int, default=60, help=argparse.SUPPRESS
)
args = parser.parse_args()
# Handle health check
if args.check_health:
health = check_health()
print(json.dumps(health, indent=2))
sys.exit(0 if health["status"] == "healthy" else 1)
# Validate required arguments when not doing health check
if args.video_path is None or args.output_path is None:
parser.error(
"video_path and output_path are required when not using --check-health"
)
# Allow environment variable overrides
chunk_duration_str = os.environ.get("MOMENTRY_ASR_CHUNK_DURATION")
if chunk_duration_str is not None:
chunk_duration = int(chunk_duration_str)
else:
chunk_duration = args.chunk_duration
max_direct_duration_str = os.environ.get("MOMENTRY_ASR_MAX_DIRECT_DURATION")
if max_direct_duration_str is not None:
max_direct_duration = int(max_direct_duration_str)
else:
max_direct_duration = args.max_direct_duration
model_size = os.environ.get("MOMENTRY_ASR_MODEL_SIZE")
if model_size is None:
model_size = args.model_size
compute_type = os.environ.get("MOMENTRY_ASR_COMPUTE_TYPE")
if compute_type is None:
compute_type = args.compute_type
run_asr(
args.video_path,
args.output_path,
args.uuid,
chunk_duration,
max_direct_duration,
model_size,
compute_type,
)

View File

@@ -0,0 +1,113 @@
#!/opt/homebrew/bin/python3.11
"""
Import a File Content Package into the DB.
Usage: python3 import_file_package.py --uuid <file_uuid> --package <path>
"""
import json, os, sys, subprocess, argparse, csv
PG_BIN = "/Users/accusys/pgsql/18.3/bin"
DB_USER = "accusys"
DB_NAME = "momentry"
def psql(sql):
r = subprocess.run([f"{PG_BIN}/psql", "-U", DB_USER, "-d", DB_NAME, "-c", sql],
capture_output=True, text=True, timeout=30)
if r.returncode != 0:
print(f" ERROR: {r.stderr[:200]}")
return False
print(f" OK: {r.stdout.strip()[:100]}")
return True
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--uuid", required=True)
parser.add_argument("--package", required=True)
args = parser.parse_args()
uuid = args.uuid
pkg = args.package.rstrip("/")
if not os.path.exists(pkg):
print(f"Package not found: {pkg}")
sys.exit(1)
print(f"Importing package {uuid} from {pkg}")
print()
# 1. Update video registration (mark as processed)
print("[1/5] Update video registration...")
meta_path = os.path.join(pkg, "metadata.json")
if os.path.exists(meta_path):
with open(meta_path) as f:
meta = json.load(f)
fps = meta.get("fps", 24.0)
dur = meta.get("duration", 0)
psql(
f"UPDATE dev.videos SET status='ready', duration={dur}, fps={fps} "
f"WHERE file_uuid='{uuid}'"
)
# 2. Import identities
print("[2/5] Import identities...")
id_path = os.path.join(pkg, "identities.csv")
if os.path.exists(id_path):
with open(id_path) as f:
count = sum(1 for _ in csv.DictReader(f))
if count > 0:
psql(
f"COPY dev.identities (uuid, name, identity_type, source, status, metadata) "
f"FROM '{id_path}' WITH CSV HEADER "
f"ON CONFLICT (name) DO NOTHING"
)
# 3. Import face detections
print("[3/5] Import face detections...")
fd_path = os.path.join(pkg, "face_detections.csv")
if os.path.exists(fd_path):
psql(
f"COPY dev.face_detections (id, file_uuid, frame_number, timestamp_secs, "
f"face_id, x, y, width, height, confidence, trace_id, identity_id) "
f"FROM '{fd_path}' WITH CSV HEADER "
f"ON CONFLICT (id) DO NOTHING"
)
# 4. Import chunks
print("[4/5] Import chunks...")
ch_path = os.path.join(pkg, "chunks.csv")
if os.path.exists(ch_path):
psql(
f"COPY dev.chunk (chunk_id, chunk_type, start_frame, end_frame, "
f"start_time, end_time, fps, text_content) "
f"FROM '{ch_path}' WITH CSV HEADER "
f"ON CONFLICT (file_uuid, chunk_id) DO NOTHING"
)
# 5. Import vectors
print("[5/5] Import chunk_vectors...")
vec_path = os.path.join(pkg, "chunk_vectors.csv")
if os.path.exists(vec_path):
psql(
f"COPY dev.chunk_vectors (chunk_id, embedding) "
f"FROM '{vec_path}' WITH CSV HEADER"
)
# Verify
print()
print("=== Verification ===")
r = subprocess.run(
[f"{PG_BIN}/psql", "-U", DB_USER, "-d", DB_NAME, "-t", "-A",
"-c", f"SELECT count(*) FROM dev.chunk WHERE file_uuid='{uuid}'"],
capture_output=True, text=True, timeout=10)
print(f" Chunks: {r.stdout.strip()}")
r = subprocess.run(
[f"{PG_BIN}/psql", "-U", DB_USER, "-d", DB_NAME, "-t", "-A",
"-c", f"SELECT count(*) FROM dev.chunk_vectors WHERE uuid='{uuid}'"],
capture_output=True, text=True, timeout=10)
print(f" Vectors: {r.stdout.strip()}")
print()
print("=== Done ===")
if __name__ == "__main__":
main()

225
scripts/package_delivery.sh Normal file
View File

@@ -0,0 +1,225 @@
#!/bin/bash
# Package Delivery — 依 M4 spec 三層打包
# Usage:
# bash package_delivery.sh dev <version>
# bash package_delivery.sh prod <version>
# bash package_delivery.sh file <file_uuid> <version>
set -euo pipefail
CMD="${1:?Usage: $0 <dev|prod|file> <arg> [version]}"
PROJECT="/Users/accusys/momentry_core_0.1"
PG_BIN="/Users/accusys/pgsql/18.3/bin"
T0=$(date +%s)
case "$CMD" in
dev)
VERSION="${2:?Usage: $0 dev <version>}"
OUTPUT="$PROJECT/release/dev_upgrade_${VERSION}"
mkdir -p "$OUTPUT/schema" "$OUTPUT/bin" "$OUTPUT/src"
echo "=== Dev Upgrade Package ${VERSION} ==="
# Migration SQL (auto-generated from current schema)
echo "[1/4] Migration SQL..."
{
echo "-- Migration: dev schema ${VERSION}"
echo "-- Date: $(date +%Y-%m-%d)"
echo "-- Auto-generated from current DB schema"
echo ""
echo "SET search_path TO dev;"
echo ""
# face_detections: timestamp_secs was added later
$PG_BIN/psql -U accusys -d momentry -t -A -c "
SELECT 'ALTER TABLE dev.face_detections ADD COLUMN IF NOT EXISTS ' || column_name || ' ' || udt_name || ';'
FROM information_schema.columns
WHERE table_schema='dev' AND table_name='face_detections'
AND column_name = 'timestamp_secs';
" 2>/dev/null
# chunk: chunk_id was changed to short format
echo ""
echo "-- Chunk index removals (already applied)"
echo "-- chunk_vectors: standard columns"
} > "$OUTPUT/schema/migration_${VERSION}.sql"
echo " migration_${VERSION}.sql"
# Binary
echo "[2/4] Binary..."
BIN_SRC="$PROJECT/target/debug/momentry_playground"
if [ -f "$BIN_SRC" ]; then
cp "$BIN_SRC" "$OUTPUT/bin/"
echo " momentry_playground ($(ls -lh "$OUTPUT/bin/momentry_playground" | awk '{print $5}'))"
else
echo " WARN: binary not found at $BIN_SRC (build first: cargo build --bin momentry_playground)"
fi
# Source (optional)
echo "[3/4] Source code..."
cd "$PROJECT"
git archive --format=tar.gz -o "$OUTPUT/src/momentry_core_${VERSION}_source.tar.gz" HEAD 2>/dev/null
echo " source.tar.gz ($(ls -lh "$OUTPUT/src/momentry_core_${VERSION}_source.tar.gz" | awk '{print $5}'))"
# UPGRADE.md
echo "[4/4] UPGRADE.md..."
cat > "$OUTPUT/UPGRADE.md" << UPEOF
# Dev Upgrade ${VERSION}
## Steps
\`\`\`bash
# 1. Apply migration
psql -U accusys -d momentry < schema/migration_${VERSION}.sql
# 2. Replace binary
cp bin/momentry_playground ${PROJECT}/target/debug/
# 3. Restart
pkill momentry_playground
DATABASE_SCHEMA=dev ${PROJECT}/target/debug/momentry_playground server --port 3003
\`\`\`
UPEOF
# Package
cd "$PROJECT/release"
tar czf "dev_upgrade_${VERSION}.tar.gz" -C "$(dirname "$OUTPUT")" "$(basename "$OUTPUT")"
echo ""
echo "=== Package: release/dev_upgrade_${VERSION}.tar.gz ($(ls -lh "dev_upgrade_${VERSION}.tar.gz" | awk '{print $5}')) ==="
;;
prod)
VERSION="${2:?Usage: $0 prod <version>}"
OUTPUT="$PROJECT/release/prod_upgrade_${VERSION}"
mkdir -p "$OUTPUT/schema" "$OUTPUT/bin"
echo "=== Prod Upgrade Package ${VERSION} ==="
echo "[1/3] Migration SQL..."
cat > "$OUTPUT/schema/migration_${VERSION}.sql" << SQLEOF
-- Migration: public schema ${VERSION}
-- Date: $(date +%Y-%m-%d)
SET search_path TO public;
-- ALTER TABLE public.videos ADD COLUMN IF NOT EXISTS ...;
SQLEOF
echo " migration_${VERSION}.sql"
echo "[2/3] Binary..."
BIN_SRC="$PROJECT/target/release/momentry"
if [ -f "$BIN_SRC" ]; then
cp "$BIN_SRC" "$OUTPUT/bin/"
echo " momentry ($(ls -lh "$OUTPUT/bin/momentry" | awk '{print $5}'))"
else
echo " WARN: release binary not found (build: cargo build --release --bin momentry)"
fi
echo "[3/3] UPGRADE.md..."
cat > "$OUTPUT/UPGRADE.md" << UPEOF
# Prod Upgrade ${VERSION}
## Pre-flight
\`\`\`bash
pg_dump -U accusys -d momentry --schema=public > backup_${VERSION}_pre.sql
\`\`\`
## Steps
\`\`\`bash
# 1. Migration
psql -U accusys -d momentry < schema/migration_${VERSION}.sql
# 2. Replace binary
cp bin/momentry ${PROJECT}/target/release/
# 3. Restart
sudo launchctl stop com.momentry.api
sudo launchctl start com.momentry.api
\`\`\`
## Rollback
\`\`\`bash
psql -U accusys -d momentry < backup_${VERSION}_pre.sql
\`\`\`
UPEOF
cd "$PROJECT/release"
tar czf "prod_upgrade_${VERSION}.tar.gz" -C "$(dirname "$OUTPUT")" "$(basename "$OUTPUT")"
echo ""
echo "=== Package: release/prod_upgrade_${VERSION}.tar.gz ($(ls -lh "prod_upgrade_${VERSION}.tar.gz" | awk '{print $5}')) ==="
;;
file)
UUID="${2:?Usage: $0 file <uuid> [version]}"
VERSION="${3:-v1.0}"
OUTPUT="$PROJECT/release/file_${UUID}_${VERSION}"
mkdir -p "$OUTPUT"
echo "=== File Package ${UUID} ${VERSION} ==="
# file_info.json
echo "[1/5] file_info.json..."
$PG_BIN/psql -U accusys -d momentry -t -A -c "
SELECT json_build_object(
'file_uuid', file_uuid,
'file_name', file_name,
'duration', duration,
'fps', fps,
'width', width,
'height', height,
'total_frames', total_frames,
'status', status
) FROM dev.videos WHERE file_uuid='$UUID';
" 2>/dev/null | python3 -c "import json,sys;json.dump(json.load(sys.stdin),open('$OUTPUT/file_info.json','w'),indent=2)"
# data.sql (using Python export script)
echo "[2/5] data.sql..."
python3 "$PROJECT/scripts/export_file_package.py" "$UUID" "$OUTPUT" 2>&1
# checksums.md5
echo "[3/5] checksums.md5..."
cd "$OUTPUT"
md5sum data.sql file_info.json 2>/dev/null > checksums.md5 || md5 data.sql file_info.json > checksums.md5 2>/dev/null
echo " checksums.md5"
# README.md
echo "[4/5] README.md..."
cat > "$OUTPUT/README.md" << READMEEOF
# File Package: ${UUID}
## Import
\`\`\`bash
export UUID=${UUID}
# 1. Clean existing data for this file
psql -U accusys -d momentry << SQL
DELETE FROM dev.chunk_vectors WHERE uuid = '\${UUID}'::text;
DELETE FROM dev.chunk WHERE file_uuid = '\${UUID}';
DELETE FROM dev.face_detections WHERE file_uuid = '\${UUID}';
DELETE FROM dev.videos WHERE file_uuid = '\${UUID}';
SQL
# 2. Restore
psql -U accusys -d momentry < data.sql
# 3. Verify
psql -U accusys -d momentry -c "SELECT file_uuid, status FROM dev.videos WHERE file_uuid = '\${UUID}'"
\`\`\`
## Verification
- \`GET /api/v1/file/{uuid}\` → 200, status=ready
- \`GET /api/v1/file/{uuid}/chunk/0-01\` → 200
- \`POST /api/v1/search/universal\` → results
READMEEOF
# Package
echo "[5/5] Packaging tar.gz..."
cd "$PROJECT/release"
tar czf "file_${UUID}_${VERSION}.tar.gz" -C "$(dirname "$OUTPUT")" "$(basename "$OUTPUT")" 2>/dev/null
echo " release/file_${UUID}_${VERSION}.tar.gz ($(ls -lh "file_${UUID}_${VERSION}.tar.gz" | awk '{print $5}'))"
rm -rf "$OUTPUT"
;;
*)
echo "Usage: $0 <dev|prod|file> <arg> [version]"
exit 1
;;
esac
ELAPSED=$(($(date +%s) - T0))
echo " Elapsed: ${ELAPSED}s"

119
scripts/package_file.sh Normal file
View File

@@ -0,0 +1,119 @@
#!/bin/bash
# Package File Content — single video's complete data
set -euo pipefail
UUID="${1:?Usage: $0 <file_uuid> [version]}"
VERSION="${2:-v1.0.0}"
PROJECT="/Users/accusys/momentry_core_0.1"
OUTPUT="$PROJECT/release/files/$UUID/$VERSION"
OUTPUT_DEV="/Users/accusys/momentry/output_dev"
PG_BIN="/Users/accusys/pgsql/18.3/bin"
T0=$(date +%s)
mkdir -p "$OUTPUT/processors"
echo "=== File Package ${UUID} ${VERSION} ==="
# 1. metadata
echo "[1/8] metadata.json..."
$PG_BIN/psql -U accusys -d momentry -t -A -c "
SELECT json_build_object(
'file_uuid', file_uuid,
'file_name', file_name,
'file_path', file_path,
'file_type', file_type,
'duration', duration,
'width', width,
'height', height,
'fps', fps,
'status', status,
'total_frames', total_frames,
'registration_time', registration_time::text
) FROM dev.videos WHERE file_uuid='$UUID';
" 2>/dev/null | python3 -c "import json,sys;d=json.load(sys.stdin);json.dump(d,open('$OUTPUT/metadata.json','w'),indent=2)" 2>/dev/null || echo " WARN: no metadata"
echo " $(ls -lh "$OUTPUT/metadata.json" | awk '{print $5}')"
# 2. Processor outputs
echo "[2/8] Processor outputs..."
for type in asr asrx asr-1 yolo face pose ocr cut scene; do
src="$OUTPUT_DEV/${UUID}.${type}.json"
if [ -f "$src" ]; then
cp "$src" "$OUTPUT/processors/"
echo " ${type}.json"
fi
done
# 3. Identities (related to this file)
echo "[3/8] Identities..."
$PG_BIN/psql -U accusys -d momentry -c "
COPY (
SELECT DISTINCT i.uuid, i.name, i.identity_type, i.source, i.status, i.metadata
FROM dev.identities i
JOIN dev.identity_bindings ib ON ib.identity_id = i.id
WHERE ib.file_uuid = '$UUID'
) TO '$OUTPUT/identities.csv' WITH CSV HEADER;
" 2>/dev/null && echo " $(wc -l < "$OUTPUT/identities.csv") rows"
# 4. Face detections
echo "[4/8] Face detections..."
$PG_BIN/psql -U accusys -d momentry -c "
COPY (
SELECT id, file_uuid, frame_number, timestamp_secs, face_id, x, y, width, height, confidence, trace_id, identity_id
FROM dev.face_detections WHERE file_uuid = '$UUID'
ORDER BY frame_number
) TO '$OUTPUT/face_detections.csv' WITH CSV HEADER;
" 2>/dev/null && echo " $(wc -l < "$OUTPUT/face_detections.csv") rows"
# 5. Chunks
echo "[5/8] Chunks..."
$PG_BIN/psql -U accusys -d momentry -c "
COPY (
SELECT chunk_id, chunk_type, start_frame, end_frame, start_time, end_time, fps, text_content
FROM dev.chunk WHERE file_uuid = '$UUID'
ORDER BY id
) TO '$OUTPUT/chunks.csv' WITH CSV HEADER;
" 2>/dev/null && echo " $(wc -l < "$OUTPUT/chunks.csv") rows"
# 6. Vectors
echo "[6/8] chunk_vectors..."
$PG_BIN/psql -U accusys -d momentry -c "
COPY (
SELECT cv.chunk_id, cv.embedding::text
FROM dev.chunk_vectors cv
JOIN dev.chunk c ON c.file_uuid=cv.uuid AND c.chunk_id=cv.chunk_id
WHERE cv.uuid = '$UUID'
) TO '$OUTPUT/chunk_vectors.csv' WITH CSV HEADER;
" 2>/dev/null && echo " $(wc -l < "$OUTPUT/chunk_vectors.csv") rows"
# 7. TKG
echo "[7/8] TKG..."
$PG_BIN/psql -U accusys -d momentry -c "
COPY (SELECT * FROM dev.tkg_nodes WHERE file_uuid='$UUID') TO '$OUTPUT/tkg_nodes.csv' WITH CSV HEADER;
" 2>/dev/null
$PG_BIN/psql -U accusys -d momentry -c "
COPY (SELECT * FROM dev.tkg_edges WHERE file_uuid='$UUID') TO '$OUTPUT/tkg_edges.csv' WITH CSV HEADER;
" 2>/dev/null
echo " nodes+edges exported"
# 8. RELEASE_INFO
echo "[8/8] RELEASE_INFO..."
SENTENCE=$($PG_BIN/psql -U accusys -d momentry -t -A -c "SELECT count(*) FROM dev.chunk WHERE file_uuid='$UUID' AND chunk_type='sentence';" 2>/dev/null)
VECTORS=$($PG_BIN/psql -U accusys -d momentry -t -A -c "SELECT count(*) FROM dev.chunk_vectors cv JOIN dev.chunk c ON c.file_uuid=cv.uuid AND c.chunk_id=cv.chunk_id WHERE cv.uuid='$UUID';" 2>/dev/null)
cat > "$OUTPUT/RELEASE_INFO.txt" << EOF
Release: ${VERSION}
Type: file
UUID: ${UUID}
Date: $(date +%Y-%m-%d)
Chunks: sentence=${SENTENCE}
Vectors: ${VECTORS}
Processors: $(ls "$OUTPUT/processors/" 2>/dev/null | wc -l | tr -d ' ')
EOF
# Symlink latest
ln -sfn "$OUTPUT" "$PROJECT/release/files/$UUID/latest"
ELAPSED=$(($(date +%s) - T0))
echo ""
echo "=== File Package done (${ELAPSED}s) ==="
echo " $OUTPUT"
du -sh "$OUTPUT"

100
scripts/package_release.sh Normal file
View File

@@ -0,0 +1,100 @@
#!/bin/bash
# ===========================================================
# Package Release — Standard Deliverable for M4
# Usage: bash scripts/package_release.sh <version> [file_uuid]
# version: e.g. v1.0.2
# file_uuid: default aeed71342a899fe4b4c57b7d41bcb692
# ===========================================================
set -euo pipefail
VERSION="${1:?Usage: $0 <version> [file_uuid]}"
UUID="${2:-aeed71342a899fe4b4c57b7d41bcb692}"
PG_BIN="/Users/accusys/pgsql/18.3/bin"
PROJECT="/Users/accusys/momentry_core_0.1"
OUTPUT="$PROJECT/release/phase1/${VERSION}"
M4_DIR="$PROJECT/docs_v1.0/M4_HANDOVER"
T0=$(date +%s)
echo "=========================================="
echo " Package Release ${VERSION}"
echo " UUID: ${UUID}"
echo "=========================================="
echo ""
mkdir -p "$OUTPUT"
# ── Step 1: DB backup (post-correction) ──
echo "[1/5] DB backup (post-correction)..."
$PG_BIN/pg_dump -U accusys -d momentry \
--schema=dev \
--table=dev.chunk \
--table=dev.chunk_vectors \
--data-only --column-inserts \
> "$OUTPUT/dev_backup_post_correction.sql" 2>/dev/null
echo " $(ls -lh "$OUTPUT/dev_backup_post_correction.sql" | awk '{print $5}')"
# ── Step 2: Verify counts ──
echo "[2/5] Verify counts..."
SENTENCE=$($PG_BIN/psql -U accusys -d momentry -t -A -c "SELECT count(*) FROM dev.chunk WHERE file_uuid='$UUID' AND chunk_type='sentence';" 2>/dev/null)
VECTORS=$($PG_BIN/psql -U accusys -d momentry -t -A -c "SELECT count(*) FROM dev.chunk_vectors WHERE uuid='$UUID';" 2>/dev/null)
MATCHED=$($PG_BIN/psql -U accusys -d momentry -t -A -c "
SELECT count(*) FROM dev.chunk_vectors cv
JOIN dev.chunk c ON c.file_uuid=cv.uuid AND c.chunk_id=cv.chunk_id
WHERE cv.uuid='$UUID';" 2>/dev/null)
echo " sentence chunks: $SENTENCE"
echo " vectors: $VECTORS"
echo " matched: $MATCHED"
if [ "$SENTENCE" != "$VECTORS" ] || [ "$VECTORS" != "$MATCHED" ]; then
echo " ❌ MISMATCH — aborting"
exit 1
fi
echo " ✅ All counts consistent"
# ── Step 3: Source code archive ──
echo "[3/5] Source code (git archive)..."
cd "$PROJECT"
git archive --format=tar.gz -o "$OUTPUT/momentry_core_${VERSION}_source.tar.gz" HEAD 2>/dev/null
echo " $(ls -lh "$OUTPUT/momentry_core_${VERSION}_source.tar.gz" | awk '{print $5}')"
# ── Step 4: Correction record ──
echo "[4/5] Correction record (asr-1.json)..."
ASR1="/Users/accusys/momentry/output_dev/${UUID}.asr-1.json"
if [ -f "$ASR1" ]; then
cp "$ASR1" "$OUTPUT/"
echo " $(ls -lh "$OUTPUT/${UUID}.asr-1.json" | awk '{print $5}')"
else
echo " ⚠️ No asr-1.json found for $UUID"
fi
# ── Step 5: Copy to M4_HANDOVER ──
echo "[5/5] Deploy to M4_HANDOVER..."
rm -rf "${M4_DIR:?}"/*
cp "$OUTPUT/dev_backup_post_correction.sql" "$M4_DIR/"
cp "$OUTPUT/momentry_core_${VERSION}_source.tar.gz" "$M4_DIR/"
if [ -f "$OUTPUT/${UUID}.asr-1.json" ]; then
cp "$OUTPUT/${UUID}.asr-1.json" "$M4_DIR/"
fi
# Scripts
for s in generate_asr1.py apply_asr_corrections.py clean_sentence_text.py pipeline_status.py; do
cp "$PROJECT/scripts/$s" "$M4_DIR/"
done
# Handover doc + test script
cp "$PROJECT/docs_v1.0/API_V1.0.0/RELEASE/PHASE1_HANDOVER_V1.0.0.md" "$M4_DIR/HANDOVER_${VERSION}.md"
cp "/tmp/test_api.sh" "$M4_DIR/api_test.sh"
# Create RELEASE_INFO
cat > "$M4_DIR/RELEASE_INFO.txt" << EOF
Release: ${VERSION}
Date: $(date +%Y-%m-%d)
UUID: ${UUID}
Pipeline: $(python3 $PROJECT/scripts/pipeline_status.py 2>/dev/null | grep "TOTAL" | tr -d ' ')
EOF
ELAPSED=$(($(date +%s) - T0))
echo ""
echo "=========================================="
echo " Package ${VERSION} complete (${ELAPSED}s)"
echo " Output: $OUTPUT"
echo " M4: $M4_DIR"
echo "=========================================="

100
scripts/package_system.sh Normal file
View File

@@ -0,0 +1,100 @@
#!/bin/bash
# Package System Upgrade — dev or prod
# Usage: bash package_system.sh dev <version>
# bash package_system.sh prod <version>
set -euo pipefail
TARGET="${1:?Usage: $0 <dev|prod> <version>}"
VERSION="${2:?Usage: $0 <dev|prod> <version>}"
PROJECT="/Users/accusys/momentry_core_0.1"
OUTPUT="$PROJECT/release/system/$TARGET/$VERSION"
PG_BIN="/Users/accusys/pgsql/18.3/bin"
T0=$(date +%s)
case "$TARGET" in
dev)
SCHEMA="dev"
ENV_FILE=".env.development"
;;
prod)
SCHEMA="public"
ENV_FILE=".env"
;;
*)
echo "Target must be 'dev' or 'prod'"
exit 1
;;
esac
mkdir -p "$OUTPUT/scripts" "$OUTPUT/test" "$OUTPUT/migration" "$OUTPUT/portal"
echo "=== System Package (${TARGET}) ${VERSION} ==="
# 1. Source code
echo "[1/6] Source code (git archive)..."
cd "$PROJECT"
git archive --format=tar.gz -o "$OUTPUT/source.tar.gz" HEAD 2>/dev/null
echo " $(ls -lh "$OUTPUT/source.tar.gz" | awk '{print $5}')"
# 2. Schema
echo "[2/6] Schema DDL (${SCHEMA})..."
$PG_BIN/pg_dump -U accusys -d momentry --schema="$SCHEMA" --schema-only > "$OUTPUT/schema.sql" 2>/dev/null
echo " $(ls -lh "$OUTPUT/schema.sql" | awk '{print $5}')"
# 3. Scripts (tools)
echo "[3/6] Tools & scripts..."
for s in pipeline_status.py generate_asr1.py apply_asr_corrections.py clean_sentence_text.py import_file_package.py; do
if [ -f "$PROJECT/scripts/$s" ]; then
cp "$PROJECT/scripts/$s" "$OUTPUT/scripts/"
echo " $s"
fi
done
cp "$PROJECT/docs_v1.0/M4_HANDOVER/api_test.sh" "$OUTPUT/test/"
echo " api_test.sh"
# 4. Portal frontend (dev only)
if [ "$TARGET" = "dev" ] && [ -d "$PROJECT/portal/dist" ]; then
echo "[4/6] Portal frontend (dist)..."
cp -r "$PROJECT/portal/dist" "$OUTPUT/portal/"
echo " dist/ ($(du -sh "$PROJECT/portal/dist" | awk '{print $1}'))"
elif [ "$TARGET" = "dev" ]; then
echo "[4/6] Portal frontend — SKIP (no dist, run 'npm run build' first)"
fi
# 5. Env config
echo "[5/6] Environment config..."
if [ -f "$PROJECT/$ENV_FILE" ]; then
cp "$PROJECT/$ENV_FILE" "$OUTPUT/"
echo " $ENV_FILE"
else
echo " WARN: $ENV_FILE not found"
fi
# 6. RELEASE_INFO
echo "[6/6] RELEASE_INFO..."
cat > "$OUTPUT/RELEASE_INFO.txt" << EOF
Release: ${VERSION}
Target: ${TARGET}
Schema: ${SCHEMA}
Date: $(date +%Y-%m-%d)
Contents:
- source.tar.gz
- schema.sql (${SCHEMA})
- scripts/* (tools)
- test/api_test.sh
- ${ENV_FILE}
EOF
if [ "$TARGET" = "dev" ] && [ -d "$OUTPUT/portal/dist" ]; then
echo " - portal/dist/" >> "$OUTPUT/RELEASE_INFO.txt"
fi
# Symlink latest
mkdir -p "$PROJECT/release/system/$TARGET"
ln -sfn "$OUTPUT" "$PROJECT/release/system/$TARGET/latest"
ELAPSED=$(($(date +%s) - T0))
echo ""
echo "=== System Package (${TARGET}) done (${ELAPSED}s) ==="
echo " $OUTPUT"
du -sh "$OUTPUT"

View File

@@ -0,0 +1,47 @@
#!/bin/bash
# PostgreSQL 18.3 - build from source
# Usage: bash scripts/setup/01_postgresql.sh
set -euo pipefail
PG_VERSION="18.3"
PG_SOURCE_URL="https://ftp.postgresql.org/pub/source/v${PG_VERSION}/postgresql-${PG_VERSION}.tar.gz"
PG_PREFIX="$HOME/pgsql/${PG_VERSION}"
PG_DATA="$HOME/pgsql/data"
echo "=== PostgreSQL ${PG_VERSION} Source Build ==="
# Step 1: Install build deps (via Homebrew)
echo "[1/5] Installing build dependencies..."
brew install readline zlib icu4c openssl e2fsprogs pkg-config
# Step 2: Download source
echo "[2/5] Downloading PostgreSQL ${PG_VERSION} source..."
mkdir -p ~/momentry_core_0.1/services/postgresql
cd ~/momentry_core_0.1/services/postgresql
curl -sL "$PG_SOURCE_URL" -o "postgresql-${PG_VERSION}.tar.gz"
tar xzf "postgresql-${PG_VERSION}.tar.gz"
cd "postgresql-${PG_VERSION}"
# Step 3: Configure
echo "[3/5] Configuring..."
export PKG_CONFIG_PATH="/opt/homebrew/opt/zlib/lib/pkgconfig:/opt/homebrew/opt/readline/lib/pkgconfig:/opt/homebrew/opt/icu4c/lib/pkgconfig:/opt/homebrew/lib/pkgconfig"
export LDFLAGS="-L/opt/homebrew/opt/openssl/lib"
export CPPFLAGS="-I/opt/homebrew/opt/openssl/include"
./configure --prefix="$PG_PREFIX" --with-uuid=e2fs --with-icu --with-openssl
# Step 4: Build
echo "[4/5] Building (parallel)..."
CORES=$(sysctl -n hw.ncpu)
make -j$CORES
make install
# Step 5: Initialize data directory
echo "[5/5] Initializing data directory..."
mkdir -p "$PG_DATA"
"$PG_PREFIX/bin/initdb" -D "$PG_DATA"
# Record checksum
echo "PostgreSQL SHA256: $(shasum -a 256 "$PG_PREFIX/bin/postgres" | cut -d' ' -f1)"
echo "=== PostgreSQL ${PG_VERSION} build complete ==="
echo "Start: pg_ctl -D $PG_DATA -l $HOME/pgsql/pg.log start"

View File

@@ -1 +1 @@
16433
55053

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show More