fix: restore identity_id after face_dedup, rebuild package v20260512
- Re-ran identity_bind.py to restore identity_id on face_detections - Dedup cleanup had removed rows with identity_id, kept NULL rows - 70691 face_detections now have identity_id, 428 identities - Full package rebuild: 169MB sqlite, 1358MB tar.gz - identities.json: 428 identities + 5483 bindings + 5483 trace maps - TMDB matching complete: Audrey Hepburn 843 traces, Cary Grant 482
This commit is contained in:
Binary file not shown.
511
scripts/asr_processor.py.backup
Executable file
511
scripts/asr_processor.py.backup
Executable file
@@ -0,0 +1,511 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
ASR Processor with chunked transcription for large files and resource monitoring.
|
||||
Maintains backward compatibility with existing API.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
import argparse
|
||||
import signal
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
|
||||
# Try to import psutil for resource monitoring
|
||||
PSUTIL_AVAILABLE = False
|
||||
psutil = None
|
||||
try:
|
||||
import psutil
|
||||
|
||||
PSUTIL_AVAILABLE = True
|
||||
except ImportError:
|
||||
sys.stderr.write("WARNING: psutil not available, resource monitoring disabled\n")
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
sys.stderr.write(f"ASR: Received signal {signum}, exiting...\n")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def has_audio_stream(video_path: str) -> bool:
|
||||
"""Check if video file has audio stream using ffprobe."""
|
||||
try:
|
||||
cmd = [
|
||||
"ffprobe",
|
||||
"-v",
|
||||
"error",
|
||||
"-select_streams",
|
||||
"a",
|
||||
"-show_entries",
|
||||
"stream=codec_type",
|
||||
"-of",
|
||||
"csv=p=0",
|
||||
video_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
return bool(result.stdout.strip())
|
||||
except subprocess.CalledProcessError:
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
sys.stderr.write("WARNING: ffprobe not found, assuming audio exists\n")
|
||||
return True
|
||||
|
||||
|
||||
def get_media_duration(media_path: str) -> float:
|
||||
"""Get media duration in seconds using ffprobe."""
|
||||
cmd = [
|
||||
"ffprobe",
|
||||
"-v",
|
||||
"error",
|
||||
"-show_entries",
|
||||
"format=duration",
|
||||
"-of",
|
||||
"csv=p=0",
|
||||
media_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
try:
|
||||
return float(result.stdout.strip())
|
||||
except (ValueError, AttributeError):
|
||||
return 0.0
|
||||
|
||||
|
||||
def extract_audio(video_path: str, audio_path: str) -> bool:
|
||||
"""Extract audio from video to WAV format."""
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-i",
|
||||
video_path,
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-ac",
|
||||
"1",
|
||||
"-y",
|
||||
audio_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
return result.returncode == 0 and os.path.exists(audio_path)
|
||||
|
||||
|
||||
def extract_chunk(
|
||||
audio_path: str, start: float, duration: float, output_path: str
|
||||
) -> bool:
|
||||
"""Extract a chunk of audio using ffmpeg."""
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-i",
|
||||
audio_path,
|
||||
"-ss",
|
||||
str(start),
|
||||
"-t",
|
||||
str(duration),
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-ac",
|
||||
"1",
|
||||
"-y",
|
||||
output_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
return os.path.exists(output_path) and os.path.getsize(output_path) > 0
|
||||
|
||||
|
||||
def monitor_resources(pid: int, interval: float = 0.1) -> Dict[str, Any]:
|
||||
"""Monitor CPU and memory usage for a process."""
|
||||
if not PSUTIL_AVAILABLE or psutil is None:
|
||||
return {"cpu_percent": 0.0, "memory_mb": 0.0, "available": False}
|
||||
|
||||
try:
|
||||
process = psutil.Process(pid)
|
||||
cpu_percent = process.cpu_percent(interval=interval)
|
||||
memory_info = process.memory_info()
|
||||
memory_mb = memory_info.rss / (1024 * 1024)
|
||||
return {
|
||||
"cpu_percent": cpu_percent,
|
||||
"memory_mb": memory_mb,
|
||||
"available": True,
|
||||
"pid": pid,
|
||||
}
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
||||
return {"cpu_percent": 0.0, "memory_mb": 0.0, "available": False}
|
||||
|
||||
|
||||
def transcribe_direct(
|
||||
model, audio_path: str, publisher: Optional[RedisPublisher] = None
|
||||
) -> Tuple[List[Dict[str, Any]], Any]:
|
||||
"""Transcribe audio directly (non-chunked)."""
|
||||
if publisher:
|
||||
publisher.info("asr", "Transcribing audio directly...")
|
||||
|
||||
start_time = time.time()
|
||||
segments, info = model.transcribe(audio_path, beam_size=5)
|
||||
|
||||
results = []
|
||||
total_segments = 0
|
||||
for segment in segments:
|
||||
results.append(
|
||||
{"start": segment.start, "end": segment.end, "text": segment.text.strip()}
|
||||
)
|
||||
total_segments += 1
|
||||
if total_segments % 100 == 0 and publisher:
|
||||
publisher.progress("asr", total_segments, 0, f"Segment {total_segments}")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
if publisher:
|
||||
publisher.info(
|
||||
"asr", f"Direct transcription: {len(results)} segments in {elapsed:.1f}s"
|
||||
)
|
||||
|
||||
return results, info
|
||||
|
||||
|
||||
def transcribe_chunk(
|
||||
model,
|
||||
chunk_path: str,
|
||||
chunk_start: float,
|
||||
chunk_idx: int,
|
||||
total_chunks: int,
|
||||
publisher: Optional[RedisPublisher] = None,
|
||||
) -> Tuple[List[Dict[str, Any]], Any]:
|
||||
"""Transcribe a single audio chunk."""
|
||||
if publisher:
|
||||
publisher.info("asr", f"Transcribing chunk {chunk_idx + 1}/{total_chunks}")
|
||||
|
||||
start_time = time.time()
|
||||
segments, info = model.transcribe(chunk_path, beam_size=5)
|
||||
|
||||
results = []
|
||||
for segment in segments:
|
||||
results.append(
|
||||
{
|
||||
"start": segment.start + chunk_start,
|
||||
"end": segment.end + chunk_start,
|
||||
"text": segment.text.strip(),
|
||||
}
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
if publisher:
|
||||
publisher.info(
|
||||
"asr",
|
||||
f"Chunk {chunk_idx + 1}/{total_chunks}: {len(results)} segments in {elapsed:.1f}s",
|
||||
)
|
||||
|
||||
return results, info
|
||||
|
||||
|
||||
def run_asr(
|
||||
video_path: str,
|
||||
output_path: str,
|
||||
uuid: str = "",
|
||||
chunk_duration: int = 600, # 10 minutes default
|
||||
max_direct_duration: int = 1800, # 30 minutes: use direct transcription for shorter files
|
||||
model_size: str = "tiny",
|
||||
compute_type: str = "int8",
|
||||
monitor_interval: int = 60,
|
||||
) -> None:
|
||||
# Set up signal handlers
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
publisher = RedisPublisher(uuid) if uuid else None
|
||||
if publisher:
|
||||
publisher.info("asr", "ASR_START")
|
||||
|
||||
# Check for audio stream
|
||||
if not has_audio_stream(video_path):
|
||||
if publisher:
|
||||
publisher.info("asr", "No audio stream detected, skipping transcription")
|
||||
output = {"language": "", "language_probability": 0.0, "segments": []}
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
if publisher:
|
||||
publisher.complete("asr", "0 segments (no audio)")
|
||||
sys.stderr.write("ASR: No audio stream, skipping transcription\n")
|
||||
sys.stderr.flush()
|
||||
sys.exit(0)
|
||||
|
||||
# Create temporary directory
|
||||
temp_dir = tempfile.mkdtemp(prefix="asr_")
|
||||
audio_path = os.path.join(temp_dir, "audio.wav")
|
||||
|
||||
if publisher:
|
||||
publisher.info("asr", "Extracting audio from video...")
|
||||
|
||||
# Extract audio
|
||||
if not extract_audio(video_path, audio_path):
|
||||
if publisher:
|
||||
publisher.error("asr", "Failed to extract audio")
|
||||
sys.stderr.write("ASR: Failed to extract audio\n")
|
||||
sys.stderr.flush()
|
||||
# Clean up
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Get audio duration
|
||||
try:
|
||||
total_duration = get_media_duration(audio_path)
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.error("asr", f"Failed to get audio duration: {e}")
|
||||
sys.stderr.write(f"ASR: Failed to get audio duration: {e}\n")
|
||||
sys.stderr.flush()
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
sys.exit(1)
|
||||
|
||||
if publisher:
|
||||
publisher.info(
|
||||
"asr",
|
||||
f"Audio duration: {total_duration:.1f}s ({total_duration / 3600:.1f} hrs)",
|
||||
)
|
||||
|
||||
# Load Whisper model
|
||||
if publisher:
|
||||
publisher.info(
|
||||
"asr", f"Loading Whisper model ({model_size}, {compute_type})..."
|
||||
)
|
||||
|
||||
try:
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
model = WhisperModel(model_size, device="cpu", compute_type=compute_type)
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.error("asr", f"Failed to load Whisper model: {e}")
|
||||
sys.stderr.write(f"ASR: Failed to load Whisper model: {e}\n")
|
||||
sys.stderr.flush()
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
sys.exit(1)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asr", "Whisper model loaded successfully")
|
||||
|
||||
# Decide whether to use chunked or direct transcription
|
||||
use_chunked = total_duration > max_direct_duration
|
||||
|
||||
all_segments = []
|
||||
language = None
|
||||
language_prob = None
|
||||
chunks = [] # Initialize chunks variable
|
||||
|
||||
if not use_chunked:
|
||||
# Direct transcription for shorter audio
|
||||
if publisher:
|
||||
publisher.info(
|
||||
"asr", f"Using direct transcription (duration ≤ {max_direct_duration}s)"
|
||||
)
|
||||
|
||||
try:
|
||||
segments, info = transcribe_direct(model, audio_path, publisher)
|
||||
all_segments.extend(segments)
|
||||
language = info.language
|
||||
language_prob = info.language_probability
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.error("asr", f"Direct transcription failed: {e}")
|
||||
sys.stderr.write(f"ASR: Direct transcription failed: {e}\n")
|
||||
sys.stderr.flush()
|
||||
# Fall back to chunked approach
|
||||
use_chunked = True
|
||||
if publisher:
|
||||
publisher.info("asr", "Falling back to chunked transcription")
|
||||
|
||||
if use_chunked:
|
||||
# Chunked transcription for long audio
|
||||
if publisher:
|
||||
publisher.info(
|
||||
"asr", f"Using chunked transcription ({chunk_duration}s chunks)"
|
||||
)
|
||||
|
||||
# Calculate chunks
|
||||
chunks = []
|
||||
start = 0.0
|
||||
chunk_idx = 0
|
||||
while start < total_duration:
|
||||
chunk_end = min(start + chunk_duration, total_duration)
|
||||
chunks.append(
|
||||
{
|
||||
"start": start,
|
||||
"end": chunk_end,
|
||||
"duration": chunk_end - start,
|
||||
"idx": chunk_idx,
|
||||
}
|
||||
)
|
||||
start = chunk_end
|
||||
chunk_idx += 1
|
||||
|
||||
if publisher:
|
||||
publisher.info("asr", f"Split into {len(chunks)} chunks")
|
||||
|
||||
chunk_temp_dir = os.path.join(temp_dir, "chunks")
|
||||
os.makedirs(chunk_temp_dir, exist_ok=True)
|
||||
|
||||
last_resource_report = time.time()
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
chunk_path = os.path.join(chunk_temp_dir, f"chunk_{i:04d}.wav")
|
||||
|
||||
if publisher:
|
||||
publisher.progress(
|
||||
"asr", i, len(chunks), f"Processing chunk {i + 1}/{len(chunks)}"
|
||||
)
|
||||
|
||||
# Extract chunk
|
||||
if not extract_chunk(
|
||||
audio_path, chunk["start"], chunk["duration"], chunk_path
|
||||
):
|
||||
if publisher:
|
||||
publisher.warning("asr", f"Failed to extract chunk {i}, skipping")
|
||||
continue
|
||||
|
||||
# Resource monitoring (sample every monitor_interval seconds)
|
||||
current_time = time.time()
|
||||
if (
|
||||
PSUTIL_AVAILABLE
|
||||
and publisher
|
||||
and (current_time - last_resource_report) >= monitor_interval
|
||||
):
|
||||
resources = monitor_resources(os.getpid())
|
||||
if resources["available"]:
|
||||
publisher.info(
|
||||
"asr",
|
||||
f"Resource usage: CPU {resources['cpu_percent']:.1f}%, "
|
||||
f"Memory {resources['memory_mb']:.1f}MB",
|
||||
)
|
||||
last_resource_report = current_time
|
||||
|
||||
# Transcribe chunk
|
||||
try:
|
||||
segments, info = transcribe_chunk(
|
||||
model, chunk_path, chunk["start"], i, len(chunks), publisher
|
||||
)
|
||||
all_segments.extend(segments)
|
||||
|
||||
if language is None:
|
||||
language = info.language
|
||||
language_prob = info.language_probability
|
||||
if publisher:
|
||||
publisher.info(
|
||||
"asr",
|
||||
f"Detected language: {language} (prob {language_prob:.2f})",
|
||||
)
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.error("asr", f"Error transcribing chunk {i}: {e}")
|
||||
sys.stderr.write(f"ASR: Error transcribing chunk {i}: {e}\n")
|
||||
sys.stderr.flush()
|
||||
# Continue with next chunk
|
||||
|
||||
# Clean up chunk file
|
||||
try:
|
||||
os.unlink(chunk_path)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Clean up temporary directory
|
||||
try:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Sort segments by start time
|
||||
all_segments.sort(key=lambda x: x["start"])
|
||||
|
||||
# Prepare output (maintain same format as original)
|
||||
output = {
|
||||
"language": language or "",
|
||||
"language_probability": language_prob or 0.0,
|
||||
"segments": all_segments,
|
||||
}
|
||||
|
||||
# Add metadata for chunked processing (optional)
|
||||
if use_chunked:
|
||||
output["processing_mode"] = "chunked"
|
||||
output["chunk_count"] = len(chunks) if "chunks" in locals() else 0
|
||||
output["chunk_duration"] = chunk_duration
|
||||
else:
|
||||
output["processing_mode"] = "direct"
|
||||
|
||||
# Write output
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
|
||||
if publisher:
|
||||
publisher.complete(
|
||||
"asr",
|
||||
f"{len(all_segments)} segments ({'chunked' if use_chunked else 'direct'} mode)",
|
||||
)
|
||||
|
||||
sys.stderr.write(
|
||||
f"ASR: Transcription complete, {len(all_segments)} segments written to {output_path}\n"
|
||||
)
|
||||
sys.stderr.flush()
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="ASR Transcription with chunked processing"
|
||||
)
|
||||
parser.add_argument("video_path", help="Path to video file")
|
||||
parser.add_argument("output_path", help="Output JSON path")
|
||||
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
|
||||
parser.add_argument("--version", action="version", version="2.0.0")
|
||||
|
||||
# Hidden arguments for configuration (can be set via environment variables)
|
||||
parser.add_argument(
|
||||
"--chunk-duration", type=int, default=600, help=argparse.SUPPRESS
|
||||
) # 10 minutes default
|
||||
parser.add_argument(
|
||||
"--max-direct-duration", type=int, default=1800, help=argparse.SUPPRESS
|
||||
) # 30 minutes
|
||||
parser.add_argument("--model-size", default="tiny", help=argparse.SUPPRESS)
|
||||
parser.add_argument("--compute-type", default="int8", help=argparse.SUPPRESS)
|
||||
parser.add_argument(
|
||||
"--monitor-interval", type=int, default=60, help=argparse.SUPPRESS
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Allow environment variable overrides
|
||||
chunk_duration_str = os.environ.get("MOMENTRY_ASR_CHUNK_DURATION")
|
||||
if chunk_duration_str is not None:
|
||||
chunk_duration = int(chunk_duration_str)
|
||||
else:
|
||||
chunk_duration = args.chunk_duration
|
||||
|
||||
max_direct_duration_str = os.environ.get("MOMENTRY_ASR_MAX_DIRECT_DURATION")
|
||||
if max_direct_duration_str is not None:
|
||||
max_direct_duration = int(max_direct_duration_str)
|
||||
else:
|
||||
max_direct_duration = args.max_direct_duration
|
||||
|
||||
model_size = os.environ.get("MOMENTRY_ASR_MODEL_SIZE")
|
||||
if model_size is None:
|
||||
model_size = args.model_size
|
||||
|
||||
compute_type = os.environ.get("MOMENTRY_ASR_COMPUTE_TYPE")
|
||||
if compute_type is None:
|
||||
compute_type = args.compute_type
|
||||
|
||||
run_asr(
|
||||
args.video_path,
|
||||
args.output_path,
|
||||
args.uuid,
|
||||
chunk_duration,
|
||||
max_direct_duration,
|
||||
model_size,
|
||||
compute_type,
|
||||
)
|
||||
672
scripts/asr_processor.py.bak
Executable file
672
scripts/asr_processor.py.bak
Executable file
@@ -0,0 +1,672 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
ASR Processor with chunked transcription for large files and resource monitoring.
|
||||
Maintains backward compatibility with existing API.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
import argparse
|
||||
import signal
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
import shutil
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
|
||||
# Try to import psutil for resource monitoring
|
||||
PSUTIL_AVAILABLE = False
|
||||
psutil = None
|
||||
try:
|
||||
import psutil
|
||||
|
||||
PSUTIL_AVAILABLE = True
|
||||
except ImportError:
|
||||
sys.stderr.write("WARNING: psutil not available, resource monitoring disabled\n")
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
||||
from redis_publisher import RedisPublisher # noqa: E402
|
||||
|
||||
|
||||
def save_checkpoint(
|
||||
checkpoint_path: str,
|
||||
segments: List[Dict[str, Any]],
|
||||
language: Optional[str],
|
||||
language_prob: Optional[float],
|
||||
processed_chunks: List[int],
|
||||
total_chunks: int,
|
||||
) -> None:
|
||||
"""Save transcription checkpoint to resume later."""
|
||||
checkpoint_data = {
|
||||
"segments": segments,
|
||||
"language": language or "",
|
||||
"language_probability": language_prob or 0.0,
|
||||
"processed_chunks": processed_chunks,
|
||||
"total_chunks": total_chunks,
|
||||
"timestamp": time.time(),
|
||||
}
|
||||
try:
|
||||
with open(checkpoint_path, "w") as f:
|
||||
json.dump(checkpoint_data, f, indent=2, default=str)
|
||||
except Exception as e:
|
||||
sys.stderr.write(f"ASR: Failed to save checkpoint: {e}\n")
|
||||
|
||||
|
||||
def load_checkpoint(checkpoint_path: str) -> Optional[Dict[str, Any]]:
|
||||
"""Load transcription checkpoint if exists."""
|
||||
try:
|
||||
with open(checkpoint_path, "r") as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def check_health() -> Dict[str, Any]:
|
||||
"""Check health of ASR processor dependencies."""
|
||||
health = {
|
||||
"status": "healthy",
|
||||
"checks": {},
|
||||
"timestamp": time.time(),
|
||||
}
|
||||
|
||||
# Check ffmpeg
|
||||
try:
|
||||
result = subprocess.run(["ffmpeg", "-version"], capture_output=True, text=True)
|
||||
health["checks"]["ffmpeg"] = {
|
||||
"available": result.returncode == 0,
|
||||
"version": result.stdout.split("\n")[0].split(" ")[2]
|
||||
if result.stdout
|
||||
else "unknown",
|
||||
}
|
||||
except Exception as e:
|
||||
health["checks"]["ffmpeg"] = {"available": False, "error": str(e)}
|
||||
|
||||
# Check ffprobe
|
||||
try:
|
||||
result = subprocess.run(["ffprobe", "-version"], capture_output=True, text=True)
|
||||
health["checks"]["ffprobe"] = {
|
||||
"available": result.returncode == 0,
|
||||
"version": result.stdout.split("\n")[0].split(" ")[2]
|
||||
if result.stdout
|
||||
else "unknown",
|
||||
}
|
||||
except Exception as e:
|
||||
health["checks"]["ffprobe"] = {"available": False, "error": str(e)}
|
||||
|
||||
# Check faster_whisper import
|
||||
try:
|
||||
import faster_whisper
|
||||
|
||||
health["checks"]["faster_whisper"] = {
|
||||
"available": True,
|
||||
"version": getattr(faster_whisper, "__version__", "unknown"),
|
||||
}
|
||||
except ImportError as e:
|
||||
health["checks"]["faster_whisper"] = {"available": False, "error": str(e)}
|
||||
health["status"] = "unhealthy"
|
||||
|
||||
# Check psutil import
|
||||
try:
|
||||
import psutil
|
||||
|
||||
health["checks"]["psutil"] = {
|
||||
"available": True,
|
||||
"version": getattr(psutil, "__version__", "unknown"),
|
||||
}
|
||||
except ImportError:
|
||||
health["checks"]["psutil"] = {
|
||||
"available": False,
|
||||
"warning": "resource monitoring disabled",
|
||||
}
|
||||
|
||||
# Determine overall status
|
||||
if not health["checks"].get("ffmpeg", {}).get("available", False) or not health[
|
||||
"checks"
|
||||
].get("ffprobe", {}).get("available", False):
|
||||
health["status"] = "unhealthy"
|
||||
|
||||
return health
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
sys.stderr.write(f"ASR: Received signal {signum}, exiting...\n")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def has_audio_stream(video_path: str) -> bool:
|
||||
"""Check if video file has audio stream using ffprobe."""
|
||||
try:
|
||||
cmd = [
|
||||
"ffprobe",
|
||||
"-v",
|
||||
"error",
|
||||
"-select_streams",
|
||||
"a",
|
||||
"-show_entries",
|
||||
"stream=codec_type",
|
||||
"-of",
|
||||
"csv=p=0",
|
||||
video_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
||||
return bool(result.stdout.strip())
|
||||
except subprocess.CalledProcessError:
|
||||
return False
|
||||
except FileNotFoundError:
|
||||
sys.stderr.write("WARNING: ffprobe not found, assuming audio exists\n")
|
||||
return True
|
||||
|
||||
|
||||
def get_media_duration(media_path: str) -> float:
|
||||
"""Get media duration in seconds using ffprobe."""
|
||||
cmd = [
|
||||
"ffprobe",
|
||||
"-v",
|
||||
"error",
|
||||
"-show_entries",
|
||||
"format=duration",
|
||||
"-of",
|
||||
"csv=p=0",
|
||||
media_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||
try:
|
||||
return float(result.stdout.strip())
|
||||
except (ValueError, AttributeError):
|
||||
return 0.0
|
||||
|
||||
|
||||
def extract_audio(video_path: str, audio_path: str) -> bool:
|
||||
"""Extract audio from video to WAV format."""
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-i",
|
||||
video_path,
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-ac",
|
||||
"1",
|
||||
"-y",
|
||||
audio_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
return result.returncode == 0 and os.path.exists(audio_path)
|
||||
|
||||
|
||||
def extract_chunk(
|
||||
audio_path: str, start: float, duration: float, output_path: str
|
||||
) -> bool:
|
||||
"""Extract a chunk of audio using ffmpeg."""
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-i",
|
||||
audio_path,
|
||||
"-ss",
|
||||
str(start),
|
||||
"-t",
|
||||
str(duration),
|
||||
"-acodec",
|
||||
"pcm_s16le",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-ac",
|
||||
"1",
|
||||
"-y",
|
||||
output_path,
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True)
|
||||
return (
|
||||
result.returncode == 0
|
||||
and os.path.exists(output_path)
|
||||
and os.path.getsize(output_path) > 0
|
||||
)
|
||||
|
||||
|
||||
def monitor_resources(pid: int, interval: float = 0.1) -> Dict[str, Any]:
|
||||
"""Monitor CPU and memory usage for a process."""
|
||||
if not PSUTIL_AVAILABLE or psutil is None:
|
||||
return {"cpu_percent": 0.0, "memory_mb": 0.0, "available": False}
|
||||
|
||||
try:
|
||||
process = psutil.Process(pid)
|
||||
cpu_percent = process.cpu_percent(interval=interval)
|
||||
memory_info = process.memory_info()
|
||||
memory_mb = memory_info.rss / (1024 * 1024)
|
||||
return {
|
||||
"cpu_percent": cpu_percent,
|
||||
"memory_mb": memory_mb,
|
||||
"available": True,
|
||||
"pid": pid,
|
||||
}
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
||||
return {"cpu_percent": 0.0, "memory_mb": 0.0, "available": False}
|
||||
|
||||
|
||||
def transcribe_direct(
|
||||
model, audio_path: str, publisher: Optional[RedisPublisher] = None
|
||||
) -> Tuple[List[Dict[str, Any]], Any]:
|
||||
"""Transcribe audio directly (non-chunked)."""
|
||||
if publisher:
|
||||
publisher.info("asr", "Transcribing audio directly...")
|
||||
|
||||
start_time = time.time()
|
||||
segments, info = model.transcribe(audio_path, beam_size=5)
|
||||
|
||||
results = []
|
||||
total_segments = 0
|
||||
for segment in segments:
|
||||
results.append(
|
||||
{"start": segment.start, "end": segment.end, "text": segment.text.strip()}
|
||||
)
|
||||
total_segments += 1
|
||||
if total_segments % 100 == 0 and publisher:
|
||||
publisher.progress("asr", total_segments, 0, f"Segment {total_segments}")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
if publisher:
|
||||
publisher.info(
|
||||
"asr", f"Direct transcription: {len(results)} segments in {elapsed:.1f}s"
|
||||
)
|
||||
|
||||
return results, info
|
||||
|
||||
|
||||
def transcribe_chunk(
|
||||
model,
|
||||
chunk_path: str,
|
||||
chunk_start: float,
|
||||
chunk_idx: int,
|
||||
total_chunks: int,
|
||||
publisher: Optional[RedisPublisher] = None,
|
||||
) -> Tuple[List[Dict[str, Any]], Any]:
|
||||
"""Transcribe a single audio chunk."""
|
||||
if publisher:
|
||||
publisher.info("asr", f"Transcribing chunk {chunk_idx + 1}/{total_chunks}")
|
||||
|
||||
start_time = time.time()
|
||||
segments, info = model.transcribe(chunk_path, beam_size=5)
|
||||
|
||||
results = []
|
||||
for segment in segments:
|
||||
results.append(
|
||||
{
|
||||
"start": segment.start + chunk_start,
|
||||
"end": segment.end + chunk_start,
|
||||
"text": segment.text.strip(),
|
||||
}
|
||||
)
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
if publisher:
|
||||
publisher.info(
|
||||
"asr",
|
||||
f"Chunk {chunk_idx + 1}/{total_chunks}: {len(results)} segments in {elapsed:.1f}s",
|
||||
)
|
||||
|
||||
return results, info
|
||||
|
||||
|
||||
def run_asr(
|
||||
video_path: str,
|
||||
output_path: str,
|
||||
uuid: str = "",
|
||||
chunk_duration: int = 600, # 10 minutes default
|
||||
max_direct_duration: int = 1200, # 20 minutes: use direct transcription for shorter files (safe limit)
|
||||
model_size: str = "tiny",
|
||||
compute_type: str = "int8",
|
||||
monitor_interval: int = 60,
|
||||
) -> None:
|
||||
# Set up signal handlers
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
publisher = RedisPublisher(uuid) if uuid else None
|
||||
if publisher:
|
||||
publisher.info("asr", "ASR_START")
|
||||
|
||||
# Check for audio stream
|
||||
if not has_audio_stream(video_path):
|
||||
if publisher:
|
||||
publisher.info("asr", "No audio stream detected, skipping transcription")
|
||||
output = {
|
||||
"processor_name": "asr",
|
||||
"processor_version": "2.0.0",
|
||||
"contract_version": "1.0",
|
||||
"language": None,
|
||||
"language_probability": None,
|
||||
"segments": [],
|
||||
}
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
if publisher:
|
||||
publisher.complete("asr", "0 segments (no audio)")
|
||||
sys.stderr.write("ASR: No audio stream, skipping transcription\n")
|
||||
sys.stderr.flush()
|
||||
sys.exit(0)
|
||||
|
||||
# Create temporary directory
|
||||
temp_dir = tempfile.mkdtemp(prefix="asr_")
|
||||
audio_path = os.path.join(temp_dir, "audio.wav")
|
||||
|
||||
if publisher:
|
||||
publisher.info("asr", "Extracting audio from video...")
|
||||
|
||||
# Extract audio
|
||||
if not extract_audio(video_path, audio_path):
|
||||
if publisher:
|
||||
publisher.error("asr", "Failed to extract audio")
|
||||
sys.stderr.write("ASR: Failed to extract audio\n")
|
||||
sys.stderr.flush()
|
||||
# Clean up
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Get audio duration
|
||||
try:
|
||||
total_duration = get_media_duration(audio_path)
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.error("asr", f"Failed to get audio duration: {e}")
|
||||
sys.stderr.write(f"ASR: Failed to get audio duration: {e}\n")
|
||||
sys.stderr.flush()
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
sys.exit(1)
|
||||
|
||||
if publisher:
|
||||
publisher.info(
|
||||
"asr",
|
||||
f"Audio duration: {total_duration:.1f}s ({total_duration / 3600:.1f} hrs)",
|
||||
)
|
||||
|
||||
# Load Whisper model
|
||||
if publisher:
|
||||
publisher.info(
|
||||
"asr", f"Loading Whisper model ({model_size}, {compute_type})..."
|
||||
)
|
||||
|
||||
try:
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
model = WhisperModel(model_size, device="cpu", compute_type=compute_type)
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.error("asr", f"Failed to load Whisper model: {e}")
|
||||
sys.stderr.write(f"ASR: Failed to load Whisper model: {e}\n")
|
||||
sys.stderr.flush()
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
sys.exit(1)
|
||||
|
||||
if publisher:
|
||||
publisher.info("asr", "Whisper model loaded successfully")
|
||||
|
||||
# Decide whether to use chunked or direct transcription
|
||||
use_chunked = total_duration > max_direct_duration
|
||||
|
||||
all_segments = []
|
||||
language = None
|
||||
language_prob = None
|
||||
chunks = [] # Initialize chunks variable
|
||||
|
||||
if not use_chunked:
|
||||
# Direct transcription for shorter audio
|
||||
if publisher:
|
||||
publisher.info(
|
||||
"asr", f"Using direct transcription (duration ≤ {max_direct_duration}s)"
|
||||
)
|
||||
|
||||
try:
|
||||
segments, info = transcribe_direct(model, audio_path, publisher)
|
||||
all_segments.extend(segments)
|
||||
language = info.language
|
||||
language_prob = info.language_probability
|
||||
except Exception as e:
|
||||
if publisher:
|
||||
publisher.error("asr", f"Direct transcription failed: {e}")
|
||||
sys.stderr.write(f"ASR: Direct transcription failed: {e}\n")
|
||||
sys.stderr.flush()
|
||||
# Fall back to chunked approach
|
||||
use_chunked = True
|
||||
if publisher:
|
||||
publisher.info("asr", "Falling back to chunked transcription")
|
||||
|
||||
if use_chunked:
|
||||
# Chunked transcription for long audio
|
||||
if publisher:
|
||||
publisher.info(
|
||||
"asr", f"Using chunked transcription ({chunk_duration}s chunks)"
|
||||
)
|
||||
|
||||
# Calculate chunks
|
||||
chunks = []
|
||||
start = 0.0
|
||||
chunk_idx = 0
|
||||
while start < total_duration:
|
||||
chunk_end = min(start + chunk_duration, total_duration)
|
||||
chunks.append(
|
||||
{
|
||||
"start": start,
|
||||
"end": chunk_end,
|
||||
"duration": chunk_end - start,
|
||||
"idx": chunk_idx,
|
||||
}
|
||||
)
|
||||
start = chunk_end
|
||||
chunk_idx += 1
|
||||
|
||||
if publisher:
|
||||
publisher.info("asr", f"Split into {len(chunks)} chunks")
|
||||
|
||||
chunk_temp_dir = os.path.join(temp_dir, "chunks")
|
||||
os.makedirs(chunk_temp_dir, exist_ok=True)
|
||||
|
||||
last_resource_report = time.time()
|
||||
|
||||
for i, chunk in enumerate(chunks):
|
||||
chunk_path = os.path.join(chunk_temp_dir, f"chunk_{i:04d}.wav")
|
||||
|
||||
if publisher and os.environ.get("MOMENTRY_DISABLE_REDIS") != "1":
|
||||
publisher.progress(
|
||||
"asr", i, len(chunks), f"Processing chunk {i + 1}/{len(chunks)}"
|
||||
)
|
||||
|
||||
# Extract chunk
|
||||
if not extract_chunk(
|
||||
audio_path, chunk["start"], chunk["duration"], chunk_path
|
||||
):
|
||||
if publisher:
|
||||
publisher.warning("asr", f"Failed to extract chunk {i}, skipping")
|
||||
continue
|
||||
|
||||
# Resource monitoring (sample every monitor_interval seconds)
|
||||
current_time = time.time()
|
||||
if (
|
||||
PSUTIL_AVAILABLE
|
||||
and publisher
|
||||
and (current_time - last_resource_report) >= monitor_interval
|
||||
):
|
||||
resources = monitor_resources(os.getpid())
|
||||
if resources["available"]:
|
||||
publisher.info(
|
||||
"asr",
|
||||
f"Resource usage: CPU {resources['cpu_percent']:.1f}%, "
|
||||
f"Memory {resources['memory_mb']:.1f}MB",
|
||||
)
|
||||
last_resource_report = current_time
|
||||
|
||||
# Transcribe chunk with retry logic
|
||||
max_retries = 3
|
||||
transcribed = False
|
||||
last_error = None
|
||||
|
||||
for retry in range(max_retries):
|
||||
try:
|
||||
segments, info = transcribe_chunk(
|
||||
model, chunk_path, chunk["start"], i, len(chunks), publisher
|
||||
)
|
||||
all_segments.extend(segments)
|
||||
|
||||
if language is None:
|
||||
language = info.language
|
||||
language_prob = info.language_probability
|
||||
if publisher:
|
||||
publisher.info(
|
||||
"asr",
|
||||
f"Detected language: {language} (prob {language_prob:.2f})",
|
||||
)
|
||||
|
||||
transcribed = True
|
||||
break # Success, exit retry loop
|
||||
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
if publisher:
|
||||
publisher.warning(
|
||||
"asr",
|
||||
f"Error transcribing chunk {i} (attempt {retry + 1}/{max_retries}): {e}",
|
||||
)
|
||||
sys.stderr.write(
|
||||
f"ASR: Error transcribing chunk {i} (attempt {retry + 1}/{max_retries}): {e}\n"
|
||||
)
|
||||
sys.stderr.flush()
|
||||
|
||||
if retry < max_retries - 1:
|
||||
# Wait before retry (exponential backoff)
|
||||
wait_time = 2**retry # 1, 2, 4 seconds
|
||||
if publisher:
|
||||
publisher.info("asr", f"Retrying in {wait_time}s...")
|
||||
time.sleep(wait_time)
|
||||
else:
|
||||
# Final attempt failed
|
||||
if publisher:
|
||||
publisher.error(
|
||||
"asr",
|
||||
f"Failed to transcribe chunk {i} after {max_retries} attempts: {last_error}",
|
||||
)
|
||||
sys.stderr.write(
|
||||
f"ASR: Failed to transcribe chunk {i} after {max_retries} attempts: {last_error}\n"
|
||||
)
|
||||
sys.stderr.flush()
|
||||
# Continue with next chunk (skip this one)
|
||||
|
||||
# Clean up chunk file
|
||||
try:
|
||||
os.unlink(chunk_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Clean up temporary directory
|
||||
try:
|
||||
shutil.rmtree(temp_dir, ignore_errors=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Sort segments by start time
|
||||
all_segments.sort(key=lambda x: x["start"])
|
||||
|
||||
# Prepare output (maintain same format as original)
|
||||
output = {
|
||||
"processor_name": "asr",
|
||||
"processor_version": "2.0.0",
|
||||
"contract_version": "1.0",
|
||||
"language": language if language is not None else None,
|
||||
"language_probability": language_prob if language_prob is not None else None,
|
||||
"segments": all_segments,
|
||||
}
|
||||
|
||||
# Add metadata for chunked processing (optional)
|
||||
if use_chunked:
|
||||
output["processing_mode"] = "chunked"
|
||||
output["chunk_count"] = len(chunks) if "chunks" in locals() else 0
|
||||
output["chunk_duration"] = chunk_duration
|
||||
else:
|
||||
output["processing_mode"] = "direct"
|
||||
|
||||
# Write output
|
||||
with open(output_path, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
|
||||
if publisher:
|
||||
publisher.complete(
|
||||
"asr",
|
||||
f"{len(all_segments)} segments ({'chunked' if use_chunked else 'direct'} mode)",
|
||||
)
|
||||
|
||||
sys.stderr.write(
|
||||
f"ASR: Transcription complete, {len(all_segments)} segments written to {output_path}\n"
|
||||
)
|
||||
sys.stderr.flush()
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="ASR Transcription with chunked processing"
|
||||
)
|
||||
parser.add_argument("video_path", nargs="?", help="Path to video file")
|
||||
parser.add_argument("output_path", nargs="?", help="Output JSON path")
|
||||
parser.add_argument("--uuid", "-u", help="UUID for Redis progress", default="")
|
||||
parser.add_argument("--version", action="version", version="2.0.0")
|
||||
parser.add_argument(
|
||||
"--check-health", action="store_true", help="Check dependencies and exit"
|
||||
)
|
||||
|
||||
# Hidden arguments for configuration (can be set via environment variables)
|
||||
parser.add_argument(
|
||||
"--chunk-duration", type=int, default=600, help=argparse.SUPPRESS
|
||||
) # 10 minutes default
|
||||
parser.add_argument(
|
||||
"--max-direct-duration", type=int, default=1200, help=argparse.SUPPRESS
|
||||
) # 20 minutes (safe limit based on testing)
|
||||
parser.add_argument("--model-size", default="tiny", help=argparse.SUPPRESS)
|
||||
parser.add_argument("--compute-type", default="int8", help=argparse.SUPPRESS)
|
||||
parser.add_argument(
|
||||
"--monitor-interval", type=int, default=60, help=argparse.SUPPRESS
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Handle health check
|
||||
if args.check_health:
|
||||
health = check_health()
|
||||
print(json.dumps(health, indent=2))
|
||||
sys.exit(0 if health["status"] == "healthy" else 1)
|
||||
|
||||
# Validate required arguments when not doing health check
|
||||
if args.video_path is None or args.output_path is None:
|
||||
parser.error(
|
||||
"video_path and output_path are required when not using --check-health"
|
||||
)
|
||||
|
||||
# Allow environment variable overrides
|
||||
chunk_duration_str = os.environ.get("MOMENTRY_ASR_CHUNK_DURATION")
|
||||
if chunk_duration_str is not None:
|
||||
chunk_duration = int(chunk_duration_str)
|
||||
else:
|
||||
chunk_duration = args.chunk_duration
|
||||
|
||||
max_direct_duration_str = os.environ.get("MOMENTRY_ASR_MAX_DIRECT_DURATION")
|
||||
if max_direct_duration_str is not None:
|
||||
max_direct_duration = int(max_direct_duration_str)
|
||||
else:
|
||||
max_direct_duration = args.max_direct_duration
|
||||
|
||||
model_size = os.environ.get("MOMENTRY_ASR_MODEL_SIZE")
|
||||
if model_size is None:
|
||||
model_size = args.model_size
|
||||
|
||||
compute_type = os.environ.get("MOMENTRY_ASR_COMPUTE_TYPE")
|
||||
if compute_type is None:
|
||||
compute_type = args.compute_type
|
||||
|
||||
run_asr(
|
||||
args.video_path,
|
||||
args.output_path,
|
||||
args.uuid,
|
||||
chunk_duration,
|
||||
max_direct_duration,
|
||||
model_size,
|
||||
compute_type,
|
||||
)
|
||||
113
scripts/import_file_package.py
Normal file
113
scripts/import_file_package.py
Normal file
@@ -0,0 +1,113 @@
|
||||
#!/opt/homebrew/bin/python3.11
|
||||
"""
|
||||
Import a File Content Package into the DB.
|
||||
Usage: python3 import_file_package.py --uuid <file_uuid> --package <path>
|
||||
"""
|
||||
import json, os, sys, subprocess, argparse, csv
|
||||
|
||||
PG_BIN = "/Users/accusys/pgsql/18.3/bin"
|
||||
DB_USER = "accusys"
|
||||
DB_NAME = "momentry"
|
||||
|
||||
def psql(sql):
|
||||
r = subprocess.run([f"{PG_BIN}/psql", "-U", DB_USER, "-d", DB_NAME, "-c", sql],
|
||||
capture_output=True, text=True, timeout=30)
|
||||
if r.returncode != 0:
|
||||
print(f" ERROR: {r.stderr[:200]}")
|
||||
return False
|
||||
print(f" OK: {r.stdout.strip()[:100]}")
|
||||
return True
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--uuid", required=True)
|
||||
parser.add_argument("--package", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
uuid = args.uuid
|
||||
pkg = args.package.rstrip("/")
|
||||
|
||||
if not os.path.exists(pkg):
|
||||
print(f"Package not found: {pkg}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Importing package {uuid} from {pkg}")
|
||||
print()
|
||||
|
||||
# 1. Update video registration (mark as processed)
|
||||
print("[1/5] Update video registration...")
|
||||
meta_path = os.path.join(pkg, "metadata.json")
|
||||
if os.path.exists(meta_path):
|
||||
with open(meta_path) as f:
|
||||
meta = json.load(f)
|
||||
fps = meta.get("fps", 24.0)
|
||||
dur = meta.get("duration", 0)
|
||||
psql(
|
||||
f"UPDATE dev.videos SET status='ready', duration={dur}, fps={fps} "
|
||||
f"WHERE file_uuid='{uuid}'"
|
||||
)
|
||||
|
||||
# 2. Import identities
|
||||
print("[2/5] Import identities...")
|
||||
id_path = os.path.join(pkg, "identities.csv")
|
||||
if os.path.exists(id_path):
|
||||
with open(id_path) as f:
|
||||
count = sum(1 for _ in csv.DictReader(f))
|
||||
if count > 0:
|
||||
psql(
|
||||
f"COPY dev.identities (uuid, name, identity_type, source, status, metadata) "
|
||||
f"FROM '{id_path}' WITH CSV HEADER "
|
||||
f"ON CONFLICT (name) DO NOTHING"
|
||||
)
|
||||
|
||||
# 3. Import face detections
|
||||
print("[3/5] Import face detections...")
|
||||
fd_path = os.path.join(pkg, "face_detections.csv")
|
||||
if os.path.exists(fd_path):
|
||||
psql(
|
||||
f"COPY dev.face_detections (id, file_uuid, frame_number, timestamp_secs, "
|
||||
f"face_id, x, y, width, height, confidence, trace_id, identity_id) "
|
||||
f"FROM '{fd_path}' WITH CSV HEADER "
|
||||
f"ON CONFLICT (id) DO NOTHING"
|
||||
)
|
||||
|
||||
# 4. Import chunks
|
||||
print("[4/5] Import chunks...")
|
||||
ch_path = os.path.join(pkg, "chunks.csv")
|
||||
if os.path.exists(ch_path):
|
||||
psql(
|
||||
f"COPY dev.chunk (chunk_id, chunk_type, start_frame, end_frame, "
|
||||
f"start_time, end_time, fps, text_content) "
|
||||
f"FROM '{ch_path}' WITH CSV HEADER "
|
||||
f"ON CONFLICT (file_uuid, chunk_id) DO NOTHING"
|
||||
)
|
||||
|
||||
# 5. Import vectors
|
||||
print("[5/5] Import chunk_vectors...")
|
||||
vec_path = os.path.join(pkg, "chunk_vectors.csv")
|
||||
if os.path.exists(vec_path):
|
||||
psql(
|
||||
f"COPY dev.chunk_vectors (chunk_id, embedding) "
|
||||
f"FROM '{vec_path}' WITH CSV HEADER"
|
||||
)
|
||||
|
||||
# Verify
|
||||
print()
|
||||
print("=== Verification ===")
|
||||
r = subprocess.run(
|
||||
[f"{PG_BIN}/psql", "-U", DB_USER, "-d", DB_NAME, "-t", "-A",
|
||||
"-c", f"SELECT count(*) FROM dev.chunk WHERE file_uuid='{uuid}'"],
|
||||
capture_output=True, text=True, timeout=10)
|
||||
print(f" Chunks: {r.stdout.strip()}")
|
||||
|
||||
r = subprocess.run(
|
||||
[f"{PG_BIN}/psql", "-U", DB_USER, "-d", DB_NAME, "-t", "-A",
|
||||
"-c", f"SELECT count(*) FROM dev.chunk_vectors WHERE uuid='{uuid}'"],
|
||||
capture_output=True, text=True, timeout=10)
|
||||
print(f" Vectors: {r.stdout.strip()}")
|
||||
|
||||
print()
|
||||
print("=== Done ===")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
225
scripts/package_delivery.sh
Normal file
225
scripts/package_delivery.sh
Normal file
@@ -0,0 +1,225 @@
|
||||
#!/bin/bash
|
||||
# Package Delivery — 依 M4 spec 三層打包
|
||||
# Usage:
|
||||
# bash package_delivery.sh dev <version>
|
||||
# bash package_delivery.sh prod <version>
|
||||
# bash package_delivery.sh file <file_uuid> <version>
|
||||
set -euo pipefail
|
||||
|
||||
CMD="${1:?Usage: $0 <dev|prod|file> <arg> [version]}"
|
||||
PROJECT="/Users/accusys/momentry_core_0.1"
|
||||
PG_BIN="/Users/accusys/pgsql/18.3/bin"
|
||||
T0=$(date +%s)
|
||||
|
||||
case "$CMD" in
|
||||
dev)
|
||||
VERSION="${2:?Usage: $0 dev <version>}"
|
||||
OUTPUT="$PROJECT/release/dev_upgrade_${VERSION}"
|
||||
mkdir -p "$OUTPUT/schema" "$OUTPUT/bin" "$OUTPUT/src"
|
||||
|
||||
echo "=== Dev Upgrade Package ${VERSION} ==="
|
||||
|
||||
# Migration SQL (auto-generated from current schema)
|
||||
echo "[1/4] Migration SQL..."
|
||||
{
|
||||
echo "-- Migration: dev schema ${VERSION}"
|
||||
echo "-- Date: $(date +%Y-%m-%d)"
|
||||
echo "-- Auto-generated from current DB schema"
|
||||
echo ""
|
||||
echo "SET search_path TO dev;"
|
||||
echo ""
|
||||
# face_detections: timestamp_secs was added later
|
||||
$PG_BIN/psql -U accusys -d momentry -t -A -c "
|
||||
SELECT 'ALTER TABLE dev.face_detections ADD COLUMN IF NOT EXISTS ' || column_name || ' ' || udt_name || ';'
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema='dev' AND table_name='face_detections'
|
||||
AND column_name = 'timestamp_secs';
|
||||
" 2>/dev/null
|
||||
# chunk: chunk_id was changed to short format
|
||||
echo ""
|
||||
echo "-- Chunk index removals (already applied)"
|
||||
echo "-- chunk_vectors: standard columns"
|
||||
} > "$OUTPUT/schema/migration_${VERSION}.sql"
|
||||
echo " migration_${VERSION}.sql"
|
||||
|
||||
# Binary
|
||||
echo "[2/4] Binary..."
|
||||
BIN_SRC="$PROJECT/target/debug/momentry_playground"
|
||||
if [ -f "$BIN_SRC" ]; then
|
||||
cp "$BIN_SRC" "$OUTPUT/bin/"
|
||||
echo " momentry_playground ($(ls -lh "$OUTPUT/bin/momentry_playground" | awk '{print $5}'))"
|
||||
else
|
||||
echo " WARN: binary not found at $BIN_SRC (build first: cargo build --bin momentry_playground)"
|
||||
fi
|
||||
|
||||
# Source (optional)
|
||||
echo "[3/4] Source code..."
|
||||
cd "$PROJECT"
|
||||
git archive --format=tar.gz -o "$OUTPUT/src/momentry_core_${VERSION}_source.tar.gz" HEAD 2>/dev/null
|
||||
echo " source.tar.gz ($(ls -lh "$OUTPUT/src/momentry_core_${VERSION}_source.tar.gz" | awk '{print $5}'))"
|
||||
|
||||
# UPGRADE.md
|
||||
echo "[4/4] UPGRADE.md..."
|
||||
cat > "$OUTPUT/UPGRADE.md" << UPEOF
|
||||
# Dev Upgrade ${VERSION}
|
||||
|
||||
## Steps
|
||||
|
||||
\`\`\`bash
|
||||
# 1. Apply migration
|
||||
psql -U accusys -d momentry < schema/migration_${VERSION}.sql
|
||||
|
||||
# 2. Replace binary
|
||||
cp bin/momentry_playground ${PROJECT}/target/debug/
|
||||
|
||||
# 3. Restart
|
||||
pkill momentry_playground
|
||||
DATABASE_SCHEMA=dev ${PROJECT}/target/debug/momentry_playground server --port 3003
|
||||
\`\`\`
|
||||
UPEOF
|
||||
|
||||
# Package
|
||||
cd "$PROJECT/release"
|
||||
tar czf "dev_upgrade_${VERSION}.tar.gz" -C "$(dirname "$OUTPUT")" "$(basename "$OUTPUT")"
|
||||
echo ""
|
||||
echo "=== Package: release/dev_upgrade_${VERSION}.tar.gz ($(ls -lh "dev_upgrade_${VERSION}.tar.gz" | awk '{print $5}')) ==="
|
||||
;;
|
||||
|
||||
prod)
|
||||
VERSION="${2:?Usage: $0 prod <version>}"
|
||||
OUTPUT="$PROJECT/release/prod_upgrade_${VERSION}"
|
||||
mkdir -p "$OUTPUT/schema" "$OUTPUT/bin"
|
||||
|
||||
echo "=== Prod Upgrade Package ${VERSION} ==="
|
||||
|
||||
echo "[1/3] Migration SQL..."
|
||||
cat > "$OUTPUT/schema/migration_${VERSION}.sql" << SQLEOF
|
||||
-- Migration: public schema ${VERSION}
|
||||
-- Date: $(date +%Y-%m-%d)
|
||||
SET search_path TO public;
|
||||
|
||||
-- ALTER TABLE public.videos ADD COLUMN IF NOT EXISTS ...;
|
||||
SQLEOF
|
||||
echo " migration_${VERSION}.sql"
|
||||
|
||||
echo "[2/3] Binary..."
|
||||
BIN_SRC="$PROJECT/target/release/momentry"
|
||||
if [ -f "$BIN_SRC" ]; then
|
||||
cp "$BIN_SRC" "$OUTPUT/bin/"
|
||||
echo " momentry ($(ls -lh "$OUTPUT/bin/momentry" | awk '{print $5}'))"
|
||||
else
|
||||
echo " WARN: release binary not found (build: cargo build --release --bin momentry)"
|
||||
fi
|
||||
|
||||
echo "[3/3] UPGRADE.md..."
|
||||
cat > "$OUTPUT/UPGRADE.md" << UPEOF
|
||||
# Prod Upgrade ${VERSION}
|
||||
|
||||
## Pre-flight
|
||||
\`\`\`bash
|
||||
pg_dump -U accusys -d momentry --schema=public > backup_${VERSION}_pre.sql
|
||||
\`\`\`
|
||||
|
||||
## Steps
|
||||
\`\`\`bash
|
||||
# 1. Migration
|
||||
psql -U accusys -d momentry < schema/migration_${VERSION}.sql
|
||||
|
||||
# 2. Replace binary
|
||||
cp bin/momentry ${PROJECT}/target/release/
|
||||
|
||||
# 3. Restart
|
||||
sudo launchctl stop com.momentry.api
|
||||
sudo launchctl start com.momentry.api
|
||||
\`\`\`
|
||||
|
||||
## Rollback
|
||||
\`\`\`bash
|
||||
psql -U accusys -d momentry < backup_${VERSION}_pre.sql
|
||||
\`\`\`
|
||||
UPEOF
|
||||
|
||||
cd "$PROJECT/release"
|
||||
tar czf "prod_upgrade_${VERSION}.tar.gz" -C "$(dirname "$OUTPUT")" "$(basename "$OUTPUT")"
|
||||
echo ""
|
||||
echo "=== Package: release/prod_upgrade_${VERSION}.tar.gz ($(ls -lh "prod_upgrade_${VERSION}.tar.gz" | awk '{print $5}')) ==="
|
||||
;;
|
||||
|
||||
file)
|
||||
UUID="${2:?Usage: $0 file <uuid> [version]}"
|
||||
VERSION="${3:-v1.0}"
|
||||
OUTPUT="$PROJECT/release/file_${UUID}_${VERSION}"
|
||||
mkdir -p "$OUTPUT"
|
||||
|
||||
echo "=== File Package ${UUID} ${VERSION} ==="
|
||||
|
||||
# file_info.json
|
||||
echo "[1/5] file_info.json..."
|
||||
$PG_BIN/psql -U accusys -d momentry -t -A -c "
|
||||
SELECT json_build_object(
|
||||
'file_uuid', file_uuid,
|
||||
'file_name', file_name,
|
||||
'duration', duration,
|
||||
'fps', fps,
|
||||
'width', width,
|
||||
'height', height,
|
||||
'total_frames', total_frames,
|
||||
'status', status
|
||||
) FROM dev.videos WHERE file_uuid='$UUID';
|
||||
" 2>/dev/null | python3 -c "import json,sys;json.dump(json.load(sys.stdin),open('$OUTPUT/file_info.json','w'),indent=2)"
|
||||
|
||||
# data.sql (using Python export script)
|
||||
echo "[2/5] data.sql..."
|
||||
python3 "$PROJECT/scripts/export_file_package.py" "$UUID" "$OUTPUT" 2>&1
|
||||
|
||||
# checksums.md5
|
||||
echo "[3/5] checksums.md5..."
|
||||
cd "$OUTPUT"
|
||||
md5sum data.sql file_info.json 2>/dev/null > checksums.md5 || md5 data.sql file_info.json > checksums.md5 2>/dev/null
|
||||
echo " checksums.md5"
|
||||
|
||||
# README.md
|
||||
echo "[4/5] README.md..."
|
||||
cat > "$OUTPUT/README.md" << READMEEOF
|
||||
# File Package: ${UUID}
|
||||
|
||||
## Import
|
||||
\`\`\`bash
|
||||
export UUID=${UUID}
|
||||
|
||||
# 1. Clean existing data for this file
|
||||
psql -U accusys -d momentry << SQL
|
||||
DELETE FROM dev.chunk_vectors WHERE uuid = '\${UUID}'::text;
|
||||
DELETE FROM dev.chunk WHERE file_uuid = '\${UUID}';
|
||||
DELETE FROM dev.face_detections WHERE file_uuid = '\${UUID}';
|
||||
DELETE FROM dev.videos WHERE file_uuid = '\${UUID}';
|
||||
SQL
|
||||
|
||||
# 2. Restore
|
||||
psql -U accusys -d momentry < data.sql
|
||||
|
||||
# 3. Verify
|
||||
psql -U accusys -d momentry -c "SELECT file_uuid, status FROM dev.videos WHERE file_uuid = '\${UUID}'"
|
||||
\`\`\`
|
||||
## Verification
|
||||
- \`GET /api/v1/file/{uuid}\` → 200, status=ready
|
||||
- \`GET /api/v1/file/{uuid}/chunk/0-01\` → 200
|
||||
- \`POST /api/v1/search/universal\` → results
|
||||
READMEEOF
|
||||
|
||||
# Package
|
||||
echo "[5/5] Packaging tar.gz..."
|
||||
cd "$PROJECT/release"
|
||||
tar czf "file_${UUID}_${VERSION}.tar.gz" -C "$(dirname "$OUTPUT")" "$(basename "$OUTPUT")" 2>/dev/null
|
||||
echo " release/file_${UUID}_${VERSION}.tar.gz ($(ls -lh "file_${UUID}_${VERSION}.tar.gz" | awk '{print $5}'))"
|
||||
rm -rf "$OUTPUT"
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Usage: $0 <dev|prod|file> <arg> [version]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
ELAPSED=$(($(date +%s) - T0))
|
||||
echo " Elapsed: ${ELAPSED}s"
|
||||
119
scripts/package_file.sh
Normal file
119
scripts/package_file.sh
Normal file
@@ -0,0 +1,119 @@
|
||||
#!/bin/bash
|
||||
# Package File Content — single video's complete data
|
||||
set -euo pipefail
|
||||
UUID="${1:?Usage: $0 <file_uuid> [version]}"
|
||||
VERSION="${2:-v1.0.0}"
|
||||
PROJECT="/Users/accusys/momentry_core_0.1"
|
||||
OUTPUT="$PROJECT/release/files/$UUID/$VERSION"
|
||||
OUTPUT_DEV="/Users/accusys/momentry/output_dev"
|
||||
PG_BIN="/Users/accusys/pgsql/18.3/bin"
|
||||
T0=$(date +%s)
|
||||
|
||||
mkdir -p "$OUTPUT/processors"
|
||||
|
||||
echo "=== File Package ${UUID} ${VERSION} ==="
|
||||
|
||||
# 1. metadata
|
||||
echo "[1/8] metadata.json..."
|
||||
$PG_BIN/psql -U accusys -d momentry -t -A -c "
|
||||
SELECT json_build_object(
|
||||
'file_uuid', file_uuid,
|
||||
'file_name', file_name,
|
||||
'file_path', file_path,
|
||||
'file_type', file_type,
|
||||
'duration', duration,
|
||||
'width', width,
|
||||
'height', height,
|
||||
'fps', fps,
|
||||
'status', status,
|
||||
'total_frames', total_frames,
|
||||
'registration_time', registration_time::text
|
||||
) FROM dev.videos WHERE file_uuid='$UUID';
|
||||
" 2>/dev/null | python3 -c "import json,sys;d=json.load(sys.stdin);json.dump(d,open('$OUTPUT/metadata.json','w'),indent=2)" 2>/dev/null || echo " WARN: no metadata"
|
||||
echo " $(ls -lh "$OUTPUT/metadata.json" | awk '{print $5}')"
|
||||
|
||||
# 2. Processor outputs
|
||||
echo "[2/8] Processor outputs..."
|
||||
for type in asr asrx asr-1 yolo face pose ocr cut scene; do
|
||||
src="$OUTPUT_DEV/${UUID}.${type}.json"
|
||||
if [ -f "$src" ]; then
|
||||
cp "$src" "$OUTPUT/processors/"
|
||||
echo " ${type}.json"
|
||||
fi
|
||||
done
|
||||
|
||||
# 3. Identities (related to this file)
|
||||
echo "[3/8] Identities..."
|
||||
$PG_BIN/psql -U accusys -d momentry -c "
|
||||
COPY (
|
||||
SELECT DISTINCT i.uuid, i.name, i.identity_type, i.source, i.status, i.metadata
|
||||
FROM dev.identities i
|
||||
JOIN dev.identity_bindings ib ON ib.identity_id = i.id
|
||||
WHERE ib.file_uuid = '$UUID'
|
||||
) TO '$OUTPUT/identities.csv' WITH CSV HEADER;
|
||||
" 2>/dev/null && echo " $(wc -l < "$OUTPUT/identities.csv") rows"
|
||||
|
||||
# 4. Face detections
|
||||
echo "[4/8] Face detections..."
|
||||
$PG_BIN/psql -U accusys -d momentry -c "
|
||||
COPY (
|
||||
SELECT id, file_uuid, frame_number, timestamp_secs, face_id, x, y, width, height, confidence, trace_id, identity_id
|
||||
FROM dev.face_detections WHERE file_uuid = '$UUID'
|
||||
ORDER BY frame_number
|
||||
) TO '$OUTPUT/face_detections.csv' WITH CSV HEADER;
|
||||
" 2>/dev/null && echo " $(wc -l < "$OUTPUT/face_detections.csv") rows"
|
||||
|
||||
# 5. Chunks
|
||||
echo "[5/8] Chunks..."
|
||||
$PG_BIN/psql -U accusys -d momentry -c "
|
||||
COPY (
|
||||
SELECT chunk_id, chunk_type, start_frame, end_frame, start_time, end_time, fps, text_content
|
||||
FROM dev.chunk WHERE file_uuid = '$UUID'
|
||||
ORDER BY id
|
||||
) TO '$OUTPUT/chunks.csv' WITH CSV HEADER;
|
||||
" 2>/dev/null && echo " $(wc -l < "$OUTPUT/chunks.csv") rows"
|
||||
|
||||
# 6. Vectors
|
||||
echo "[6/8] chunk_vectors..."
|
||||
$PG_BIN/psql -U accusys -d momentry -c "
|
||||
COPY (
|
||||
SELECT cv.chunk_id, cv.embedding::text
|
||||
FROM dev.chunk_vectors cv
|
||||
JOIN dev.chunk c ON c.file_uuid=cv.uuid AND c.chunk_id=cv.chunk_id
|
||||
WHERE cv.uuid = '$UUID'
|
||||
) TO '$OUTPUT/chunk_vectors.csv' WITH CSV HEADER;
|
||||
" 2>/dev/null && echo " $(wc -l < "$OUTPUT/chunk_vectors.csv") rows"
|
||||
|
||||
# 7. TKG
|
||||
echo "[7/8] TKG..."
|
||||
$PG_BIN/psql -U accusys -d momentry -c "
|
||||
COPY (SELECT * FROM dev.tkg_nodes WHERE file_uuid='$UUID') TO '$OUTPUT/tkg_nodes.csv' WITH CSV HEADER;
|
||||
" 2>/dev/null
|
||||
$PG_BIN/psql -U accusys -d momentry -c "
|
||||
COPY (SELECT * FROM dev.tkg_edges WHERE file_uuid='$UUID') TO '$OUTPUT/tkg_edges.csv' WITH CSV HEADER;
|
||||
" 2>/dev/null
|
||||
echo " nodes+edges exported"
|
||||
|
||||
# 8. RELEASE_INFO
|
||||
echo "[8/8] RELEASE_INFO..."
|
||||
SENTENCE=$($PG_BIN/psql -U accusys -d momentry -t -A -c "SELECT count(*) FROM dev.chunk WHERE file_uuid='$UUID' AND chunk_type='sentence';" 2>/dev/null)
|
||||
VECTORS=$($PG_BIN/psql -U accusys -d momentry -t -A -c "SELECT count(*) FROM dev.chunk_vectors cv JOIN dev.chunk c ON c.file_uuid=cv.uuid AND c.chunk_id=cv.chunk_id WHERE cv.uuid='$UUID';" 2>/dev/null)
|
||||
cat > "$OUTPUT/RELEASE_INFO.txt" << EOF
|
||||
Release: ${VERSION}
|
||||
Type: file
|
||||
UUID: ${UUID}
|
||||
Date: $(date +%Y-%m-%d)
|
||||
|
||||
Chunks: sentence=${SENTENCE}
|
||||
Vectors: ${VECTORS}
|
||||
Processors: $(ls "$OUTPUT/processors/" 2>/dev/null | wc -l | tr -d ' ')
|
||||
EOF
|
||||
|
||||
# Symlink latest
|
||||
ln -sfn "$OUTPUT" "$PROJECT/release/files/$UUID/latest"
|
||||
|
||||
ELAPSED=$(($(date +%s) - T0))
|
||||
echo ""
|
||||
echo "=== File Package done (${ELAPSED}s) ==="
|
||||
echo " $OUTPUT"
|
||||
du -sh "$OUTPUT"
|
||||
100
scripts/package_release.sh
Normal file
100
scripts/package_release.sh
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/bin/bash
|
||||
# ===========================================================
|
||||
# Package Release — Standard Deliverable for M4
|
||||
# Usage: bash scripts/package_release.sh <version> [file_uuid]
|
||||
# version: e.g. v1.0.2
|
||||
# file_uuid: default aeed71342a899fe4b4c57b7d41bcb692
|
||||
# ===========================================================
|
||||
set -euo pipefail
|
||||
|
||||
VERSION="${1:?Usage: $0 <version> [file_uuid]}"
|
||||
UUID="${2:-aeed71342a899fe4b4c57b7d41bcb692}"
|
||||
PG_BIN="/Users/accusys/pgsql/18.3/bin"
|
||||
PROJECT="/Users/accusys/momentry_core_0.1"
|
||||
OUTPUT="$PROJECT/release/phase1/${VERSION}"
|
||||
M4_DIR="$PROJECT/docs_v1.0/M4_HANDOVER"
|
||||
T0=$(date +%s)
|
||||
|
||||
echo "=========================================="
|
||||
echo " Package Release ${VERSION}"
|
||||
echo " UUID: ${UUID}"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
|
||||
mkdir -p "$OUTPUT"
|
||||
|
||||
# ── Step 1: DB backup (post-correction) ──
|
||||
echo "[1/5] DB backup (post-correction)..."
|
||||
$PG_BIN/pg_dump -U accusys -d momentry \
|
||||
--schema=dev \
|
||||
--table=dev.chunk \
|
||||
--table=dev.chunk_vectors \
|
||||
--data-only --column-inserts \
|
||||
> "$OUTPUT/dev_backup_post_correction.sql" 2>/dev/null
|
||||
echo " $(ls -lh "$OUTPUT/dev_backup_post_correction.sql" | awk '{print $5}')"
|
||||
|
||||
# ── Step 2: Verify counts ──
|
||||
echo "[2/5] Verify counts..."
|
||||
SENTENCE=$($PG_BIN/psql -U accusys -d momentry -t -A -c "SELECT count(*) FROM dev.chunk WHERE file_uuid='$UUID' AND chunk_type='sentence';" 2>/dev/null)
|
||||
VECTORS=$($PG_BIN/psql -U accusys -d momentry -t -A -c "SELECT count(*) FROM dev.chunk_vectors WHERE uuid='$UUID';" 2>/dev/null)
|
||||
MATCHED=$($PG_BIN/psql -U accusys -d momentry -t -A -c "
|
||||
SELECT count(*) FROM dev.chunk_vectors cv
|
||||
JOIN dev.chunk c ON c.file_uuid=cv.uuid AND c.chunk_id=cv.chunk_id
|
||||
WHERE cv.uuid='$UUID';" 2>/dev/null)
|
||||
echo " sentence chunks: $SENTENCE"
|
||||
echo " vectors: $VECTORS"
|
||||
echo " matched: $MATCHED"
|
||||
|
||||
if [ "$SENTENCE" != "$VECTORS" ] || [ "$VECTORS" != "$MATCHED" ]; then
|
||||
echo " ❌ MISMATCH — aborting"
|
||||
exit 1
|
||||
fi
|
||||
echo " ✅ All counts consistent"
|
||||
|
||||
# ── Step 3: Source code archive ──
|
||||
echo "[3/5] Source code (git archive)..."
|
||||
cd "$PROJECT"
|
||||
git archive --format=tar.gz -o "$OUTPUT/momentry_core_${VERSION}_source.tar.gz" HEAD 2>/dev/null
|
||||
echo " $(ls -lh "$OUTPUT/momentry_core_${VERSION}_source.tar.gz" | awk '{print $5}')"
|
||||
|
||||
# ── Step 4: Correction record ──
|
||||
echo "[4/5] Correction record (asr-1.json)..."
|
||||
ASR1="/Users/accusys/momentry/output_dev/${UUID}.asr-1.json"
|
||||
if [ -f "$ASR1" ]; then
|
||||
cp "$ASR1" "$OUTPUT/"
|
||||
echo " $(ls -lh "$OUTPUT/${UUID}.asr-1.json" | awk '{print $5}')"
|
||||
else
|
||||
echo " ⚠️ No asr-1.json found for $UUID"
|
||||
fi
|
||||
|
||||
# ── Step 5: Copy to M4_HANDOVER ──
|
||||
echo "[5/5] Deploy to M4_HANDOVER..."
|
||||
rm -rf "${M4_DIR:?}"/*
|
||||
cp "$OUTPUT/dev_backup_post_correction.sql" "$M4_DIR/"
|
||||
cp "$OUTPUT/momentry_core_${VERSION}_source.tar.gz" "$M4_DIR/"
|
||||
if [ -f "$OUTPUT/${UUID}.asr-1.json" ]; then
|
||||
cp "$OUTPUT/${UUID}.asr-1.json" "$M4_DIR/"
|
||||
fi
|
||||
# Scripts
|
||||
for s in generate_asr1.py apply_asr_corrections.py clean_sentence_text.py pipeline_status.py; do
|
||||
cp "$PROJECT/scripts/$s" "$M4_DIR/"
|
||||
done
|
||||
# Handover doc + test script
|
||||
cp "$PROJECT/docs_v1.0/API_V1.0.0/RELEASE/PHASE1_HANDOVER_V1.0.0.md" "$M4_DIR/HANDOVER_${VERSION}.md"
|
||||
cp "/tmp/test_api.sh" "$M4_DIR/api_test.sh"
|
||||
|
||||
# Create RELEASE_INFO
|
||||
cat > "$M4_DIR/RELEASE_INFO.txt" << EOF
|
||||
Release: ${VERSION}
|
||||
Date: $(date +%Y-%m-%d)
|
||||
UUID: ${UUID}
|
||||
Pipeline: $(python3 $PROJECT/scripts/pipeline_status.py 2>/dev/null | grep "TOTAL" | tr -d ' ')
|
||||
EOF
|
||||
|
||||
ELAPSED=$(($(date +%s) - T0))
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo " Package ${VERSION} complete (${ELAPSED}s)"
|
||||
echo " Output: $OUTPUT"
|
||||
echo " M4: $M4_DIR"
|
||||
echo "=========================================="
|
||||
100
scripts/package_system.sh
Normal file
100
scripts/package_system.sh
Normal file
@@ -0,0 +1,100 @@
|
||||
#!/bin/bash
|
||||
# Package System Upgrade — dev or prod
|
||||
# Usage: bash package_system.sh dev <version>
|
||||
# bash package_system.sh prod <version>
|
||||
set -euo pipefail
|
||||
|
||||
TARGET="${1:?Usage: $0 <dev|prod> <version>}"
|
||||
VERSION="${2:?Usage: $0 <dev|prod> <version>}"
|
||||
PROJECT="/Users/accusys/momentry_core_0.1"
|
||||
OUTPUT="$PROJECT/release/system/$TARGET/$VERSION"
|
||||
PG_BIN="/Users/accusys/pgsql/18.3/bin"
|
||||
T0=$(date +%s)
|
||||
|
||||
case "$TARGET" in
|
||||
dev)
|
||||
SCHEMA="dev"
|
||||
ENV_FILE=".env.development"
|
||||
;;
|
||||
prod)
|
||||
SCHEMA="public"
|
||||
ENV_FILE=".env"
|
||||
;;
|
||||
*)
|
||||
echo "Target must be 'dev' or 'prod'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
mkdir -p "$OUTPUT/scripts" "$OUTPUT/test" "$OUTPUT/migration" "$OUTPUT/portal"
|
||||
|
||||
echo "=== System Package (${TARGET}) ${VERSION} ==="
|
||||
|
||||
# 1. Source code
|
||||
echo "[1/6] Source code (git archive)..."
|
||||
cd "$PROJECT"
|
||||
git archive --format=tar.gz -o "$OUTPUT/source.tar.gz" HEAD 2>/dev/null
|
||||
echo " $(ls -lh "$OUTPUT/source.tar.gz" | awk '{print $5}')"
|
||||
|
||||
# 2. Schema
|
||||
echo "[2/6] Schema DDL (${SCHEMA})..."
|
||||
$PG_BIN/pg_dump -U accusys -d momentry --schema="$SCHEMA" --schema-only > "$OUTPUT/schema.sql" 2>/dev/null
|
||||
echo " $(ls -lh "$OUTPUT/schema.sql" | awk '{print $5}')"
|
||||
|
||||
# 3. Scripts (tools)
|
||||
echo "[3/6] Tools & scripts..."
|
||||
for s in pipeline_status.py generate_asr1.py apply_asr_corrections.py clean_sentence_text.py import_file_package.py; do
|
||||
if [ -f "$PROJECT/scripts/$s" ]; then
|
||||
cp "$PROJECT/scripts/$s" "$OUTPUT/scripts/"
|
||||
echo " $s"
|
||||
fi
|
||||
done
|
||||
cp "$PROJECT/docs_v1.0/M4_HANDOVER/api_test.sh" "$OUTPUT/test/"
|
||||
echo " api_test.sh"
|
||||
|
||||
# 4. Portal frontend (dev only)
|
||||
if [ "$TARGET" = "dev" ] && [ -d "$PROJECT/portal/dist" ]; then
|
||||
echo "[4/6] Portal frontend (dist)..."
|
||||
cp -r "$PROJECT/portal/dist" "$OUTPUT/portal/"
|
||||
echo " dist/ ($(du -sh "$PROJECT/portal/dist" | awk '{print $1}'))"
|
||||
elif [ "$TARGET" = "dev" ]; then
|
||||
echo "[4/6] Portal frontend — SKIP (no dist, run 'npm run build' first)"
|
||||
fi
|
||||
|
||||
# 5. Env config
|
||||
echo "[5/6] Environment config..."
|
||||
if [ -f "$PROJECT/$ENV_FILE" ]; then
|
||||
cp "$PROJECT/$ENV_FILE" "$OUTPUT/"
|
||||
echo " $ENV_FILE"
|
||||
else
|
||||
echo " WARN: $ENV_FILE not found"
|
||||
fi
|
||||
|
||||
# 6. RELEASE_INFO
|
||||
echo "[6/6] RELEASE_INFO..."
|
||||
cat > "$OUTPUT/RELEASE_INFO.txt" << EOF
|
||||
Release: ${VERSION}
|
||||
Target: ${TARGET}
|
||||
Schema: ${SCHEMA}
|
||||
Date: $(date +%Y-%m-%d)
|
||||
|
||||
Contents:
|
||||
- source.tar.gz
|
||||
- schema.sql (${SCHEMA})
|
||||
- scripts/* (tools)
|
||||
- test/api_test.sh
|
||||
- ${ENV_FILE}
|
||||
EOF
|
||||
if [ "$TARGET" = "dev" ] && [ -d "$OUTPUT/portal/dist" ]; then
|
||||
echo " - portal/dist/" >> "$OUTPUT/RELEASE_INFO.txt"
|
||||
fi
|
||||
|
||||
# Symlink latest
|
||||
mkdir -p "$PROJECT/release/system/$TARGET"
|
||||
ln -sfn "$OUTPUT" "$PROJECT/release/system/$TARGET/latest"
|
||||
|
||||
ELAPSED=$(($(date +%s) - T0))
|
||||
echo ""
|
||||
echo "=== System Package (${TARGET}) done (${ELAPSED}s) ==="
|
||||
echo " $OUTPUT"
|
||||
du -sh "$OUTPUT"
|
||||
47
scripts/setup/01_postgresql.sh
Normal file
47
scripts/setup/01_postgresql.sh
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/bin/bash
|
||||
# PostgreSQL 18.3 - build from source
|
||||
# Usage: bash scripts/setup/01_postgresql.sh
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
PG_VERSION="18.3"
|
||||
PG_SOURCE_URL="https://ftp.postgresql.org/pub/source/v${PG_VERSION}/postgresql-${PG_VERSION}.tar.gz"
|
||||
PG_PREFIX="$HOME/pgsql/${PG_VERSION}"
|
||||
PG_DATA="$HOME/pgsql/data"
|
||||
|
||||
echo "=== PostgreSQL ${PG_VERSION} Source Build ==="
|
||||
|
||||
# Step 1: Install build deps (via Homebrew)
|
||||
echo "[1/5] Installing build dependencies..."
|
||||
brew install readline zlib icu4c openssl e2fsprogs pkg-config
|
||||
|
||||
# Step 2: Download source
|
||||
echo "[2/5] Downloading PostgreSQL ${PG_VERSION} source..."
|
||||
mkdir -p ~/momentry_core_0.1/services/postgresql
|
||||
cd ~/momentry_core_0.1/services/postgresql
|
||||
curl -sL "$PG_SOURCE_URL" -o "postgresql-${PG_VERSION}.tar.gz"
|
||||
tar xzf "postgresql-${PG_VERSION}.tar.gz"
|
||||
cd "postgresql-${PG_VERSION}"
|
||||
|
||||
# Step 3: Configure
|
||||
echo "[3/5] Configuring..."
|
||||
export PKG_CONFIG_PATH="/opt/homebrew/opt/zlib/lib/pkgconfig:/opt/homebrew/opt/readline/lib/pkgconfig:/opt/homebrew/opt/icu4c/lib/pkgconfig:/opt/homebrew/lib/pkgconfig"
|
||||
export LDFLAGS="-L/opt/homebrew/opt/openssl/lib"
|
||||
export CPPFLAGS="-I/opt/homebrew/opt/openssl/include"
|
||||
./configure --prefix="$PG_PREFIX" --with-uuid=e2fs --with-icu --with-openssl
|
||||
|
||||
# Step 4: Build
|
||||
echo "[4/5] Building (parallel)..."
|
||||
CORES=$(sysctl -n hw.ncpu)
|
||||
make -j$CORES
|
||||
make install
|
||||
|
||||
# Step 5: Initialize data directory
|
||||
echo "[5/5] Initializing data directory..."
|
||||
mkdir -p "$PG_DATA"
|
||||
"$PG_PREFIX/bin/initdb" -D "$PG_DATA"
|
||||
|
||||
# Record checksum
|
||||
echo "PostgreSQL SHA256: $(shasum -a 256 "$PG_PREFIX/bin/postgres" | cut -d' ' -f1)"
|
||||
echo "=== PostgreSQL ${PG_VERSION} build complete ==="
|
||||
echo "Start: pg_ctl -D $PG_DATA -l $HOME/pgsql/pg.log start"
|
||||
@@ -1 +1 @@
|
||||
16433
|
||||
55053
|
||||
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user