Files
momentry_core/chunked_transcribe.py
Warren b54c2def30 feat: add migrations, test scripts, and utility tools
- Add database migrations (006-028) for face recognition, identity, file_uuid
- Add test scripts for ASR, face, search, processing
- Add portal frontend (Tauri)
- Add config, benchmark, and monitoring utilities
- Add model checkpoints and pretrained model references
2026-04-30 15:11:53 +08:00

201 lines
5.7 KiB
Python

#!/usr/bin/env python3
"""
Chunked transcription to handle large audio files.
"""
import sys
import time
import tempfile
import json
import subprocess
from pathlib import Path
import numpy as np
def split_audio(input_path, chunk_duration=1800, output_dir=None):
"""Split audio into chunks using ffmpeg."""
if output_dir is None:
output_dir = Path(tempfile.mkdtemp(prefix="audio_chunks_"))
else:
output_dir = Path(output_dir)
output_dir.mkdir(exist_ok=True, parents=True)
# Get total duration
cmd = [
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"csv=p=0",
str(input_path),
]
result = subprocess.run(cmd, capture_output=True, text=True)
total_duration = float(result.stdout.strip())
print(
f"Total audio duration: {total_duration:.1f}s ({total_duration / 3600:.1f} hrs)"
)
print(f"Splitting into {chunk_duration}s chunks...")
chunks = []
start = 0
chunk_idx = 0
while start < total_duration:
chunk_path = output_dir / f"chunk_{chunk_idx:04d}.wav"
cmd = [
"ffmpeg",
"-i",
str(input_path),
"-ss",
str(start),
"-t",
str(chunk_duration),
"-acodec",
"pcm_s16le",
"-ar",
"16000",
"-ac",
"1",
"-y",
str(chunk_path),
]
subprocess.run(cmd, capture_output=True)
if chunk_path.exists() and chunk_path.stat().st_size > 0:
chunks.append(
{
"path": chunk_path,
"start_time": start,
"end_time": min(start + chunk_duration, total_duration),
}
)
else:
print(f"Warning: Chunk {chunk_idx} may be empty")
start += chunk_duration
chunk_idx += 1
print(f"Created {len(chunks)} chunks in {output_dir}")
return chunks, output_dir
def transcribe_chunk(chunk_info, model, chunk_idx, total_chunks):
"""Transcribe a single chunk."""
print(
f"[{chunk_idx + 1}/{total_chunks}] Transcribing chunk {chunk_info['start_time']:.1f}-{chunk_info['end_time']:.1f}"
)
start_time = time.time()
segments, info = model.transcribe(str(chunk_info["path"]), beam_size=5)
results = []
for segment in segments:
# Adjust timestamps by chunk start time
results.append(
{
"start": segment.start + chunk_info["start_time"],
"end": segment.end + chunk_info["start_time"],
"text": segment.text.strip(),
}
)
elapsed = time.time() - start_time
print(f"{len(results)} segments in {elapsed:.1f}s")
return results, info
def main():
import argparse
parser = argparse.ArgumentParser(description="Chunked transcription")
parser.add_argument("audio_path", help="Audio file path")
parser.add_argument(
"--chunk-duration",
type=int,
default=1800,
help="Chunk duration in seconds (default: 1800 = 30 min)",
)
parser.add_argument("--model-size", default="tiny", help="Whisper model size")
parser.add_argument("--compute-type", default="int8", help="Compute type")
parser.add_argument(
"--output", "-o", default="chunked_transcription.json", help="Output JSON path"
)
args = parser.parse_args()
audio_path = Path(args.audio_path)
if not audio_path.exists():
print(f"Error: File not found: {audio_path}")
sys.exit(1)
print(f"Chunked Transcription for {audio_path}")
print(f"Model: {args.model_size}, Compute: {args.compute_type}")
print(
f"Chunk duration: {args.chunk_duration}s ({args.chunk_duration / 60:.1f} min)"
)
# Split audio
chunks, temp_dir = split_audio(audio_path, chunk_duration=args.chunk_duration)
if not chunks:
print("No chunks created")
sys.exit(1)
# Load model once
print("Loading Whisper model...")
from faster_whisper import WhisperModel
model_start = time.time()
model = WhisperModel(args.model_size, device="cpu", compute_type=args.compute_type)
print(f"Model loaded in {time.time() - model_start:.1f}s")
# Process each chunk
all_segments = []
language = None
language_prob = None
for i, chunk in enumerate(chunks):
try:
segments, info = transcribe_chunk(chunk, model, i, len(chunks))
all_segments.extend(segments)
if language is None:
language = info.language
language_prob = info.language_probability
except Exception as e:
print(f"Error transcribing chunk {i}: {e}")
import traceback
traceback.print_exc()
# Continue with next chunk
# Sort segments by start time
all_segments.sort(key=lambda x: x["start"])
# Save results
output = {
"language": language or "unknown",
"language_probability": language_prob or 0.0,
"segments": all_segments,
"chunk_count": len(chunks),
"chunk_duration": args.chunk_duration,
"total_segments": len(all_segments),
}
output_path = Path(args.output)
output_path.parent.mkdir(exist_ok=True, parents=True)
with open(output_path, "w") as f:
json.dump(output, f, indent=2)
print(f"\nTranscription completed:")
print(f" Total segments: {len(all_segments)}")
print(
f" Language: {output['language']} (prob {output['language_probability']:.2f})"
)
print(f" Results saved to: {output_path}")
# Cleanup temp directory
import shutil
shutil.rmtree(temp_dir, ignore_errors=True)
if __name__ == "__main__":
main()