36 lines
994 B
Python
36 lines
994 B
Python
"""
|
|
Whisper Local - uses faster-whisper for per-segment transcription
|
|
"""
|
|
|
|
import numpy as np
|
|
|
|
|
|
def load_model(size="small"):
|
|
from faster_whisper import WhisperModel
|
|
return WhisperModel(size, device="cpu", compute_type="int8")
|
|
|
|
|
|
def transcribe_segment(wav, sample_rate, start_sec, end_sec, model):
|
|
start_sample = int(start_sec * sample_rate)
|
|
end_sample = int(end_sec * sample_rate)
|
|
if start_sample >= len(wav):
|
|
return {"text": "", "language": "", "lang_prob": 0.0, "segments": []}
|
|
segment_wav = wav[start_sample:min(end_sample, len(wav))]
|
|
|
|
segments_generator, info = model.transcribe(segment_wav, language=None)
|
|
|
|
text = ""
|
|
lang_prob = info.language_probability if info else 0.0
|
|
language = info.language if info else ""
|
|
|
|
segs = list(segments_generator)
|
|
for seg in segs:
|
|
text += seg.text + " "
|
|
|
|
return {
|
|
"text": text.strip(),
|
|
"language": language,
|
|
"lang_prob": lang_prob,
|
|
"segments": segs,
|
|
}
|