const fs = require("fs"); const path = require("path"); const { v4 } = require("uuid"); const defaultWhisper = "Xenova/whisper-small"; // Model Card: https://huggingface.co/Xenova/whisper-small const fileSize = { "Xenova/whisper-small": "250mb", "Xenova/whisper-large": "1.56GB", }; class LocalWhisper { constructor({ options }) { this.model = options?.WhisperModelPref ?? defaultWhisper; this.fileSize = fileSize[this.model]; this.cacheDir = path.resolve( process.env.STORAGE_DIR ? path.resolve(process.env.STORAGE_DIR, `models`) : path.resolve(__dirname, `../../../server/storage/models`) ); this.modelPath = path.resolve(this.cacheDir, ...this.model.split("/")); // Make directory when it does not exist in existing installations if (!fs.existsSync(this.cacheDir)) fs.mkdirSync(this.cacheDir, { recursive: true }); this.#log("Initialized."); } #log(text, ...args) { console.log(`\x1b[32m[LocalWhisper]\x1b[0m ${text}`, ...args); } #validateAudioFile(wavFile) { const sampleRate = wavFile.fmt.sampleRate; const duration = wavFile.data.samples / sampleRate; // Most speech recognition systems expect minimum 8kHz // But we'll set it lower to be safe if (sampleRate < 4000) { // 4kHz minimum throw new Error( "Audio file sample rate is too low for accurate transcription. Minimum required is 4kHz." ); } // Typical audio file duration limits const MAX_DURATION_SECONDS = 4 * 60 * 60; // 4 hours if (duration > MAX_DURATION_SECONDS) { throw new Error("Audio file duration exceeds maximum limit of 4 hours."); } // Check final sample count after upsampling to prevent memory issues const targetSampleRate = 16000; const upsampledSamples = duration * targetSampleRate; const MAX_SAMPLES = 230_400_000; // ~4 hours at 16kHz if (upsampledSamples > MAX_SAMPLES) { throw new Error("Audio file exceeds maximum allowed length."); } return true; } async #convertToWavAudioData(sourcePath) { try { let buffer; const wavefile = require("wavefile"); const { FFMPEGWrapper } = require("./ffmpeg"); const ffmpeg = new FFMPEGWrapper(); const outFolder = path.resolve(__dirname, `../../storage/tmp`); if (!fs.existsSync(outFolder)) fs.mkdirSync(outFolder, { recursive: true }); const outputFile = path.resolve(outFolder, `${v4()}.wav`); const success = await ffmpeg.convertAudioToWav(sourcePath, outputFile); if (!success) throw new Error( "[Conversion Failed]: Could not convert file to .wav format!" ); buffer = fs.readFileSync(outputFile); fs.rmSync(outputFile); const wavFile = new wavefile.WaveFile(buffer); try { this.#validateAudioFile(wavFile); } catch (error) { this.#log(`Audio validation failed: ${error.message}`); throw new Error(`Invalid audio file: ${error.message}`); } // Although we use ffmpeg to convert to the correct format (16k hz 32f), // different versions of ffmpeg produce different results based on the // environment. To ensure consistency, we convert to the correct format again. wavFile.toBitDepth("32f"); wavFile.toSampleRate(16000); let audioData = wavFile.getSamples(); if (Array.isArray(audioData)) { if (audioData.length > 1) { const SCALING_FACTOR = Math.sqrt(2); // Merge channels into first channel to save memory for (let i = 0; i < audioData[0].length; ++i) { audioData[0][i] = (SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2; } } audioData = audioData[0]; } return audioData; } catch (error) { console.error(`convertToWavAudioData`, error); return null; } } async client() { if (!fs.existsSync(this.modelPath)) { this.#log( `The native whisper model has never been run and will be downloaded right now. Subsequent runs will be faster. (~${this.fileSize})` ); } try { // Convert ESM to CommonJS via import so we can load this library. const pipeline = (...args) => import("@xenova/transformers").then(({ pipeline }) => { return pipeline(...args); }); return await pipeline("automatic-speech-recognition", this.model, { cache_dir: this.cacheDir, ...(!fs.existsSync(this.modelPath) ? { // Show download progress if we need to download any files progress_callback: (data) => { if (!data.hasOwnProperty("progress")) return; console.log( `\x1b[34m[ONNXWhisper - Downloading Model Files]\x1b[0m ${ data.file } ${~~data?.progress}%` ); }, } : {}), }); } catch (error) { let errMsg = error.message; if (errMsg.includes("Could not locate file")) { errMsg = "The native whisper model failed to download from the huggingface.co CDN. Your internet connection may be unstable or blocked by Huggingface.co - you will need to download the model manually and place it in the storage/models folder to use local Whisper transcription."; } this.#log( `Failed to load the native whisper model: ${errMsg}`, error.stack ); throw new Error(errMsg); } } async processFile(fullFilePath, filename) { try { const audioDataPromise = new Promise((resolve) => this.#convertToWavAudioData(fullFilePath).then((audioData) => resolve(audioData) ) ); const [audioData, transcriber] = await Promise.all([ audioDataPromise, this.client(), ]); if (!audioData) { this.#log(`Failed to parse content from ${filename}.`); return { content: null, error: `Failed to parse content from ${filename}.`, }; } this.#log(`Transcribing audio data to text...`); const { text } = await transcriber(audioData, { chunk_length_s: 30, stride_length_s: 5, }); return { content: text, error: null }; } catch (error) { return { content: null, error: error.message }; } } } module.exports = { LocalWhisper, };