* refactor localWhisper to use new custom FFMPEGWrapper class * stub tests in github actions * add back wavefile conversion to 16khz 32f to fix docker builds * use afterEach for cleanup in ffmpeg tests * remove unused FFMPEG_PATH env check * use spawnSync for ffmpeg to capture and log output * lint * revert removal of try/catch around validateAudioFile for more helpful error msgs * use readFileSync instead of createReadStream for less overhead * change import to require for fix-path and stub import in tests * refactor to singleton to preserve ffmpeg path dev build --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
198 lines
6.3 KiB
JavaScript
198 lines
6.3 KiB
JavaScript
const fs = require("fs");
|
|
const path = require("path");
|
|
const { v4 } = require("uuid");
|
|
const defaultWhisper = "Xenova/whisper-small"; // Model Card: https://huggingface.co/Xenova/whisper-small
|
|
const fileSize = {
|
|
"Xenova/whisper-small": "250mb",
|
|
"Xenova/whisper-large": "1.56GB",
|
|
};
|
|
|
|
class LocalWhisper {
|
|
constructor({ options }) {
|
|
this.model = options?.WhisperModelPref ?? defaultWhisper;
|
|
this.fileSize = fileSize[this.model];
|
|
this.cacheDir = path.resolve(
|
|
process.env.STORAGE_DIR
|
|
? path.resolve(process.env.STORAGE_DIR, `models`)
|
|
: path.resolve(__dirname, `../../../server/storage/models`)
|
|
);
|
|
|
|
this.modelPath = path.resolve(this.cacheDir, ...this.model.split("/"));
|
|
// Make directory when it does not exist in existing installations
|
|
if (!fs.existsSync(this.cacheDir))
|
|
fs.mkdirSync(this.cacheDir, { recursive: true });
|
|
|
|
this.#log("Initialized.");
|
|
}
|
|
|
|
#log(text, ...args) {
|
|
console.log(`\x1b[32m[LocalWhisper]\x1b[0m ${text}`, ...args);
|
|
}
|
|
|
|
#validateAudioFile(wavFile) {
|
|
const sampleRate = wavFile.fmt.sampleRate;
|
|
const duration = wavFile.data.samples / sampleRate;
|
|
|
|
// Most speech recognition systems expect minimum 8kHz
|
|
// But we'll set it lower to be safe
|
|
if (sampleRate < 4000) {
|
|
// 4kHz minimum
|
|
throw new Error(
|
|
"Audio file sample rate is too low for accurate transcription. Minimum required is 4kHz."
|
|
);
|
|
}
|
|
|
|
// Typical audio file duration limits
|
|
const MAX_DURATION_SECONDS = 4 * 60 * 60; // 4 hours
|
|
if (duration > MAX_DURATION_SECONDS) {
|
|
throw new Error("Audio file duration exceeds maximum limit of 4 hours.");
|
|
}
|
|
|
|
// Check final sample count after upsampling to prevent memory issues
|
|
const targetSampleRate = 16000;
|
|
const upsampledSamples = duration * targetSampleRate;
|
|
const MAX_SAMPLES = 230_400_000; // ~4 hours at 16kHz
|
|
|
|
if (upsampledSamples > MAX_SAMPLES) {
|
|
throw new Error("Audio file exceeds maximum allowed length.");
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
async #convertToWavAudioData(sourcePath) {
|
|
try {
|
|
let buffer;
|
|
const wavefile = require("wavefile");
|
|
const { FFMPEGWrapper } = require("./ffmpeg");
|
|
const ffmpeg = new FFMPEGWrapper();
|
|
const outFolder = path.resolve(__dirname, `../../storage/tmp`);
|
|
if (!fs.existsSync(outFolder))
|
|
fs.mkdirSync(outFolder, { recursive: true });
|
|
|
|
const outputFile = path.resolve(outFolder, `${v4()}.wav`);
|
|
const success = ffmpeg.convertAudioToWav(sourcePath, outputFile);
|
|
if (!success)
|
|
throw new Error(
|
|
"[Conversion Failed]: Could not convert file to .wav format!"
|
|
);
|
|
|
|
buffer = fs.readFileSync(outputFile);
|
|
fs.rmSync(outputFile);
|
|
|
|
const wavFile = new wavefile.WaveFile(buffer);
|
|
try {
|
|
this.#validateAudioFile(wavFile);
|
|
} catch (error) {
|
|
this.#log(`Audio validation failed: ${error.message}`);
|
|
throw new Error(`Invalid audio file: ${error.message}`);
|
|
}
|
|
|
|
// Although we use ffmpeg to convert to the correct format (16k hz 32f),
|
|
// different versions of ffmpeg produce different results based on the
|
|
// environment. To ensure consistency, we convert to the correct format again.
|
|
wavFile.toBitDepth("32f");
|
|
wavFile.toSampleRate(16000);
|
|
|
|
let audioData = wavFile.getSamples();
|
|
if (Array.isArray(audioData)) {
|
|
if (audioData.length > 1) {
|
|
const SCALING_FACTOR = Math.sqrt(2);
|
|
|
|
// Merge channels into first channel to save memory
|
|
for (let i = 0; i < audioData[0].length; ++i) {
|
|
audioData[0][i] =
|
|
(SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2;
|
|
}
|
|
}
|
|
audioData = audioData[0];
|
|
}
|
|
|
|
return audioData;
|
|
} catch (error) {
|
|
console.error(`convertToWavAudioData`, error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async client() {
|
|
if (!fs.existsSync(this.modelPath)) {
|
|
this.#log(
|
|
`The native whisper model has never been run and will be downloaded right now. Subsequent runs will be faster. (~${this.fileSize})`
|
|
);
|
|
}
|
|
|
|
try {
|
|
// Convert ESM to CommonJS via import so we can load this library.
|
|
const pipeline = (...args) =>
|
|
import("@xenova/transformers").then(({ pipeline }) => {
|
|
return pipeline(...args);
|
|
});
|
|
return await pipeline("automatic-speech-recognition", this.model, {
|
|
cache_dir: this.cacheDir,
|
|
...(!fs.existsSync(this.modelPath)
|
|
? {
|
|
// Show download progress if we need to download any files
|
|
progress_callback: (data) => {
|
|
if (!data.hasOwnProperty("progress")) return;
|
|
console.log(
|
|
`\x1b[34m[Embedding - Downloading Model Files]\x1b[0m ${
|
|
data.file
|
|
} ${~~data?.progress}%`
|
|
);
|
|
},
|
|
}
|
|
: {}),
|
|
});
|
|
} catch (error) {
|
|
let errMsg = error.message;
|
|
if (errMsg.includes("Could not locate file")) {
|
|
errMsg =
|
|
"The native whisper model failed to download from the huggingface.co CDN. Your internet connection may be unstable or blocked by Huggingface.co - you will need to download the model manually and place it in the storage/models folder to use local Whisper transcription.";
|
|
}
|
|
|
|
this.#log(
|
|
`Failed to load the native whisper model: ${errMsg}`,
|
|
error.stack
|
|
);
|
|
throw new Error(errMsg);
|
|
}
|
|
}
|
|
|
|
async processFile(fullFilePath, filename) {
|
|
try {
|
|
const audioDataPromise = new Promise((resolve) =>
|
|
this.#convertToWavAudioData(fullFilePath).then((audioData) =>
|
|
resolve(audioData)
|
|
)
|
|
);
|
|
const [audioData, transcriber] = await Promise.all([
|
|
audioDataPromise,
|
|
this.client(),
|
|
]);
|
|
|
|
if (!audioData) {
|
|
this.#log(`Failed to parse content from ${filename}.`);
|
|
return {
|
|
content: null,
|
|
error: `Failed to parse content from ${filename}.`,
|
|
};
|
|
}
|
|
|
|
this.#log(`Transcribing audio data to text...`);
|
|
const { text } = await transcriber(audioData, {
|
|
chunk_length_s: 30,
|
|
stride_length_s: 5,
|
|
});
|
|
|
|
return { content: text, error: null };
|
|
} catch (error) {
|
|
return { content: null, error: error.message };
|
|
}
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
LocalWhisper,
|
|
};
|