merlyn/collector/utils/WhisperProviders/localWhisper.js

const fs = require("fs");
const path = require("path");
const { v4 } = require("uuid");
const defaultWhisper = "Xenova/whisper-small"; // Model Card: https://huggingface.co/Xenova/whisper-small
const fileSize = {
  "Xenova/whisper-small": "250mb",
  "Xenova/whisper-large": "1.56GB",
};

class LocalWhisper {
  constructor({ options }) {
    this.model = options?.WhisperModelPref ?? defaultWhisper;
    this.fileSize = fileSize[this.model];
    this.cacheDir = path.resolve(
      process.env.STORAGE_DIR
        ? path.resolve(process.env.STORAGE_DIR, `models`)
        : path.resolve(__dirname, `../../../server/storage/models`)
    );

    this.modelPath = path.resolve(this.cacheDir, ...this.model.split("/"));
    // Make directory when it does not exist in existing installations
    if (!fs.existsSync(this.cacheDir))
      fs.mkdirSync(this.cacheDir, { recursive: true });

    this.#log("Initialized.");
  }

  #log(text, ...args) {
    console.log(`\x1b[32m[LocalWhisper]\x1b[0m ${text}`, ...args);
  }

  #validateAudioFile(wavFile) {
    const sampleRate = wavFile.fmt.sampleRate;
    const duration = wavFile.data.samples / sampleRate;

    // Most speech recognition systems expect minimum 8kHz
    // But we'll set it lower to be safe
    if (sampleRate < 4000) {
      // 4kHz minimum
      throw new Error(
        "Audio file sample rate is too low for accurate transcription. Minimum required is 4kHz."
      );
    }

    // Typical audio file duration limits
    const MAX_DURATION_SECONDS = 4 * 60 * 60; // 4 hours
    if (duration > MAX_DURATION_SECONDS) {
      throw new Error("Audio file duration exceeds maximum limit of 4 hours.");
    }

    // Check final sample count after upsampling to prevent memory issues
    const targetSampleRate = 16000;
    const upsampledSamples = duration * targetSampleRate;
    const MAX_SAMPLES = 230_400_000; // ~4 hours at 16kHz

    if (upsampledSamples > MAX_SAMPLES) {
      throw new Error("Audio file exceeds maximum allowed length.");
    }

    return true;
  }

  async #convertToWavAudioData(sourcePath) {
    try {
      let buffer;
      const wavefile = require("wavefile");
      const { FFMPEGWrapper } = require("./ffmpeg");
      const ffmpeg = new FFMPEGWrapper();
      const outFolder = path.resolve(__dirname, `../../storage/tmp`);
      if (!fs.existsSync(outFolder))
        fs.mkdirSync(outFolder, { recursive: true });

      const outputFile = path.resolve(outFolder, `${v4()}.wav`);
      const success = ffmpeg.convertAudioToWav(sourcePath, outputFile);
      if (!success)
        throw new Error(
          "[Conversion Failed]: Could not convert file to .wav format!"
        );

      buffer = fs.readFileSync(outputFile);
      fs.rmSync(outputFile);

      const wavFile = new wavefile.WaveFile(buffer);
      try {
        this.#validateAudioFile(wavFile);
      } catch (error) {
        this.#log(`Audio validation failed: ${error.message}`);
        throw new Error(`Invalid audio file: ${error.message}`);
      }

      // Although we use ffmpeg to convert to the correct format (16k hz 32f),
      // different versions of ffmpeg produce different results based on the
      // environment. To ensure consistency, we convert to the correct format again.
      wavFile.toBitDepth("32f");
      wavFile.toSampleRate(16000);

      let audioData = wavFile.getSamples();
      if (Array.isArray(audioData)) {
        if (audioData.length > 1) {
          const SCALING_FACTOR = Math.sqrt(2);

          // Merge channels into first channel to save memory
          for (let i = 0; i < audioData[0].length; ++i) {
            audioData[0][i] =
              (SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2;
          }
        }
        audioData = audioData[0];
      }

      return audioData;
    } catch (error) {
      console.error(`convertToWavAudioData`, error);
      return null;
    }
  }

  async client() {
    if (!fs.existsSync(this.modelPath)) {
      this.#log(
        `The native whisper model has never been run and will be downloaded right now. Subsequent runs will be faster. (~${this.fileSize})`
      );
    }

    try {
      // Convert ESM to CommonJS via import so we can load this library.
      const pipeline = (...args) =>
        import("@xenova/transformers").then(({ pipeline }) => {
          return pipeline(...args);
        });
      return await pipeline("automatic-speech-recognition", this.model, {
        cache_dir: this.cacheDir,
        ...(!fs.existsSync(this.modelPath)
          ? {
              // Show download progress if we need to download any files
              progress_callback: (data) => {
                if (!data.hasOwnProperty("progress")) return;
                console.log(
                  `\x1b[34m[Embedding - Downloading Model Files]\x1b[0m ${
                    data.file
                  } ${~~data?.progress}%`
                );
              },
            }
          : {}),
      });
    } catch (error) {
      let errMsg = error.message;
      if (errMsg.includes("Could not locate file")) {
        errMsg =
          "The native whisper model failed to download from the huggingface.co CDN. Your internet connection may be unstable or blocked by Huggingface.co - you will need to download the model manually and place it in the storage/models folder to use local Whisper transcription.";
      }

      this.#log(
        `Failed to load the native whisper model: ${errMsg}`,
        error.stack
      );
      throw new Error(errMsg);
    }
  }

  async processFile(fullFilePath, filename) {
    try {
      const audioDataPromise = new Promise((resolve) =>
        this.#convertToWavAudioData(fullFilePath).then((audioData) =>
          resolve(audioData)
        )
      );
      const [audioData, transcriber] = await Promise.all([
        audioDataPromise,
        this.client(),
      ]);

      if (!audioData) {
        this.#log(`Failed to parse content from ${filename}.`);
        return {
          content: null,
          error: `Failed to parse content from ${filename}.`,
        };
      }

      this.#log(`Transcribing audio data to text...`);
      const { text } = await transcriber(audioData, {
        chunk_length_s: 30,
        stride_length_s: 5,
      });

      return { content: text, error: null };
    } catch (error) {
      return { content: null, error: error.message };
    }
  }
}

module.exports = {
  LocalWhisper,
};