From 0ada8829915472cb9c94eff97382d5664aa0dec4 Mon Sep 17 00:00:00 2001 From: Timothy Carambat Date: Thu, 14 Mar 2024 15:43:26 -0700 Subject: [PATCH] Support external transcription providers (#909) * Support External Transcription providers * patch files * update docs * fix return data --- collector/index.js | 4 +- collector/package.json | 3 +- .../processSingleFile/convert/asAudio.js | 117 ++---------- collector/processSingleFile/index.js | 3 +- .../utils/WhisperProviders/OpenAiWhisper.js | 44 +++++ .../utils/WhisperProviders/localWhisper.js | 126 +++++++++++- collector/yarn.lock | 20 ++ docker/.env.example | 10 + frontend/src/App.jsx | 9 + .../src/components/SettingsSidebar/index.jsx | 11 +- .../NativeTranscriptionOptions/index.jsx | 38 ++++ .../OpenAiOptions/index.jsx | 41 ++++ .../TranscriptionPreference/index.jsx | 180 ++++++++++++++++++ frontend/src/utils/paths.js | 3 + server/.env.example | 10 + server/models/systemSettings.js | 1 + server/storage/models/README.md | 3 + server/utils/collectorApi/index.js | 14 +- server/utils/helpers/updateENV.js | 14 ++ 19 files changed, 541 insertions(+), 110 deletions(-) create mode 100644 collector/utils/WhisperProviders/OpenAiWhisper.js create mode 100644 frontend/src/components/TranscriptionSelection/NativeTranscriptionOptions/index.jsx create mode 100644 frontend/src/components/TranscriptionSelection/OpenAiOptions/index.jsx create mode 100644 frontend/src/pages/GeneralSettings/TranscriptionPreference/index.jsx diff --git a/collector/index.js b/collector/index.js index 9ebe5f1c..a1142d75 100644 --- a/collector/index.js +++ b/collector/index.js @@ -25,7 +25,7 @@ app.use( ); app.post("/process", async function (request, response) { - const { filename } = reqBody(request); + const { filename, options = {} } = reqBody(request); try { const targetFilename = path .normalize(filename) @@ -34,7 +34,7 @@ app.post("/process", async function (request, response) { success, reason, documents = [], - } = await processSingleFile(targetFilename); + } = await processSingleFile(targetFilename, options); response .status(200) .json({ filename: targetFilename, success, reason, documents }); diff --git a/collector/package.json b/collector/package.json index d145ab86..8a0441d7 100644 --- a/collector/package.json +++ b/collector/package.json @@ -33,6 +33,7 @@ "moment": "^2.29.4", "multer": "^1.4.5-lts.1", "officeparser": "^4.0.5", + "openai": "^3.2.1", "pdf-parse": "^1.1.1", "puppeteer": "~21.5.2", "slugify": "^1.6.6", @@ -46,4 +47,4 @@ "nodemon": "^2.0.22", "prettier": "^2.4.1" } -} +} \ No newline at end of file diff --git a/collector/processSingleFile/convert/asAudio.js b/collector/processSingleFile/convert/asAudio.js index 15ae5cf0..170426e4 100644 --- a/collector/processSingleFile/convert/asAudio.js +++ b/collector/processSingleFile/convert/asAudio.js @@ -1,5 +1,3 @@ -const fs = require("fs"); -const path = require("path"); const { v4 } = require("uuid"); const { createdDate, @@ -9,39 +7,35 @@ const { const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); const { LocalWhisper } = require("../../utils/WhisperProviders/localWhisper"); +const { OpenAiWhisper } = require("../../utils/WhisperProviders/OpenAiWhisper"); -async function asAudio({ fullFilePath = "", filename = "" }) { - const whisper = new LocalWhisper(); +const WHISPER_PROVIDERS = { + openai: OpenAiWhisper, + local: LocalWhisper, +}; + +async function asAudio({ fullFilePath = "", filename = "", options = {} }) { + const WhisperProvider = WHISPER_PROVIDERS.hasOwnProperty( + options?.whisperProvider + ) + ? WHISPER_PROVIDERS[options?.whisperProvider] + : WHISPER_PROVIDERS.local; console.log(`-- Working ${filename} --`); - const transcriberPromise = new Promise((resolve) => - whisper.client().then((client) => resolve(client)) - ); - const audioDataPromise = new Promise((resolve) => - convertToWavAudioData(fullFilePath).then((audioData) => resolve(audioData)) - ); - const [audioData, transcriber] = await Promise.all([ - audioDataPromise, - transcriberPromise, - ]); + const whisper = new WhisperProvider({ options }); + const { content, error } = await whisper.processFile(fullFilePath, filename); - if (!audioData) { - console.error(`Failed to parse content from ${filename}.`); + if (!!error) { + console.error(`Error encountered for parsing of ${filename}.`); trashFile(fullFilePath); return { success: false, - reason: `Failed to parse content from ${filename}.`, + reason: error, documents: [], }; } - console.log(`[Model Working]: Transcribing audio data to text`); - const { text: content } = await transcriber(audioData, { - chunk_length_s: 30, - stride_length_s: 5, - }); - - if (!content.length) { + if (!content?.length) { console.error(`Resulting text content was empty for ${filename}.`); trashFile(fullFilePath); return { @@ -76,79 +70,4 @@ async function asAudio({ fullFilePath = "", filename = "" }) { return { success: true, reason: null, documents: [document] }; } -async function convertToWavAudioData(sourcePath) { - try { - let buffer; - const wavefile = require("wavefile"); - const ffmpeg = require("fluent-ffmpeg"); - const outFolder = path.resolve(__dirname, `../../storage/tmp`); - if (!fs.existsSync(outFolder)) fs.mkdirSync(outFolder, { recursive: true }); - - const fileExtension = path.extname(sourcePath).toLowerCase(); - if (fileExtension !== ".wav") { - console.log( - `[Conversion Required] ${fileExtension} file detected - converting to .wav` - ); - const outputFile = path.resolve(outFolder, `${v4()}.wav`); - const convert = new Promise((resolve) => { - ffmpeg(sourcePath) - .toFormat("wav") - .on("error", (error) => { - console.error(`[Conversion Error] ${error.message}`); - resolve(false); - }) - .on("progress", (progress) => - console.log( - `[Conversion Processing]: ${progress.targetSize}KB converted` - ) - ) - .on("end", () => { - console.log("[Conversion Complete]: File converted to .wav!"); - resolve(true); - }) - .save(outputFile); - }); - const success = await convert; - if (!success) - throw new Error( - "[Conversion Failed]: Could not convert file to .wav format!" - ); - - const chunks = []; - const stream = fs.createReadStream(outputFile); - for await (let chunk of stream) chunks.push(chunk); - buffer = Buffer.concat(chunks); - fs.rmSync(outputFile); - } else { - const chunks = []; - const stream = fs.createReadStream(sourcePath); - for await (let chunk of stream) chunks.push(chunk); - buffer = Buffer.concat(chunks); - } - - const wavFile = new wavefile.WaveFile(buffer); - wavFile.toBitDepth("32f"); - wavFile.toSampleRate(16000); - - let audioData = wavFile.getSamples(); - if (Array.isArray(audioData)) { - if (audioData.length > 1) { - const SCALING_FACTOR = Math.sqrt(2); - - // Merge channels into first channel to save memory - for (let i = 0; i < audioData[0].length; ++i) { - audioData[0][i] = - (SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2; - } - } - audioData = audioData[0]; - } - - return audioData; - } catch (error) { - console.error(`convertToWavAudioData`, error); - return null; - } -} - module.exports = asAudio; diff --git a/collector/processSingleFile/index.js b/collector/processSingleFile/index.js index 569a2cde..5d9e6a38 100644 --- a/collector/processSingleFile/index.js +++ b/collector/processSingleFile/index.js @@ -7,7 +7,7 @@ const { const { trashFile, isTextType } = require("../utils/files"); const RESERVED_FILES = ["__HOTDIR__.md"]; -async function processSingleFile(targetFilename) { +async function processSingleFile(targetFilename, options = {}) { const fullFilePath = path.resolve(WATCH_DIRECTORY, targetFilename); if (RESERVED_FILES.includes(targetFilename)) return { @@ -54,6 +54,7 @@ async function processSingleFile(targetFilename) { return await FileTypeProcessor({ fullFilePath, filename: targetFilename, + options, }); } diff --git a/collector/utils/WhisperProviders/OpenAiWhisper.js b/collector/utils/WhisperProviders/OpenAiWhisper.js new file mode 100644 index 00000000..3b9d08e6 --- /dev/null +++ b/collector/utils/WhisperProviders/OpenAiWhisper.js @@ -0,0 +1,44 @@ +const fs = require("fs"); + +class OpenAiWhisper { + constructor({ options }) { + const { Configuration, OpenAIApi } = require("openai"); + if (!options.openAiKey) throw new Error("No OpenAI API key was set."); + + const config = new Configuration({ + apiKey: options.openAiKey, + }); + this.openai = new OpenAIApi(config); + this.model = "whisper-1"; + this.temperature = 0; + this.#log("Initialized."); + } + + #log(text, ...args) { + console.log(`\x1b[32m[OpenAiWhisper]\x1b[0m ${text}`, ...args); + } + + async processFile(fullFilePath) { + return await this.openai + .createTranscription( + fs.createReadStream(fullFilePath), + this.model, + undefined, + "text", + this.temperature + ) + .then((res) => { + if (res.hasOwnProperty("data")) + return { content: res.data, error: null }; + return { content: "", error: "No content was able to be transcribed." }; + }) + .catch((e) => { + this.#log(`Could not get any response from openai whisper`, e.message); + return { content: "", error: e.message }; + }); + } +} + +module.exports = { + OpenAiWhisper, +}; diff --git a/collector/utils/WhisperProviders/localWhisper.js b/collector/utils/WhisperProviders/localWhisper.js index 6503e202..46dbe226 100644 --- a/collector/utils/WhisperProviders/localWhisper.js +++ b/collector/utils/WhisperProviders/localWhisper.js @@ -1,5 +1,6 @@ -const path = require("path"); const fs = require("fs"); +const path = require("path"); +const { v4 } = require("uuid"); class LocalWhisper { constructor() { @@ -16,12 +17,94 @@ class LocalWhisper { // Make directory when it does not exist in existing installations if (!fs.existsSync(this.cacheDir)) fs.mkdirSync(this.cacheDir, { recursive: true }); + + this.#log("Initialized."); + } + + #log(text, ...args) { + console.log(`\x1b[32m[LocalWhisper]\x1b[0m ${text}`, ...args); + } + + async #convertToWavAudioData(sourcePath) { + try { + let buffer; + const wavefile = require("wavefile"); + const ffmpeg = require("fluent-ffmpeg"); + const outFolder = path.resolve(__dirname, `../../storage/tmp`); + if (!fs.existsSync(outFolder)) + fs.mkdirSync(outFolder, { recursive: true }); + + const fileExtension = path.extname(sourcePath).toLowerCase(); + if (fileExtension !== ".wav") { + this.#log( + `File conversion required! ${fileExtension} file detected - converting to .wav` + ); + const outputFile = path.resolve(outFolder, `${v4()}.wav`); + const convert = new Promise((resolve) => { + ffmpeg(sourcePath) + .toFormat("wav") + .on("error", (error) => { + this.#log(`Conversion Error! ${error.message}`); + resolve(false); + }) + .on("progress", (progress) => + this.#log( + `Conversion Processing! ${progress.targetSize}KB converted` + ) + ) + .on("end", () => { + this.#log(`Conversion Complete! File converted to .wav!`); + resolve(true); + }) + .save(outputFile); + }); + const success = await convert; + if (!success) + throw new Error( + "[Conversion Failed]: Could not convert file to .wav format!" + ); + + const chunks = []; + const stream = fs.createReadStream(outputFile); + for await (let chunk of stream) chunks.push(chunk); + buffer = Buffer.concat(chunks); + fs.rmSync(outputFile); + } else { + const chunks = []; + const stream = fs.createReadStream(sourcePath); + for await (let chunk of stream) chunks.push(chunk); + buffer = Buffer.concat(chunks); + } + + const wavFile = new wavefile.WaveFile(buffer); + wavFile.toBitDepth("32f"); + wavFile.toSampleRate(16000); + + let audioData = wavFile.getSamples(); + if (Array.isArray(audioData)) { + if (audioData.length > 1) { + const SCALING_FACTOR = Math.sqrt(2); + + // Merge channels into first channel to save memory + for (let i = 0; i < audioData[0].length; ++i) { + audioData[0][i] = + (SCALING_FACTOR * (audioData[0][i] + audioData[1][i])) / 2; + } + } + audioData = audioData[0]; + } + + return audioData; + } catch (error) { + console.error(`convertToWavAudioData`, error); + return null; + } } async client() { if (!fs.existsSync(this.modelPath)) { - console.log( - "\x1b[34m[INFO]\x1b[0m The native whisper model has never been run and will be downloaded right now. Subsequent runs will be faster. (~250MB)\n\n" + this.#log( + `The native whisper model has never been run and will be downloaded right now. Subsequent runs will be faster. (~250MB)` ); } @@ -48,10 +131,45 @@ class LocalWhisper { : {}), }); } catch (error) { - console.error("Failed to load the native whisper model:", error); + this.#log("Failed to load the native whisper model:", error); throw error; } } + + async processFile(fullFilePath, filename) { + try { + const transcriberPromise = new Promise((resolve) => + this.client().then((client) => resolve(client)) + ); + const audioDataPromise = new Promise((resolve) => + this.#convertToWavAudioData(fullFilePath).then((audioData) => + resolve(audioData) + ) + ); + const [audioData, transcriber] = await Promise.all([ + audioDataPromise, + transcriberPromise, + ]); + + if (!audioData) { + this.#log(`Failed to parse content from ${filename}.`); + return { + content: null, + error: `Failed to parse content from ${filename}.`, + }; + } + + this.#log(`Transcribing audio data to text...`); + const { text } = await transcriber(audioData, { + chunk_length_s: 30, + stride_length_s: 5, + }); + + return { content: text, error: null }; + } catch (error) { + return { content: null, error: error.message }; + } + } } module.exports = { diff --git a/collector/yarn.lock b/collector/yarn.lock index bf979c86..3bb0f1ea 100644 --- a/collector/yarn.lock +++ b/collector/yarn.lock @@ -372,6 +372,13 @@ asynckit@^0.4.0: resolved "https://registry.yarnpkg.com/asynckit/-/asynckit-0.4.0.tgz#c79ed97f7f34cb8f2ba1bc9790bcc366474b4b79" integrity sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q== +axios@^0.26.0: + version "0.26.1" + resolved "https://registry.yarnpkg.com/axios/-/axios-0.26.1.tgz#1ede41c51fcf51bbbd6fd43669caaa4f0495aaa9" + integrity sha512-fPwcX4EvnSHuInCMItEhAGnaSEXRBjtzh9fOtsE6E1G6p7vl7edEeZe11QHf18+6+9gR5PbKV/sGKNaD8YaMeA== + dependencies: + follow-redirects "^1.14.8" + b4a@^1.6.4: version "1.6.4" resolved "https://registry.yarnpkg.com/b4a/-/b4a-1.6.4.tgz#ef1c1422cae5ce6535ec191baeed7567443f36c9" @@ -1203,6 +1210,11 @@ fluent-ffmpeg@^2.1.2: async ">=0.2.9" which "^1.1.1" +follow-redirects@^1.14.8: + version "1.15.6" + resolved "https://registry.yarnpkg.com/follow-redirects/-/follow-redirects-1.15.6.tgz#7f815c0cda4249c74ff09e95ef97c23b5fd0399b" + integrity sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA== + form-data-encoder@1.7.2: version "1.7.2" resolved "https://registry.yarnpkg.com/form-data-encoder/-/form-data-encoder-1.7.2.tgz#1f1ae3dccf58ed4690b86d87e4f57c654fbab040" @@ -2304,6 +2316,14 @@ onnxruntime-web@1.14.0: onnxruntime-common "~1.14.0" platform "^1.3.6" +openai@^3.2.1: + version "3.3.0" + resolved "https://registry.yarnpkg.com/openai/-/openai-3.3.0.tgz#a6408016ad0945738e1febf43f2fccca83a3f532" + integrity sha512-uqxI/Au+aPRnsaQRe8CojU0eCR7I0mBiKjD3sNMzY6DaC1ZVrc85u98mtJW6voDug8fgGN+DIZmTDxTthxb7dQ== + dependencies: + axios "^0.26.0" + form-data "^4.0.0" + openai@^4.19.0: version "4.20.1" resolved "https://registry.yarnpkg.com/openai/-/openai-4.20.1.tgz#afa0d496d125b5a0f6cebcb4b9aeabf71e00214e" diff --git a/docker/.env.example b/docker/.env.example index ae4913dc..ed6fd3bc 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -131,6 +131,16 @@ GID='1000' # ASTRA_DB_APPLICATION_TOKEN= # ASTRA_DB_ENDPOINT= +########################################### +######## Audio Model Selection ############ +########################################### +# (default) use built-in whisper-small model. +# WHISPER_PROVIDER="local" + +# use openai hosted whisper model. +# WHISPER_PROVIDER="openai" +# OPEN_AI_KEY=sk-xxxxxxxx + # CLOUD DEPLOYMENT VARIRABLES ONLY # AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting. # DISABLE_TELEMETRY="false" diff --git a/frontend/src/App.jsx b/frontend/src/App.jsx index 86f6eb08..8a57d27b 100644 --- a/frontend/src/App.jsx +++ b/frontend/src/App.jsx @@ -29,6 +29,9 @@ const GeneralApiKeys = lazy(() => import("@/pages/GeneralSettings/ApiKeys")); const GeneralLLMPreference = lazy( () => import("@/pages/GeneralSettings/LLMPreference") ); +const GeneralTranscriptionPreference = lazy( + () => import("@/pages/GeneralSettings/TranscriptionPreference") +); const GeneralEmbeddingPreference = lazy( () => import("@/pages/GeneralSettings/EmbeddingPreference") ); @@ -76,6 +79,12 @@ export default function App() { path="/settings/llm-preference" element={} /> + + } + /> } diff --git a/frontend/src/components/SettingsSidebar/index.jsx b/frontend/src/components/SettingsSidebar/index.jsx index 84b78064..a7aca7ff 100644 --- a/frontend/src/components/SettingsSidebar/index.jsx +++ b/frontend/src/components/SettingsSidebar/index.jsx @@ -19,6 +19,7 @@ import { Notepad, CodeBlock, Barcode, + ClosedCaptioning, } from "@phosphor-icons/react"; import useUser from "@/hooks/useUser"; import { USER_BACKGROUND_COLOR } from "@/utils/constants"; @@ -278,9 +279,17 @@ const SidebarOptions = ({ user = null }) => ( flex={true} allowedRole={["admin"]} /> +