merlyn/server/utils/EmbeddingEngines/ollama/index.js
Marcello Fitton 42a41201a8
feat: Document Embedding Status Events | Refactor Document Embedding to Job Queue and Forked Process (#5254)
* implement native embedder job queue

* persist embedding progress across renders

* add development worker timeouts

* change to static method

* native reranker

* remove useless return

* lint

* simplify

* make embedding worker timeout value configurable by admin

* add event emission for missing data

* lint

* remove onProgress callback argument

* make rerank to rerankDirect

* persists progress state across app reloads

* remove chunk level progress reporting

* remove unuse dvariable

* make NATIVE_RERANKING_WORKER_TIMEOUT user configurable

* remove dead code

* scope embedding progress per-user and clear stale state on SSE reconnect

* lint

* revert vector databases and embedding engines to call their original methods

* simplify rerank

* simplify progress fetching by removing updateProgressFromApi

* remove duplicate jsdoc

* replace sessionStorage persistence with server-side history replay for embedding progress

* fix old comment

* fix: ignore premature SSE all_complete when embedding hasn't started yet

The SSE connection opens before the embedding API call fires, so the
server sees no buffered history and immediately sends all_complete.
Firefox dispatches this eagerly enough that it closes the EventSource
before real progress events arrive, causing the progress UI to clear
and fall back to the loading spinner. Chrome's EventSource timing
masks the race.

Track slugs where startEmbedding was called but no real progress event
has arrived yet via awaitingProgressRef. Ignore the first all_complete
for those slugs and keep the connection open for the real events.

* reduce duplication with progress emissions

* remove dead code

* refactor: streamline embedding progress handling

Removed unnecessary tracking of slugs for premature all_complete events in the EmbeddingProgressProvider. Updated the server-side logic to avoid sending all_complete when no embedding is in progress, allowing the connection to remain open for real events. Adjusted the embedding initiation flow to ensure the server processes the job before the SSE connection opens, improving the reliability of progress updates.

* fix stale comment

* remove unused function

* fix event emissions for document creation failure

* refactor: move Reranking Worker Idle Timeout input to LanceDBOptions component

Extracted the Reranking Worker Idle Timeout input from GeneralEmbeddingPreference and integrated it into the LanceDBOptions component. This change enhances modularity and maintains a cleaner structure for the settings interface.

* lint

* remove unused hadHistory vars

* refactor workspace directory by hoisting component and converting into functions

* moved EmbeddingProgressProvider to wrap Document Manager Modal

* refactor embed progress SSE connection to use fetchEventSource instead of native EventSource API.

* refactor message handlng into a function and reduce duplication

* refactor: utilize writeResponseChunk for event emissions in document embedding progress SSE

* refactor: explicit in-proc embedding and rerank methods that are called by workers instead of process.send checks

* Abstract EmbeddingProgressBus and Worker Queue into modules

* remove error and toast messages on embed process result

* use safeJsonParse

* add chunk-level progress events with per-document progress bar in UI

* remove unused parameter

* rename all worker timeout references to use ttl | remove ttl updating from UI

* refactor: pass embedding context through job payload instead of global state

* lint

* add graceful shutdown for workers

* apply figma styles

* refactor embedding worker to use bree

* use existing WorkerQueue class as the management layer for jobs

* lint

* revert all reranking worker changes back to master state

Removes the reranking worker queue, rerankViaWorker/rerankInProcess
renames, and NATIVE_RERANKING_WORKER_TTL config so this branch
only contains the embedding worker job queue feature.

* remove breeManaged flag — WorkerQueue always spawns via Bree

* fix prompt embedding bug

* have embedTextInput call embedChunksInProcess

* add message field to `process.send()`

* remove nullish check and error throw

* remove bespoke graceful shutdown logix

* add spawnWorker method and asbtract redudant flows into helper methods

* remove unneeded comment

* remove recomputation of TTL value

* frontend cleanup and refactor

* wip on backend refactor

* backend overhaul

* small lint

* second pass

* add logging, update endpoint

* simple refactor

* add reporting to all embedder providers

* fix styles

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
2026-04-06 17:00:15 -07:00

137 lines
4.9 KiB
JavaScript

const {
maximumChunkLength,
reportEmbeddingProgress,
} = require("../../helpers");
const { Ollama } = require("ollama");
const { OllamaAILLM } = require("../../AiProviders/ollama");
class OllamaEmbedder {
constructor() {
if (!process.env.EMBEDDING_BASE_PATH)
throw new Error("No embedding base path was set.");
if (!process.env.EMBEDDING_MODEL_PREF)
throw new Error("No embedding model was set.");
this.className = "OllamaEmbedder";
this.basePath = process.env.EMBEDDING_BASE_PATH;
this.model = process.env.EMBEDDING_MODEL_PREF;
this.maxConcurrentChunks = process.env.OLLAMA_EMBEDDING_BATCH_SIZE
? Number(process.env.OLLAMA_EMBEDDING_BATCH_SIZE)
: 1;
this.embeddingMaxChunkLength = maximumChunkLength();
this.authToken = process.env.OLLAMA_AUTH_TOKEN;
const headers = this.authToken
? { Authorization: `Bearer ${this.authToken}` }
: {};
this.client = new Ollama({
host: this.basePath,
headers,
fetch: OllamaAILLM.applyOllamaFetch(),
});
this.log(
`initialized with model ${this.model} at ${this.basePath}. Batch size: ${this.maxConcurrentChunks}, num_ctx: ${this.embeddingMaxChunkLength}`
);
}
log(text, ...args) {
console.log(`\x1b[36m[${this.className}]\x1b[0m ${text}`, ...args);
}
/**
* Checks if the Ollama service is alive by pinging the base path.
* @returns {Promise<boolean>} - A promise that resolves to true if the service is alive, false otherwise.
*/
async #isAlive() {
return await fetch(this.basePath)
.then((res) => res.ok)
.catch((e) => {
this.log(e.message);
return false;
});
}
async embedTextInput(textInput) {
const result = await this.embedChunks(
Array.isArray(textInput) ? textInput : [textInput]
);
return result?.[0] || [];
}
/**
* This function takes an array of text chunks and embeds them using the Ollama API.
* Chunks are processed in batches based on the maxConcurrentChunks setting to balance
* resource usage on the Ollama endpoint.
*
* We will use the num_ctx option to set the maximum context window to the max chunk length defined by the user in the settings
* so that the maximum context window is used and content is not truncated.
*
* We also assume the default keep alive option. This could cause issues with models being unloaded and reloaded
* on low memory machines, but that is simply a user-end issue we cannot control. If the LLM and embedder are
* constantly being loaded and unloaded, the user should use another LLM or Embedder to avoid this issue.
* @param {string[]} textChunks - An array of text chunks to embed.
* @returns {Promise<Array<number[]>>} - A promise that resolves to an array of embeddings.
*/
async embedChunks(textChunks = []) {
if (!(await this.#isAlive()))
throw new Error(
`Ollama service could not be reached. Is Ollama running?`
);
this.log(
`Embedding ${textChunks.length} chunks of text with ${this.model} in batches of ${this.maxConcurrentChunks}.`
);
let data = [];
let error = null;
// Process chunks in batches based on maxConcurrentChunks
const totalBatches = Math.ceil(
textChunks.length / this.maxConcurrentChunks
);
let currentBatch = 0;
for (let i = 0; i < textChunks.length; i += this.maxConcurrentChunks) {
const batch = textChunks.slice(i, i + this.maxConcurrentChunks);
currentBatch++;
try {
// Use input param instead of prompt param to support batch processing
const res = await this.client.embed({
model: this.model,
input: batch,
options: {
// Always set the num_ctx to the max chunk length defined by the user in the settings
// so that the maximum context window is used and content is not truncated.
num_ctx: this.embeddingMaxChunkLength,
},
});
const { embeddings } = res;
if (!Array.isArray(embeddings) || embeddings.length === 0)
throw new Error("Ollama returned empty embeddings for batch!");
// Using prompt param in embed() would return a single embedding (number[])
// but input param returns an array of embeddings (number[][]) for batch processing.
// This is why we spread the embeddings array into the data array.
data.push(...embeddings);
reportEmbeddingProgress(data.length, textChunks.length);
this.log(
`Batch ${currentBatch}/${totalBatches}: Embedded ${embeddings.length} chunks. Total: ${data.length}/${textChunks.length}`
);
} catch (err) {
this.log(err.message);
error = err.message;
data = [];
break;
}
}
if (!!error) throw new Error(`Ollama Failed to embed: ${error}`);
return data.length > 0 ? data : null;
}
}
module.exports = {
OllamaEmbedder,
};