Add batch embedding support for Ollama embedding provider (#4553)

* add batch embedding support for ollama embedding provider * lint * simplify ollama embedder input
2025-11-25 13:03:47 -08:00 · 2025-11-25 13:03:47 -08:00 · 66e44f65b4
commit 66e44f65b4
parent 6a72ac2240
4 changed files with 79 additions and 15 deletions
--- a/frontend/src/components/EmbeddingSelection/OllamaOptions/index.jsx
+++ b/frontend/src/components/EmbeddingSelection/OllamaOptions/index.jsx
@ -23,11 +23,18 @@ export default function OllamaEmbeddingOptions({ settings }) {
  const [maxChunkLength, setMaxChunkLength] = useState(
    settings?.EmbeddingModelMaxChunkLength || 8192
  );
+  const [batchSize, setBatchSize] = useState(
+    settings?.OllamaEmbeddingBatchSize || 1
+  );

  const handleMaxChunkLengthChange = (e) => {
    setMaxChunkLength(Number(e.target.value));
  };

+  const handleBatchSizeChange = (e) => {
+    setBatchSize(Number(e.target.value));
+  };
+
  return (
    <div className="w-full flex flex-col gap-y-7">
      <div className="w-full flex items-start gap-[36px] mt-1.5">
@ -74,7 +81,7 @@ export default function OllamaEmbeddingOptions({ settings }) {
          }}
          className="border-none text-theme-text-primary hover:text-theme-text-secondary flex items-center text-sm"
        >
-          {showAdvancedControls ? "Hide" : "Show"} Manual Endpoint Input
+          {showAdvancedControls ? "Hide" : "Show"} Advanced Settings
          {showAdvancedControls ? (
            <CaretUp size={14} className="ml-1" />
          ) : (
@ -121,6 +128,41 @@ export default function OllamaEmbeddingOptions({ settings }) {
              Enter the URL where Ollama is running.
            </p>
          </div>
+          <div className="flex flex-col w-60">
+            <div
+              data-tooltip-place="top"
+              data-tooltip-id="ollama-batch-size-tooltip"
+              className="flex gap-x-1 items-center mb-3"
+            >
+              <Info
+                size={16}
+                className="text-theme-text-secondary cursor-pointer"
+              />
+              <label className="text-white text-sm font-semibold block">
+                Embedding batch size
+              </label>
+              <Tooltip id="ollama-batch-size-tooltip">
+                Number of text chunks to embed in parallel. Higher values
+                improve speed but use more memory. Default is 1.
+              </Tooltip>
+            </div>
+            <input
+              type="number"
+              name="OllamaEmbeddingBatchSize"
+              className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
+              placeholder="1"
+              min={1}
+              value={batchSize}
+              onChange={handleBatchSizeChange}
+              onScroll={(e) => e.target.blur()}
+              required={true}
+              autoComplete="off"
+            />
+            <p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
+              Increase this value to process multiple chunks simultaneously for
+              faster embedding.
+            </p>
+          </div>
        </div>
      </div>
    </div>
--- a/server/models/systemSettings.js
+++ b/server/models/systemSettings.js
@ -232,6 +232,7 @@ const SystemSettings = {
          : process.env.EMBEDDING_MODEL_PREF,
      EmbeddingModelMaxChunkLength:
        process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH,
+      OllamaEmbeddingBatchSize: process.env.OLLAMA_EMBEDDING_BATCH_SIZE || 1,
      VoyageAiApiKey: !!process.env.VOYAGEAI_API_KEY,
      GenericOpenAiEmbeddingApiKey:
        !!process.env.GENERIC_OPEN_AI_EMBEDDING_API_KEY,
--- a/server/utils/EmbeddingEngines/ollama/index.js
+++ b/server/utils/EmbeddingEngines/ollama/index.js
@ -11,12 +11,13 @@ class OllamaEmbedder {
    this.className = "OllamaEmbedder";
    this.basePath = process.env.EMBEDDING_BASE_PATH;
    this.model = process.env.EMBEDDING_MODEL_PREF;
-    // Limit of how many strings we can process in a single pass to stay with resource or network limits
-    this.maxConcurrentChunks = 1;
+    this.maxConcurrentChunks = process.env.OLLAMA_EMBEDDING_BATCH_SIZE
+      ? Number(process.env.OLLAMA_EMBEDDING_BATCH_SIZE)
+      : 1;
    this.embeddingMaxChunkLength = maximumChunkLength();
    this.client = new Ollama({ host: this.basePath });
    this.log(
-      `initialized with model ${this.model} at ${this.basePath}. num_ctx: ${this.embeddingMaxChunkLength}`
+      `initialized with model ${this.model} at ${this.basePath}. Batch size: ${this.maxConcurrentChunks}, num_ctx: ${this.embeddingMaxChunkLength}`
    );
  }

@ -46,14 +47,14 @@ class OllamaEmbedder {

  /**
   * This function takes an array of text chunks and embeds them using the Ollama API.
-   * chunks are processed sequentially to avoid overwhelming the API with too many requests
-   * or running out of resources on the endpoint running the ollama instance.
+   * Chunks are processed in batches based on the maxConcurrentChunks setting to balance
+   * resource usage on the Ollama endpoint.
   *
   * We will use the num_ctx option to set the maximum context window to the max chunk length defined by the user in the settings
   * so that the maximum context window is used and content is not truncated.
   *
   * We also assume the default keep alive option. This could cause issues with models being unloaded and reloaded
-   * on load memory machines, but that is simply a user-end issue we cannot control. If the LLM and embedder are
+   * on low memory machines, but that is simply a user-end issue we cannot control. If the LLM and embedder are
   * constantly being loaded and unloaded, the user should use another LLM or Embedder to avoid this issue.
   * @param {string[]} textChunks - An array of text chunks to embed.
   * @returns {Promise<Array<number[]>>} - A promise that resolves to an array of embeddings.
@ -64,17 +65,27 @@ class OllamaEmbedder {
        `Ollama service could not be reached. Is Ollama running?`
      );
    this.log(
-      `Embedding ${textChunks.length} chunks of text with ${this.model}.`
+      `Embedding ${textChunks.length} chunks of text with ${this.model} in batches of ${this.maxConcurrentChunks}.`
    );

    let data = [];
    let error = null;

-    for (const chunk of textChunks) {
+    // Process chunks in batches based on maxConcurrentChunks
+    const totalBatches = Math.ceil(
+      textChunks.length / this.maxConcurrentChunks
+    );
+    let currentBatch = 0;
+
+    for (let i = 0; i < textChunks.length; i += this.maxConcurrentChunks) {
+      const batch = textChunks.slice(i, i + this.maxConcurrentChunks);
+      currentBatch++;
+
      try {
-        const res = await this.client.embeddings({
+        // Use input param instead of prompt param to support batch processing
+        const res = await this.client.embed({
          model: this.model,
-          prompt: chunk,
+          input: batch,
          options: {
            // Always set the num_ctx to the max chunk length defined by the user in the settings
            // so that the maximum context window is used and content is not truncated.
@ -82,11 +93,17 @@ class OllamaEmbedder {
          },
        });

-        const { embedding } = res;
-        if (!Array.isArray(embedding) || embedding.length === 0)
-          throw new Error("Ollama returned an empty embedding for chunk!");
+        const { embeddings } = res;
+        if (!Array.isArray(embeddings) || embeddings.length === 0)
+          throw new Error("Ollama returned empty embeddings for batch!");

-        data.push(embedding);
+        // Using prompt param in embed() would return a single embedding (number[])
+        // but input param returns an array of embeddings (number[][]) for batch processing.
+        // This is why we spread the embeddings array into the data array.
+        data.push(...embeddings);
+        this.log(
+          `Batch ${currentBatch}/${totalBatches}: Embedded ${embeddings.length} chunks. Total: ${data.length}/${textChunks.length}`
+        );
      } catch (err) {
        this.log(err.message);
        error = err.message;
--- a/server/utils/helpers/updateENV.js
+++ b/server/utils/helpers/updateENV.js
@ -307,6 +307,10 @@ const KEY_MAPPING = {
    envKey: "EMBEDDING_MODEL_MAX_CHUNK_LENGTH",
    checks: [nonZero],
  },
+  OllamaEmbeddingBatchSize: {
+    envKey: "OLLAMA_EMBEDDING_BATCH_SIZE",
+    checks: [nonZero],
+  },

  // Gemini Embedding Settings
  GeminiEmbeddingApiKey: {