Refactor Ollama context window setting (#4909)

2026-01-27 10:50:40 -08:00 · 2026-01-27 10:50:40 -08:00 · fe78e1c667
commit fe78e1c667
parent 64bff91998
7 changed files with 205 additions and 169 deletions
--- a/frontend/src/components/LLMSelection/OllamaLLMOptions/index.jsx
+++ b/frontend/src/components/LLMSelection/OllamaLLMOptions/index.jsx
@ -1,10 +1,10 @@
 import React, { useEffect, useState } from "react";
 import System from "@/models/system";
 import PreLoader from "@/components/Preloader";
 import { OLLAMA_COMMON_URLS } from "@/utils/constants";
-import { CaretDown, CaretUp, Info } from "@phosphor-icons/react";
+import { CaretDown, CaretUp, Info, CircleNotch } from "@phosphor-icons/react";
 import useProviderEndpointAutoDiscovery from "@/hooks/useProviderEndpointAutoDiscovery";
 import { Tooltip } from "react-tooltip";
 import { Link } from "react-router-dom";
 export default function OllamaLLMOptions({ settings }) {
  const {
@ -21,9 +21,6 @@ export default function OllamaLLMOptions({ settings }) {
    initialBasePath: settings?.OllamaLLMBasePath,
    ENDPOINTS: OLLAMA_COMMON_URLS,
  });
  const [performanceMode, setPerformanceMode] = useState(
    settings?.OllamaLLMPerformanceMode || "base"
  );
  const [maxTokens, setMaxTokens] = useState(
    settings?.OllamaLLMTokenLimit || ""
  );
@ -56,14 +53,36 @@ export default function OllamaLLMOptions({ settings }) {
      <div hidden={!showAdvancedControls}>
        <div className="flex flex-col">
-          <div className="w-full flex items-start gap-4">
+          <div className="w-full flex items-start gap-4 mb-4">
            <div className="flex flex-col w-60">
              <div className="flex justify-between items-center mb-2">
-                <label className="text-white text-sm font-semibold">
+                <div className="flex items-center gap-1">
-                  Ollama Base URL
+                  <label className="text-white text-sm font-semibold">
-                </label>
+                    Ollama Base URL
                  </label>
                  <Info
                    size={18}
                    className="text-theme-text-secondary cursor-pointer"
                    data-tooltip-id="ollama-base-url"
                    data-tooltip-content="Enter the URL where Ollama is running."
                  />
                  <Tooltip
                    id="ollama-base-url"
                    place="top"
                    delayShow={300}
                    className="tooltip !text-xs !opacity-100"
                    style={{
                      maxWidth: "250px",
                      whiteSpace: "normal",
                      wordWrap: "break-word",
                    }}
                  />
                </div>
                {loading ? (
-                  <PreLoader size="6" />
+                  <CircleNotch
                    size={16}
                    className="text-theme-text-secondary animate-spin"
                  />
                ) : (
                  <>
                    {!basePathValue.value && (
@ -89,61 +108,45 @@ export default function OllamaLLMOptions({ settings }) {
                onChange={basePath.onChange}
                onBlur={basePath.onBlur}
              />
              <p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
                Enter the URL where Ollama is running.
              </p>
            </div>
            <div className="flex flex-col w-60">
-              <label className="text-white text-sm font-semibold mb-2 flex items-center">
+              <div className="flex items-center mb-2 gap-x-1">
-                Performance Mode
+                <label className="text-white text-sm font-semibold block">
                  Ollama Keep Alive
                </label>
                <Info
-                  size={16}
+                  size={18}
-                  className="ml-2 text-white"
+                  className="text-theme-text-secondary cursor-pointer"
-                  data-tooltip-id="performance-mode-tooltip"
+                  data-tooltip-id="ollama-keep-alive"
                />
-              </label>
+                <Tooltip
-              <select
+                  id="ollama-keep-alive"
-                name="OllamaLLMPerformanceMode"
+                  place="top"
-                required={true}
+                  delayShow={300}
-                className="border-none bg-theme-settings-input-bg border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
+                  delayHide={400}
-                value={performanceMode}
+                  clickable={true}
-                onChange={(e) => setPerformanceMode(e.target.value)}
+                  className="tooltip !text-xs !opacity-100"
-              >
+                  style={{
-                <option value="base">Base (Default)</option>
+                    maxWidth: "250px",
-                <option value="maximum">Maximum</option>
+                    whiteSpace: "normal",
-              </select>
+                    wordWrap: "break-word",
-              <p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
+                  }}
-                Choose the performance mode for the Ollama model.
+                >
-              </p>
+                  <p className="text-xs leading-[18px] font-base">
-              <Tooltip
+                    Choose how long Ollama should keep your model in memory
-                id="performance-mode-tooltip"
+                    before unloading.{" "}
-                place="bottom"
+                    <Link
-                className="tooltip !text-xs max-w-xs"
+                      className="underline text-blue-300"
-              >
+                      to="https://docs.ollama.com/faq#how-do-i-keep-a-model-loaded-in-memory-or-make-it-unload-immediately"
-                <p className="text-red-500">
+                      target="_blank"
-                  <strong>Note:</strong> Be careful with the Maximum mode. It
+                      rel="noreferrer"
-                  may increase resource usage significantly.
+                    >
-                </p>
+                      Learn more &rarr;
-                <br />
+                    </Link>
-                <p>
+                  </p>
-                  <strong>Base:</strong> Ollama automatically limits the context
+                </Tooltip>
-                  to 2048 tokens, keeping resources usage low while maintaining
+              </div>
                  good performance. Suitable for most users and models.
                </p>
                <br />
                <p>
                  <strong>Maximum:</strong> Uses the full context window (up to
                  Max Tokens). Will result in increased resource usage but
                  allows for larger context conversations. <br />
                  <br />
                  This is not recommended for most users.
                </p>
              </Tooltip>
            </div>
            <div className="flex flex-col w-60">
              <label className="text-white text-sm font-semibold block mb-2">
                Ollama Keep Alive
              </label>
              <select
                name="OllamaLLMKeepAliveSeconds"
                required={true}
@ -155,26 +158,53 @@ export default function OllamaLLMOptions({ settings }) {
                <option value="3600">1 hour</option>
                <option value="-1">Forever</option>
              </select>
              <p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
                Choose how long Ollama should keep your model in memory before
                unloading.
                <a
                  className="underline text-blue-300"
                  href="https://github.com/ollama/ollama/blob/main/docs/faq.md#how-do-i-keep-a-model-loaded-in-memory-or-make-it-unload-immediately"
                  target="_blank"
                  rel="noreferrer"
                >
                  {" "}
                  Learn more &rarr;
                </a>
              </p>
            </div>
          </div>
          <div className="w-full flex items-start gap-4">
            <div className="flex flex-col w-60">
-              <label className="text-white text-sm font-semibold block mb-2">
+              <div className="flex items-center mb-2 gap-x-1">
-                Max Tokens (Optional)
+                <label className="text-white text-sm font-semibold block">
-              </label>
+                  Model context window
                </label>
                <Info
                  size={18}
                  className="text-theme-text-secondary cursor-pointer"
                  data-tooltip-id="ollama-model-context-window"
                />
                <Tooltip
                  id="ollama-model-context-window"
                  place="top"
                  delayShow={300}
                  delayHide={400}
                  clickable={true}
                  className="tooltip !text-xs !opacity-100"
                  style={{
                    maxWidth: "250px",
                    whiteSpace: "normal",
                    wordWrap: "break-word",
                  }}
                >
                  <p className="text-xs leading-[18px] font-base">
                    Specify the maximum number of tokens that can be used for
                    the model context window.
                    <br /> <br />
                    If you leave this field blank, the context window limit will
                    be auto-detected from the model and applied to all chats. If
                    auto-detection fails, a fallback context window limit of
                    4096 will be used.
                    <br /> <br />
                    <b>Important:</b> Some models have very large context
                    windows using the full context window limit can dramatically
                    increase the memory usage of your system. For this reason,
                    we will automatically cap the context window limit to 16,384
                    tokens if the model supports more than that and no value is
                    specified.
                    <br /> <br />
                    If an invalid value is entered, AnythingLLM will handle this
                    for you so that chats do not fail.
                  </p>
                </Tooltip>
              </div>
              <input
                type="number"
                name="OllamaLLMTokenLimit"
@ -189,28 +219,44 @@ export default function OllamaLLMOptions({ settings }) {
                required={false}
                autoComplete="off"
              />
              <p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
                Override the context window limit. Leave empty to auto-detect
                from the model (defaults to 4096 if detection fails).
              </p>
            </div>
-          </div>
+
-          <div className="w-full flex items-start gap-4 mt-4">
+            <div className="flex flex-col w-60">
-            <div className="flex flex-col w-100">
+              <div className="flex items-center mb-2 gap-x-1">
-              <label className="text-white text-sm font-semibold">
+                <label className="text-white text-sm font-semibold">
-                Auth Token
+                  Authentication Token
-              </label>
+                </label>
-              <p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
+                <Info
-                Enter a <code>Bearer</code> Auth Token for interacting with your
+                  size={18}
-                Ollama server.
+                  className="text-theme-text-secondary cursor-pointer"
-                <br />
+                  data-tooltip-id="ollama-authentication-token"
-                Used <b>only</b> if running Ollama behind an authentication
+                />
-                server.
+                <Tooltip
-              </p>
+                  id="ollama-authentication-token"
                  place="top"
                  delayShow={300}
                  delayHide={400}
                  clickable={true}
                  className="tooltip !text-xs !opacity-100"
                  style={{
                    maxWidth: "250px",
                    whiteSpace: "normal",
                    wordWrap: "break-word",
                  }}
                >
                  <p className="text-xs leading-[18px] font-base">
                    Enter a <code>Bearer</code> Auth Token for interacting with
                    your Ollama server.
                    <br /> <br />
                    Used <b>only</b> if running Ollama behind an authentication
                    server.
                  </p>
                </Tooltip>
              </div>
              <input
                type="password"
                name="OllamaLLMAuthToken"
-                className="border-none bg-theme-settings-input-bg mt-2 text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg outline-none block w-full p-2.5 focus:outline-primary-button active:outline-primary-button"
+                className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg outline-none block w-full p-2.5 focus:outline-primary-button active:outline-primary-button"
                placeholder="Ollama Auth Token"
                defaultValue={
                  settings?.OllamaLLMAuthToken ? "*".repeat(20) : ""
--- a/server/models/systemSettings.js
+++ b/server/models/systemSettings.js
@ -518,7 +518,6 @@ const SystemSettings = {
      OllamaLLMModelPref: process.env.OLLAMA_MODEL_PREF,
      OllamaLLMTokenLimit: process.env.OLLAMA_MODEL_TOKEN_LIMIT || null,
      OllamaLLMKeepAliveSeconds: process.env.OLLAMA_KEEP_ALIVE_TIMEOUT ?? 300,
      OllamaLLMPerformanceMode: process.env.OLLAMA_PERFORMANCE_MODE ?? "base",
      // Novita LLM Keys
      NovitaLLMApiKey: !!process.env.NOVITA_LLM_API_KEY,
--- a/server/utils/AiProviders/ollama/README.md
+++ b/server/utils/AiProviders/ollama/README.md
@ -1,40 +0,0 @@
 # Common Issues with Ollama
 If you encounter an error stating `llama:streaming - could not stream chat. Error: connect ECONNREFUSED 172.17.0.1:11434` when using AnythingLLM in a Docker container, this indicates that the IP of the Host inside of the virtual docker network does not bind to port 11434 of the host system by default, due to Ollama's restriction to localhost and 127.0.0.1. To resolve this issue and ensure proper communication between the Dockerized AnythingLLM and the Ollama service, you must configure Ollama to bind to 0.0.0.0 or a specific IP address.
 ### Setting Environment Variables on Mac
 If Ollama is run as a macOS application, environment variables should be set using `launchctl`:
 1.  For each environment variable, call `launchctl setenv`.
    ```bash
    launchctl setenv OLLAMA_HOST "0.0.0.0"
    ```
 2.  Restart the Ollama application.
 ### Setting Environment Variables on Linux
 If Ollama is run as a systemd service, environment variables should be set using `systemctl`:
 1.  Edit the systemd service by calling `systemctl edit ollama.service`. This will open an editor.
 2.  For each environment variable, add a line `Environment` under the section `[Service]`:
    ```ini
    [Service]
    Environment="OLLAMA_HOST=0.0.0.0"
    ```
 3.  Save and exit.
 4.  Reload `systemd` and restart Ollama:
    ```bash
    systemctl daemon-reload
    systemctl restart ollama
    ```
 ### Setting Environment Variables on Windows
 On Windows, Ollama inherits your user and system environment variables.
 1.  First, quit Ollama by clicking on it in the taskbar.
 2.  Edit system environment variables from the Control Panel.
 3.  Edit or create new variable(s) for your user account for `OLLAMA_HOST`, `OLLAMA_MODELS`, etc.
 4.  Click OK/Apply to save.
 5.  Run `ollama` from a new terminal window.
--- a/server/utils/AiProviders/ollama/index.js
+++ b/server/utils/AiProviders/ollama/index.js
@ -23,7 +23,6 @@ class OllamaAILLM {
    this.authToken = process.env.OLLAMA_AUTH_TOKEN;
    this.basePath = process.env.OLLAMA_BASE_PATH;
    this.model = modelPreference || process.env.OLLAMA_MODEL_PREF;
    this.performanceMode = process.env.OLLAMA_PERFORMANCE_MODE || "base";
    this.keepAlive = process.env.OLLAMA_KEEP_ALIVE_TIMEOUT
      ? Number(process.env.OLLAMA_KEEP_ALIVE_TIMEOUT)
      : 300; // Default 5-minute timeout for Ollama model loading.
@ -43,9 +42,7 @@ class OllamaAILLM {
    this.limits = null;
    OllamaAILLM.cacheContextWindows(true);
-    this.#log(
+    this.#log(`initialized with\nmodel: ${this.model}`);
      `initialized with\nmodel: ${this.model}\nperf: ${this.performanceMode}`
    );
  }
  #log(text, ...args) {
@ -64,6 +61,9 @@ class OllamaAILLM {
      system: this.promptWindowLimit() * 0.15,
      user: this.promptWindowLimit() * 0.7,
    };
    this.#log(
      `model ${this.model} is using a max context window of ${this.promptWindowLimit()}/${OllamaAILLM.maxContextWindow(this.model)} tokens.`
    );
  }
  /**
@ -176,8 +176,7 @@ class OllamaAILLM {
    }
    let userDefinedLimit = null;
-    const systemDefinedLimit =
+    const systemDefinedLimit = OllamaAILLM.maxContextWindow(modelName);
      Number(this.modelContextWindows[modelName]) || 4096;
    if (
      process.env.OLLAMA_MODEL_TOKEN_LIMIT &&
@ -190,13 +189,23 @@ class OllamaAILLM {
    // so we return the minimum of the two, if there is no user defined limit, we return the system defined limit as-is.
    if (userDefinedLimit !== null)
      return Math.min(userDefinedLimit, systemDefinedLimit);
-    return systemDefinedLimit;
+
    // Cap the context window limit to 16,384 tokens if the model supports more than that and no value is specified by the user.
    // This prevents super-large context windows from being used if the user does not specify a value
    // as well as also having smaller context windows use the full context window limit.
    return Math.min(systemDefinedLimit, 16384);
  }
  promptWindowLimit() {
    return this.constructor.promptWindowLimit(this.model);
  }
  static maxContextWindow(modelName = null) {
    if (Object.keys(OllamaAILLM.modelContextWindows).length === 0 || !modelName)
      return 4096;
    return Number(OllamaAILLM.modelContextWindows[modelName]) || 16384;
  }
  async isValidChatCompletionModel(_ = "") {
    return true;
  }
@ -266,10 +275,7 @@ class OllamaAILLM {
          options: {
            temperature,
            use_mlock: true,
-            // There are currently only two performance settings so if its not "base" - its max context.
+            num_ctx: this.promptWindowLimit(),
            ...(this.performanceMode === "base"
              ? {} // TODO: if in base mode, maybe we just use half the context window when below <10K?
              : { num_ctx: this.promptWindowLimit() }),
          },
        })
        .then((res) => {
@ -321,10 +327,7 @@ class OllamaAILLM {
        options: {
          temperature,
          use_mlock: true,
-          // There are currently only two performance settings so if its not "base" - its max context.
+          num_ctx: this.promptWindowLimit(),
          ...(this.performanceMode === "base"
            ? {}
            : { num_ctx: this.promptWindowLimit() }),
        },
      }),
      messages,
--- a/server/utils/agents/aibitat/providers/ai-provider.js
+++ b/server/utils/agents/aibitat/providers/ai-provider.js
@ -464,16 +464,10 @@ class OllamaLangchainChatModel {
    });
  }
  static performanceMode() {
    return process.env.OLLAMA_PERFORMANCE_MODE || "base";
  }
  static queryOptions(config = {}) {
    const model = config?.model || process.env.OLLAMA_MODEL_PREF;
    return {
-      ...(this.performanceMode() === "base"
+      num_ctx: OllamaAILLM.promptWindowLimit(model),
        ? {}
        : { num_ctx: OllamaAILLM.promptWindowLimit(model) }),
    };
  }
 }
--- a/server/utils/agents/aibitat/providers/ollama.js
+++ b/server/utils/agents/aibitat/providers/ollama.js
@ -25,6 +25,7 @@ class OllamaProvider extends InheritMultiple([Provider, UnTooled]) {
    this._client = new Ollama({
      host: process.env.OLLAMA_BASE_PATH,
      headers: headers,
      fetch: this.#applyFetch(),
    });
    this.model = model;
    this.verbose = true;
@ -38,15 +39,12 @@ class OllamaProvider extends InheritMultiple([Provider, UnTooled]) {
    return true;
  }
  get performanceMode() {
    return process.env.OLLAMA_PERFORMANCE_MODE || "base";
  }
  get queryOptions() {
    this.providerLog(
      `${this.model} is using a max context window of ${OllamaAILLM.promptWindowLimit(this.model)}/${OllamaAILLM.maxContextWindow(this.model)} tokens.`
    );
    return {
-      ...(this.performanceMode === "base"
+      num_ctx: OllamaAILLM.promptWindowLimit(this.model),
        ? {}
        : { num_ctx: OllamaAILLM.promptWindowLimit(this.model) }),
    };
  }
@ -366,6 +364,46 @@ class OllamaProvider extends InheritMultiple([Provider, UnTooled]) {
  getCost(_usage) {
    return 0;
  }
  /**
   * Apply a custom fetch function to the Ollama client.
   * This is useful when we want to bypass the default 5m timeout for global fetch
   * for machines which run responses very slowly.
   * @returns {Function} The custom fetch function.
   */
  #applyFetch() {
    try {
      if (!("OLLAMA_RESPONSE_TIMEOUT" in process.env)) return fetch;
      const { Agent } = require("undici");
      const moment = require("moment");
      let timeout = process.env.OLLAMA_RESPONSE_TIMEOUT;
      if (!timeout || isNaN(Number(timeout)) || Number(timeout) <= 5 * 60_000) {
        this.providerLog(
          "Timeout option was not set, is not a number, or is less than 5 minutes in ms - falling back to default",
          { timeout }
        );
        return fetch;
      } else timeout = Number(timeout);
      const noTimeoutFetch = (input, init = {}) => {
        return fetch(input, {
          ...init,
          dispatcher: new Agent({ headersTimeout: timeout }),
        });
      };
      const humanDiff = moment.duration(timeout).humanize();
      this.providerLog(`Applying custom fetch w/timeout of ${humanDiff}.`);
      return noTimeoutFetch;
    } catch (error) {
      this.providerLog(
        "Error applying custom fetch - using default fetch",
        error
      );
      return fetch;
    }
  }
 }
 module.exports = OllamaProvider;
--- a/server/utils/helpers/updateENV.js
+++ b/server/utils/helpers/updateENV.js
@ -125,10 +125,6 @@ const KEY_MAPPING = {
    envKey: "OLLAMA_MODEL_TOKEN_LIMIT",
    checks: [],
  },
  OllamaLLMPerformanceMode: {
    envKey: "OLLAMA_PERFORMANCE_MODE",
    checks: [],
  },
  OllamaLLMKeepAliveSeconds: {
    envKey: "OLLAMA_KEEP_ALIVE_TIMEOUT",
    checks: [isInteger],