Refactor Ollama context window setting (#4909)

This commit is contained in:
Timothy Carambat 2026-01-27 10:50:40 -08:00 committed by GitHub
parent 64bff91998
commit fe78e1c667
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 205 additions and 169 deletions

View File

@ -1,10 +1,10 @@
import React, { useEffect, useState } from "react"; import React, { useEffect, useState } from "react";
import System from "@/models/system"; import System from "@/models/system";
import PreLoader from "@/components/Preloader";
import { OLLAMA_COMMON_URLS } from "@/utils/constants"; import { OLLAMA_COMMON_URLS } from "@/utils/constants";
import { CaretDown, CaretUp, Info } from "@phosphor-icons/react"; import { CaretDown, CaretUp, Info, CircleNotch } from "@phosphor-icons/react";
import useProviderEndpointAutoDiscovery from "@/hooks/useProviderEndpointAutoDiscovery"; import useProviderEndpointAutoDiscovery from "@/hooks/useProviderEndpointAutoDiscovery";
import { Tooltip } from "react-tooltip"; import { Tooltip } from "react-tooltip";
import { Link } from "react-router-dom";
export default function OllamaLLMOptions({ settings }) { export default function OllamaLLMOptions({ settings }) {
const { const {
@ -21,9 +21,6 @@ export default function OllamaLLMOptions({ settings }) {
initialBasePath: settings?.OllamaLLMBasePath, initialBasePath: settings?.OllamaLLMBasePath,
ENDPOINTS: OLLAMA_COMMON_URLS, ENDPOINTS: OLLAMA_COMMON_URLS,
}); });
const [performanceMode, setPerformanceMode] = useState(
settings?.OllamaLLMPerformanceMode || "base"
);
const [maxTokens, setMaxTokens] = useState( const [maxTokens, setMaxTokens] = useState(
settings?.OllamaLLMTokenLimit || "" settings?.OllamaLLMTokenLimit || ""
); );
@ -56,14 +53,36 @@ export default function OllamaLLMOptions({ settings }) {
<div hidden={!showAdvancedControls}> <div hidden={!showAdvancedControls}>
<div className="flex flex-col"> <div className="flex flex-col">
<div className="w-full flex items-start gap-4"> <div className="w-full flex items-start gap-4 mb-4">
<div className="flex flex-col w-60"> <div className="flex flex-col w-60">
<div className="flex justify-between items-center mb-2"> <div className="flex justify-between items-center mb-2">
<label className="text-white text-sm font-semibold"> <div className="flex items-center gap-1">
Ollama Base URL <label className="text-white text-sm font-semibold">
</label> Ollama Base URL
</label>
<Info
size={18}
className="text-theme-text-secondary cursor-pointer"
data-tooltip-id="ollama-base-url"
data-tooltip-content="Enter the URL where Ollama is running."
/>
<Tooltip
id="ollama-base-url"
place="top"
delayShow={300}
className="tooltip !text-xs !opacity-100"
style={{
maxWidth: "250px",
whiteSpace: "normal",
wordWrap: "break-word",
}}
/>
</div>
{loading ? ( {loading ? (
<PreLoader size="6" /> <CircleNotch
size={16}
className="text-theme-text-secondary animate-spin"
/>
) : ( ) : (
<> <>
{!basePathValue.value && ( {!basePathValue.value && (
@ -89,61 +108,45 @@ export default function OllamaLLMOptions({ settings }) {
onChange={basePath.onChange} onChange={basePath.onChange}
onBlur={basePath.onBlur} onBlur={basePath.onBlur}
/> />
<p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
Enter the URL where Ollama is running.
</p>
</div> </div>
<div className="flex flex-col w-60"> <div className="flex flex-col w-60">
<label className="text-white text-sm font-semibold mb-2 flex items-center"> <div className="flex items-center mb-2 gap-x-1">
Performance Mode <label className="text-white text-sm font-semibold block">
Ollama Keep Alive
</label>
<Info <Info
size={16} size={18}
className="ml-2 text-white" className="text-theme-text-secondary cursor-pointer"
data-tooltip-id="performance-mode-tooltip" data-tooltip-id="ollama-keep-alive"
/> />
</label> <Tooltip
<select id="ollama-keep-alive"
name="OllamaLLMPerformanceMode" place="top"
required={true} delayShow={300}
className="border-none bg-theme-settings-input-bg border-gray-500 text-white text-sm rounded-lg block w-full p-2.5" delayHide={400}
value={performanceMode} clickable={true}
onChange={(e) => setPerformanceMode(e.target.value)} className="tooltip !text-xs !opacity-100"
> style={{
<option value="base">Base (Default)</option> maxWidth: "250px",
<option value="maximum">Maximum</option> whiteSpace: "normal",
</select> wordWrap: "break-word",
<p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2"> }}
Choose the performance mode for the Ollama model. >
</p> <p className="text-xs leading-[18px] font-base">
<Tooltip Choose how long Ollama should keep your model in memory
id="performance-mode-tooltip" before unloading.{" "}
place="bottom" <Link
className="tooltip !text-xs max-w-xs" className="underline text-blue-300"
> to="https://docs.ollama.com/faq#how-do-i-keep-a-model-loaded-in-memory-or-make-it-unload-immediately"
<p className="text-red-500"> target="_blank"
<strong>Note:</strong> Be careful with the Maximum mode. It rel="noreferrer"
may increase resource usage significantly. >
</p> Learn more &rarr;
<br /> </Link>
<p> </p>
<strong>Base:</strong> Ollama automatically limits the context </Tooltip>
to 2048 tokens, keeping resources usage low while maintaining </div>
good performance. Suitable for most users and models.
</p>
<br />
<p>
<strong>Maximum:</strong> Uses the full context window (up to
Max Tokens). Will result in increased resource usage but
allows for larger context conversations. <br />
<br />
This is not recommended for most users.
</p>
</Tooltip>
</div>
<div className="flex flex-col w-60">
<label className="text-white text-sm font-semibold block mb-2">
Ollama Keep Alive
</label>
<select <select
name="OllamaLLMKeepAliveSeconds" name="OllamaLLMKeepAliveSeconds"
required={true} required={true}
@ -155,26 +158,53 @@ export default function OllamaLLMOptions({ settings }) {
<option value="3600">1 hour</option> <option value="3600">1 hour</option>
<option value="-1">Forever</option> <option value="-1">Forever</option>
</select> </select>
<p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
Choose how long Ollama should keep your model in memory before
unloading.
<a
className="underline text-blue-300"
href="https://github.com/ollama/ollama/blob/main/docs/faq.md#how-do-i-keep-a-model-loaded-in-memory-or-make-it-unload-immediately"
target="_blank"
rel="noreferrer"
>
{" "}
Learn more &rarr;
</a>
</p>
</div> </div>
</div> </div>
<div className="w-full flex items-start gap-4"> <div className="w-full flex items-start gap-4">
<div className="flex flex-col w-60"> <div className="flex flex-col w-60">
<label className="text-white text-sm font-semibold block mb-2"> <div className="flex items-center mb-2 gap-x-1">
Max Tokens (Optional) <label className="text-white text-sm font-semibold block">
</label> Model context window
</label>
<Info
size={18}
className="text-theme-text-secondary cursor-pointer"
data-tooltip-id="ollama-model-context-window"
/>
<Tooltip
id="ollama-model-context-window"
place="top"
delayShow={300}
delayHide={400}
clickable={true}
className="tooltip !text-xs !opacity-100"
style={{
maxWidth: "250px",
whiteSpace: "normal",
wordWrap: "break-word",
}}
>
<p className="text-xs leading-[18px] font-base">
Specify the maximum number of tokens that can be used for
the model context window.
<br /> <br />
If you leave this field blank, the context window limit will
be auto-detected from the model and applied to all chats. If
auto-detection fails, a fallback context window limit of
4096 will be used.
<br /> <br />
<b>Important:</b> Some models have very large context
windows using the full context window limit can dramatically
increase the memory usage of your system. For this reason,
we will automatically cap the context window limit to 16,384
tokens if the model supports more than that and no value is
specified.
<br /> <br />
If an invalid value is entered, AnythingLLM will handle this
for you so that chats do not fail.
</p>
</Tooltip>
</div>
<input <input
type="number" type="number"
name="OllamaLLMTokenLimit" name="OllamaLLMTokenLimit"
@ -189,28 +219,44 @@ export default function OllamaLLMOptions({ settings }) {
required={false} required={false}
autoComplete="off" autoComplete="off"
/> />
<p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
Override the context window limit. Leave empty to auto-detect
from the model (defaults to 4096 if detection fails).
</p>
</div> </div>
</div>
<div className="w-full flex items-start gap-4 mt-4"> <div className="flex flex-col w-60">
<div className="flex flex-col w-100"> <div className="flex items-center mb-2 gap-x-1">
<label className="text-white text-sm font-semibold"> <label className="text-white text-sm font-semibold">
Auth Token Authentication Token
</label> </label>
<p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2"> <Info
Enter a <code>Bearer</code> Auth Token for interacting with your size={18}
Ollama server. className="text-theme-text-secondary cursor-pointer"
<br /> data-tooltip-id="ollama-authentication-token"
Used <b>only</b> if running Ollama behind an authentication />
server. <Tooltip
</p> id="ollama-authentication-token"
place="top"
delayShow={300}
delayHide={400}
clickable={true}
className="tooltip !text-xs !opacity-100"
style={{
maxWidth: "250px",
whiteSpace: "normal",
wordWrap: "break-word",
}}
>
<p className="text-xs leading-[18px] font-base">
Enter a <code>Bearer</code> Auth Token for interacting with
your Ollama server.
<br /> <br />
Used <b>only</b> if running Ollama behind an authentication
server.
</p>
</Tooltip>
</div>
<input <input
type="password" type="password"
name="OllamaLLMAuthToken" name="OllamaLLMAuthToken"
className="border-none bg-theme-settings-input-bg mt-2 text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg outline-none block w-full p-2.5 focus:outline-primary-button active:outline-primary-button" className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg outline-none block w-full p-2.5 focus:outline-primary-button active:outline-primary-button"
placeholder="Ollama Auth Token" placeholder="Ollama Auth Token"
defaultValue={ defaultValue={
settings?.OllamaLLMAuthToken ? "*".repeat(20) : "" settings?.OllamaLLMAuthToken ? "*".repeat(20) : ""

View File

@ -518,7 +518,6 @@ const SystemSettings = {
OllamaLLMModelPref: process.env.OLLAMA_MODEL_PREF, OllamaLLMModelPref: process.env.OLLAMA_MODEL_PREF,
OllamaLLMTokenLimit: process.env.OLLAMA_MODEL_TOKEN_LIMIT || null, OllamaLLMTokenLimit: process.env.OLLAMA_MODEL_TOKEN_LIMIT || null,
OllamaLLMKeepAliveSeconds: process.env.OLLAMA_KEEP_ALIVE_TIMEOUT ?? 300, OllamaLLMKeepAliveSeconds: process.env.OLLAMA_KEEP_ALIVE_TIMEOUT ?? 300,
OllamaLLMPerformanceMode: process.env.OLLAMA_PERFORMANCE_MODE ?? "base",
// Novita LLM Keys // Novita LLM Keys
NovitaLLMApiKey: !!process.env.NOVITA_LLM_API_KEY, NovitaLLMApiKey: !!process.env.NOVITA_LLM_API_KEY,

View File

@ -1,40 +0,0 @@
# Common Issues with Ollama
If you encounter an error stating `llama:streaming - could not stream chat. Error: connect ECONNREFUSED 172.17.0.1:11434` when using AnythingLLM in a Docker container, this indicates that the IP of the Host inside of the virtual docker network does not bind to port 11434 of the host system by default, due to Ollama's restriction to localhost and 127.0.0.1. To resolve this issue and ensure proper communication between the Dockerized AnythingLLM and the Ollama service, you must configure Ollama to bind to 0.0.0.0 or a specific IP address.
### Setting Environment Variables on Mac
If Ollama is run as a macOS application, environment variables should be set using `launchctl`:
1. For each environment variable, call `launchctl setenv`.
```bash
launchctl setenv OLLAMA_HOST "0.0.0.0"
```
2. Restart the Ollama application.
### Setting Environment Variables on Linux
If Ollama is run as a systemd service, environment variables should be set using `systemctl`:
1. Edit the systemd service by calling `systemctl edit ollama.service`. This will open an editor.
2. For each environment variable, add a line `Environment` under the section `[Service]`:
```ini
[Service]
Environment="OLLAMA_HOST=0.0.0.0"
```
3. Save and exit.
4. Reload `systemd` and restart Ollama:
```bash
systemctl daemon-reload
systemctl restart ollama
```
### Setting Environment Variables on Windows
On Windows, Ollama inherits your user and system environment variables.
1. First, quit Ollama by clicking on it in the taskbar.
2. Edit system environment variables from the Control Panel.
3. Edit or create new variable(s) for your user account for `OLLAMA_HOST`, `OLLAMA_MODELS`, etc.
4. Click OK/Apply to save.
5. Run `ollama` from a new terminal window.

View File

@ -23,7 +23,6 @@ class OllamaAILLM {
this.authToken = process.env.OLLAMA_AUTH_TOKEN; this.authToken = process.env.OLLAMA_AUTH_TOKEN;
this.basePath = process.env.OLLAMA_BASE_PATH; this.basePath = process.env.OLLAMA_BASE_PATH;
this.model = modelPreference || process.env.OLLAMA_MODEL_PREF; this.model = modelPreference || process.env.OLLAMA_MODEL_PREF;
this.performanceMode = process.env.OLLAMA_PERFORMANCE_MODE || "base";
this.keepAlive = process.env.OLLAMA_KEEP_ALIVE_TIMEOUT this.keepAlive = process.env.OLLAMA_KEEP_ALIVE_TIMEOUT
? Number(process.env.OLLAMA_KEEP_ALIVE_TIMEOUT) ? Number(process.env.OLLAMA_KEEP_ALIVE_TIMEOUT)
: 300; // Default 5-minute timeout for Ollama model loading. : 300; // Default 5-minute timeout for Ollama model loading.
@ -43,9 +42,7 @@ class OllamaAILLM {
this.limits = null; this.limits = null;
OllamaAILLM.cacheContextWindows(true); OllamaAILLM.cacheContextWindows(true);
this.#log( this.#log(`initialized with\nmodel: ${this.model}`);
`initialized with\nmodel: ${this.model}\nperf: ${this.performanceMode}`
);
} }
#log(text, ...args) { #log(text, ...args) {
@ -64,6 +61,9 @@ class OllamaAILLM {
system: this.promptWindowLimit() * 0.15, system: this.promptWindowLimit() * 0.15,
user: this.promptWindowLimit() * 0.7, user: this.promptWindowLimit() * 0.7,
}; };
this.#log(
`model ${this.model} is using a max context window of ${this.promptWindowLimit()}/${OllamaAILLM.maxContextWindow(this.model)} tokens.`
);
} }
/** /**
@ -176,8 +176,7 @@ class OllamaAILLM {
} }
let userDefinedLimit = null; let userDefinedLimit = null;
const systemDefinedLimit = const systemDefinedLimit = OllamaAILLM.maxContextWindow(modelName);
Number(this.modelContextWindows[modelName]) || 4096;
if ( if (
process.env.OLLAMA_MODEL_TOKEN_LIMIT && process.env.OLLAMA_MODEL_TOKEN_LIMIT &&
@ -190,13 +189,23 @@ class OllamaAILLM {
// so we return the minimum of the two, if there is no user defined limit, we return the system defined limit as-is. // so we return the minimum of the two, if there is no user defined limit, we return the system defined limit as-is.
if (userDefinedLimit !== null) if (userDefinedLimit !== null)
return Math.min(userDefinedLimit, systemDefinedLimit); return Math.min(userDefinedLimit, systemDefinedLimit);
return systemDefinedLimit;
// Cap the context window limit to 16,384 tokens if the model supports more than that and no value is specified by the user.
// This prevents super-large context windows from being used if the user does not specify a value
// as well as also having smaller context windows use the full context window limit.
return Math.min(systemDefinedLimit, 16384);
} }
promptWindowLimit() { promptWindowLimit() {
return this.constructor.promptWindowLimit(this.model); return this.constructor.promptWindowLimit(this.model);
} }
static maxContextWindow(modelName = null) {
if (Object.keys(OllamaAILLM.modelContextWindows).length === 0 || !modelName)
return 4096;
return Number(OllamaAILLM.modelContextWindows[modelName]) || 16384;
}
async isValidChatCompletionModel(_ = "") { async isValidChatCompletionModel(_ = "") {
return true; return true;
} }
@ -266,10 +275,7 @@ class OllamaAILLM {
options: { options: {
temperature, temperature,
use_mlock: true, use_mlock: true,
// There are currently only two performance settings so if its not "base" - its max context. num_ctx: this.promptWindowLimit(),
...(this.performanceMode === "base"
? {} // TODO: if in base mode, maybe we just use half the context window when below <10K?
: { num_ctx: this.promptWindowLimit() }),
}, },
}) })
.then((res) => { .then((res) => {
@ -321,10 +327,7 @@ class OllamaAILLM {
options: { options: {
temperature, temperature,
use_mlock: true, use_mlock: true,
// There are currently only two performance settings so if its not "base" - its max context. num_ctx: this.promptWindowLimit(),
...(this.performanceMode === "base"
? {}
: { num_ctx: this.promptWindowLimit() }),
}, },
}), }),
messages, messages,

View File

@ -464,16 +464,10 @@ class OllamaLangchainChatModel {
}); });
} }
static performanceMode() {
return process.env.OLLAMA_PERFORMANCE_MODE || "base";
}
static queryOptions(config = {}) { static queryOptions(config = {}) {
const model = config?.model || process.env.OLLAMA_MODEL_PREF; const model = config?.model || process.env.OLLAMA_MODEL_PREF;
return { return {
...(this.performanceMode() === "base" num_ctx: OllamaAILLM.promptWindowLimit(model),
? {}
: { num_ctx: OllamaAILLM.promptWindowLimit(model) }),
}; };
} }
} }

View File

@ -25,6 +25,7 @@ class OllamaProvider extends InheritMultiple([Provider, UnTooled]) {
this._client = new Ollama({ this._client = new Ollama({
host: process.env.OLLAMA_BASE_PATH, host: process.env.OLLAMA_BASE_PATH,
headers: headers, headers: headers,
fetch: this.#applyFetch(),
}); });
this.model = model; this.model = model;
this.verbose = true; this.verbose = true;
@ -38,15 +39,12 @@ class OllamaProvider extends InheritMultiple([Provider, UnTooled]) {
return true; return true;
} }
get performanceMode() {
return process.env.OLLAMA_PERFORMANCE_MODE || "base";
}
get queryOptions() { get queryOptions() {
this.providerLog(
`${this.model} is using a max context window of ${OllamaAILLM.promptWindowLimit(this.model)}/${OllamaAILLM.maxContextWindow(this.model)} tokens.`
);
return { return {
...(this.performanceMode === "base" num_ctx: OllamaAILLM.promptWindowLimit(this.model),
? {}
: { num_ctx: OllamaAILLM.promptWindowLimit(this.model) }),
}; };
} }
@ -366,6 +364,46 @@ class OllamaProvider extends InheritMultiple([Provider, UnTooled]) {
getCost(_usage) { getCost(_usage) {
return 0; return 0;
} }
/**
* Apply a custom fetch function to the Ollama client.
* This is useful when we want to bypass the default 5m timeout for global fetch
* for machines which run responses very slowly.
* @returns {Function} The custom fetch function.
*/
#applyFetch() {
try {
if (!("OLLAMA_RESPONSE_TIMEOUT" in process.env)) return fetch;
const { Agent } = require("undici");
const moment = require("moment");
let timeout = process.env.OLLAMA_RESPONSE_TIMEOUT;
if (!timeout || isNaN(Number(timeout)) || Number(timeout) <= 5 * 60_000) {
this.providerLog(
"Timeout option was not set, is not a number, or is less than 5 minutes in ms - falling back to default",
{ timeout }
);
return fetch;
} else timeout = Number(timeout);
const noTimeoutFetch = (input, init = {}) => {
return fetch(input, {
...init,
dispatcher: new Agent({ headersTimeout: timeout }),
});
};
const humanDiff = moment.duration(timeout).humanize();
this.providerLog(`Applying custom fetch w/timeout of ${humanDiff}.`);
return noTimeoutFetch;
} catch (error) {
this.providerLog(
"Error applying custom fetch - using default fetch",
error
);
return fetch;
}
}
} }
module.exports = OllamaProvider; module.exports = OllamaProvider;

View File

@ -125,10 +125,6 @@ const KEY_MAPPING = {
envKey: "OLLAMA_MODEL_TOKEN_LIMIT", envKey: "OLLAMA_MODEL_TOKEN_LIMIT",
checks: [], checks: [],
}, },
OllamaLLMPerformanceMode: {
envKey: "OLLAMA_PERFORMANCE_MODE",
checks: [],
},
OllamaLLMKeepAliveSeconds: { OllamaLLMKeepAliveSeconds: {
envKey: "OLLAMA_KEEP_ALIVE_TIMEOUT", envKey: "OLLAMA_KEEP_ALIVE_TIMEOUT",
checks: [isInteger], checks: [isInteger],