Simplify cache condition for LMStudio and Ollama to prevent race condition (#4669)

closes #4597
resolves #4572
closes #4600
resolves #4599
This commit is contained in:
Timothy Carambat 2025-11-20 16:32:02 -08:00 committed by GitHub
parent 49c29fb968
commit f0b3dab4c1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 51 additions and 21 deletions

View File

@ -36,16 +36,11 @@ class LMStudioLLM {
this.embedder = embedder ?? new NativeEmbedder();
this.defaultTemp = 0.7;
LMStudioLLM.cacheContextWindows(true).then(() => {
this.limits = {
history: this.promptWindowLimit() * 0.15,
system: this.promptWindowLimit() * 0.15,
user: this.promptWindowLimit() * 0.7,
};
this.#log(
`initialized with\nmodel: ${this.model}\nn_ctx: ${this.promptWindowLimit()}`
);
});
// Lazy load the limits to avoid blocking the main thread on cacheContextWindows
this.limits = null;
LMStudioLLM.cacheContextWindows(true);
this.#log(`initialized with model: ${this.model}`);
}
#log(text, ...args) {
@ -56,6 +51,16 @@ class LMStudioLLM {
console.log(`\x1b[32m[LMStudio]\x1b[0m ${text}`, ...args);
}
async assertModelContextLimits() {
if (this.limits !== null) return;
await LMStudioLLM.cacheContextWindows();
this.limits = {
history: this.promptWindowLimit() * 0.15,
system: this.promptWindowLimit() * 0.15,
user: this.promptWindowLimit() * 0.7,
};
}
/**
* Cache the context windows for the LMStudio models.
* This is done once and then cached for the lifetime of the server. This is absolutely necessary to ensure that the context windows are correct.
@ -71,7 +76,9 @@ class LMStudioLLM {
if (Object.keys(LMStudioLLM.modelContextWindows).length > 0 && !force)
return;
const endpoint = new URL(process.env.LMSTUDIO_BASE_PATH);
const endpoint = new URL(
parseLMStudioBasePath(process.env.LMSTUDIO_BASE_PATH)
);
endpoint.pathname = "/api/v0/models";
await fetch(endpoint.toString())
.then((res) => {
@ -115,6 +122,13 @@ class LMStudioLLM {
}
static promptWindowLimit(modelName) {
if (Object.keys(LMStudioLLM.modelContextWindows).length === 0) {
this.#slog(
"No context windows cached - Context window may be inaccurately reported."
);
return process.env.LMSTUDIO_MODEL_TOKEN_LIMIT || 4096;
}
let userDefinedLimit = null;
const systemDefinedLimit =
Number(this.modelContextWindows[modelName]) || 4096;
@ -255,6 +269,7 @@ class LMStudioLLM {
}
async compressMessages(promptArgs = {}, rawHistory = []) {
await this.assertModelContextLimits();
const { messageArrayCompressor } = require("../../helpers/chat");
const messageArray = this.constructPrompt(promptArgs);
return await messageArrayCompressor(this, messageArray, rawHistory);

View File

@ -39,16 +39,13 @@ class OllamaAILLM {
this.embedder = embedder ?? new NativeEmbedder();
this.defaultTemp = 0.7;
OllamaAILLM.cacheContextWindows(true).then(() => {
this.limits = {
history: this.promptWindowLimit() * 0.15,
system: this.promptWindowLimit() * 0.15,
user: this.promptWindowLimit() * 0.7,
};
this.#log(
`initialized with\nmodel: ${this.model}\nperf: ${this.performanceMode}\nn_ctx: ${this.promptWindowLimit()}`
);
});
// Lazy load the limits to avoid blocking the main thread on cacheContextWindows
this.limits = null;
OllamaAILLM.cacheContextWindows(true);
this.#log(
`initialized with\nmodel: ${this.model}\nperf: ${this.performanceMode}`
);
}
#log(text, ...args) {
@ -59,6 +56,16 @@ class OllamaAILLM {
console.log(`\x1b[32m[Ollama]\x1b[0m ${text}`, ...args);
}
async assertModelContextLimits() {
if (this.limits !== null) return;
await OllamaAILLM.cacheContextWindows();
this.limits = {
history: this.promptWindowLimit() * 0.15,
system: this.promptWindowLimit() * 0.15,
user: this.promptWindowLimit() * 0.7,
};
}
/**
* Cache the context windows for the Ollama models.
* This is done once and then cached for the lifetime of the server. This is absolutely necessary to ensure that the context windows are correct.
@ -161,6 +168,13 @@ class OllamaAILLM {
}
static promptWindowLimit(modelName) {
if (Object.keys(OllamaAILLM.modelContextWindows).length === 0) {
this.#slog(
"No context windows cached - Context window may be inaccurately reported."
);
return process.env.OLLAMA_MODEL_TOKEN_LIMIT || 4096;
}
let userDefinedLimit = null;
const systemDefinedLimit =
Number(this.modelContextWindows[modelName]) || 4096;
@ -455,6 +469,7 @@ class OllamaAILLM {
}
async compressMessages(promptArgs = {}, rawHistory = []) {
await this.assertModelContextLimits();
const { messageArrayCompressor } = require("../../helpers/chat");
const messageArray = this.constructPrompt(promptArgs);
return await messageArrayCompressor(this, messageArray, rawHistory);