Simplify cache condition for LMStudio and Ollama to prevent race condition (#4669)

closes #4597 resolves #4572 closes #4600 resolves #4599
2025-11-20 16:32:02 -08:00 · 2025-11-20 16:32:02 -08:00 · f0b3dab4c1
commit f0b3dab4c1
parent 49c29fb968
2 changed files with 51 additions and 21 deletions
--- a/server/utils/AiProviders/lmStudio/index.js
+++ b/server/utils/AiProviders/lmStudio/index.js
@ -36,16 +36,11 @@ class LMStudioLLM {
    this.embedder = embedder ?? new NativeEmbedder();
    this.defaultTemp = 0.7;

-    LMStudioLLM.cacheContextWindows(true).then(() => {
-      this.limits = {
-        history: this.promptWindowLimit() * 0.15,
-        system: this.promptWindowLimit() * 0.15,
-        user: this.promptWindowLimit() * 0.7,
-      };
-      this.#log(
-        `initialized with\nmodel: ${this.model}\nn_ctx: ${this.promptWindowLimit()}`
-      );
-    });
+    // Lazy load the limits to avoid blocking the main thread on cacheContextWindows
+    this.limits = null;
+
+    LMStudioLLM.cacheContextWindows(true);
+    this.#log(`initialized with model: ${this.model}`);
  }

  #log(text, ...args) {
@ -56,6 +51,16 @@ class LMStudioLLM {
    console.log(`\x1b[32m[LMStudio]\x1b[0m ${text}`, ...args);
  }

+  async assertModelContextLimits() {
+    if (this.limits !== null) return;
+    await LMStudioLLM.cacheContextWindows();
+    this.limits = {
+      history: this.promptWindowLimit() * 0.15,
+      system: this.promptWindowLimit() * 0.15,
+      user: this.promptWindowLimit() * 0.7,
+    };
+  }
+
  /**
   * Cache the context windows for the LMStudio models.
   * This is done once and then cached for the lifetime of the server. This is absolutely necessary to ensure that the context windows are correct.
@ -71,7 +76,9 @@ class LMStudioLLM {
      if (Object.keys(LMStudioLLM.modelContextWindows).length > 0 && !force)
        return;

-      const endpoint = new URL(process.env.LMSTUDIO_BASE_PATH);
+      const endpoint = new URL(
+        parseLMStudioBasePath(process.env.LMSTUDIO_BASE_PATH)
+      );
      endpoint.pathname = "/api/v0/models";
      await fetch(endpoint.toString())
        .then((res) => {
@ -115,6 +122,13 @@ class LMStudioLLM {
  }

  static promptWindowLimit(modelName) {
+    if (Object.keys(LMStudioLLM.modelContextWindows).length === 0) {
+      this.#slog(
+        "No context windows cached - Context window may be inaccurately reported."
+      );
+      return process.env.LMSTUDIO_MODEL_TOKEN_LIMIT || 4096;
+    }
+
    let userDefinedLimit = null;
    const systemDefinedLimit =
      Number(this.modelContextWindows[modelName]) || 4096;
@ -255,6 +269,7 @@ class LMStudioLLM {
  }

  async compressMessages(promptArgs = {}, rawHistory = []) {
+    await this.assertModelContextLimits();
    const { messageArrayCompressor } = require("../../helpers/chat");
    const messageArray = this.constructPrompt(promptArgs);
    return await messageArrayCompressor(this, messageArray, rawHistory);
--- a/server/utils/AiProviders/ollama/index.js
+++ b/server/utils/AiProviders/ollama/index.js
@ -39,16 +39,13 @@ class OllamaAILLM {
    this.embedder = embedder ?? new NativeEmbedder();
    this.defaultTemp = 0.7;

-    OllamaAILLM.cacheContextWindows(true).then(() => {
-      this.limits = {
-        history: this.promptWindowLimit() * 0.15,
-        system: this.promptWindowLimit() * 0.15,
-        user: this.promptWindowLimit() * 0.7,
-      };
-      this.#log(
-        `initialized with\nmodel: ${this.model}\nperf: ${this.performanceMode}\nn_ctx: ${this.promptWindowLimit()}`
-      );
-    });
+    // Lazy load the limits to avoid blocking the main thread on cacheContextWindows
+    this.limits = null;
+
+    OllamaAILLM.cacheContextWindows(true);
+    this.#log(
+      `initialized with\nmodel: ${this.model}\nperf: ${this.performanceMode}`
+    );
  }

  #log(text, ...args) {
@ -59,6 +56,16 @@ class OllamaAILLM {
    console.log(`\x1b[32m[Ollama]\x1b[0m ${text}`, ...args);
  }

+  async assertModelContextLimits() {
+    if (this.limits !== null) return;
+    await OllamaAILLM.cacheContextWindows();
+    this.limits = {
+      history: this.promptWindowLimit() * 0.15,
+      system: this.promptWindowLimit() * 0.15,
+      user: this.promptWindowLimit() * 0.7,
+    };
+  }
+
  /**
   * Cache the context windows for the Ollama models.
   * This is done once and then cached for the lifetime of the server. This is absolutely necessary to ensure that the context windows are correct.
@ -161,6 +168,13 @@ class OllamaAILLM {
  }

  static promptWindowLimit(modelName) {
+    if (Object.keys(OllamaAILLM.modelContextWindows).length === 0) {
+      this.#slog(
+        "No context windows cached - Context window may be inaccurately reported."
+      );
+      return process.env.OLLAMA_MODEL_TOKEN_LIMIT || 4096;
+    }
+
    let userDefinedLimit = null;
    const systemDefinedLimit =
      Number(this.modelContextWindows[modelName]) || 4096;
@ -455,6 +469,7 @@ class OllamaAILLM {
  }

  async compressMessages(promptArgs = {}, rawHistory = []) {
+    await this.assertModelContextLimits();
    const { messageArrayCompressor } = require("../../helpers/chat");
    const messageArray = this.constructPrompt(promptArgs);
    return await messageArrayCompressor(this, messageArray, rawHistory);