Prompt caching for Anthropic LLM and Agent providers (#4488)

* prompt caching for anthropic llm and agent providers * add UI for control of ENV simplify implementation --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
2025-11-20 17:17:03 -08:00 · 2025-11-20 17:17:03 -08:00 · c913a2d68c
commit c913a2d68c
parent f0b3dab4c1
7 changed files with 158 additions and 7 deletions
--- a/docker/.env.example
+++ b/docker/.env.example
@ -27,6 +27,7 @@ GID='1000'
 # LLM_PROVIDER='anthropic'
 # ANTHROPIC_API_KEY=sk-ant-xxxx
 # ANTHROPIC_MODEL_PREF='claude-2'
+# ANTHROPIC_CACHE_CONTROL="5m" # Enable prompt caching (5m=5min cache, 1h=1hour cache). Reduces costs and improves speed by caching system prompts.

 # LLM_PROVIDER='lmstudio'
 # LMSTUDIO_BASE_PATH='http://your-server:1234/v1'
--- a/frontend/src/components/LLMSelection/AnthropicAiOptions/index.jsx
+++ b/frontend/src/components/LLMSelection/AnthropicAiOptions/index.jsx
@ -1,7 +1,9 @@
 import { useState, useEffect } from "react";
 import System from "@/models/system";
+import { CaretDown, CaretUp } from "@phosphor-icons/react";

 export default function AnthropicAiOptions({ settings }) {
+  const [showAdvancedControls, setShowAdvancedControls] = useState(false);
  const [inputValue, setInputValue] = useState(settings?.AnthropicApiKey);
  const [anthropicApiKey, setAnthropicApiKey] = useState(
    settings?.AnthropicApiKey
@ -27,7 +29,6 @@ export default function AnthropicAiOptions({ settings }) {
            onBlur={() => setAnthropicApiKey(inputValue)}
          />
        </div>
-
        {!settings?.credentialsOnly && (
          <AnthropicModelSelection
            apiKey={anthropicApiKey}
@ -35,6 +36,56 @@ export default function AnthropicAiOptions({ settings }) {
          />
        )}
      </div>
+      <div className="flex justify-start mt-4">
+        <button
+          onClick={(e) => {
+            e.preventDefault();
+            setShowAdvancedControls(!showAdvancedControls);
+          }}
+          className="border-none text-theme-text-primary hover:text-theme-text-secondary flex items-center text-sm"
+        >
+          {showAdvancedControls ? "Hide" : "Show"} advanced settings
+          {showAdvancedControls ? (
+            <CaretUp size={14} className="ml-1" />
+          ) : (
+            <CaretDown size={14} className="ml-1" />
+          )}
+        </button>
+      </div>
+      <div hidden={!showAdvancedControls}>
+        <div className="w-full flex items-start gap-4 mt-1.5">
+          <div className="flex flex-col w-60">
+            <div className="flex justify-between items-center mb-2">
+              <label className="text-white text-sm font-semibold">
+                Prompt Caching
+              </label>
+            </div>
+            <select
+              name="AnthropicCacheControl"
+              className="border-none bg-theme-settings-input-bg border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
+            >
+              <option
+                value="none"
+                selected={settings?.AnthropicCacheControl === "none"}
+              >
+                No caching
+              </option>
+              <option
+                value="5m"
+                selected={settings?.AnthropicCacheControl === "5m"}
+              >
+                5 minutes
+              </option>
+              <option
+                value="1h"
+                selected={settings?.AnthropicCacheControl === "1h"}
+              >
+                1 hour
+              </option>
+            </select>
+          </div>
+        </div>
+      </div>
    </div>
  );
 }
--- a/server/.env.example
+++ b/server/.env.example
@ -24,6 +24,7 @@ SIG_SALT='salt' # Please generate random string at least 32 chars long.
 # LLM_PROVIDER='anthropic'
 # ANTHROPIC_API_KEY=sk-ant-xxxx
 # ANTHROPIC_MODEL_PREF='claude-2'
+# ANTHROPIC_CACHE_CONTROL="5m" # Enable prompt caching (5m=5min cache, 1h=1hour cache). Reduces costs and improves speed by caching system prompts.

 # LLM_PROVIDER='lmstudio'
 # LMSTUDIO_BASE_PATH='http://your-server:1234/v1'
--- a/server/models/systemSettings.js
+++ b/server/models/systemSettings.js
@ -481,6 +481,7 @@ const SystemSettings = {
      // Anthropic Keys
      AnthropicApiKey: !!process.env.ANTHROPIC_API_KEY,
      AnthropicModelPref: process.env.ANTHROPIC_MODEL_PREF || "claude-2",
+      AnthropicCacheControl: process.env.ANTHROPIC_CACHE_CONTROL || "none",

      // Gemini Keys
      GeminiLLMApiKey: !!process.env.GEMINI_API_KEY,
--- a/server/utils/AiProviders/anthropic/index.js
+++ b/server/utils/AiProviders/anthropic/index.js
@ -34,7 +34,9 @@ class AnthropicLLM {

    this.embedder = embedder ?? new NativeEmbedder();
    this.defaultTemp = 0.7;
-    this.log(`Initialized with ${this.model}`);
+    this.log(
+      `Initialized with ${this.model}. Cache ${this.cacheControl ? `enabled (${this.cacheControl.ttl})` : "disabled"}`
+    );
  }

  log(text, ...args) {
@ -57,6 +59,48 @@ class AnthropicLLM {
    return true;
  }

+  /**
+   * Parses the cache control ENV variable
+   *
+   * If caching is enabled, we can pass less than 1024 tokens and Anthropic will just
+   * ignore it unless it is above the model's minimum. Since this feature is opt-in
+   * we can safely assume that if caching is enabled that we should just pass the content as is.
+   * https://docs.claude.com/en/docs/build-with-claude/prompt-caching#cache-limitations
+   *
+   * @param {string} value - The ENV value (5m or 1h)
+   * @returns {null|{type: "ephemeral", ttl: "5m" | "1h"}} Cache control configuration
+   */
+  get cacheControl() {
+    // Store result in instance variable to avoid recalculating
+    if (this._cacheControl) return this._cacheControl;
+
+    if (!process.env.ANTHROPIC_CACHE_CONTROL) this._cacheControl = null;
+    else {
+      const normalized =
+        process.env.ANTHROPIC_CACHE_CONTROL.toLowerCase().trim();
+      if (["5m", "1h"].includes(normalized))
+        this._cacheControl = { type: "ephemeral", ttl: normalized };
+      else this._cacheControl = null;
+    }
+    return this._cacheControl;
+  }
+
+  /**
+   * Builds system parameter with cache control if applicable
+   * @param {string} systemContent - The system prompt content
+   * @returns {string|array} System parameter for API call
+   */
+  #buildSystemPrompt(systemContent) {
+    if (!systemContent || !this.cacheControl) return systemContent;
+    return [
+      {
+        type: "text",
+        text: systemContent,
+        cache_control: this.cacheControl,
+      },
+    ];
+  }
+
  /**
   * Generates appropriate content array for a message + attachments.
   * @param {{userPrompt:string, attachments: import("../../helpers").Attachment[]}}
@ -105,11 +149,12 @@ class AnthropicLLM {

  async getChatCompletion(messages = null, { temperature = 0.7 }) {
    try {
+      const systemContent = messages[0].content;
      const result = await LLMPerformanceMonitor.measureAsyncFunction(
        this.anthropic.messages.create({
          model: this.model,
          max_tokens: 4096,
-          system: messages[0].content, // Strip out the system message
+          system: this.#buildSystemPrompt(systemContent),
          messages: messages.slice(1), // Pop off the system message
          temperature: Number(temperature ?? this.defaultTemp),
        })
@ -117,6 +162,7 @@ class AnthropicLLM {

      const promptTokens = result.output.usage.input_tokens;
      const completionTokens = result.output.usage.output_tokens;
+
      return {
        textResponse: result.output.content[0].text,
        metrics: {
@ -134,11 +180,12 @@ class AnthropicLLM {
  }

  async streamGetChatCompletion(messages = null, { temperature = 0.7 }) {
+    const systemContent = messages[0].content;
    const measuredStreamRequest = await LLMPerformanceMonitor.measureStream(
      this.anthropic.messages.stream({
        model: this.model,
        max_tokens: 4096,
-        system: messages[0].content, // Strip out the system message
+        system: this.#buildSystemPrompt(systemContent),
        messages: messages.slice(1), // Pop off the system message
        temperature: Number(temperature ?? this.defaultTemp),
      }),
--- a/server/utils/agents/aibitat/providers/anthropic.js
+++ b/server/utils/agents/aibitat/providers/anthropic.js
@ -23,14 +23,55 @@ class AnthropicProvider extends Provider {
    const client = new Anthropic(options);

    super(client);
-
    this.model = model;
  }

+  /**
+   * Parses the cache control ENV variable
+   *
+   * If caching is enabled, we can pass less than 1024 tokens and Anthropic will just
+   * ignore it unless it is above the model's minimum. Since this feature is opt-in
+   * we can safely assume that if caching is enabled that we should just pass the content as is.
+   * https://docs.claude.com/en/docs/build-with-claude/prompt-caching#cache-limitations
+   *
+   * @param {string} value - The ENV value (5m or 1h)
+   * @returns {null|{type: "ephemeral", ttl: "5m" | "1h"}} Cache control configuration
+   */
+  get cacheControl() {
+    // Store result in instance variable to avoid recalculating
+    if (this._cacheControl) return this._cacheControl;
+
+    if (!process.env.ANTHROPIC_CACHE_CONTROL) this._cacheControl = null;
+    else {
+      const normalized =
+        process.env.ANTHROPIC_CACHE_CONTROL.toLowerCase().trim();
+      if (["5m", "1h"].includes(normalized))
+        this._cacheControl = { type: "ephemeral", ttl: normalized };
+      else this._cacheControl = null;
+    }
+    return this._cacheControl;
+  }
+
  get supportsAgentStreaming() {
    return true;
  }

+  /**
+   * Builds system parameter with cache control if applicable
+   * @param {string} systemContent - The system prompt content
+   * @returns {string|array} System parameter for API call
+   */
+  #buildSystemPrompt(systemContent) {
+    if (!systemContent || !this.cacheControl) return systemContent;
+    return [
+      {
+        type: "text",
+        text: systemContent,
+        cache_control: this.cacheControl,
+      },
+    ];
+  }
+
  #prepareMessages(messages = []) {
    // Extract system prompt and filter out any system messages from the main chat.
    let systemPrompt =
@ -149,7 +190,7 @@ class AnthropicProvider extends Provider {
        {
          model: this.model,
          max_tokens: 4096,
-          system: systemPrompt,
+          system: this.#buildSystemPrompt(systemPrompt),
          messages: chats,
          stream: true,
          ...(Array.isArray(functions) && functions?.length > 0
@ -276,7 +317,7 @@ class AnthropicProvider extends Provider {
        {
          model: this.model,
          max_tokens: 4096,
-          system: systemPrompt,
+          system: this.#buildSystemPrompt(systemPrompt),
          messages: chats,
          stream: false,
          ...(Array.isArray(functions) && functions?.length > 0
--- a/server/utils/helpers/updateENV.js
+++ b/server/utils/helpers/updateENV.js
@ -58,6 +58,15 @@ const KEY_MAPPING = {
    envKey: "ANTHROPIC_MODEL_PREF",
    checks: [isNotEmpty],
  },
+  AnthropicCacheControl: {
+    envKey: "ANTHROPIC_CACHE_CONTROL",
+    checks: [
+      (input) =>
+        ["none", "5m", "1h"].includes(input)
+          ? null
+          : "Invalid cache control. Must be one of: 5m, 1h.",
+    ],
+  },

  GeminiLLMApiKey: {
    envKey: "GEMINI_API_KEY",