Prompt caching for Anthropic LLM and Agent providers (#4488)

* prompt caching for anthropic llm and agent providers

* add UI for control of ENV
simplify implementation

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
Sean Hatfield 2025-11-20 17:17:03 -08:00 committed by GitHub
parent f0b3dab4c1
commit c913a2d68c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 158 additions and 7 deletions

View File

@ -27,6 +27,7 @@ GID='1000'
# LLM_PROVIDER='anthropic'
# ANTHROPIC_API_KEY=sk-ant-xxxx
# ANTHROPIC_MODEL_PREF='claude-2'
# ANTHROPIC_CACHE_CONTROL="5m" # Enable prompt caching (5m=5min cache, 1h=1hour cache). Reduces costs and improves speed by caching system prompts.
# LLM_PROVIDER='lmstudio'
# LMSTUDIO_BASE_PATH='http://your-server:1234/v1'

View File

@ -1,7 +1,9 @@
import { useState, useEffect } from "react";
import System from "@/models/system";
import { CaretDown, CaretUp } from "@phosphor-icons/react";
export default function AnthropicAiOptions({ settings }) {
const [showAdvancedControls, setShowAdvancedControls] = useState(false);
const [inputValue, setInputValue] = useState(settings?.AnthropicApiKey);
const [anthropicApiKey, setAnthropicApiKey] = useState(
settings?.AnthropicApiKey
@ -27,7 +29,6 @@ export default function AnthropicAiOptions({ settings }) {
onBlur={() => setAnthropicApiKey(inputValue)}
/>
</div>
{!settings?.credentialsOnly && (
<AnthropicModelSelection
apiKey={anthropicApiKey}
@ -35,6 +36,56 @@ export default function AnthropicAiOptions({ settings }) {
/>
)}
</div>
<div className="flex justify-start mt-4">
<button
onClick={(e) => {
e.preventDefault();
setShowAdvancedControls(!showAdvancedControls);
}}
className="border-none text-theme-text-primary hover:text-theme-text-secondary flex items-center text-sm"
>
{showAdvancedControls ? "Hide" : "Show"} advanced settings
{showAdvancedControls ? (
<CaretUp size={14} className="ml-1" />
) : (
<CaretDown size={14} className="ml-1" />
)}
</button>
</div>
<div hidden={!showAdvancedControls}>
<div className="w-full flex items-start gap-4 mt-1.5">
<div className="flex flex-col w-60">
<div className="flex justify-between items-center mb-2">
<label className="text-white text-sm font-semibold">
Prompt Caching
</label>
</div>
<select
name="AnthropicCacheControl"
className="border-none bg-theme-settings-input-bg border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
>
<option
value="none"
selected={settings?.AnthropicCacheControl === "none"}
>
No caching
</option>
<option
value="5m"
selected={settings?.AnthropicCacheControl === "5m"}
>
5 minutes
</option>
<option
value="1h"
selected={settings?.AnthropicCacheControl === "1h"}
>
1 hour
</option>
</select>
</div>
</div>
</div>
</div>
);
}

View File

@ -24,6 +24,7 @@ SIG_SALT='salt' # Please generate random string at least 32 chars long.
# LLM_PROVIDER='anthropic'
# ANTHROPIC_API_KEY=sk-ant-xxxx
# ANTHROPIC_MODEL_PREF='claude-2'
# ANTHROPIC_CACHE_CONTROL="5m" # Enable prompt caching (5m=5min cache, 1h=1hour cache). Reduces costs and improves speed by caching system prompts.
# LLM_PROVIDER='lmstudio'
# LMSTUDIO_BASE_PATH='http://your-server:1234/v1'

View File

@ -481,6 +481,7 @@ const SystemSettings = {
// Anthropic Keys
AnthropicApiKey: !!process.env.ANTHROPIC_API_KEY,
AnthropicModelPref: process.env.ANTHROPIC_MODEL_PREF || "claude-2",
AnthropicCacheControl: process.env.ANTHROPIC_CACHE_CONTROL || "none",
// Gemini Keys
GeminiLLMApiKey: !!process.env.GEMINI_API_KEY,

View File

@ -34,7 +34,9 @@ class AnthropicLLM {
this.embedder = embedder ?? new NativeEmbedder();
this.defaultTemp = 0.7;
this.log(`Initialized with ${this.model}`);
this.log(
`Initialized with ${this.model}. Cache ${this.cacheControl ? `enabled (${this.cacheControl.ttl})` : "disabled"}`
);
}
log(text, ...args) {
@ -57,6 +59,48 @@ class AnthropicLLM {
return true;
}
/**
* Parses the cache control ENV variable
*
* If caching is enabled, we can pass less than 1024 tokens and Anthropic will just
* ignore it unless it is above the model's minimum. Since this feature is opt-in
* we can safely assume that if caching is enabled that we should just pass the content as is.
* https://docs.claude.com/en/docs/build-with-claude/prompt-caching#cache-limitations
*
* @param {string} value - The ENV value (5m or 1h)
* @returns {null|{type: "ephemeral", ttl: "5m" | "1h"}} Cache control configuration
*/
get cacheControl() {
// Store result in instance variable to avoid recalculating
if (this._cacheControl) return this._cacheControl;
if (!process.env.ANTHROPIC_CACHE_CONTROL) this._cacheControl = null;
else {
const normalized =
process.env.ANTHROPIC_CACHE_CONTROL.toLowerCase().trim();
if (["5m", "1h"].includes(normalized))
this._cacheControl = { type: "ephemeral", ttl: normalized };
else this._cacheControl = null;
}
return this._cacheControl;
}
/**
* Builds system parameter with cache control if applicable
* @param {string} systemContent - The system prompt content
* @returns {string|array} System parameter for API call
*/
#buildSystemPrompt(systemContent) {
if (!systemContent || !this.cacheControl) return systemContent;
return [
{
type: "text",
text: systemContent,
cache_control: this.cacheControl,
},
];
}
/**
* Generates appropriate content array for a message + attachments.
* @param {{userPrompt:string, attachments: import("../../helpers").Attachment[]}}
@ -105,11 +149,12 @@ class AnthropicLLM {
async getChatCompletion(messages = null, { temperature = 0.7 }) {
try {
const systemContent = messages[0].content;
const result = await LLMPerformanceMonitor.measureAsyncFunction(
this.anthropic.messages.create({
model: this.model,
max_tokens: 4096,
system: messages[0].content, // Strip out the system message
system: this.#buildSystemPrompt(systemContent),
messages: messages.slice(1), // Pop off the system message
temperature: Number(temperature ?? this.defaultTemp),
})
@ -117,6 +162,7 @@ class AnthropicLLM {
const promptTokens = result.output.usage.input_tokens;
const completionTokens = result.output.usage.output_tokens;
return {
textResponse: result.output.content[0].text,
metrics: {
@ -134,11 +180,12 @@ class AnthropicLLM {
}
async streamGetChatCompletion(messages = null, { temperature = 0.7 }) {
const systemContent = messages[0].content;
const measuredStreamRequest = await LLMPerformanceMonitor.measureStream(
this.anthropic.messages.stream({
model: this.model,
max_tokens: 4096,
system: messages[0].content, // Strip out the system message
system: this.#buildSystemPrompt(systemContent),
messages: messages.slice(1), // Pop off the system message
temperature: Number(temperature ?? this.defaultTemp),
}),

View File

@ -23,14 +23,55 @@ class AnthropicProvider extends Provider {
const client = new Anthropic(options);
super(client);
this.model = model;
}
/**
* Parses the cache control ENV variable
*
* If caching is enabled, we can pass less than 1024 tokens and Anthropic will just
* ignore it unless it is above the model's minimum. Since this feature is opt-in
* we can safely assume that if caching is enabled that we should just pass the content as is.
* https://docs.claude.com/en/docs/build-with-claude/prompt-caching#cache-limitations
*
* @param {string} value - The ENV value (5m or 1h)
* @returns {null|{type: "ephemeral", ttl: "5m" | "1h"}} Cache control configuration
*/
get cacheControl() {
// Store result in instance variable to avoid recalculating
if (this._cacheControl) return this._cacheControl;
if (!process.env.ANTHROPIC_CACHE_CONTROL) this._cacheControl = null;
else {
const normalized =
process.env.ANTHROPIC_CACHE_CONTROL.toLowerCase().trim();
if (["5m", "1h"].includes(normalized))
this._cacheControl = { type: "ephemeral", ttl: normalized };
else this._cacheControl = null;
}
return this._cacheControl;
}
get supportsAgentStreaming() {
return true;
}
/**
* Builds system parameter with cache control if applicable
* @param {string} systemContent - The system prompt content
* @returns {string|array} System parameter for API call
*/
#buildSystemPrompt(systemContent) {
if (!systemContent || !this.cacheControl) return systemContent;
return [
{
type: "text",
text: systemContent,
cache_control: this.cacheControl,
},
];
}
#prepareMessages(messages = []) {
// Extract system prompt and filter out any system messages from the main chat.
let systemPrompt =
@ -149,7 +190,7 @@ class AnthropicProvider extends Provider {
{
model: this.model,
max_tokens: 4096,
system: systemPrompt,
system: this.#buildSystemPrompt(systemPrompt),
messages: chats,
stream: true,
...(Array.isArray(functions) && functions?.length > 0
@ -276,7 +317,7 @@ class AnthropicProvider extends Provider {
{
model: this.model,
max_tokens: 4096,
system: systemPrompt,
system: this.#buildSystemPrompt(systemPrompt),
messages: chats,
stream: false,
...(Array.isArray(functions) && functions?.length > 0

View File

@ -58,6 +58,15 @@ const KEY_MAPPING = {
envKey: "ANTHROPIC_MODEL_PREF",
checks: [isNotEmpty],
},
AnthropicCacheControl: {
envKey: "ANTHROPIC_CACHE_CONTROL",
checks: [
(input) =>
["none", "5m", "1h"].includes(input)
? null
: "Invalid cache control. Must be one of: 5m, 1h.",
],
},
GeminiLLMApiKey: {
envKey: "GEMINI_API_KEY",