Prompt caching for Anthropic LLM and Agent providers (#4488)
* prompt caching for anthropic llm and agent providers * add UI for control of ENV simplify implementation --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
parent
f0b3dab4c1
commit
c913a2d68c
@ -27,6 +27,7 @@ GID='1000'
|
||||
# LLM_PROVIDER='anthropic'
|
||||
# ANTHROPIC_API_KEY=sk-ant-xxxx
|
||||
# ANTHROPIC_MODEL_PREF='claude-2'
|
||||
# ANTHROPIC_CACHE_CONTROL="5m" # Enable prompt caching (5m=5min cache, 1h=1hour cache). Reduces costs and improves speed by caching system prompts.
|
||||
|
||||
# LLM_PROVIDER='lmstudio'
|
||||
# LMSTUDIO_BASE_PATH='http://your-server:1234/v1'
|
||||
|
||||
@ -1,7 +1,9 @@
|
||||
import { useState, useEffect } from "react";
|
||||
import System from "@/models/system";
|
||||
import { CaretDown, CaretUp } from "@phosphor-icons/react";
|
||||
|
||||
export default function AnthropicAiOptions({ settings }) {
|
||||
const [showAdvancedControls, setShowAdvancedControls] = useState(false);
|
||||
const [inputValue, setInputValue] = useState(settings?.AnthropicApiKey);
|
||||
const [anthropicApiKey, setAnthropicApiKey] = useState(
|
||||
settings?.AnthropicApiKey
|
||||
@ -27,7 +29,6 @@ export default function AnthropicAiOptions({ settings }) {
|
||||
onBlur={() => setAnthropicApiKey(inputValue)}
|
||||
/>
|
||||
</div>
|
||||
|
||||
{!settings?.credentialsOnly && (
|
||||
<AnthropicModelSelection
|
||||
apiKey={anthropicApiKey}
|
||||
@ -35,6 +36,56 @@ export default function AnthropicAiOptions({ settings }) {
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
<div className="flex justify-start mt-4">
|
||||
<button
|
||||
onClick={(e) => {
|
||||
e.preventDefault();
|
||||
setShowAdvancedControls(!showAdvancedControls);
|
||||
}}
|
||||
className="border-none text-theme-text-primary hover:text-theme-text-secondary flex items-center text-sm"
|
||||
>
|
||||
{showAdvancedControls ? "Hide" : "Show"} advanced settings
|
||||
{showAdvancedControls ? (
|
||||
<CaretUp size={14} className="ml-1" />
|
||||
) : (
|
||||
<CaretDown size={14} className="ml-1" />
|
||||
)}
|
||||
</button>
|
||||
</div>
|
||||
<div hidden={!showAdvancedControls}>
|
||||
<div className="w-full flex items-start gap-4 mt-1.5">
|
||||
<div className="flex flex-col w-60">
|
||||
<div className="flex justify-between items-center mb-2">
|
||||
<label className="text-white text-sm font-semibold">
|
||||
Prompt Caching
|
||||
</label>
|
||||
</div>
|
||||
<select
|
||||
name="AnthropicCacheControl"
|
||||
className="border-none bg-theme-settings-input-bg border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
|
||||
>
|
||||
<option
|
||||
value="none"
|
||||
selected={settings?.AnthropicCacheControl === "none"}
|
||||
>
|
||||
No caching
|
||||
</option>
|
||||
<option
|
||||
value="5m"
|
||||
selected={settings?.AnthropicCacheControl === "5m"}
|
||||
>
|
||||
5 minutes
|
||||
</option>
|
||||
<option
|
||||
value="1h"
|
||||
selected={settings?.AnthropicCacheControl === "1h"}
|
||||
>
|
||||
1 hour
|
||||
</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
@ -24,6 +24,7 @@ SIG_SALT='salt' # Please generate random string at least 32 chars long.
|
||||
# LLM_PROVIDER='anthropic'
|
||||
# ANTHROPIC_API_KEY=sk-ant-xxxx
|
||||
# ANTHROPIC_MODEL_PREF='claude-2'
|
||||
# ANTHROPIC_CACHE_CONTROL="5m" # Enable prompt caching (5m=5min cache, 1h=1hour cache). Reduces costs and improves speed by caching system prompts.
|
||||
|
||||
# LLM_PROVIDER='lmstudio'
|
||||
# LMSTUDIO_BASE_PATH='http://your-server:1234/v1'
|
||||
|
||||
@ -481,6 +481,7 @@ const SystemSettings = {
|
||||
// Anthropic Keys
|
||||
AnthropicApiKey: !!process.env.ANTHROPIC_API_KEY,
|
||||
AnthropicModelPref: process.env.ANTHROPIC_MODEL_PREF || "claude-2",
|
||||
AnthropicCacheControl: process.env.ANTHROPIC_CACHE_CONTROL || "none",
|
||||
|
||||
// Gemini Keys
|
||||
GeminiLLMApiKey: !!process.env.GEMINI_API_KEY,
|
||||
|
||||
@ -34,7 +34,9 @@ class AnthropicLLM {
|
||||
|
||||
this.embedder = embedder ?? new NativeEmbedder();
|
||||
this.defaultTemp = 0.7;
|
||||
this.log(`Initialized with ${this.model}`);
|
||||
this.log(
|
||||
`Initialized with ${this.model}. Cache ${this.cacheControl ? `enabled (${this.cacheControl.ttl})` : "disabled"}`
|
||||
);
|
||||
}
|
||||
|
||||
log(text, ...args) {
|
||||
@ -57,6 +59,48 @@ class AnthropicLLM {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the cache control ENV variable
|
||||
*
|
||||
* If caching is enabled, we can pass less than 1024 tokens and Anthropic will just
|
||||
* ignore it unless it is above the model's minimum. Since this feature is opt-in
|
||||
* we can safely assume that if caching is enabled that we should just pass the content as is.
|
||||
* https://docs.claude.com/en/docs/build-with-claude/prompt-caching#cache-limitations
|
||||
*
|
||||
* @param {string} value - The ENV value (5m or 1h)
|
||||
* @returns {null|{type: "ephemeral", ttl: "5m" | "1h"}} Cache control configuration
|
||||
*/
|
||||
get cacheControl() {
|
||||
// Store result in instance variable to avoid recalculating
|
||||
if (this._cacheControl) return this._cacheControl;
|
||||
|
||||
if (!process.env.ANTHROPIC_CACHE_CONTROL) this._cacheControl = null;
|
||||
else {
|
||||
const normalized =
|
||||
process.env.ANTHROPIC_CACHE_CONTROL.toLowerCase().trim();
|
||||
if (["5m", "1h"].includes(normalized))
|
||||
this._cacheControl = { type: "ephemeral", ttl: normalized };
|
||||
else this._cacheControl = null;
|
||||
}
|
||||
return this._cacheControl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds system parameter with cache control if applicable
|
||||
* @param {string} systemContent - The system prompt content
|
||||
* @returns {string|array} System parameter for API call
|
||||
*/
|
||||
#buildSystemPrompt(systemContent) {
|
||||
if (!systemContent || !this.cacheControl) return systemContent;
|
||||
return [
|
||||
{
|
||||
type: "text",
|
||||
text: systemContent,
|
||||
cache_control: this.cacheControl,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates appropriate content array for a message + attachments.
|
||||
* @param {{userPrompt:string, attachments: import("../../helpers").Attachment[]}}
|
||||
@ -105,11 +149,12 @@ class AnthropicLLM {
|
||||
|
||||
async getChatCompletion(messages = null, { temperature = 0.7 }) {
|
||||
try {
|
||||
const systemContent = messages[0].content;
|
||||
const result = await LLMPerformanceMonitor.measureAsyncFunction(
|
||||
this.anthropic.messages.create({
|
||||
model: this.model,
|
||||
max_tokens: 4096,
|
||||
system: messages[0].content, // Strip out the system message
|
||||
system: this.#buildSystemPrompt(systemContent),
|
||||
messages: messages.slice(1), // Pop off the system message
|
||||
temperature: Number(temperature ?? this.defaultTemp),
|
||||
})
|
||||
@ -117,6 +162,7 @@ class AnthropicLLM {
|
||||
|
||||
const promptTokens = result.output.usage.input_tokens;
|
||||
const completionTokens = result.output.usage.output_tokens;
|
||||
|
||||
return {
|
||||
textResponse: result.output.content[0].text,
|
||||
metrics: {
|
||||
@ -134,11 +180,12 @@ class AnthropicLLM {
|
||||
}
|
||||
|
||||
async streamGetChatCompletion(messages = null, { temperature = 0.7 }) {
|
||||
const systemContent = messages[0].content;
|
||||
const measuredStreamRequest = await LLMPerformanceMonitor.measureStream(
|
||||
this.anthropic.messages.stream({
|
||||
model: this.model,
|
||||
max_tokens: 4096,
|
||||
system: messages[0].content, // Strip out the system message
|
||||
system: this.#buildSystemPrompt(systemContent),
|
||||
messages: messages.slice(1), // Pop off the system message
|
||||
temperature: Number(temperature ?? this.defaultTemp),
|
||||
}),
|
||||
|
||||
@ -23,14 +23,55 @@ class AnthropicProvider extends Provider {
|
||||
const client = new Anthropic(options);
|
||||
|
||||
super(client);
|
||||
|
||||
this.model = model;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses the cache control ENV variable
|
||||
*
|
||||
* If caching is enabled, we can pass less than 1024 tokens and Anthropic will just
|
||||
* ignore it unless it is above the model's minimum. Since this feature is opt-in
|
||||
* we can safely assume that if caching is enabled that we should just pass the content as is.
|
||||
* https://docs.claude.com/en/docs/build-with-claude/prompt-caching#cache-limitations
|
||||
*
|
||||
* @param {string} value - The ENV value (5m or 1h)
|
||||
* @returns {null|{type: "ephemeral", ttl: "5m" | "1h"}} Cache control configuration
|
||||
*/
|
||||
get cacheControl() {
|
||||
// Store result in instance variable to avoid recalculating
|
||||
if (this._cacheControl) return this._cacheControl;
|
||||
|
||||
if (!process.env.ANTHROPIC_CACHE_CONTROL) this._cacheControl = null;
|
||||
else {
|
||||
const normalized =
|
||||
process.env.ANTHROPIC_CACHE_CONTROL.toLowerCase().trim();
|
||||
if (["5m", "1h"].includes(normalized))
|
||||
this._cacheControl = { type: "ephemeral", ttl: normalized };
|
||||
else this._cacheControl = null;
|
||||
}
|
||||
return this._cacheControl;
|
||||
}
|
||||
|
||||
get supportsAgentStreaming() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds system parameter with cache control if applicable
|
||||
* @param {string} systemContent - The system prompt content
|
||||
* @returns {string|array} System parameter for API call
|
||||
*/
|
||||
#buildSystemPrompt(systemContent) {
|
||||
if (!systemContent || !this.cacheControl) return systemContent;
|
||||
return [
|
||||
{
|
||||
type: "text",
|
||||
text: systemContent,
|
||||
cache_control: this.cacheControl,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
#prepareMessages(messages = []) {
|
||||
// Extract system prompt and filter out any system messages from the main chat.
|
||||
let systemPrompt =
|
||||
@ -149,7 +190,7 @@ class AnthropicProvider extends Provider {
|
||||
{
|
||||
model: this.model,
|
||||
max_tokens: 4096,
|
||||
system: systemPrompt,
|
||||
system: this.#buildSystemPrompt(systemPrompt),
|
||||
messages: chats,
|
||||
stream: true,
|
||||
...(Array.isArray(functions) && functions?.length > 0
|
||||
@ -276,7 +317,7 @@ class AnthropicProvider extends Provider {
|
||||
{
|
||||
model: this.model,
|
||||
max_tokens: 4096,
|
||||
system: systemPrompt,
|
||||
system: this.#buildSystemPrompt(systemPrompt),
|
||||
messages: chats,
|
||||
stream: false,
|
||||
...(Array.isArray(functions) && functions?.length > 0
|
||||
|
||||
@ -58,6 +58,15 @@ const KEY_MAPPING = {
|
||||
envKey: "ANTHROPIC_MODEL_PREF",
|
||||
checks: [isNotEmpty],
|
||||
},
|
||||
AnthropicCacheControl: {
|
||||
envKey: "ANTHROPIC_CACHE_CONTROL",
|
||||
checks: [
|
||||
(input) =>
|
||||
["none", "5m", "1h"].includes(input)
|
||||
? null
|
||||
: "Invalid cache control. Must be one of: 5m, 1h.",
|
||||
],
|
||||
},
|
||||
|
||||
GeminiLLMApiKey: {
|
||||
envKey: "GEMINI_API_KEY",
|
||||
|
||||
Loading…
Reference in New Issue
Block a user