4601 log model on response (#4781)

* add model tag to chatCompletion

* add modelTag `model` to async streaming
keeps default arguments for prompt token calculation where applied via explict arg

* fix HF default arg

* render all performance metrics as available for backward compatibility
add `timestamp` to both sync/async chat methods

* extract metrics string to function
This commit is contained in:
Timothy Carambat 2025-12-14 14:46:55 -08:00 committed by GitHub
parent bc3ad06de4
commit 664f466e3f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
34 changed files with 176 additions and 42 deletions

View File

@ -1,3 +1,4 @@
import { formatDateTimeAsMoment } from "@/utils/directories";
import { numberWithCommas } from "@/utils/numbers";
import React, { useEffect, useState, useContext } from "react";
const MetricsContext = React.createContext();
@ -41,6 +42,26 @@ function getAutoShowMetrics() {
return window?.localStorage?.getItem(SHOW_METRICS_KEY) === "true";
}
/**
* Build the metrics string for a given metrics object
* - Model name
* - Duration and output TPS
* - Timestamp
* @param {metrics: {duration:number, outputTps: number, model?: string, timestamp?: number}} metrics
* @returns {string}
*/
function buildMetricsString(metrics = {}) {
return [
metrics?.model ? metrics.model : "",
`${formatDuration(metrics.duration)} (${formatTps(metrics.outputTps)} tok/s)`,
metrics?.timestamp
? formatDateTimeAsMoment(metrics.timestamp, "MMM D, h:mm A")
: "",
]
.filter(Boolean)
.join(" · ");
}
/**
* Toggle the show metrics setting in localStorage `anythingllm_show_chat_metrics` key
* @returns {void}
@ -88,7 +109,7 @@ export function MetricsProvider({ children }) {
/**
* Render the metrics for a given chat, if available
* @param {metrics: {duration:number, outputTps: number}} props
* @param {metrics: {duration:number, outputTps: number, model: string, timestamp: number}} props
* @returns
*/
export default function RenderMetrics({ metrics = {} }) {
@ -110,8 +131,7 @@ export default function RenderMetrics({ metrics = {} }) {
className={`border-none flex justify-end items-center gap-x-[8px] ${showMetricsAutomatically ? "opacity-100" : "opacity-0"} md:group-hover:opacity-100 transition-all duration-300`}
>
<p className="cursor-pointer text-xs font-mono text-theme-text-secondary opacity-50">
{formatDuration(metrics.duration)} ({formatTps(metrics.outputTps)}{" "}
tok/s)
{buildMetricsString(metrics)}
</p>
</button>
);

View File

@ -171,6 +171,8 @@ class AnthropicLLM {
total_tokens: promptTokens + completionTokens,
outputTps: completionTokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
} catch (error) {
@ -190,7 +192,8 @@ class AnthropicLLM {
temperature: Number(temperature ?? this.defaultTemp),
}),
messages,
false
false,
this.model
);
return measuredStreamRequest;

View File

@ -220,6 +220,8 @@ class ApiPieLLM {
outputTps:
(result.output.usage?.completion_tokens || 0) / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -237,7 +239,9 @@ class ApiPieLLM {
messages,
temperature,
}),
messages
messages,
true,
this.model
);
return measuredStreamRequest;
}

View File

@ -174,6 +174,8 @@ class AzureOpenAiLLM {
total_tokens: result.output.usage.total_tokens || 0,
outputTps: result.output.usage.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -192,7 +194,9 @@ class AzureOpenAiLLM {
n: 1,
stream: true,
}),
messages
messages,
true,
this.model
);
return measuredStreamRequest;

View File

@ -423,9 +423,7 @@ class AWSBedrockLLM {
);
}
throw new Error(`AWSBedrock::getChatCompletion failed. ${e.message}`);
}),
messages,
false
})
);
const response = result.output;
@ -450,6 +448,8 @@ class AWSBedrockLLM {
total_tokens: response?.usage?.totalTokens ?? 0,
outputTps: outputTps,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -492,7 +492,8 @@ class AWSBedrockLLM {
const measuredStreamRequest = await LLMPerformanceMonitor.measureStream(
stream,
messages,
false // Indicate it's not a function call measurement
false,
this.model
);
return measuredStreamRequest;
} catch (e) {

View File

@ -124,6 +124,8 @@ class CohereLLM {
total_tokens: promptTokens + completionTokens,
outputTps: completionTokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -139,7 +141,8 @@ class CohereLLM {
temperature,
}),
messages,
false
false,
this.model
);
return measuredStreamRequest;

View File

@ -225,6 +225,8 @@ class CometApiLLM {
total_tokens: result.output.usage.total_tokens || 0,
outputTps: result.output.usage.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -242,7 +244,9 @@ class CometApiLLM {
messages,
temperature,
}),
messages
messages,
true,
this.model
);
return measuredStreamRequest;
}

View File

@ -130,6 +130,8 @@ class DeepSeekLLM {
total_tokens: result.output.usage.total_tokens || 0,
outputTps: result.output.usage.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -148,7 +150,8 @@ class DeepSeekLLM {
temperature,
}),
messages,
false
false,
this.model
);
return measuredStreamRequest;

View File

@ -165,6 +165,8 @@ class DellProAiStudioLLM {
total_tokens: result.output.usage?.total_tokens || 0,
outputTps: result.output.usage?.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -182,7 +184,9 @@ class DellProAiStudioLLM {
messages,
temperature,
}),
messages
messages,
true,
this.model
);
return measuredStreamRequest;
}

View File

@ -163,6 +163,8 @@ class FireworksAiLLM {
total_tokens: result.output.usage.total_tokens || 0,
outputTps: result.output.usage.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -181,7 +183,8 @@ class FireworksAiLLM {
temperature,
}),
messages,
false
false,
this.model
);
return measuredStreamRequest;
}

View File

@ -224,6 +224,8 @@ class FoundryLLM {
total_tokens: result.output.usage.total_tokens || 0,
outputTps: result.output.usage.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -242,7 +244,9 @@ class FoundryLLM {
temperature,
max_completion_tokens: this.promptWindowLimit(),
}),
messages
messages,
true,
this.model
);
return measuredStreamRequest;
}

View File

@ -405,6 +405,8 @@ class GeminiLLM {
total_tokens: result.output.usage.total_tokens || 0,
outputTps: result.output.usage.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -421,7 +423,8 @@ class GeminiLLM {
},
}),
messages,
false
false,
this.model
);
return measuredStreamRequest;

View File

@ -193,6 +193,8 @@ class GenericOpenAiLLM {
outputTps:
(result.output?.usage?.completion_tokens || 0) / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -206,9 +208,10 @@ class GenericOpenAiLLM {
temperature,
max_tokens: this.maxTokens,
}),
messages
messages,
// runPromptTokenCalculation: true - There is not way to know if the generic provider connected is returning
// the correct usage metrics if any at all since any provider could be connected.
true,
this.model
);
return measuredStreamRequest;
}

View File

@ -170,6 +170,8 @@ class GiteeAILLM {
total_tokens: result.output.usage.total_tokens || 0,
outputTps: result.output.usage.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -183,7 +185,8 @@ class GiteeAILLM {
temperature,
}),
messages,
false
false,
this.model
);
return measuredStreamRequest;

View File

@ -203,6 +203,8 @@ class GroqLLM {
result.output.usage.completion_tokens /
result.output.usage.completion_time,
duration: result.output.usage.total_time,
model: this.model,
timestamp: new Date(),
},
};
}
@ -221,7 +223,8 @@ class GroqLLM {
temperature,
}),
messages,
false
false,
this.model
);
return measuredStreamRequest;

View File

@ -117,6 +117,8 @@ class HuggingFaceLLM {
outputTps:
(result.output.usage?.completion_tokens || 0) / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -129,7 +131,9 @@ class HuggingFaceLLM {
messages,
temperature,
}),
messages
messages,
true,
this.model
);
return measuredStreamRequest;
}

View File

@ -160,6 +160,8 @@ class KoboldCPPLLM {
total_tokens: promptTokens + completionTokens,
outputTps: completionTokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -173,7 +175,9 @@ class KoboldCPPLLM {
temperature,
max_tokens: this.maxTokens,
}),
messages
messages,
true,
this.model
);
return measuredStreamRequest;
}

View File

@ -154,6 +154,8 @@ class LiteLLM {
outputTps:
(result.output.usage?.completion_tokens || 0) / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -167,9 +169,10 @@ class LiteLLM {
temperature,
max_tokens: parseInt(this.maxTokens), // LiteLLM requires int
}),
messages
messages,
// runPromptTokenCalculation: true - We manually count the tokens because they may or may not be provided in the stream
// responses depending on LLM connected. If they are provided, then we counted for nothing, but better than nothing.
true,
this.model
);
return measuredStreamRequest;

View File

@ -234,6 +234,8 @@ class LMStudioLLM {
total_tokens: result.output.usage?.total_tokens || 0,
outputTps: result.output.usage?.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -251,7 +253,9 @@ class LMStudioLLM {
messages,
temperature,
}),
messages
messages,
true,
this.model
);
return measuredStreamRequest;
}

View File

@ -145,6 +145,8 @@ class LocalAiLLM {
total_tokens: promptTokens + completionTokens,
outputTps: completionTokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -162,7 +164,9 @@ class LocalAiLLM {
messages,
temperature,
}),
messages
messages,
true,
this.model
);
return measuredStreamRequest;
}

View File

@ -139,6 +139,8 @@ class MistralLLM {
total_tokens: result.output.usage.total_tokens || 0,
outputTps: result.output.usage.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -157,7 +159,8 @@ class MistralLLM {
temperature,
}),
messages,
false
false,
this.model
);
return measuredStreamRequest;
}

View File

@ -136,6 +136,8 @@ class MoonshotAiLLM {
total_tokens: result.output.usage.total_tokens || 0,
outputTps: result.output.usage.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -148,7 +150,9 @@ class MoonshotAiLLM {
messages,
temperature,
}),
messages
messages,
true,
this.model
);
return measuredStreamRequest;

View File

@ -225,6 +225,8 @@ class NovitaLLM {
total_tokens: result.output.usage.total_tokens || 0,
outputTps: result.output.usage.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -242,7 +244,9 @@ class NovitaLLM {
messages,
temperature,
}),
messages
messages,
true,
this.model
);
return measuredStreamRequest;
}

View File

@ -184,6 +184,8 @@ class NvidiaNimLLM {
total_tokens: result.output.usage.total_tokens || 0,
outputTps: result.output.usage.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -201,7 +203,9 @@ class NvidiaNimLLM {
messages,
temperature,
}),
messages
messages,
true,
this.model
);
return measuredStreamRequest;
}

View File

@ -305,6 +305,8 @@ class OllamaAILLM {
outputTps:
result.output.usage.completion_tokens / result.output.usage.duration,
duration: result.output.usage.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -326,7 +328,8 @@ class OllamaAILLM {
},
}),
messages,
false
false,
this.model
).catch((e) => {
throw this.#errorHandler(e);
});

View File

@ -175,6 +175,8 @@ class OpenAiLLM {
? usage.output_tokens / result.duration
: 0,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -194,7 +196,8 @@ class OpenAiLLM {
temperature: this.#temperature(this.model, temperature),
}),
messages,
false
false,
this.model
);
return measuredStreamRequest;

View File

@ -276,6 +276,8 @@ class OpenRouterLLM {
total_tokens: result.output.usage.total_tokens || 0,
outputTps: result.output.usage.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -300,13 +302,15 @@ class OpenRouterLLM {
include_reasoning: true,
user: user?.id ? `user_${user.id}` : "",
}),
messages
messages,
// We have to manually count the tokens
// OpenRouter has a ton of providers and they all can return slightly differently
// some return chunk.usage on STOP, some do it after stop, its inconsistent.
// So it is possible reported metrics are inaccurate since we cannot reliably
// catch the metrics before resolving the stream - so we just pretend this functionality
// is not available.
true,
this.model
);
return measuredStreamRequest;

View File

@ -117,6 +117,8 @@ class PerplexityLLM {
total_tokens: result.output.usage?.total_tokens || 0,
outputTps: result.output.usage?.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -134,7 +136,9 @@ class PerplexityLLM {
messages,
temperature,
}),
messages
messages,
true,
this.model
);
return measuredStreamRequest;
}

View File

@ -176,6 +176,8 @@ class PPIOLLM {
total_tokens: result.output.usage.total_tokens || 0,
outputTps: result.output.usage.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -193,7 +195,9 @@ class PPIOLLM {
messages,
temperature,
}),
messages
messages,
true,
this.model
);
return measuredStreamRequest;
}

View File

@ -150,6 +150,8 @@ class TextGenWebUILLM {
total_tokens: result.output.usage?.total_tokens || 0,
outputTps: result.output.usage?.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -162,7 +164,9 @@ class TextGenWebUILLM {
messages,
temperature,
}),
messages
messages,
true,
this.model
);
return measuredStreamRequest;
}

View File

@ -209,6 +209,8 @@ class TogetherAiLLM {
total_tokens: result.output.usage?.total_tokens || 0,
outputTps: result.output.usage?.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -227,7 +229,8 @@ class TogetherAiLLM {
temperature,
}),
messages,
false
false,
this.model
);
return measuredStreamRequest;
}

View File

@ -147,6 +147,8 @@ class XAiLLM {
total_tokens: result.output.usage.total_tokens || 0,
outputTps: result.output.usage.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -165,7 +167,8 @@ class XAiLLM {
temperature,
}),
messages,
false
false,
this.model
);
return measuredStreamRequest;

View File

@ -136,6 +136,8 @@ class ZAiLLM {
total_tokens: result.output.usage?.total_tokens || 0,
outputTps: result.output.usage?.completion_tokens / result.duration,
duration: result.duration,
model: this.model,
timestamp: new Date(),
},
};
}
@ -149,7 +151,8 @@ class ZAiLLM {
temperature,
}),
messages,
false
false,
this.model
);
return measuredStreamRequest;

View File

@ -59,13 +59,15 @@ class LLMPerformanceMonitor {
* Also attaches an `endMeasurement` method to the stream that will calculate the duration of the stream and metrics.
* @param {Promise<OpenAICompatibleStream>} func
* @param {Messages} messages - the messages sent to the LLM so we can calculate the prompt tokens since most providers do not return this on stream
* @param {boolean} runPromptTokenCalculation - whether to run the prompt token calculation to estimate the `prompt_tokens` metric. This is useful for providers that do not return this on stream.
* @param {boolean} runPromptTokenCalculation - [default: true] whether to run the prompt token calculation to estimate the `prompt_tokens` metric. This is useful for providers that do not return this on stream.
* @param {string} modelTag - the tag of the model that was used to generate the stream (eg: gpt-4o, claude-3-5-sonnet, qwen3/72b-instruct, etc.)
* @returns {Promise<MonitoredStream>}
*/
static async measureStream(
func,
messages = [],
runPromptTokenCalculation = true
runPromptTokenCalculation = true,
modelTag = ""
) {
const stream = await func;
stream.start = Date.now();
@ -76,6 +78,7 @@ class LLMPerformanceMonitor {
total_tokens: 0,
outputTps: 0,
duration: 0,
...(modelTag ? { model: modelTag } : {}),
};
stream.endMeasurement = (reportedUsage = {}) => {
@ -88,6 +91,7 @@ class LLMPerformanceMonitor {
...stream.metrics,
...reportedUsage,
duration: reportedUsage?.duration ?? estimatedDuration,
timestamp: new Date(),
};
stream.metrics.total_tokens =