4601 log model on response (#4781)
* add model tag to chatCompletion * add modelTag `model` to async streaming keeps default arguments for prompt token calculation where applied via explict arg * fix HF default arg * render all performance metrics as available for backward compatibility add `timestamp` to both sync/async chat methods * extract metrics string to function
This commit is contained in:
parent
bc3ad06de4
commit
664f466e3f
@ -1,3 +1,4 @@
|
||||
import { formatDateTimeAsMoment } from "@/utils/directories";
|
||||
import { numberWithCommas } from "@/utils/numbers";
|
||||
import React, { useEffect, useState, useContext } from "react";
|
||||
const MetricsContext = React.createContext();
|
||||
@ -41,6 +42,26 @@ function getAutoShowMetrics() {
|
||||
return window?.localStorage?.getItem(SHOW_METRICS_KEY) === "true";
|
||||
}
|
||||
|
||||
/**
|
||||
* Build the metrics string for a given metrics object
|
||||
* - Model name
|
||||
* - Duration and output TPS
|
||||
* - Timestamp
|
||||
* @param {metrics: {duration:number, outputTps: number, model?: string, timestamp?: number}} metrics
|
||||
* @returns {string}
|
||||
*/
|
||||
function buildMetricsString(metrics = {}) {
|
||||
return [
|
||||
metrics?.model ? metrics.model : "",
|
||||
`${formatDuration(metrics.duration)} (${formatTps(metrics.outputTps)} tok/s)`,
|
||||
metrics?.timestamp
|
||||
? formatDateTimeAsMoment(metrics.timestamp, "MMM D, h:mm A")
|
||||
: "",
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join(" · ");
|
||||
}
|
||||
|
||||
/**
|
||||
* Toggle the show metrics setting in localStorage `anythingllm_show_chat_metrics` key
|
||||
* @returns {void}
|
||||
@ -88,7 +109,7 @@ export function MetricsProvider({ children }) {
|
||||
|
||||
/**
|
||||
* Render the metrics for a given chat, if available
|
||||
* @param {metrics: {duration:number, outputTps: number}} props
|
||||
* @param {metrics: {duration:number, outputTps: number, model: string, timestamp: number}} props
|
||||
* @returns
|
||||
*/
|
||||
export default function RenderMetrics({ metrics = {} }) {
|
||||
@ -110,8 +131,7 @@ export default function RenderMetrics({ metrics = {} }) {
|
||||
className={`border-none flex justify-end items-center gap-x-[8px] ${showMetricsAutomatically ? "opacity-100" : "opacity-0"} md:group-hover:opacity-100 transition-all duration-300`}
|
||||
>
|
||||
<p className="cursor-pointer text-xs font-mono text-theme-text-secondary opacity-50">
|
||||
{formatDuration(metrics.duration)} ({formatTps(metrics.outputTps)}{" "}
|
||||
tok/s)
|
||||
{buildMetricsString(metrics)}
|
||||
</p>
|
||||
</button>
|
||||
);
|
||||
|
||||
@ -171,6 +171,8 @@ class AnthropicLLM {
|
||||
total_tokens: promptTokens + completionTokens,
|
||||
outputTps: completionTokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
} catch (error) {
|
||||
@ -190,7 +192,8 @@ class AnthropicLLM {
|
||||
temperature: Number(temperature ?? this.defaultTemp),
|
||||
}),
|
||||
messages,
|
||||
false
|
||||
false,
|
||||
this.model
|
||||
);
|
||||
|
||||
return measuredStreamRequest;
|
||||
|
||||
@ -220,6 +220,8 @@ class ApiPieLLM {
|
||||
outputTps:
|
||||
(result.output.usage?.completion_tokens || 0) / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -237,7 +239,9 @@ class ApiPieLLM {
|
||||
messages,
|
||||
temperature,
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -174,6 +174,8 @@ class AzureOpenAiLLM {
|
||||
total_tokens: result.output.usage.total_tokens || 0,
|
||||
outputTps: result.output.usage.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -192,7 +194,9 @@ class AzureOpenAiLLM {
|
||||
n: 1,
|
||||
stream: true,
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
|
||||
return measuredStreamRequest;
|
||||
|
||||
@ -423,9 +423,7 @@ class AWSBedrockLLM {
|
||||
);
|
||||
}
|
||||
throw new Error(`AWSBedrock::getChatCompletion failed. ${e.message}`);
|
||||
}),
|
||||
messages,
|
||||
false
|
||||
})
|
||||
);
|
||||
|
||||
const response = result.output;
|
||||
@ -450,6 +448,8 @@ class AWSBedrockLLM {
|
||||
total_tokens: response?.usage?.totalTokens ?? 0,
|
||||
outputTps: outputTps,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -492,7 +492,8 @@ class AWSBedrockLLM {
|
||||
const measuredStreamRequest = await LLMPerformanceMonitor.measureStream(
|
||||
stream,
|
||||
messages,
|
||||
false // Indicate it's not a function call measurement
|
||||
false,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
} catch (e) {
|
||||
|
||||
@ -124,6 +124,8 @@ class CohereLLM {
|
||||
total_tokens: promptTokens + completionTokens,
|
||||
outputTps: completionTokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -139,7 +141,8 @@ class CohereLLM {
|
||||
temperature,
|
||||
}),
|
||||
messages,
|
||||
false
|
||||
false,
|
||||
this.model
|
||||
);
|
||||
|
||||
return measuredStreamRequest;
|
||||
|
||||
@ -225,6 +225,8 @@ class CometApiLLM {
|
||||
total_tokens: result.output.usage.total_tokens || 0,
|
||||
outputTps: result.output.usage.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -242,7 +244,9 @@ class CometApiLLM {
|
||||
messages,
|
||||
temperature,
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -130,6 +130,8 @@ class DeepSeekLLM {
|
||||
total_tokens: result.output.usage.total_tokens || 0,
|
||||
outputTps: result.output.usage.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -148,7 +150,8 @@ class DeepSeekLLM {
|
||||
temperature,
|
||||
}),
|
||||
messages,
|
||||
false
|
||||
false,
|
||||
this.model
|
||||
);
|
||||
|
||||
return measuredStreamRequest;
|
||||
|
||||
@ -165,6 +165,8 @@ class DellProAiStudioLLM {
|
||||
total_tokens: result.output.usage?.total_tokens || 0,
|
||||
outputTps: result.output.usage?.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -182,7 +184,9 @@ class DellProAiStudioLLM {
|
||||
messages,
|
||||
temperature,
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -163,6 +163,8 @@ class FireworksAiLLM {
|
||||
total_tokens: result.output.usage.total_tokens || 0,
|
||||
outputTps: result.output.usage.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -181,7 +183,8 @@ class FireworksAiLLM {
|
||||
temperature,
|
||||
}),
|
||||
messages,
|
||||
false
|
||||
false,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -224,6 +224,8 @@ class FoundryLLM {
|
||||
total_tokens: result.output.usage.total_tokens || 0,
|
||||
outputTps: result.output.usage.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -242,7 +244,9 @@ class FoundryLLM {
|
||||
temperature,
|
||||
max_completion_tokens: this.promptWindowLimit(),
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -405,6 +405,8 @@ class GeminiLLM {
|
||||
total_tokens: result.output.usage.total_tokens || 0,
|
||||
outputTps: result.output.usage.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -421,7 +423,8 @@ class GeminiLLM {
|
||||
},
|
||||
}),
|
||||
messages,
|
||||
false
|
||||
false,
|
||||
this.model
|
||||
);
|
||||
|
||||
return measuredStreamRequest;
|
||||
|
||||
@ -193,6 +193,8 @@ class GenericOpenAiLLM {
|
||||
outputTps:
|
||||
(result.output?.usage?.completion_tokens || 0) / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -206,9 +208,10 @@ class GenericOpenAiLLM {
|
||||
temperature,
|
||||
max_tokens: this.maxTokens,
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
// runPromptTokenCalculation: true - There is not way to know if the generic provider connected is returning
|
||||
// the correct usage metrics if any at all since any provider could be connected.
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -170,6 +170,8 @@ class GiteeAILLM {
|
||||
total_tokens: result.output.usage.total_tokens || 0,
|
||||
outputTps: result.output.usage.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -183,7 +185,8 @@ class GiteeAILLM {
|
||||
temperature,
|
||||
}),
|
||||
messages,
|
||||
false
|
||||
false,
|
||||
this.model
|
||||
);
|
||||
|
||||
return measuredStreamRequest;
|
||||
|
||||
@ -203,6 +203,8 @@ class GroqLLM {
|
||||
result.output.usage.completion_tokens /
|
||||
result.output.usage.completion_time,
|
||||
duration: result.output.usage.total_time,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -221,7 +223,8 @@ class GroqLLM {
|
||||
temperature,
|
||||
}),
|
||||
messages,
|
||||
false
|
||||
false,
|
||||
this.model
|
||||
);
|
||||
|
||||
return measuredStreamRequest;
|
||||
|
||||
@ -117,6 +117,8 @@ class HuggingFaceLLM {
|
||||
outputTps:
|
||||
(result.output.usage?.completion_tokens || 0) / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -129,7 +131,9 @@ class HuggingFaceLLM {
|
||||
messages,
|
||||
temperature,
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -160,6 +160,8 @@ class KoboldCPPLLM {
|
||||
total_tokens: promptTokens + completionTokens,
|
||||
outputTps: completionTokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -173,7 +175,9 @@ class KoboldCPPLLM {
|
||||
temperature,
|
||||
max_tokens: this.maxTokens,
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -154,6 +154,8 @@ class LiteLLM {
|
||||
outputTps:
|
||||
(result.output.usage?.completion_tokens || 0) / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -167,9 +169,10 @@ class LiteLLM {
|
||||
temperature,
|
||||
max_tokens: parseInt(this.maxTokens), // LiteLLM requires int
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
// runPromptTokenCalculation: true - We manually count the tokens because they may or may not be provided in the stream
|
||||
// responses depending on LLM connected. If they are provided, then we counted for nothing, but better than nothing.
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
|
||||
return measuredStreamRequest;
|
||||
|
||||
@ -234,6 +234,8 @@ class LMStudioLLM {
|
||||
total_tokens: result.output.usage?.total_tokens || 0,
|
||||
outputTps: result.output.usage?.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -251,7 +253,9 @@ class LMStudioLLM {
|
||||
messages,
|
||||
temperature,
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -145,6 +145,8 @@ class LocalAiLLM {
|
||||
total_tokens: promptTokens + completionTokens,
|
||||
outputTps: completionTokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -162,7 +164,9 @@ class LocalAiLLM {
|
||||
messages,
|
||||
temperature,
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -139,6 +139,8 @@ class MistralLLM {
|
||||
total_tokens: result.output.usage.total_tokens || 0,
|
||||
outputTps: result.output.usage.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -157,7 +159,8 @@ class MistralLLM {
|
||||
temperature,
|
||||
}),
|
||||
messages,
|
||||
false
|
||||
false,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -136,6 +136,8 @@ class MoonshotAiLLM {
|
||||
total_tokens: result.output.usage.total_tokens || 0,
|
||||
outputTps: result.output.usage.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -148,7 +150,9 @@ class MoonshotAiLLM {
|
||||
messages,
|
||||
temperature,
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
|
||||
return measuredStreamRequest;
|
||||
|
||||
@ -225,6 +225,8 @@ class NovitaLLM {
|
||||
total_tokens: result.output.usage.total_tokens || 0,
|
||||
outputTps: result.output.usage.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -242,7 +244,9 @@ class NovitaLLM {
|
||||
messages,
|
||||
temperature,
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -184,6 +184,8 @@ class NvidiaNimLLM {
|
||||
total_tokens: result.output.usage.total_tokens || 0,
|
||||
outputTps: result.output.usage.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -201,7 +203,9 @@ class NvidiaNimLLM {
|
||||
messages,
|
||||
temperature,
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -305,6 +305,8 @@ class OllamaAILLM {
|
||||
outputTps:
|
||||
result.output.usage.completion_tokens / result.output.usage.duration,
|
||||
duration: result.output.usage.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -326,7 +328,8 @@ class OllamaAILLM {
|
||||
},
|
||||
}),
|
||||
messages,
|
||||
false
|
||||
false,
|
||||
this.model
|
||||
).catch((e) => {
|
||||
throw this.#errorHandler(e);
|
||||
});
|
||||
|
||||
@ -175,6 +175,8 @@ class OpenAiLLM {
|
||||
? usage.output_tokens / result.duration
|
||||
: 0,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -194,7 +196,8 @@ class OpenAiLLM {
|
||||
temperature: this.#temperature(this.model, temperature),
|
||||
}),
|
||||
messages,
|
||||
false
|
||||
false,
|
||||
this.model
|
||||
);
|
||||
|
||||
return measuredStreamRequest;
|
||||
|
||||
@ -276,6 +276,8 @@ class OpenRouterLLM {
|
||||
total_tokens: result.output.usage.total_tokens || 0,
|
||||
outputTps: result.output.usage.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -300,13 +302,15 @@ class OpenRouterLLM {
|
||||
include_reasoning: true,
|
||||
user: user?.id ? `user_${user.id}` : "",
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
// We have to manually count the tokens
|
||||
// OpenRouter has a ton of providers and they all can return slightly differently
|
||||
// some return chunk.usage on STOP, some do it after stop, its inconsistent.
|
||||
// So it is possible reported metrics are inaccurate since we cannot reliably
|
||||
// catch the metrics before resolving the stream - so we just pretend this functionality
|
||||
// is not available.
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
|
||||
return measuredStreamRequest;
|
||||
|
||||
@ -117,6 +117,8 @@ class PerplexityLLM {
|
||||
total_tokens: result.output.usage?.total_tokens || 0,
|
||||
outputTps: result.output.usage?.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -134,7 +136,9 @@ class PerplexityLLM {
|
||||
messages,
|
||||
temperature,
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -176,6 +176,8 @@ class PPIOLLM {
|
||||
total_tokens: result.output.usage.total_tokens || 0,
|
||||
outputTps: result.output.usage.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -193,7 +195,9 @@ class PPIOLLM {
|
||||
messages,
|
||||
temperature,
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -150,6 +150,8 @@ class TextGenWebUILLM {
|
||||
total_tokens: result.output.usage?.total_tokens || 0,
|
||||
outputTps: result.output.usage?.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -162,7 +164,9 @@ class TextGenWebUILLM {
|
||||
messages,
|
||||
temperature,
|
||||
}),
|
||||
messages
|
||||
messages,
|
||||
true,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -209,6 +209,8 @@ class TogetherAiLLM {
|
||||
total_tokens: result.output.usage?.total_tokens || 0,
|
||||
outputTps: result.output.usage?.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -227,7 +229,8 @@ class TogetherAiLLM {
|
||||
temperature,
|
||||
}),
|
||||
messages,
|
||||
false
|
||||
false,
|
||||
this.model
|
||||
);
|
||||
return measuredStreamRequest;
|
||||
}
|
||||
|
||||
@ -147,6 +147,8 @@ class XAiLLM {
|
||||
total_tokens: result.output.usage.total_tokens || 0,
|
||||
outputTps: result.output.usage.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -165,7 +167,8 @@ class XAiLLM {
|
||||
temperature,
|
||||
}),
|
||||
messages,
|
||||
false
|
||||
false,
|
||||
this.model
|
||||
);
|
||||
|
||||
return measuredStreamRequest;
|
||||
|
||||
@ -136,6 +136,8 @@ class ZAiLLM {
|
||||
total_tokens: result.output.usage?.total_tokens || 0,
|
||||
outputTps: result.output.usage?.completion_tokens / result.duration,
|
||||
duration: result.duration,
|
||||
model: this.model,
|
||||
timestamp: new Date(),
|
||||
},
|
||||
};
|
||||
}
|
||||
@ -149,7 +151,8 @@ class ZAiLLM {
|
||||
temperature,
|
||||
}),
|
||||
messages,
|
||||
false
|
||||
false,
|
||||
this.model
|
||||
);
|
||||
|
||||
return measuredStreamRequest;
|
||||
|
||||
@ -59,13 +59,15 @@ class LLMPerformanceMonitor {
|
||||
* Also attaches an `endMeasurement` method to the stream that will calculate the duration of the stream and metrics.
|
||||
* @param {Promise<OpenAICompatibleStream>} func
|
||||
* @param {Messages} messages - the messages sent to the LLM so we can calculate the prompt tokens since most providers do not return this on stream
|
||||
* @param {boolean} runPromptTokenCalculation - whether to run the prompt token calculation to estimate the `prompt_tokens` metric. This is useful for providers that do not return this on stream.
|
||||
* @param {boolean} runPromptTokenCalculation - [default: true] whether to run the prompt token calculation to estimate the `prompt_tokens` metric. This is useful for providers that do not return this on stream.
|
||||
* @param {string} modelTag - the tag of the model that was used to generate the stream (eg: gpt-4o, claude-3-5-sonnet, qwen3/72b-instruct, etc.)
|
||||
* @returns {Promise<MonitoredStream>}
|
||||
*/
|
||||
static async measureStream(
|
||||
func,
|
||||
messages = [],
|
||||
runPromptTokenCalculation = true
|
||||
runPromptTokenCalculation = true,
|
||||
modelTag = ""
|
||||
) {
|
||||
const stream = await func;
|
||||
stream.start = Date.now();
|
||||
@ -76,6 +78,7 @@ class LLMPerformanceMonitor {
|
||||
total_tokens: 0,
|
||||
outputTps: 0,
|
||||
duration: 0,
|
||||
...(modelTag ? { model: modelTag } : {}),
|
||||
};
|
||||
|
||||
stream.endMeasurement = (reportedUsage = {}) => {
|
||||
@ -88,6 +91,7 @@ class LLMPerformanceMonitor {
|
||||
...stream.metrics,
|
||||
...reportedUsage,
|
||||
duration: reportedUsage?.duration ?? estimatedDuration,
|
||||
timestamp: new Date(),
|
||||
};
|
||||
|
||||
stream.metrics.total_tokens =
|
||||
|
||||
Loading…
Reference in New Issue
Block a user