fix: correct TPS calculation for Generic OpenAI provider with llama.cpp (#4981)
* add check for timings field on final chunk to override usage data * refactor: extract llama.cpp timings into reusable private method Move timings extraction into #extractTimings so it can be shared by both streaming (handleStream) and non-streaming (getChatCompletion) code paths. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * lint and cleanup --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
parent
5fb1281891
commit
1ccf468158
@ -149,6 +149,23 @@ class GenericOpenAiLLM {
|
|||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts accurate generation-only timing and token count from a llama.cpp
|
||||||
|
* response or streaming chunk. Mutates the provided usage object in place
|
||||||
|
* so it can be used by both streaming and non-streaming code paths.
|
||||||
|
* @param {Object} response - the API response or final streaming chunk
|
||||||
|
* @param {Object} usage - the usage object to mutate
|
||||||
|
*/
|
||||||
|
#extractLlamaCppTimings(response, usage) {
|
||||||
|
if (!response || !response.timings) return;
|
||||||
|
|
||||||
|
if (response.timings.hasOwnProperty("predicted_n"))
|
||||||
|
usage.completion_tokens = Number(response.timings.predicted_n);
|
||||||
|
|
||||||
|
if (response.timings.hasOwnProperty("predicted_ms"))
|
||||||
|
usage.duration = Number(response.timings.predicted_ms) / 1000;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parses and prepends reasoning from the response and returns the full text response.
|
* Parses and prepends reasoning from the response and returns the full text response.
|
||||||
* @param {Object} response
|
* @param {Object} response
|
||||||
@ -184,15 +201,19 @@ class GenericOpenAiLLM {
|
|||||||
)
|
)
|
||||||
return null;
|
return null;
|
||||||
|
|
||||||
|
const usage = {
|
||||||
|
prompt_tokens: result.output?.usage?.prompt_tokens || 0,
|
||||||
|
completion_tokens: result.output?.usage?.completion_tokens || 0,
|
||||||
|
total_tokens: result.output?.usage?.total_tokens || 0,
|
||||||
|
duration: result.duration,
|
||||||
|
};
|
||||||
|
this.#extractLlamaCppTimings(result.output, usage);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
textResponse: this.#parseReasoningFromResponse(result.output.choices[0]),
|
textResponse: this.#parseReasoningFromResponse(result.output.choices[0]),
|
||||||
metrics: {
|
metrics: {
|
||||||
prompt_tokens: result.output?.usage?.prompt_tokens || 0,
|
...usage,
|
||||||
completion_tokens: result.output?.usage?.completion_tokens || 0,
|
outputTps: usage.completion_tokens / usage.duration,
|
||||||
total_tokens: result.output?.usage?.total_tokens || 0,
|
|
||||||
outputTps:
|
|
||||||
(result.output?.usage?.completion_tokens || 0) / result.duration,
|
|
||||||
duration: result.duration,
|
|
||||||
model: this.model,
|
model: this.model,
|
||||||
provider: this.className,
|
provider: this.className,
|
||||||
timestamp: new Date(),
|
timestamp: new Date(),
|
||||||
@ -332,6 +353,8 @@ class GenericOpenAiLLM {
|
|||||||
close: true,
|
close: true,
|
||||||
error: false,
|
error: false,
|
||||||
});
|
});
|
||||||
|
this.#extractLlamaCppTimings(chunk, usage);
|
||||||
|
|
||||||
response.removeListener("close", handleAbort);
|
response.removeListener("close", handleAbort);
|
||||||
stream?.endMeasurement(usage);
|
stream?.endMeasurement(usage);
|
||||||
resolve(fullText);
|
resolve(fullText);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user