fix: correct TPS calculation for Generic OpenAI provider with llama.cpp (#4981)

* add check for timings field on final chunk to override usage data * refactor: extract llama.cpp timings into reusable private method Move timings extraction into #extractTimings so it can be shared by both streaming (handleStream) and non-streaming (getChatCompletion) code paths. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * lint and cleanup --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
2026-02-12 14:40:35 -08:00 · 2026-02-12 14:40:35 -08:00 · 1ccf468158
commit 1ccf468158
parent 5fb1281891
1 changed files with 29 additions and 6 deletions
--- a/server/utils/AiProviders/genericOpenAi/index.js
+++ b/server/utils/AiProviders/genericOpenAi/index.js
@ -149,6 +149,23 @@ class GenericOpenAiLLM {
    ];
  }
  /**
   * Extracts accurate generation-only timing and token count from a llama.cpp
   * response or streaming chunk. Mutates the provided usage object in place
   * so it can be used by both streaming and non-streaming code paths.
   * @param {Object} response - the API response or final streaming chunk
   * @param {Object} usage - the usage object to mutate
   */
  #extractLlamaCppTimings(response, usage) {
    if (!response || !response.timings) return;
    if (response.timings.hasOwnProperty("predicted_n"))
      usage.completion_tokens = Number(response.timings.predicted_n);
    if (response.timings.hasOwnProperty("predicted_ms"))
      usage.duration = Number(response.timings.predicted_ms) / 1000;
  }
  /**
   * Parses and prepends reasoning from the response and returns the full text response.
   * @param {Object} response
@ -184,15 +201,19 @@ class GenericOpenAiLLM {
    )
      return null;
    const usage = {
      prompt_tokens: result.output?.usage?.prompt_tokens || 0,
      completion_tokens: result.output?.usage?.completion_tokens || 0,
      total_tokens: result.output?.usage?.total_tokens || 0,
      duration: result.duration,
    };
    this.#extractLlamaCppTimings(result.output, usage);
    return {
      textResponse: this.#parseReasoningFromResponse(result.output.choices[0]),
      metrics: {
-        prompt_tokens: result.output?.usage?.prompt_tokens || 0,
+        ...usage,
-        completion_tokens: result.output?.usage?.completion_tokens || 0,
+        outputTps: usage.completion_tokens / usage.duration,
        total_tokens: result.output?.usage?.total_tokens || 0,
        outputTps:
          (result.output?.usage?.completion_tokens || 0) / result.duration,
        duration: result.duration,
        model: this.model,
        provider: this.className,
        timestamp: new Date(),
@ -332,6 +353,8 @@ class GenericOpenAiLLM {
              close: true,
              error: false,
            });
            this.#extractLlamaCppTimings(chunk, usage);
            response.removeListener("close", handleAbort);
            stream?.endMeasurement(usage);
            resolve(fullText);