diff --git a/server/utils/AiProviders/openRouter/index.js b/server/utils/AiProviders/openRouter/index.js index c8e12748..28d4306b 100644 --- a/server/utils/AiProviders/openRouter/index.js +++ b/server/utils/AiProviders/openRouter/index.js @@ -304,13 +304,9 @@ class OpenRouterLLM { user: user?.id ? `user_${user.id}` : "", }), messages, - // We have to manually count the tokens - // OpenRouter has a ton of providers and they all can return slightly differently - // some return chunk.usage on STOP, some do it after stop, its inconsistent. - // So it is possible reported metrics are inaccurate since we cannot reliably - // catch the metrics before resolving the stream - so we just pretend this functionality - // is not available. - runPromptTokenCalculation: true, + // OpenRouter returns the usage in the stream as the very last chunk **after** the finish reason. + // so we don't need to run the prompt token calculation. + runPromptTokenCalculation: false, modelTag: this.model, provider: this.className, }); @@ -320,6 +316,8 @@ class OpenRouterLLM { /** * Handles the default stream response for a chat. + * - Handle weird OR timeout behavior where the stream never self-closes. + * - Handle the usage metrics being returned in the stream as the very last chunk **after** the finish reason. * @param {import("express").Response} response * @param {import('../../helpers/chat/LLMPerformanceMonitor').MonitoredStream} stream * @param {Object} responseProps @@ -328,6 +326,8 @@ class OpenRouterLLM { handleStream(response, stream, responseProps) { const timeoutThresholdMs = this.timeout; const { uuid = uuidv4(), sources = [] } = responseProps; + let hasUsageMetrics = false; + let usage = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }; return new Promise(async (resolve) => { let fullText = ""; @@ -336,14 +336,8 @@ class OpenRouterLLM { let pplxCitations = []; // Array of inline citations for Perplexity models (if applicable) let isPerplexity = this.isPerplexityModel; - // Establish listener to early-abort a streaming response - // in case things go sideways or the user does not like the response. - // We preserve the generated text but continue as if chat was completed - // to preserve previously generated content. const handleAbort = () => { - stream?.endMeasurement({ - completion_tokens: LLMPerformanceMonitor.countTokens(fullText), - }); + stream?.endMeasurement(usage); clientAbortedHandler(resolve, fullText); }; response.on("close", handleAbort); @@ -375,9 +369,7 @@ class OpenRouterLLM { }); clearInterval(timeoutCheck); response.removeListener("close", handleAbort); - stream?.endMeasurement({ - completion_tokens: LLMPerformanceMonitor.countTokens(fullText), - }); + stream?.endMeasurement(usage); resolve(fullText); } }, 500); @@ -389,6 +381,15 @@ class OpenRouterLLM { const reasoningToken = message?.delta?.reasoning; lastChunkTime = Number(new Date()); + if (chunk.hasOwnProperty("usage") && !hasUsageMetrics) { + hasUsageMetrics = true; + usage = { + prompt_tokens: chunk.usage.prompt_tokens, + completion_tokens: chunk.usage.completion_tokens, + total_tokens: chunk.usage.total_tokens, + }; + } + // Some models will return citations (e.g. Perplexity) - we should preserve them for inline citations if applicable. if ( isPerplexity && @@ -464,7 +465,7 @@ class OpenRouterLLM { }); } - if (message.finish_reason !== null) { + if (message?.finish_reason) { writeResponseChunk(response, { uuid, sources, @@ -473,14 +474,14 @@ class OpenRouterLLM { close: true, error: false, }); - response.removeListener("close", handleAbort); - clearInterval(timeoutCheck); - stream?.endMeasurement({ - completion_tokens: LLMPerformanceMonitor.countTokens(fullText), - }); - resolve(fullText); } } + + // Stream completed naturally - resolve with final metrics + response.removeListener("close", handleAbort); + clearInterval(timeoutCheck); + stream?.endMeasurement(usage); + resolve(fullText); } catch (e) { writeResponseChunk(response, { uuid, @@ -492,9 +493,7 @@ class OpenRouterLLM { }); response.removeListener("close", handleAbort); clearInterval(timeoutCheck); - stream?.endMeasurement({ - completion_tokens: LLMPerformanceMonitor.countTokens(fullText), - }); + stream?.endMeasurement(usage); resolve(fullText); } });