Add tokenizer improvments via Singleton class and estimation (#3072)
* Add tokenizer improvments via Singleton class linting * dev build * Estimation fallback when string exceeds a fixed byte size * Add notice to tiktoken on backend
This commit is contained in:
parent
e1af72daa7
commit
d1ca16f7f8
2
.github/workflows/dev-build.yaml
vendored
2
.github/workflows/dev-build.yaml
vendored
@ -6,7 +6,7 @@ concurrency:
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ['agent-ui-animations'] # put your current branch to create a build. Core team only.
|
||||
branches: ['3069-tokenizer-collector-improvements'] # put your current branch to create a build. Core team only.
|
||||
paths-ignore:
|
||||
- '**.md'
|
||||
- 'cloud-deployments/*'
|
||||
|
||||
@ -41,7 +41,7 @@ async function scrapeGenericUrl(link, textOnly = false) {
|
||||
published: new Date().toLocaleString(),
|
||||
wordCount: content.split(" ").length,
|
||||
pageContent: content,
|
||||
token_count_estimate: tokenizeString(content).length,
|
||||
token_count_estimate: tokenizeString(content),
|
||||
};
|
||||
|
||||
const document = writeToServerDocuments(
|
||||
|
||||
@ -55,7 +55,7 @@ async function processRawText(textContent, metadata) {
|
||||
published: METADATA_KEYS.possible.published(metadata),
|
||||
wordCount: textContent.split(" ").length,
|
||||
pageContent: textContent,
|
||||
token_count_estimate: tokenizeString(textContent).length,
|
||||
token_count_estimate: tokenizeString(textContent),
|
||||
};
|
||||
|
||||
const document = writeToServerDocuments(
|
||||
|
||||
@ -56,7 +56,7 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
|
||||
published: createdDate(fullFilePath),
|
||||
wordCount: content.split(" ").length,
|
||||
pageContent: content,
|
||||
token_count_estimate: tokenizeString(content).length,
|
||||
token_count_estimate: tokenizeString(content),
|
||||
};
|
||||
|
||||
const document = writeToServerDocuments(
|
||||
|
||||
@ -42,7 +42,7 @@ async function asDocX({ fullFilePath = "", filename = "" }) {
|
||||
published: createdDate(fullFilePath),
|
||||
wordCount: content.split(" ").length,
|
||||
pageContent: content,
|
||||
token_count_estimate: tokenizeString(content).length,
|
||||
token_count_estimate: tokenizeString(content),
|
||||
};
|
||||
|
||||
const document = writeToServerDocuments(
|
||||
|
||||
@ -40,7 +40,7 @@ async function asEPub({ fullFilePath = "", filename = "" }) {
|
||||
published: createdDate(fullFilePath),
|
||||
wordCount: content.split(" ").length,
|
||||
pageContent: content,
|
||||
token_count_estimate: tokenizeString(content).length,
|
||||
token_count_estimate: tokenizeString(content),
|
||||
};
|
||||
|
||||
const document = writeToServerDocuments(
|
||||
|
||||
@ -53,7 +53,7 @@ async function asMbox({ fullFilePath = "", filename = "" }) {
|
||||
published: createdDate(fullFilePath),
|
||||
wordCount: content.split(" ").length,
|
||||
pageContent: content,
|
||||
token_count_estimate: tokenizeString(content).length,
|
||||
token_count_estimate: tokenizeString(content),
|
||||
};
|
||||
|
||||
item++;
|
||||
|
||||
@ -38,7 +38,7 @@ async function asOfficeMime({ fullFilePath = "", filename = "" }) {
|
||||
published: createdDate(fullFilePath),
|
||||
wordCount: content.split(" ").length,
|
||||
pageContent: content,
|
||||
token_count_estimate: tokenizeString(content).length,
|
||||
token_count_estimate: tokenizeString(content),
|
||||
};
|
||||
|
||||
const document = writeToServerDocuments(
|
||||
|
||||
@ -49,7 +49,7 @@ async function asPdf({ fullFilePath = "", filename = "" }) {
|
||||
published: createdDate(fullFilePath),
|
||||
wordCount: content.split(" ").length,
|
||||
pageContent: content,
|
||||
token_count_estimate: tokenizeString(content).length,
|
||||
token_count_estimate: tokenizeString(content),
|
||||
};
|
||||
|
||||
const document = writeToServerDocuments(
|
||||
|
||||
@ -38,7 +38,7 @@ async function asTxt({ fullFilePath = "", filename = "" }) {
|
||||
published: createdDate(fullFilePath),
|
||||
wordCount: content.split(" ").length,
|
||||
pageContent: content,
|
||||
token_count_estimate: tokenizeString(content).length,
|
||||
token_count_estimate: tokenizeString(content),
|
||||
};
|
||||
|
||||
const document = writeToServerDocuments(
|
||||
|
||||
@ -67,7 +67,7 @@ async function asXlsx({ fullFilePath = "", filename = "" }) {
|
||||
published: createdDate(fullFilePath),
|
||||
wordCount: content.split(/\s+/).length,
|
||||
pageContent: content,
|
||||
token_count_estimate: tokenizeString(content).length,
|
||||
token_count_estimate: tokenizeString(content),
|
||||
};
|
||||
|
||||
const document = writeToServerDocuments(
|
||||
|
||||
@ -104,7 +104,7 @@ async function loadConfluence(
|
||||
published: new Date().toLocaleString(),
|
||||
wordCount: doc.pageContent.split(" ").length,
|
||||
pageContent: doc.pageContent,
|
||||
token_count_estimate: tokenizeString(doc.pageContent).length,
|
||||
token_count_estimate: tokenizeString(doc.pageContent),
|
||||
};
|
||||
|
||||
console.log(
|
||||
|
||||
@ -66,7 +66,7 @@ async function loadGithubRepo(args, response) {
|
||||
published: new Date().toLocaleString(),
|
||||
wordCount: doc.pageContent.split(" ").length,
|
||||
pageContent: doc.pageContent,
|
||||
token_count_estimate: tokenizeString(doc.pageContent).length,
|
||||
token_count_estimate: tokenizeString(doc.pageContent),
|
||||
};
|
||||
console.log(
|
||||
`[Github Loader]: Saving ${doc.metadata.source} to ${outFolder}`
|
||||
|
||||
@ -82,7 +82,7 @@ async function loadGitlabRepo(args, response) {
|
||||
}
|
||||
|
||||
data.wordCount = pageContent.split(" ").length;
|
||||
data.token_count_estimate = tokenizeString(pageContent).length;
|
||||
data.token_count_estimate = tokenizeString(pageContent);
|
||||
data.pageContent = pageContent;
|
||||
|
||||
console.log(
|
||||
|
||||
@ -122,7 +122,7 @@ async function bulkScrapePages(links, outFolderPath) {
|
||||
published: new Date().toLocaleString(),
|
||||
wordCount: content.split(" ").length,
|
||||
pageContent: content,
|
||||
token_count_estimate: tokenizeString(content).length,
|
||||
token_count_estimate: tokenizeString(content),
|
||||
};
|
||||
|
||||
writeToServerDocuments(data, data.title, outFolderPath);
|
||||
|
||||
@ -115,7 +115,7 @@ async function loadYouTubeTranscript({ url }) {
|
||||
published: new Date().toLocaleString(),
|
||||
wordCount: content.split(" ").length,
|
||||
pageContent: content,
|
||||
token_count_estimate: tokenizeString(content).length,
|
||||
token_count_estimate: tokenizeString(content),
|
||||
};
|
||||
|
||||
console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`);
|
||||
|
||||
@ -1,15 +1,66 @@
|
||||
const { getEncoding } = require("js-tiktoken");
|
||||
|
||||
function tokenizeString(input = "") {
|
||||
try {
|
||||
const encoder = getEncoding("cl100k_base");
|
||||
return encoder.encode(input);
|
||||
} catch (e) {
|
||||
console.error("Could not tokenize string!");
|
||||
return [];
|
||||
class TikTokenTokenizer {
|
||||
static MAX_KB_ESTIMATE = 10;
|
||||
static DIVISOR = 8;
|
||||
|
||||
constructor() {
|
||||
if (TikTokenTokenizer.instance) {
|
||||
this.log(
|
||||
"Singleton instance already exists. Returning existing instance."
|
||||
);
|
||||
return TikTokenTokenizer.instance;
|
||||
}
|
||||
|
||||
this.encoder = getEncoding("cl100k_base");
|
||||
TikTokenTokenizer.instance = this;
|
||||
this.log("Initialized new TikTokenTokenizer instance.");
|
||||
}
|
||||
|
||||
log(text, ...args) {
|
||||
console.log(`\x1b[35m[TikTokenTokenizer]\x1b[0m ${text}`, ...args);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the input is too long to encode
|
||||
* this is more of a rough estimate and a sanity check to prevent
|
||||
* CPU issues from encoding too large of strings
|
||||
* Assumes 1 character = 2 bytes in JS
|
||||
* @param {string} input
|
||||
* @returns {boolean}
|
||||
*/
|
||||
#isTooLong(input) {
|
||||
const bytesEstimate = input.length * 2;
|
||||
const kbEstimate = Math.floor(bytesEstimate / 1024);
|
||||
return kbEstimate >= TikTokenTokenizer.MAX_KB_ESTIMATE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode a string into tokens for rough token count estimation.
|
||||
* @param {string} input
|
||||
* @returns {number}
|
||||
*/
|
||||
tokenizeString(input = "") {
|
||||
try {
|
||||
if (this.#isTooLong(input)) {
|
||||
this.log("Input will take too long to encode - estimating");
|
||||
return Math.ceil(input.length / TikTokenTokenizer.DIVISOR);
|
||||
}
|
||||
|
||||
return this.encoder.encode(input).length;
|
||||
} catch (e) {
|
||||
this.log("Could not tokenize string! Estimating...", e.message, e.stack);
|
||||
return Math.ceil(input?.length / TikTokenTokenizer.DIVISOR) || 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const tokenizer = new TikTokenTokenizer();
|
||||
module.exports = {
|
||||
tokenizeString,
|
||||
/**
|
||||
* Encode a string into tokens for rough token count estimation.
|
||||
* @param {string} input
|
||||
* @returns {number}
|
||||
*/
|
||||
tokenizeString: (input) => tokenizer.tokenizeString(input),
|
||||
};
|
||||
|
||||
@ -4,7 +4,10 @@ const {
|
||||
} = require("../../helpers/chat/LLMPerformanceMonitor");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { MODEL_MAP } = require("../modelMap");
|
||||
const { writeResponseChunk, clientAbortedHandler } = require("../../helpers/chat/responses");
|
||||
const {
|
||||
writeResponseChunk,
|
||||
clientAbortedHandler,
|
||||
} = require("../../helpers/chat/responses");
|
||||
|
||||
class DeepSeekLLM {
|
||||
constructor(embedder = null, modelPreference = null) {
|
||||
|
||||
@ -1,10 +1,36 @@
|
||||
const { getEncodingNameForModel, getEncoding } = require("js-tiktoken");
|
||||
|
||||
/**
|
||||
* @class TokenManager
|
||||
*
|
||||
* @notice
|
||||
* We cannot do estimation of tokens here like we do in the collector
|
||||
* because we need to know the model to do it.
|
||||
* Other issues are we also do reverse tokenization here for the chat history during cannonballing.
|
||||
* So here we are stuck doing the actual tokenization and encoding until we figure out what to do with prompt overflows.
|
||||
*/
|
||||
class TokenManager {
|
||||
static instance = null;
|
||||
static currentModel = null;
|
||||
|
||||
constructor(model = "gpt-3.5-turbo") {
|
||||
if (TokenManager.instance && TokenManager.currentModel === model) {
|
||||
this.log("Returning existing instance for model:", model);
|
||||
return TokenManager.instance;
|
||||
}
|
||||
|
||||
this.model = model;
|
||||
this.encoderName = this.#getEncodingFromModel(model);
|
||||
this.encoder = getEncoding(this.encoderName);
|
||||
|
||||
TokenManager.instance = this;
|
||||
TokenManager.currentModel = model;
|
||||
this.log("Initialized new TokenManager instance for model:", model);
|
||||
return this;
|
||||
}
|
||||
|
||||
log(text, ...args) {
|
||||
console.log(`\x1b[35m[TokenManager]\x1b[0m ${text}`, ...args);
|
||||
}
|
||||
|
||||
#getEncodingFromModel(model) {
|
||||
@ -15,9 +41,11 @@ class TokenManager {
|
||||
}
|
||||
}
|
||||
|
||||
// Pass in an empty array of disallowedSpecials to handle all tokens as text and to be tokenized.
|
||||
// https://github.com/openai/tiktoken/blob/9e79899bc248d5313c7dd73562b5e211d728723d/tiktoken/core.py#L91C20-L91C38
|
||||
// Returns number[]
|
||||
/**
|
||||
* Pass in an empty array of disallowedSpecials to handle all tokens as text and to be tokenized.
|
||||
* @param {string} input
|
||||
* @returns {number[]}
|
||||
*/
|
||||
tokensFromString(input = "") {
|
||||
try {
|
||||
const tokens = this.encoder.encode(String(input), undefined, []);
|
||||
@ -28,17 +56,31 @@ class TokenManager {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts an array of tokens back to a string.
|
||||
* @param {number[]} tokens
|
||||
* @returns {string}
|
||||
*/
|
||||
bytesFromTokens(tokens = []) {
|
||||
const bytes = this.encoder.decode(tokens);
|
||||
return bytes;
|
||||
}
|
||||
|
||||
// Returns number
|
||||
/**
|
||||
* Counts the number of tokens in a string.
|
||||
* @param {string} input
|
||||
* @returns {number}
|
||||
*/
|
||||
countFromString(input = "") {
|
||||
const tokens = this.tokensFromString(input);
|
||||
return tokens.length;
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimates the number of tokens in a string or array of strings.
|
||||
* @param {string | string[]} input
|
||||
* @returns {number}
|
||||
*/
|
||||
statsFrom(input) {
|
||||
if (typeof input === "string") return this.countFromString(input);
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user