* WIP on embedder selection TODO: apply splitting and query prefixes (if applicable) * wip on upsert * Support base model support nomic-text-embed-v1 support multilingual-e5-small Add prefixing for both embedding and query for RAG tasks Add chunking prefix to all vector dbs to apply prefix when possible Show dropdown and auto-pull on new selection * norm translations * move supported models to constants handle null seelction or invalid selection on dropdown update comments * dev * patch text splitter maximums for now * normalize translations * add tests for splitter functionality * normalize --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
64 lines
2.5 KiB
JavaScript
64 lines
2.5 KiB
JavaScript
const SUPPORTED_NATIVE_EMBEDDING_MODELS = {
|
|
"Xenova/all-MiniLM-L6-v2": {
|
|
maxConcurrentChunks: 25,
|
|
// Right now, this is NOT the token length, and is instead the number of characters
|
|
// that can be processed in a single pass. So we override to 1,000 characters.
|
|
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
|
|
// embeddingMaxChunkLength: 512, (from the model card)
|
|
embeddingMaxChunkLength: 1_000,
|
|
chunkPrefix: "",
|
|
queryPrefix: "",
|
|
apiInfo: {
|
|
id: "Xenova/all-MiniLM-L6-v2",
|
|
name: "all-MiniLM-L6-v2",
|
|
description:
|
|
"A lightweight and fast model for embedding text. The default model for AnythingLLM.",
|
|
lang: "English",
|
|
size: "23MB",
|
|
modelCard: "https://huggingface.co/Xenova/all-MiniLM-L6-v2",
|
|
},
|
|
},
|
|
"Xenova/nomic-embed-text-v1": {
|
|
maxConcurrentChunks: 5,
|
|
// Right now, this is NOT the token length, and is instead the number of characters
|
|
// that can be processed in a single pass. So we override to 16,000 characters.
|
|
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
|
|
// embeddingMaxChunkLength: 8192, (from the model card)
|
|
embeddingMaxChunkLength: 16_000,
|
|
chunkPrefix: "search_document: ",
|
|
queryPrefix: "search_query: ",
|
|
apiInfo: {
|
|
id: "Xenova/nomic-embed-text-v1",
|
|
name: "nomic-embed-text-v1",
|
|
description:
|
|
"A high-performing open embedding model with a large token context window. Requires more processing power and memory.",
|
|
lang: "English",
|
|
size: "139MB",
|
|
modelCard: "https://huggingface.co/Xenova/nomic-embed-text-v1",
|
|
},
|
|
},
|
|
"MintplexLabs/multilingual-e5-small": {
|
|
maxConcurrentChunks: 5,
|
|
// Right now, this is NOT the token length, and is instead the number of characters
|
|
// that can be processed in a single pass. So we override to 1,000 characters.
|
|
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
|
|
// embeddingMaxChunkLength: 512, (from the model card)
|
|
embeddingMaxChunkLength: 1_000,
|
|
chunkPrefix: "passage: ",
|
|
queryPrefix: "query: ",
|
|
apiInfo: {
|
|
id: "MintplexLabs/multilingual-e5-small",
|
|
name: "multilingual-e5-small",
|
|
description:
|
|
"A larger multilingual embedding model that supports 100+ languages. Requires more processing power and memory.",
|
|
lang: "100+ languages",
|
|
size: "487MB",
|
|
modelCard: "https://huggingface.co/intfloat/multilingual-e5-small",
|
|
},
|
|
},
|
|
};
|
|
|
|
module.exports = {
|
|
SUPPORTED_NATIVE_EMBEDDING_MODELS,
|
|
};
|