merlyn/server/utils/EmbeddingEngines/native/constants.js
Timothy Carambat 2c19dd09ed
Native Embedder model selection (incl: Multilingual support) (#3835)
* WIP on embedder selection
TODO: apply splitting and query prefixes (if applicable)

* wip on upsert

* Support base model
support nomic-text-embed-v1
support multilingual-e5-small
Add prefixing for both embedding and query for RAG tasks
Add chunking prefix to all vector dbs to apply prefix when possible
Show dropdown and auto-pull on new selection

* norm translations

* move supported models to constants
handle null seelction or invalid selection on dropdown
update comments

* dev

* patch text splitter maximums for now

* normalize translations

* add tests for splitter functionality

* normalize

---------

Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
2025-07-22 10:07:20 -07:00

64 lines
2.5 KiB
JavaScript

const SUPPORTED_NATIVE_EMBEDDING_MODELS = {
"Xenova/all-MiniLM-L6-v2": {
maxConcurrentChunks: 25,
// Right now, this is NOT the token length, and is instead the number of characters
// that can be processed in a single pass. So we override to 1,000 characters.
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
// embeddingMaxChunkLength: 512, (from the model card)
embeddingMaxChunkLength: 1_000,
chunkPrefix: "",
queryPrefix: "",
apiInfo: {
id: "Xenova/all-MiniLM-L6-v2",
name: "all-MiniLM-L6-v2",
description:
"A lightweight and fast model for embedding text. The default model for AnythingLLM.",
lang: "English",
size: "23MB",
modelCard: "https://huggingface.co/Xenova/all-MiniLM-L6-v2",
},
},
"Xenova/nomic-embed-text-v1": {
maxConcurrentChunks: 5,
// Right now, this is NOT the token length, and is instead the number of characters
// that can be processed in a single pass. So we override to 16,000 characters.
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
// embeddingMaxChunkLength: 8192, (from the model card)
embeddingMaxChunkLength: 16_000,
chunkPrefix: "search_document: ",
queryPrefix: "search_query: ",
apiInfo: {
id: "Xenova/nomic-embed-text-v1",
name: "nomic-embed-text-v1",
description:
"A high-performing open embedding model with a large token context window. Requires more processing power and memory.",
lang: "English",
size: "139MB",
modelCard: "https://huggingface.co/Xenova/nomic-embed-text-v1",
},
},
"MintplexLabs/multilingual-e5-small": {
maxConcurrentChunks: 5,
// Right now, this is NOT the token length, and is instead the number of characters
// that can be processed in a single pass. So we override to 1,000 characters.
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
// embeddingMaxChunkLength: 512, (from the model card)
embeddingMaxChunkLength: 1_000,
chunkPrefix: "passage: ",
queryPrefix: "query: ",
apiInfo: {
id: "MintplexLabs/multilingual-e5-small",
name: "multilingual-e5-small",
description:
"A larger multilingual embedding model that supports 100+ languages. Requires more processing power and memory.",
lang: "100+ languages",
size: "487MB",
modelCard: "https://huggingface.co/intfloat/multilingual-e5-small",
},
},
};
module.exports = {
SUPPORTED_NATIVE_EMBEDDING_MODELS,
};