Native Embedder model selection (incl: Multilingual support) (#3835)

* WIP on embedder selection
TODO: apply splitting and query prefixes (if applicable)

* wip on upsert

* Support base model
support nomic-text-embed-v1
support multilingual-e5-small
Add prefixing for both embedding and query for RAG tasks
Add chunking prefix to all vector dbs to apply prefix when possible
Show dropdown and auto-pull on new selection

* norm translations

* move supported models to constants
handle null seelction or invalid selection on dropdown
update comments

* dev

* patch text splitter maximums for now

* normalize translations

* add tests for splitter functionality

* normalize

---------

Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
This commit is contained in:
Timothy Carambat 2025-07-22 10:07:20 -07:00 committed by GitHub
parent 31a8ead823
commit 2c19dd09ed
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
44 changed files with 463 additions and 80 deletions

View File

@ -6,7 +6,7 @@ concurrency:
on:
push:
branches: ['4034-version-control'] # put your current branch to create a build. Core team only.
branches: ['multilingual-native-embedder-selection'] # put your current branch to create a build. Core team only.
paths-ignore:
- '**.md'
- 'cloud-deployments/*'

View File

@ -140,6 +140,10 @@ GID='1000'
###########################################
######## Embedding API SElECTION ##########
###########################################
# This will be the assumed default embedding seleciton and model
# EMBEDDING_ENGINE='native'
# EMBEDDING_MODEL_PREF='Xenova/all-MiniLM-L6-v2'
# Only used if you are using an LLM that does not natively support embedding (openai or Azure)
# EMBEDDING_ENGINE='openai'
# OPEN_AI_KEY=sk-xxxx

View File

@ -1,12 +1,100 @@
import { useTranslation } from "react-i18next";
import { useEffect, useState } from "react";
import { Link } from "react-router-dom";
import System from "@/models/system";
export default function NativeEmbeddingOptions({ settings }) {
const [loading, setLoading] = useState(true);
const [availableModels, setAvailableModels] = useState([]);
const [selectedModel, setSelectedModel] = useState(
settings?.EmbeddingModelPref
);
const [selectedModelInfo, setSelectedModelInfo] = useState();
useEffect(() => {
System.customModels("native-embedder")
.then(({ models }) => {
if (models?.length > 0) {
setAvailableModels(models);
const _selectedModel =
models.find((model) => model.id === settings?.EmbeddingModelPref) ??
models[0];
setSelectedModel(_selectedModel.id);
setSelectedModelInfo(_selectedModel);
}
})
.finally(() => {
setLoading(false);
});
}, []);
useEffect(() => {
if (!availableModels?.length || !selectedModel) return;
setSelectedModelInfo(
availableModels.find((model) => model.id === selectedModel)
);
}, [selectedModel, availableModels]);
export default function NativeEmbeddingOptions() {
const { t } = useTranslation();
return (
<div className="w-full h-10 items-center flex">
<p className="text-sm font-base text-white text-opacity-60">
{t("embedding.provider.description")}
</p>
<div className="w-full flex flex-col gap-y-4">
<div className="w-full flex flex-col mt-1.5">
<div className="flex flex-col w-96">
<label className="text-white text-sm font-semibold block mb-3">
Model Preference
</label>
<select
name="EmbeddingModelPref"
required={true}
defaultValue={selectedModel}
className="border-none bg-theme-settings-input-bg border-gray-500 text-theme-text-primary text-sm rounded-lg block w-60 p-2.5"
onChange={(e) => setSelectedModel(e.target.value)}
>
{loading ? (
<option
value="--loading-available-models--"
disabled={true}
selected={true}
>
--loading available models--
</option>
) : (
<optgroup label="Available embedding models">
{availableModels.map((model) => {
return (
<option
key={model.id}
value={model.id}
selected={selectedModel === model.id}
>
{model.name}
</option>
);
})}
</optgroup>
)}
</select>
</div>
{selectedModelInfo && (
<div className="flex flex-col gap-y-2 mt-2">
<p className="text-theme-text-secondary text-xs font-normal block">
{selectedModelInfo?.description}
</p>
<p className="text-theme-text-secondary text-xs font-normal block">
Trained on: {selectedModelInfo?.lang}
</p>
<p className="text-theme-text-secondary text-xs font-normal block">
Download Size: {selectedModelInfo?.size}
</p>
<Link
to={selectedModelInfo?.modelCard}
target="_blank"
rel="noopener noreferrer"
className="text-theme-text-secondary text-xs font-normal block underline hover:text-theme-text-primary"
>
View model card on Hugging Face &rarr;
</Link>
</div>
)}
</div>
</div>
);
}

View File

@ -122,7 +122,7 @@ const Citation = memo(({ source, onClick, textSizeClass }) => {
});
function omitChunkHeader(text) {
if (!text.startsWith("<document_metadata>")) return text;
if (!text.includes("<document_metadata>")) return text;
return text.split("</document_metadata>")[1].trim();
}

View File

@ -406,8 +406,6 @@ const TRANSLATIONS = {
"التضمين هو عملية تحويل النص إلى متجهات. هذه البيانات مطلوبة لتحويل ملفاتك ومطالباتك إلى تنسيق يمكن لـ إني ثينك إلْلْمْ استخدامه للمعالجة.",
provider: {
title: "موفر التضمين",
description:
"لا يلزم إجراء أي إعداد عند استخدام محرك التضمين الأصلي الخاص بـ إني ثينك إلْلْمْ.",
},
},
text: {

View File

@ -408,8 +408,6 @@ const TRANSLATIONS = {
"Indlejring er processen med at omdanne tekst til vektorer. Disse legitimationsoplysninger er nødvendige for at omdanne dine filer og prompts til et format, som AnythingLLM kan bruge til behandling.",
provider: {
title: "Indlejringsudbyder",
description:
"Ingen opsætning er nødvendig, når du bruger AnythingLLM's indbyggede indlejringsmotor.",
},
},
text: {

View File

@ -597,8 +597,6 @@ const TRANSLATIONS = {
"Einbettung ist der Prozess, Text in Vektoren umzuwandeln. Diese Anmeldeinformationen sind erforderlich, um Ihre Dateien und Prompts in ein Format umzuwandeln, das AnythingLLM zur Verarbeitung verwenden kann.",
provider: {
title: "Einbettungsanbieter",
description:
"Bei Verwendung der nativen Einbettungs-Engine von AnythingLLM ist keine Einrichtung erforderlich.",
},
},
text: {

View File

@ -620,8 +620,6 @@ const TRANSLATIONS = {
"Embedding is the process of turning text into vectors. These credentials are required to turn your files and prompts into a format which AnythingLLM can use to process.",
provider: {
title: "Embedding Provider",
description:
"There is no set up required when using AnythingLLM's native embedding engine.",
},
},

View File

@ -405,8 +405,6 @@ const TRANSLATIONS = {
"La incrustación es el proceso de convertir texto en vectores. Estas credenciales son necesarias para convertir tus archivos y prompts en un formato que AnythingLLM pueda usar para procesar.",
provider: {
title: "Proveedor de incrustación",
description:
"No se requiere configuración cuando se utiliza el motor de incrustación nativo de AnythingLLM.",
},
},
text: {

View File

@ -573,8 +573,6 @@ const TRANSLATIONS = {
"Embedding muudab teksti vektoriteks. Need võtmed on vajalikud, et AnythingLLM saaks sinu failid ja päringud töödelda.",
provider: {
title: "Embedding-i pakkuja",
description:
"AnythingLLM-i sisseehitatud embedding-mootor ei vaja seadistust.",
},
},
text: {

View File

@ -398,8 +398,6 @@ const TRANSLATIONS = {
"جاسازی فرآیند تبدیل متن به بردارها است. این اعتبارنامه‌ها برای تبدیل فایل‌ها و درخواست‌های شما به فرمتی که AnythingLLM بتواند پردازش کند، ضروری هستند.",
provider: {
title: "ارائه‌دهنده جاسازی",
description:
"هنگام استفاده از موتور جاسازی داخلی AnythingLLM نیازی به تنظیمات نیست.",
},
},
text: {

View File

@ -406,8 +406,6 @@ const TRANSLATIONS = {
"L'intégration est le processus de transformation du texte en vecteurs. Ces identifiants sont nécessaires pour transformer vos fichiers et invites en un format que AnythingLLM peut utiliser pour traiter.",
provider: {
title: "Fournisseur d'intégration",
description:
"Aucune configuration n'est nécessaire lors de l'utilisation du moteur d'intégration natif de AnythingLLM.",
},
},
text: {

View File

@ -394,8 +394,6 @@ const TRANSLATIONS = {
"הטבעה היא תהליך הפיכת טקסט לווקטורים. אישורי הרשאה אלה נדרשים כדי להפוך את הקבצים והבקשות שלך לפורמט ש-AnythingLLM יכול להשתמש בו לעיבוד.",
provider: {
title: "ספק הטבעה",
description:
"אין צורך בהגדרה בעת שימוש במנוע ההטבעה המקורי של AnythingLLM.",
},
},
text: {

View File

@ -404,8 +404,6 @@ const TRANSLATIONS = {
"L'embedding è il processo di trasformazione del testo in vettori. Queste credenziali sono necessarie per trasformare i file e i prompt in un formato che AnythingLLM può utilizzare per l'elaborazione.",
provider: {
title: "Provider di embedding",
description:
"Non è richiesta alcuna configurazione quando si utilizza il motore di embedding nativo di AnythingLLM.",
},
},
text: {

View File

@ -406,8 +406,6 @@ const TRANSLATIONS = {
"埋め込みとは、テキストをベクトルに変換するプロセスです。これらの認証情報は、ファイルやプロンプトをAnythingLLMが処理できるフォーマットに変換するために必要です。",
provider: {
title: "埋め込みプロバイダー",
description:
"AnythingLLMのネイティブ埋め込みエンジンを使用する場合、特に設定は必要ありません。",
},
},
text: {

View File

@ -581,8 +581,6 @@ const TRANSLATIONS = {
"임베딩은 텍스트를 벡터로 변환하는 과정입니다. 파일과 프롬프트를 AnythingLLM이 처리할 수 있는 형식으로 변환하려면 이러한 인증이 필요합니다.",
provider: {
title: "임베딩 제공자",
description:
"AnythingLLM의 기본 임베딩 엔진을 사용할 때는 설정이 필요하지 않습니다.",
},
},
text: {

View File

@ -592,8 +592,6 @@ const TRANSLATIONS = {
"Iegulšana ir process, ar kuru teksts tiek pārveidots vektoros. Šie akreditācijas dati ir nepieciešami, lai pārveidotu jūsu failus un vaicājumus formātā, kuru AnythingLLM var izmantot apstrādei.",
provider: {
title: "Iegulšanas pakalpojuma sniedzējs",
description:
"Nav nepieciešama iestatīšana, izmantojot AnythingLLM iebūvēto iegulšanas dzinēju.",
},
},
text: {

View File

@ -401,8 +401,6 @@ const TRANSLATIONS = {
"Inbedding is het proces van het omzetten van tekst in vectoren. Deze inloggegevens zijn vereist om je bestanden en prompts om te zetten naar een formaat dat AnythingLLM kan gebruiken om te verwerken.",
provider: {
title: "Inbedding Provider",
description:
"Er is geen instelling vereist bij gebruik van de ingebouwde inbeddingengine van AnythingLLM.",
},
},
text: {

View File

@ -94,6 +94,11 @@ function normalizeTranslations(lang, source, target, subdir = null) {
);
}
// If a non-en file has a key that is NOT in the en file, it will be removed
for (const key of Object.keys(normalized)) {
if (!source[key]) delete normalized[key];
}
return normalized;
}

View File

@ -597,8 +597,6 @@ const TRANSLATIONS = {
"Embedding to proces przekształcania tekstu na wektory. Poświadczenia są wymagane do przekształcenia plików i tekstu za pomocą wybranego modelu.",
provider: {
title: "Model używany do tworzenia embeddingów",
description:
"Podczas korzystania z natywnego silnika osadzania AnythingLLM nie jest wymagana żadna konfiguracja.",
},
},
text: {

View File

@ -579,8 +579,6 @@ const TRANSLATIONS = {
"Vínculo é o processo de transformar texto em vetores. Essas credenciais são necessárias para processar arquivos e prompts.",
provider: {
title: "Provedor de Vínculo",
description:
"Nenhuma configuração é necessária ao usar o mecanismo nativo do AnythingLLM.",
},
},
text: {

View File

@ -410,8 +410,6 @@ const TRANSLATIONS = {
"Встраивание - это процесс превращения текста в векторы. Эти учетные данные необходимы для превращения ваших файлов и подсказок в формат, который AnythingLLM может использовать для обработки.",
provider: {
title: "Поставщик встраивания",
description:
"Нет необходимости в настройке при использовании встроенного механизма встраивания AnythingLLM.",
},
},
text: {

View File

@ -401,8 +401,6 @@ const TRANSLATIONS = {
"Gömme, metni vektörlere dönüştürme sürecidir. Dosyalarınızın ve komutlarınızın işlenebilmesi için AnythingLLM, bu kimlik bilgilerine ihtiyaç duyar.",
provider: {
title: "Embedding Sağlayıcısı",
description:
"AnythingLLM'nin yerel gömme motoru kullanıldığında ek bir kurulum gerekmez.",
},
},
text: {

View File

@ -400,8 +400,6 @@ const TRANSLATIONS = {
"Embedding is the process of turning text into vectors. These credentials are required to turn your files and prompts into a format which AnythingLLM can use to process.",
provider: {
title: "Embedding Provider",
description:
"There is no set up required when using AnythingLLM's native embedding engine.",
},
},
text: {

View File

@ -557,7 +557,6 @@ const TRANSLATIONS = {
"嵌入是将文本转换为矢量的过程。需要这些凭据才能将你的文件和提示转换为 AnythingLLM 可以用来处理的格式。",
provider: {
title: "嵌入引擎提供商",
description: "使用 AnythingLLM 的本机嵌入引擎时不需要设置。",
},
},
text: {

View File

@ -389,7 +389,6 @@ const TRANSLATIONS = {
"嵌入是將文字轉換成向量的過程。這些憑證是用於將您的檔案和提示詞轉換成 AnythingLLM 可以處理的格式。",
provider: {
title: "向量嵌入提供者",
description: "使用 AnythingLLM 的原生嵌入引擎時,不需要任何設定。",
},
},
text: {

View File

@ -138,6 +138,10 @@ SIG_SALT='salt' # Please generate random string at least 32 chars long.
###########################################
######## Embedding API SElECTION ##########
###########################################
# This will be the assumed default embedding seleciton and model
# EMBEDDING_ENGINE='native'
# EMBEDDING_MODEL_PREF='Xenova/all-MiniLM-L6-v2'
# Only used if you are using an LLM that does not natively support embedding (openai or Azure)
# EMBEDDING_ENGINE='openai'
# OPEN_AI_KEY=sk-xxxx

View File

@ -0,0 +1,104 @@
const { TextSplitter } = require("../../../utils/TextSplitter");
const _ = require("lodash");
describe("TextSplitter", () => {
test("should split long text into n sized chunks", async () => {
const text = "This is a test text to be split into chunks".repeat(2);
const textSplitter = new TextSplitter({
chunkSize: 20,
chunkOverlap: 0,
});
const chunks = await textSplitter.splitText(text);
expect(chunks.length).toEqual(5);
});
test("applies chunk overlap of 20 characters on invalid chunkOverlap", async () => {
const text = "This is a test text to be split into chunks".repeat(2);
const textSplitter = new TextSplitter({
chunkSize: 30,
});
const chunks = await textSplitter.splitText(text);
expect(chunks.length).toEqual(6);
});
test("does not allow chunkOverlap to be greater than chunkSize", async () => {
expect(() => {
new TextSplitter({
chunkSize: 20,
chunkOverlap: 21,
});
}).toThrow();
});
test("applies specific metadata to stringifyHeader to each chunk", async () => {
const metadata = {
id: "123e4567-e89b-12d3-a456-426614174000",
url: "https://example.com",
title: "Example",
docAuthor: "John Doe",
published: "2021-01-01",
chunkSource: "link://https://example.com",
description: "This is a test text to be split into chunks",
};
const chunkHeaderMeta = TextSplitter.buildHeaderMeta(metadata);
expect(chunkHeaderMeta).toEqual({
sourceDocument: metadata.title,
source: metadata.url,
published: metadata.published,
});
});
test("applies a valid chunkPrefix to each chunk", async () => {
const text = "This is a test text to be split into chunks".repeat(2);
let textSplitter = new TextSplitter({
chunkSize: 20,
chunkOverlap: 0,
chunkPrefix: "testing: ",
});
let chunks = await textSplitter.splitText(text);
expect(chunks.length).toEqual(5);
expect(chunks.every(chunk => chunk.startsWith("testing: "))).toBe(true);
textSplitter = new TextSplitter({
chunkSize: 20,
chunkOverlap: 0,
chunkPrefix: "testing2: ",
});
chunks = await textSplitter.splitText(text);
expect(chunks.length).toEqual(5);
expect(chunks.every(chunk => chunk.startsWith("testing2: "))).toBe(true);
textSplitter = new TextSplitter({
chunkSize: 20,
chunkOverlap: 0,
chunkPrefix: undefined,
});
chunks = await textSplitter.splitText(text);
expect(chunks.length).toEqual(5);
expect(chunks.every(chunk => !chunk.startsWith(": "))).toBe(true);
textSplitter = new TextSplitter({
chunkSize: 20,
chunkOverlap: 0,
chunkPrefix: "",
});
chunks = await textSplitter.splitText(text);
expect(chunks.length).toEqual(5);
expect(chunks.every(chunk => !chunk.startsWith(": "))).toBe(true);
// Applied chunkPrefix with chunkHeaderMeta
textSplitter = new TextSplitter({
chunkSize: 20,
chunkOverlap: 0,
chunkHeaderMeta: TextSplitter.buildHeaderMeta({
title: "Example",
url: "https://example.com",
published: "2021-01-01",
}),
chunkPrefix: "testing3: ",
});
chunks = await textSplitter.splitText(text);
expect(chunks.length).toEqual(5);
expect(chunks.every(chunk => chunk.startsWith("testing3: <document_metadata>"))).toBe(true);
});
});

View File

@ -8,6 +8,7 @@ const prisma = require("../utils/prisma");
const { v4 } = require("uuid");
const { MetaGenerator } = require("../utils/boot/MetaGenerator");
const { PGVector } = require("../utils/vectorDbProviders/pgvector");
const { NativeEmbedder } = require("../utils/EmbeddingEngines/native");
const { getBaseLLMProviderModel } = require("../utils/helpers");
function isNullOrNaN(value) {
@ -194,6 +195,7 @@ const SystemSettings = {
const { hasVectorCachedFiles } = require("../utils/files");
const llmProvider = process.env.LLM_PROVIDER;
const vectorDB = process.env.VECTOR_DB;
const embeddingEngine = process.env.EMBEDDING_ENGINE ?? "native";
return {
// --------------------------------------------------------
// General Settings
@ -208,11 +210,14 @@ const SystemSettings = {
// --------------------------------------------------------
// Embedder Provider Selection Settings & Configs
// --------------------------------------------------------
EmbeddingEngine: process.env.EMBEDDING_ENGINE,
EmbeddingEngine: embeddingEngine,
HasExistingEmbeddings: await this.hasEmbeddings(), // check if they have any currently embedded documents active in workspaces.
HasCachedEmbeddings: hasVectorCachedFiles(), // check if they any currently cached embedded docs.
EmbeddingBasePath: process.env.EMBEDDING_BASE_PATH,
EmbeddingModelPref: process.env.EMBEDDING_MODEL_PREF,
EmbeddingModelPref:
embeddingEngine === "native"
? NativeEmbedder._getEmbeddingModel()
: process.env.EMBEDDING_MODEL_PREF,
EmbeddingModelMaxChunkLength:
process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH,
VoyageAiApiKey: !!process.env.VOYAGEAI_API_KEY,

View File

@ -10,3 +10,4 @@ togetherAi
tesseract
ppio
context-windows/*
MintplexLabs

View File

@ -0,0 +1,63 @@
const SUPPORTED_NATIVE_EMBEDDING_MODELS = {
"Xenova/all-MiniLM-L6-v2": {
maxConcurrentChunks: 25,
// Right now, this is NOT the token length, and is instead the number of characters
// that can be processed in a single pass. So we override to 1,000 characters.
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
// embeddingMaxChunkLength: 512, (from the model card)
embeddingMaxChunkLength: 1_000,
chunkPrefix: "",
queryPrefix: "",
apiInfo: {
id: "Xenova/all-MiniLM-L6-v2",
name: "all-MiniLM-L6-v2",
description:
"A lightweight and fast model for embedding text. The default model for AnythingLLM.",
lang: "English",
size: "23MB",
modelCard: "https://huggingface.co/Xenova/all-MiniLM-L6-v2",
},
},
"Xenova/nomic-embed-text-v1": {
maxConcurrentChunks: 5,
// Right now, this is NOT the token length, and is instead the number of characters
// that can be processed in a single pass. So we override to 16,000 characters.
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
// embeddingMaxChunkLength: 8192, (from the model card)
embeddingMaxChunkLength: 16_000,
chunkPrefix: "search_document: ",
queryPrefix: "search_query: ",
apiInfo: {
id: "Xenova/nomic-embed-text-v1",
name: "nomic-embed-text-v1",
description:
"A high-performing open embedding model with a large token context window. Requires more processing power and memory.",
lang: "English",
size: "139MB",
modelCard: "https://huggingface.co/Xenova/nomic-embed-text-v1",
},
},
"MintplexLabs/multilingual-e5-small": {
maxConcurrentChunks: 5,
// Right now, this is NOT the token length, and is instead the number of characters
// that can be processed in a single pass. So we override to 1,000 characters.
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
// embeddingMaxChunkLength: 512, (from the model card)
embeddingMaxChunkLength: 1_000,
chunkPrefix: "passage: ",
queryPrefix: "query: ",
apiInfo: {
id: "MintplexLabs/multilingual-e5-small",
name: "multilingual-e5-small",
description:
"A larger multilingual embedding model that supports 100+ languages. Requires more processing power and memory.",
lang: "100+ languages",
size: "487MB",
modelCard: "https://huggingface.co/intfloat/multilingual-e5-small",
},
},
};
module.exports = {
SUPPORTED_NATIVE_EMBEDDING_MODELS,
};

View File

@ -2,37 +2,114 @@ const path = require("path");
const fs = require("fs");
const { toChunks } = require("../../helpers");
const { v4 } = require("uuid");
const { SUPPORTED_NATIVE_EMBEDDING_MODELS } = require("./constants");
class NativeEmbedder {
static defaultModel = "Xenova/all-MiniLM-L6-v2";
/**
* Supported embedding models for native.
* @type {Record<string, {
* chunkPrefix: string;
* queryPrefix: string;
* apiInfo: {
* id: string;
* name: string;
* description: string;
* lang: string;
* size: string;
* modelCard: string;
* };
* }>}
*/
static supportedModels = SUPPORTED_NATIVE_EMBEDDING_MODELS;
// This is a folder that Mintplex Labs hosts for those who cannot capture the HF model download
// endpoint for various reasons. This endpoint is not guaranteed to be active or maintained
// and may go offline at any time at Mintplex Labs's discretion.
#fallbackHost = "https://cdn.anythingllm.com/support/models/";
constructor() {
// Model Card: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
this.model = "Xenova/all-MiniLM-L6-v2";
this.model = this.getEmbeddingModel();
this.modelInfo = this.getEmbedderInfo();
this.cacheDir = path.resolve(
process.env.STORAGE_DIR
? path.resolve(process.env.STORAGE_DIR, `models`)
: path.resolve(__dirname, `../../../storage/models`)
);
this.modelPath = path.resolve(this.cacheDir, "Xenova", "all-MiniLM-L6-v2");
this.modelPath = path.resolve(this.cacheDir, ...this.model.split("/"));
this.modelDownloaded = fs.existsSync(this.modelPath);
// Limit of how many strings we can process in a single pass to stay with resource or network limits
this.maxConcurrentChunks = 25;
this.embeddingMaxChunkLength = 1_000;
this.maxConcurrentChunks = this.modelInfo.maxConcurrentChunks;
this.embeddingMaxChunkLength = this.modelInfo.embeddingMaxChunkLength;
// Make directory when it does not exist in existing installations
if (!fs.existsSync(this.cacheDir)) fs.mkdirSync(this.cacheDir);
this.log("Initialized");
this.log(`Initialized ${this.model}`);
}
log(text, ...args) {
console.log(`\x1b[36m[NativeEmbedder]\x1b[0m ${text}`, ...args);
}
/**
* Get the selected model from the environment variable.
* @returns {string}
*/
static _getEmbeddingModel() {
const envModel =
process.env.EMBEDDING_MODEL_PREF ?? NativeEmbedder.defaultModel;
if (NativeEmbedder.supportedModels?.[envModel]) return envModel;
return NativeEmbedder.defaultModel;
}
get embeddingPrefix() {
return NativeEmbedder.supportedModels[this.model]?.chunkPrefix || "";
}
get queryPrefix() {
return NativeEmbedder.supportedModels[this.model]?.queryPrefix || "";
}
/**
* Get the available models in an API response format
* we can use to populate the frontend dropdown.
* @returns {{id: string, name: string, description: string, lang: string, size: string, modelCard: string}[]}
*/
static availableModels() {
return Object.values(NativeEmbedder.supportedModels).map(
(model) => model.apiInfo
);
}
/**
* Get the embedding model to use.
* We only support a few models and will default to the default model if the environment variable is not set or not supported.
*
* Why only a few? Because we need to mirror them on the CDN so non-US users can download them.
* eg: "Xenova/all-MiniLM-L6-v2"
* eg: "Xenova/nomic-embed-text-v1"
* @returns {string}
*/
getEmbeddingModel() {
const envModel =
process.env.EMBEDDING_MODEL_PREF ?? NativeEmbedder.defaultModel;
if (NativeEmbedder.supportedModels?.[envModel]) return envModel;
return NativeEmbedder.defaultModel;
}
/**
* Get the embedding model info.
*
* Will always fallback to the default model if the model is not supported.
* @returns {Object}
*/
getEmbedderInfo() {
const model = this.getEmbeddingModel();
return NativeEmbedder.supportedModels[model];
}
#tempfilePath() {
const filename = `${v4()}.tmp`;
const tmpPath = process.env.STORAGE_DIR
@ -124,7 +201,27 @@ class NativeEmbedder {
throw fetchResponse.error;
}
/**
* Apply the query prefix to the text input if it is required by the model.
* eg: nomic-embed-text-v1 requires a query prefix for embedding/searching.
* @param {string|string[]} textInput - The text to embed.
* @returns {string|string[]} The text with the prefix applied.
*/
#applyQueryPrefix(textInput) {
if (!this.queryPrefix) return textInput;
if (Array.isArray(textInput))
textInput = textInput.map((text) => `${this.queryPrefix}${text}`);
else textInput = `${this.queryPrefix}${textInput}`;
return textInput;
}
/**
* Embed a single text input.
* @param {string|string[]} textInput - The text to embed.
* @returns {Promise<Array<number>>} The embedded text.
*/
async embedTextInput(textInput) {
textInput = this.#applyQueryPrefix(textInput);
const result = await this.embedChunks(
Array.isArray(textInput) ? textInput : [textInput]
);

View File

@ -20,22 +20,16 @@ function isNullOrNaN(value) {
class TextSplitter {
#splitter;
/**
* Creates a new TextSplitter instance.
* @param {Object} config
* @param {string} [config.chunkPrefix = ""] - Prefix to be added to the start of each chunk.
* @param {number} [config.chunkSize = 1000] - The size of each chunk.
* @param {number} [config.chunkOverlap = 20] - The overlap between chunks.
* @param {Object} [config.chunkHeaderMeta = null] - Metadata to be added to the start of each chunk - will come after the prefix.
*/
constructor(config = {}) {
/*
config can be a ton of things depending on what is required or optional by the specific splitter.
Non-splitter related keys
{
splitByFilename: string, // TODO
}
------
Default: "RecursiveCharacterTextSplitter"
Config: {
chunkSize: number,
chunkOverlap: number,
chunkHeaderMeta: object | null, // Gets appended to top of each chunk as metadata
}
------
*/
this.config = config;
this.#splitter = this.#setSplitter(config);
}
@ -124,20 +118,41 @@ class TextSplitter {
}
/**
* Creates a string of metadata to be prepended to each chunk.
* Apply the chunk prefix to the text if it is present.
* @param {string} text - The text to apply the prefix to.
* @returns {string} The text with the embedder model prefix applied.
*/
#applyPrefix(text = "") {
if (!this.config.chunkPrefix) return text;
return `${this.config.chunkPrefix}${text}`;
}
/**
* Creates a string of metadata to be prepended to each chunk.
* Will additionally prepend a prefix to the text if it was provided (requirement for some embedders).
* @returns {string} The text with the embedder model prefix applied.
*/
stringifyHeader() {
if (!this.config.chunkHeaderMeta) return null;
let content = "";
if (!this.config.chunkHeaderMeta) return this.#applyPrefix(content);
Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => {
if (!key || !value) return;
content += `${key}: ${value}\n`;
});
if (!content) return null;
return `<document_metadata>\n${content}</document_metadata>\n\n`;
if (!content) return this.#applyPrefix(content);
return this.#applyPrefix(
`<document_metadata>\n${content}</document_metadata>\n\n`
);
}
/**
* Sets the splitter to use a defined config passes to other subclasses.
* @param {Object} config
* @param {string} [config.chunkPrefix = ""] - Prefix to be added to the start of each chunk.
* @param {number} [config.chunkSize = 1000] - The size of each chunk.
* @param {number} [config.chunkOverlap = 20] - The overlap between chunks.
*/
#setSplitter(config = {}) {
// if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
return new RecursiveSplitter({
@ -160,7 +175,11 @@ class RecursiveSplitter {
const {
RecursiveCharacterTextSplitter,
} = require("@langchain/textsplitters");
this.log(`Will split with`, { chunkSize, chunkOverlap });
this.log(`Will split with`, {
chunkSize,
chunkOverlap,
chunkHeader: chunkHeader ? `${chunkHeader?.slice(0, 50)}...` : null,
});
this.chunkHeader = chunkHeader;
this.engine = new RecursiveCharacterTextSplitter({
chunkSize,

View File

@ -1,7 +1,6 @@
const { fetchOpenRouterModels } = require("../AiProviders/openRouter");
const { fetchApiPieModels } = require("../AiProviders/apipie");
const { perplexityModels } = require("../AiProviders/perplexity");
const { togetherAiModels } = require("../AiProviders/togetherAi");
const { fireworksAiModels } = require("../AiProviders/fireworksAi");
const { ElevenLabsTTS } = require("../TextToSpeech/elevenLabs");
const { fetchNovitaModels } = require("../AiProviders/novita");
@ -34,6 +33,8 @@ const SUPPORT_CUSTOM_MODELS = [
"ppio",
"dpais",
"moonshotai",
// Embedding Engines
"native-embedder",
];
async function getCustomModels(provider = "", apiKey = null, basePath = null) {
@ -87,6 +88,8 @@ async function getCustomModels(provider = "", apiKey = null, basePath = null) {
return await getDellProAiStudioModels(basePath);
case "moonshotai":
return await getMoonshotAiModels(apiKey);
case "native-embedder":
return await getNativeEmbedderModels();
default:
return { models: [], error: "Invalid provider for custom models" };
}
@ -678,6 +681,11 @@ async function getDellProAiStudioModels(basePath = null) {
}
}
function getNativeEmbedderModels() {
const { NativeEmbedder } = require("../EmbeddingEngines/native");
return { models: NativeEmbedder.availableModels(), error: null };
}
async function getMoonshotAiModels(_apiKey = null) {
const apiKey =
_apiKey === true

View File

@ -288,7 +288,7 @@ const KEY_MAPPING = {
EmbeddingModelPref: {
envKey: "EMBEDDING_MODEL_PREF",
checks: [isNotEmpty],
postUpdate: [handleVectorStoreReset],
postUpdate: [handleVectorStoreReset, downloadEmbeddingModelIfRequired],
},
EmbeddingModelMaxChunkLength: {
envKey: "EMBEDDING_MODEL_MAX_CHUNK_LENGTH",
@ -927,6 +927,22 @@ async function handleVectorStoreReset(key, prevValue, nextValue) {
return false;
}
/**
* Downloads the embedding model in background if the user has selected a different model
* - Only supported for the native embedder
* - Must have the native embedder selected prior (otherwise will download on embed)
*/
async function downloadEmbeddingModelIfRequired(key, prevValue, nextValue) {
if (prevValue === nextValue) return;
if (key !== "EmbeddingModelPref" || process.env.EMBEDDING_ENGINE !== "native")
return;
const { NativeEmbedder } = require("../EmbeddingEngines/native");
if (!NativeEmbedder.supportedModels[nextValue]) return; // if the model is not supported, don't download it
new NativeEmbedder().embedderClient();
return false;
}
/**
* Validates the Postgres connection string for the PGVector options.
* @param {string} input - The Postgres connection string to validate.

View File

@ -206,6 +206,7 @@ const AstraDB = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);

View File

@ -252,6 +252,7 @@ const Chroma = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);

View File

@ -328,6 +328,7 @@ const LanceDb = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);

View File

@ -204,6 +204,7 @@ const Milvus = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);

View File

@ -536,6 +536,7 @@ const PGVector = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);

View File

@ -150,6 +150,7 @@ const PineconeDB = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);

View File

@ -222,6 +222,7 @@ const QDrant = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);

View File

@ -263,6 +263,7 @@ const Weaviate = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);

View File

@ -197,6 +197,7 @@ const Zilliz = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);