Add batch embedding support for Ollama embedding provider (#4553)
* add batch embedding support for ollama embedding provider * lint * simplify ollama embedder input
This commit is contained in:
parent
6a72ac2240
commit
66e44f65b4
@ -23,11 +23,18 @@ export default function OllamaEmbeddingOptions({ settings }) {
|
||||
const [maxChunkLength, setMaxChunkLength] = useState(
|
||||
settings?.EmbeddingModelMaxChunkLength || 8192
|
||||
);
|
||||
const [batchSize, setBatchSize] = useState(
|
||||
settings?.OllamaEmbeddingBatchSize || 1
|
||||
);
|
||||
|
||||
const handleMaxChunkLengthChange = (e) => {
|
||||
setMaxChunkLength(Number(e.target.value));
|
||||
};
|
||||
|
||||
const handleBatchSizeChange = (e) => {
|
||||
setBatchSize(Number(e.target.value));
|
||||
};
|
||||
|
||||
return (
|
||||
<div className="w-full flex flex-col gap-y-7">
|
||||
<div className="w-full flex items-start gap-[36px] mt-1.5">
|
||||
@ -74,7 +81,7 @@ export default function OllamaEmbeddingOptions({ settings }) {
|
||||
}}
|
||||
className="border-none text-theme-text-primary hover:text-theme-text-secondary flex items-center text-sm"
|
||||
>
|
||||
{showAdvancedControls ? "Hide" : "Show"} Manual Endpoint Input
|
||||
{showAdvancedControls ? "Hide" : "Show"} Advanced Settings
|
||||
{showAdvancedControls ? (
|
||||
<CaretUp size={14} className="ml-1" />
|
||||
) : (
|
||||
@ -121,6 +128,41 @@ export default function OllamaEmbeddingOptions({ settings }) {
|
||||
Enter the URL where Ollama is running.
|
||||
</p>
|
||||
</div>
|
||||
<div className="flex flex-col w-60">
|
||||
<div
|
||||
data-tooltip-place="top"
|
||||
data-tooltip-id="ollama-batch-size-tooltip"
|
||||
className="flex gap-x-1 items-center mb-3"
|
||||
>
|
||||
<Info
|
||||
size={16}
|
||||
className="text-theme-text-secondary cursor-pointer"
|
||||
/>
|
||||
<label className="text-white text-sm font-semibold block">
|
||||
Embedding batch size
|
||||
</label>
|
||||
<Tooltip id="ollama-batch-size-tooltip">
|
||||
Number of text chunks to embed in parallel. Higher values
|
||||
improve speed but use more memory. Default is 1.
|
||||
</Tooltip>
|
||||
</div>
|
||||
<input
|
||||
type="number"
|
||||
name="OllamaEmbeddingBatchSize"
|
||||
className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
|
||||
placeholder="1"
|
||||
min={1}
|
||||
value={batchSize}
|
||||
onChange={handleBatchSizeChange}
|
||||
onScroll={(e) => e.target.blur()}
|
||||
required={true}
|
||||
autoComplete="off"
|
||||
/>
|
||||
<p className="text-xs leading-[18px] font-base text-white text-opacity-60 mt-2">
|
||||
Increase this value to process multiple chunks simultaneously for
|
||||
faster embedding.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@ -232,6 +232,7 @@ const SystemSettings = {
|
||||
: process.env.EMBEDDING_MODEL_PREF,
|
||||
EmbeddingModelMaxChunkLength:
|
||||
process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH,
|
||||
OllamaEmbeddingBatchSize: process.env.OLLAMA_EMBEDDING_BATCH_SIZE || 1,
|
||||
VoyageAiApiKey: !!process.env.VOYAGEAI_API_KEY,
|
||||
GenericOpenAiEmbeddingApiKey:
|
||||
!!process.env.GENERIC_OPEN_AI_EMBEDDING_API_KEY,
|
||||
|
||||
@ -11,12 +11,13 @@ class OllamaEmbedder {
|
||||
this.className = "OllamaEmbedder";
|
||||
this.basePath = process.env.EMBEDDING_BASE_PATH;
|
||||
this.model = process.env.EMBEDDING_MODEL_PREF;
|
||||
// Limit of how many strings we can process in a single pass to stay with resource or network limits
|
||||
this.maxConcurrentChunks = 1;
|
||||
this.maxConcurrentChunks = process.env.OLLAMA_EMBEDDING_BATCH_SIZE
|
||||
? Number(process.env.OLLAMA_EMBEDDING_BATCH_SIZE)
|
||||
: 1;
|
||||
this.embeddingMaxChunkLength = maximumChunkLength();
|
||||
this.client = new Ollama({ host: this.basePath });
|
||||
this.log(
|
||||
`initialized with model ${this.model} at ${this.basePath}. num_ctx: ${this.embeddingMaxChunkLength}`
|
||||
`initialized with model ${this.model} at ${this.basePath}. Batch size: ${this.maxConcurrentChunks}, num_ctx: ${this.embeddingMaxChunkLength}`
|
||||
);
|
||||
}
|
||||
|
||||
@ -46,14 +47,14 @@ class OllamaEmbedder {
|
||||
|
||||
/**
|
||||
* This function takes an array of text chunks and embeds them using the Ollama API.
|
||||
* chunks are processed sequentially to avoid overwhelming the API with too many requests
|
||||
* or running out of resources on the endpoint running the ollama instance.
|
||||
* Chunks are processed in batches based on the maxConcurrentChunks setting to balance
|
||||
* resource usage on the Ollama endpoint.
|
||||
*
|
||||
* We will use the num_ctx option to set the maximum context window to the max chunk length defined by the user in the settings
|
||||
* so that the maximum context window is used and content is not truncated.
|
||||
*
|
||||
* We also assume the default keep alive option. This could cause issues with models being unloaded and reloaded
|
||||
* on load memory machines, but that is simply a user-end issue we cannot control. If the LLM and embedder are
|
||||
* on low memory machines, but that is simply a user-end issue we cannot control. If the LLM and embedder are
|
||||
* constantly being loaded and unloaded, the user should use another LLM or Embedder to avoid this issue.
|
||||
* @param {string[]} textChunks - An array of text chunks to embed.
|
||||
* @returns {Promise<Array<number[]>>} - A promise that resolves to an array of embeddings.
|
||||
@ -64,17 +65,27 @@ class OllamaEmbedder {
|
||||
`Ollama service could not be reached. Is Ollama running?`
|
||||
);
|
||||
this.log(
|
||||
`Embedding ${textChunks.length} chunks of text with ${this.model}.`
|
||||
`Embedding ${textChunks.length} chunks of text with ${this.model} in batches of ${this.maxConcurrentChunks}.`
|
||||
);
|
||||
|
||||
let data = [];
|
||||
let error = null;
|
||||
|
||||
for (const chunk of textChunks) {
|
||||
// Process chunks in batches based on maxConcurrentChunks
|
||||
const totalBatches = Math.ceil(
|
||||
textChunks.length / this.maxConcurrentChunks
|
||||
);
|
||||
let currentBatch = 0;
|
||||
|
||||
for (let i = 0; i < textChunks.length; i += this.maxConcurrentChunks) {
|
||||
const batch = textChunks.slice(i, i + this.maxConcurrentChunks);
|
||||
currentBatch++;
|
||||
|
||||
try {
|
||||
const res = await this.client.embeddings({
|
||||
// Use input param instead of prompt param to support batch processing
|
||||
const res = await this.client.embed({
|
||||
model: this.model,
|
||||
prompt: chunk,
|
||||
input: batch,
|
||||
options: {
|
||||
// Always set the num_ctx to the max chunk length defined by the user in the settings
|
||||
// so that the maximum context window is used and content is not truncated.
|
||||
@ -82,11 +93,17 @@ class OllamaEmbedder {
|
||||
},
|
||||
});
|
||||
|
||||
const { embedding } = res;
|
||||
if (!Array.isArray(embedding) || embedding.length === 0)
|
||||
throw new Error("Ollama returned an empty embedding for chunk!");
|
||||
const { embeddings } = res;
|
||||
if (!Array.isArray(embeddings) || embeddings.length === 0)
|
||||
throw new Error("Ollama returned empty embeddings for batch!");
|
||||
|
||||
data.push(embedding);
|
||||
// Using prompt param in embed() would return a single embedding (number[])
|
||||
// but input param returns an array of embeddings (number[][]) for batch processing.
|
||||
// This is why we spread the embeddings array into the data array.
|
||||
data.push(...embeddings);
|
||||
this.log(
|
||||
`Batch ${currentBatch}/${totalBatches}: Embedded ${embeddings.length} chunks. Total: ${data.length}/${textChunks.length}`
|
||||
);
|
||||
} catch (err) {
|
||||
this.log(err.message);
|
||||
error = err.message;
|
||||
|
||||
@ -307,6 +307,10 @@ const KEY_MAPPING = {
|
||||
envKey: "EMBEDDING_MODEL_MAX_CHUNK_LENGTH",
|
||||
checks: [nonZero],
|
||||
},
|
||||
OllamaEmbeddingBatchSize: {
|
||||
envKey: "OLLAMA_EMBEDDING_BATCH_SIZE",
|
||||
checks: [nonZero],
|
||||
},
|
||||
|
||||
// Gemini Embedding Settings
|
||||
GeminiEmbeddingApiKey: {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user