merlyn/server/__tests__/utils/TextSplitter/index.test.js
Timothy Carambat 2c19dd09ed
Native Embedder model selection (incl: Multilingual support) (#3835)
* WIP on embedder selection
TODO: apply splitting and query prefixes (if applicable)

* wip on upsert

* Support base model
support nomic-text-embed-v1
support multilingual-e5-small
Add prefixing for both embedding and query for RAG tasks
Add chunking prefix to all vector dbs to apply prefix when possible
Show dropdown and auto-pull on new selection

* norm translations

* move supported models to constants
handle null seelction or invalid selection on dropdown
update comments

* dev

* patch text splitter maximums for now

* normalize translations

* add tests for splitter functionality

* normalize

---------

Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
2025-07-22 10:07:20 -07:00

105 lines
3.3 KiB
JavaScript

const { TextSplitter } = require("../../../utils/TextSplitter");
const _ = require("lodash");
describe("TextSplitter", () => {
test("should split long text into n sized chunks", async () => {
const text = "This is a test text to be split into chunks".repeat(2);
const textSplitter = new TextSplitter({
chunkSize: 20,
chunkOverlap: 0,
});
const chunks = await textSplitter.splitText(text);
expect(chunks.length).toEqual(5);
});
test("applies chunk overlap of 20 characters on invalid chunkOverlap", async () => {
const text = "This is a test text to be split into chunks".repeat(2);
const textSplitter = new TextSplitter({
chunkSize: 30,
});
const chunks = await textSplitter.splitText(text);
expect(chunks.length).toEqual(6);
});
test("does not allow chunkOverlap to be greater than chunkSize", async () => {
expect(() => {
new TextSplitter({
chunkSize: 20,
chunkOverlap: 21,
});
}).toThrow();
});
test("applies specific metadata to stringifyHeader to each chunk", async () => {
const metadata = {
id: "123e4567-e89b-12d3-a456-426614174000",
url: "https://example.com",
title: "Example",
docAuthor: "John Doe",
published: "2021-01-01",
chunkSource: "link://https://example.com",
description: "This is a test text to be split into chunks",
};
const chunkHeaderMeta = TextSplitter.buildHeaderMeta(metadata);
expect(chunkHeaderMeta).toEqual({
sourceDocument: metadata.title,
source: metadata.url,
published: metadata.published,
});
});
test("applies a valid chunkPrefix to each chunk", async () => {
const text = "This is a test text to be split into chunks".repeat(2);
let textSplitter = new TextSplitter({
chunkSize: 20,
chunkOverlap: 0,
chunkPrefix: "testing: ",
});
let chunks = await textSplitter.splitText(text);
expect(chunks.length).toEqual(5);
expect(chunks.every(chunk => chunk.startsWith("testing: "))).toBe(true);
textSplitter = new TextSplitter({
chunkSize: 20,
chunkOverlap: 0,
chunkPrefix: "testing2: ",
});
chunks = await textSplitter.splitText(text);
expect(chunks.length).toEqual(5);
expect(chunks.every(chunk => chunk.startsWith("testing2: "))).toBe(true);
textSplitter = new TextSplitter({
chunkSize: 20,
chunkOverlap: 0,
chunkPrefix: undefined,
});
chunks = await textSplitter.splitText(text);
expect(chunks.length).toEqual(5);
expect(chunks.every(chunk => !chunk.startsWith(": "))).toBe(true);
textSplitter = new TextSplitter({
chunkSize: 20,
chunkOverlap: 0,
chunkPrefix: "",
});
chunks = await textSplitter.splitText(text);
expect(chunks.length).toEqual(5);
expect(chunks.every(chunk => !chunk.startsWith(": "))).toBe(true);
// Applied chunkPrefix with chunkHeaderMeta
textSplitter = new TextSplitter({
chunkSize: 20,
chunkOverlap: 0,
chunkHeaderMeta: TextSplitter.buildHeaderMeta({
title: "Example",
url: "https://example.com",
published: "2021-01-01",
}),
chunkPrefix: "testing3: ",
});
chunks = await textSplitter.splitText(text);
expect(chunks.length).toEqual(5);
expect(chunks.every(chunk => chunk.startsWith("testing3: <document_metadata>"))).toBe(true);
});
});