From 7ca2753c24b97ce10a3467aee6e0a3a1b24b7ae2 Mon Sep 17 00:00:00 2001 From: Marcello Fitton <106866560+angelplusultra@users.noreply.github.com> Date: Mon, 29 Sep 2025 13:49:45 -0700 Subject: [PATCH] Sanitize Metadata Before PG Vector Database Insertion (#4434) * Fix JSDOC for updateOrCreateCollection * Add sanitizeForJsonb method to PGVector for safe JSONB handling This new method recursively sanitizes values intended for JSONB storage, removing disallowed control characters and ensuring safe insertion into PostgreSQL. The method is integrated into the vector insertion process to sanitize metadata before database operations. * Add unit tests for PGVector.sanitizeForJsonb method This commit introduces a comprehensive test suite for the PGVector.sanitizeForJsonb method, ensuring it correctly handles various input types, including null, undefined, strings with disallowed control characters, objects, arrays, and Date objects. The tests verify that the method sanitizes inputs without mutating the original data structures. --------- Co-authored-by: Timothy Carambat --- .../vectorDbProviders/pgvector/index.test.js | 76 +++++++++++++++++++ .../utils/vectorDbProviders/pgvector/index.js | 60 ++++++++++++++- 2 files changed, 132 insertions(+), 4 deletions(-) create mode 100644 server/__tests__/utils/vectorDbProviders/pgvector/index.test.js diff --git a/server/__tests__/utils/vectorDbProviders/pgvector/index.test.js b/server/__tests__/utils/vectorDbProviders/pgvector/index.test.js new file mode 100644 index 00000000..33d6266a --- /dev/null +++ b/server/__tests__/utils/vectorDbProviders/pgvector/index.test.js @@ -0,0 +1,76 @@ +const { PGVector } = require("../../../../utils/vectorDbProviders/pgvector"); + +describe("PGVector.sanitizeForJsonb", () => { + it("returns null/undefined as-is", () => { + expect(PGVector.sanitizeForJsonb(null)).toBeNull(); + expect(PGVector.sanitizeForJsonb(undefined)).toBeUndefined(); + }); + + it("keeps safe whitespace (tab, LF, CR) and removes disallowed C0 controls", () => { + const input = "a\u0000\u0001\u0002\tline\ncarriage\rreturn\u001Fend"; + const result = PGVector.sanitizeForJsonb(input); + // Expect all < 0x20 except 9,10,13 removed; keep letters and allowed whitespace + expect(result).toBe("a\tline\ncarriage\rreturnend"); + }); + + it("removes only disallowed control chars; keeps normal printable chars", () => { + const input = "Hello\u0000, World! \u0007\u0008\u000B\u000C\u001F"; + const result = PGVector.sanitizeForJsonb(input); + expect(result).toBe("Hello, World! "); + }); + + it("deeply sanitizes objects", () => { + const input = { + plain: "ok", + bad: "has\u0000nul", + nested: { + arr: ["fine", "bad\u0001", { deep: "\u0002oops" }], + }, + }; + const result = PGVector.sanitizeForJsonb(input); + expect(result).toEqual({ + plain: "ok", + bad: "hasnul", + nested: { arr: ["fine", "bad", { deep: "oops" }] }, + }); + }); + + it("deeply sanitizes arrays", () => { + const input = ["\u0000", 1, true, { s: "bad\u0003" }, ["ok", "\u0004bad"]]; + const result = PGVector.sanitizeForJsonb(input); + expect(result).toEqual(["", 1, true, { s: "bad" }, ["ok", "bad"]]); + }); + + it("converts Date to ISO string", () => { + const d = new Date("2020-01-02T03:04:05.000Z"); + expect(PGVector.sanitizeForJsonb(d)).toBe(d.toISOString()); + }); + + it("returns primitives unchanged (number, boolean, bigint)", () => { + expect(PGVector.sanitizeForJsonb(42)).toBe(42); + expect(PGVector.sanitizeForJsonb(3.14)).toBe(3.14); + expect(PGVector.sanitizeForJsonb(true)).toBe(true); + expect(PGVector.sanitizeForJsonb(false)).toBe(false); + expect(PGVector.sanitizeForJsonb(BigInt(1))).toBe(BigInt(1)); + }); + + it("returns symbol unchanged", () => { + const sym = Symbol("x"); + expect(PGVector.sanitizeForJsonb(sym)).toBe(sym); + }); + + it("does not mutate original objects/arrays", () => { + const obj = { a: "bad\u0000", nested: { b: "ok" } }; + const arr = ["\u0001", { c: "bad\u0002" }]; + const objCopy = JSON.parse(JSON.stringify(obj)); + const arrCopy = JSON.parse(JSON.stringify(arr)); + const resultObj = PGVector.sanitizeForJsonb(obj); + const resultArr = PGVector.sanitizeForJsonb(arr); + // Original inputs remain unchanged + expect(obj).toEqual(objCopy); + expect(arr).toEqual(arrCopy); + // Results are sanitized copies + expect(resultObj).toEqual({ a: "bad", nested: { b: "ok" } }); + expect(resultArr).toEqual(["", { c: "bad" }]); + }); +}); diff --git a/server/utils/vectorDbProviders/pgvector/index.js b/server/utils/vectorDbProviders/pgvector/index.js index d5c86907..990498eb 100644 --- a/server/utils/vectorDbProviders/pgvector/index.js +++ b/server/utils/vectorDbProviders/pgvector/index.js @@ -52,6 +52,55 @@ const PGVector = { console.log(`\x1b[35m[PGVectorDb]\x1b[0m ${message}`, ...args); }, + /** + * Recursively sanitize values intended for JSONB to prevent Postgres errors + * like "unsupported Unicode escape sequence". This primarily removes the + * NUL character (\u0000) and other disallowed control characters from + * strings. Arrays and objects are traversed and sanitized deeply. + * @param {any} value + * @returns {any} + */ + sanitizeForJsonb: function (value) { + // Fast path for null/undefined and primitives that do not need changes + if (value === null || value === undefined) return value; + + // Strings: strip NUL and unsafe C0 control characters except common whitespace + if (typeof value === "string") { + // Build a sanitized string by excluding C0 control characters except + // horizontal tab (9), line feed (10), and carriage return (13). + let sanitized = ""; + for (let i = 0; i < value.length; i++) { + const code = value.charCodeAt(i); + if (code === 9 || code === 10 || code === 13 || code >= 0x20) { + sanitized += value[i]; + } + } + return sanitized; + } + + // Arrays: sanitize each element + if (Array.isArray(value)) { + return value.map((item) => this.sanitizeForJsonb(item)); + } + + // Dates: keep as ISO string + if (value instanceof Date) { + return value.toISOString(); + } + + // Objects: sanitize each property value + if (typeof value === "object") { + const result = {}; + for (const [k, v] of Object.entries(value)) { + result[k] = this.sanitizeForJsonb(v); + } + return result; + } + + // Numbers, booleans, etc. + return value; + }, + client: function (connectionString = null) { return new pgsql.Client({ connectionString: connectionString || PGVector.connectionString(), @@ -362,9 +411,11 @@ const PGVector = { /** * Update or create a collection in the database - * @param {pgsql.Connection} connection - * @param {{id: number, vector: number[], metadata: Object}[]} submissions - * @param {string} namespace + * @param {Object} params + * @param {pgsql.Connection} params.connection + * @param {{id: number, vector: number[], metadata: Object}[]} params.submissions + * @param {string} params.namespace + * @param {number} params.dimensions * @returns {Promise} */ updateOrCreateCollection: async function ({ @@ -381,9 +432,10 @@ const PGVector = { await connection.query(`BEGIN`); for (const submission of submissions) { const embedding = `[${submission.vector.map(Number).join(",")}]`; // stringify the vector for pgvector + const sanitizedMetadata = this.sanitizeForJsonb(submission.metadata); await connection.query( `INSERT INTO "${PGVector.tableName()}" (id, namespace, embedding, metadata) VALUES ($1, $2, $3, $4)`, - [submission.id, namespace, embedding, submission.metadata] + [submission.id, namespace, embedding, sanitizedMetadata] ); } this.log(`Committing ${submissions.length} vectors to ${namespace}`);