Sanitize Metadata Before PG Vector Database Insertion (#4434)

* Fix JSDOC for updateOrCreateCollection

* Add sanitizeForJsonb method to PGVector for safe JSONB handling

This new method recursively sanitizes values intended for JSONB storage, removing disallowed control characters and ensuring safe insertion into PostgreSQL. The method is integrated into the vector insertion process to sanitize metadata before database operations.

* Add unit tests for PGVector.sanitizeForJsonb method

This commit introduces a comprehensive test suite for the PGVector.sanitizeForJsonb method, ensuring it correctly handles various input types, including null, undefined, strings with disallowed control characters, objects, arrays, and Date objects. The tests verify that the method sanitizes inputs without mutating the original data structures.

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
Marcello Fitton 2025-09-29 13:49:45 -07:00 committed by GitHub
parent eb77876127
commit 7ca2753c24
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 132 additions and 4 deletions

View File

@ -0,0 +1,76 @@
const { PGVector } = require("../../../../utils/vectorDbProviders/pgvector");
describe("PGVector.sanitizeForJsonb", () => {
it("returns null/undefined as-is", () => {
expect(PGVector.sanitizeForJsonb(null)).toBeNull();
expect(PGVector.sanitizeForJsonb(undefined)).toBeUndefined();
});
it("keeps safe whitespace (tab, LF, CR) and removes disallowed C0 controls", () => {
const input = "a\u0000\u0001\u0002\tline\ncarriage\rreturn\u001Fend";
const result = PGVector.sanitizeForJsonb(input);
// Expect all < 0x20 except 9,10,13 removed; keep letters and allowed whitespace
expect(result).toBe("a\tline\ncarriage\rreturnend");
});
it("removes only disallowed control chars; keeps normal printable chars", () => {
const input = "Hello\u0000, World! \u0007\u0008\u000B\u000C\u001F";
const result = PGVector.sanitizeForJsonb(input);
expect(result).toBe("Hello, World! ");
});
it("deeply sanitizes objects", () => {
const input = {
plain: "ok",
bad: "has\u0000nul",
nested: {
arr: ["fine", "bad\u0001", { deep: "\u0002oops" }],
},
};
const result = PGVector.sanitizeForJsonb(input);
expect(result).toEqual({
plain: "ok",
bad: "hasnul",
nested: { arr: ["fine", "bad", { deep: "oops" }] },
});
});
it("deeply sanitizes arrays", () => {
const input = ["\u0000", 1, true, { s: "bad\u0003" }, ["ok", "\u0004bad"]];
const result = PGVector.sanitizeForJsonb(input);
expect(result).toEqual(["", 1, true, { s: "bad" }, ["ok", "bad"]]);
});
it("converts Date to ISO string", () => {
const d = new Date("2020-01-02T03:04:05.000Z");
expect(PGVector.sanitizeForJsonb(d)).toBe(d.toISOString());
});
it("returns primitives unchanged (number, boolean, bigint)", () => {
expect(PGVector.sanitizeForJsonb(42)).toBe(42);
expect(PGVector.sanitizeForJsonb(3.14)).toBe(3.14);
expect(PGVector.sanitizeForJsonb(true)).toBe(true);
expect(PGVector.sanitizeForJsonb(false)).toBe(false);
expect(PGVector.sanitizeForJsonb(BigInt(1))).toBe(BigInt(1));
});
it("returns symbol unchanged", () => {
const sym = Symbol("x");
expect(PGVector.sanitizeForJsonb(sym)).toBe(sym);
});
it("does not mutate original objects/arrays", () => {
const obj = { a: "bad\u0000", nested: { b: "ok" } };
const arr = ["\u0001", { c: "bad\u0002" }];
const objCopy = JSON.parse(JSON.stringify(obj));
const arrCopy = JSON.parse(JSON.stringify(arr));
const resultObj = PGVector.sanitizeForJsonb(obj);
const resultArr = PGVector.sanitizeForJsonb(arr);
// Original inputs remain unchanged
expect(obj).toEqual(objCopy);
expect(arr).toEqual(arrCopy);
// Results are sanitized copies
expect(resultObj).toEqual({ a: "bad", nested: { b: "ok" } });
expect(resultArr).toEqual(["", { c: "bad" }]);
});
});

View File

@ -52,6 +52,55 @@ const PGVector = {
console.log(`\x1b[35m[PGVectorDb]\x1b[0m ${message}`, ...args);
},
/**
* Recursively sanitize values intended for JSONB to prevent Postgres errors
* like "unsupported Unicode escape sequence". This primarily removes the
* NUL character (\u0000) and other disallowed control characters from
* strings. Arrays and objects are traversed and sanitized deeply.
* @param {any} value
* @returns {any}
*/
sanitizeForJsonb: function (value) {
// Fast path for null/undefined and primitives that do not need changes
if (value === null || value === undefined) return value;
// Strings: strip NUL and unsafe C0 control characters except common whitespace
if (typeof value === "string") {
// Build a sanitized string by excluding C0 control characters except
// horizontal tab (9), line feed (10), and carriage return (13).
let sanitized = "";
for (let i = 0; i < value.length; i++) {
const code = value.charCodeAt(i);
if (code === 9 || code === 10 || code === 13 || code >= 0x20) {
sanitized += value[i];
}
}
return sanitized;
}
// Arrays: sanitize each element
if (Array.isArray(value)) {
return value.map((item) => this.sanitizeForJsonb(item));
}
// Dates: keep as ISO string
if (value instanceof Date) {
return value.toISOString();
}
// Objects: sanitize each property value
if (typeof value === "object") {
const result = {};
for (const [k, v] of Object.entries(value)) {
result[k] = this.sanitizeForJsonb(v);
}
return result;
}
// Numbers, booleans, etc.
return value;
},
client: function (connectionString = null) {
return new pgsql.Client({
connectionString: connectionString || PGVector.connectionString(),
@ -362,9 +411,11 @@ const PGVector = {
/**
* Update or create a collection in the database
* @param {pgsql.Connection} connection
* @param {{id: number, vector: number[], metadata: Object}[]} submissions
* @param {string} namespace
* @param {Object} params
* @param {pgsql.Connection} params.connection
* @param {{id: number, vector: number[], metadata: Object}[]} params.submissions
* @param {string} params.namespace
* @param {number} params.dimensions
* @returns {Promise<boolean>}
*/
updateOrCreateCollection: async function ({
@ -381,9 +432,10 @@ const PGVector = {
await connection.query(`BEGIN`);
for (const submission of submissions) {
const embedding = `[${submission.vector.map(Number).join(",")}]`; // stringify the vector for pgvector
const sanitizedMetadata = this.sanitizeForJsonb(submission.metadata);
await connection.query(
`INSERT INTO "${PGVector.tableName()}" (id, namespace, embedding, metadata) VALUES ($1, $2, $3, $4)`,
[submission.id, namespace, embedding, submission.metadata]
[submission.id, namespace, embedding, sanitizedMetadata]
);
}
this.log(`Committing ${submissions.length} vectors to ${namespace}`);