Sanitize Metadata Before PG Vector Database Insertion (#4434)
* Fix JSDOC for updateOrCreateCollection * Add sanitizeForJsonb method to PGVector for safe JSONB handling This new method recursively sanitizes values intended for JSONB storage, removing disallowed control characters and ensuring safe insertion into PostgreSQL. The method is integrated into the vector insertion process to sanitize metadata before database operations. * Add unit tests for PGVector.sanitizeForJsonb method This commit introduces a comprehensive test suite for the PGVector.sanitizeForJsonb method, ensuring it correctly handles various input types, including null, undefined, strings with disallowed control characters, objects, arrays, and Date objects. The tests verify that the method sanitizes inputs without mutating the original data structures. --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
parent
eb77876127
commit
7ca2753c24
@ -0,0 +1,76 @@
|
||||
const { PGVector } = require("../../../../utils/vectorDbProviders/pgvector");
|
||||
|
||||
describe("PGVector.sanitizeForJsonb", () => {
|
||||
it("returns null/undefined as-is", () => {
|
||||
expect(PGVector.sanitizeForJsonb(null)).toBeNull();
|
||||
expect(PGVector.sanitizeForJsonb(undefined)).toBeUndefined();
|
||||
});
|
||||
|
||||
it("keeps safe whitespace (tab, LF, CR) and removes disallowed C0 controls", () => {
|
||||
const input = "a\u0000\u0001\u0002\tline\ncarriage\rreturn\u001Fend";
|
||||
const result = PGVector.sanitizeForJsonb(input);
|
||||
// Expect all < 0x20 except 9,10,13 removed; keep letters and allowed whitespace
|
||||
expect(result).toBe("a\tline\ncarriage\rreturnend");
|
||||
});
|
||||
|
||||
it("removes only disallowed control chars; keeps normal printable chars", () => {
|
||||
const input = "Hello\u0000, World! \u0007\u0008\u000B\u000C\u001F";
|
||||
const result = PGVector.sanitizeForJsonb(input);
|
||||
expect(result).toBe("Hello, World! ");
|
||||
});
|
||||
|
||||
it("deeply sanitizes objects", () => {
|
||||
const input = {
|
||||
plain: "ok",
|
||||
bad: "has\u0000nul",
|
||||
nested: {
|
||||
arr: ["fine", "bad\u0001", { deep: "\u0002oops" }],
|
||||
},
|
||||
};
|
||||
const result = PGVector.sanitizeForJsonb(input);
|
||||
expect(result).toEqual({
|
||||
plain: "ok",
|
||||
bad: "hasnul",
|
||||
nested: { arr: ["fine", "bad", { deep: "oops" }] },
|
||||
});
|
||||
});
|
||||
|
||||
it("deeply sanitizes arrays", () => {
|
||||
const input = ["\u0000", 1, true, { s: "bad\u0003" }, ["ok", "\u0004bad"]];
|
||||
const result = PGVector.sanitizeForJsonb(input);
|
||||
expect(result).toEqual(["", 1, true, { s: "bad" }, ["ok", "bad"]]);
|
||||
});
|
||||
|
||||
it("converts Date to ISO string", () => {
|
||||
const d = new Date("2020-01-02T03:04:05.000Z");
|
||||
expect(PGVector.sanitizeForJsonb(d)).toBe(d.toISOString());
|
||||
});
|
||||
|
||||
it("returns primitives unchanged (number, boolean, bigint)", () => {
|
||||
expect(PGVector.sanitizeForJsonb(42)).toBe(42);
|
||||
expect(PGVector.sanitizeForJsonb(3.14)).toBe(3.14);
|
||||
expect(PGVector.sanitizeForJsonb(true)).toBe(true);
|
||||
expect(PGVector.sanitizeForJsonb(false)).toBe(false);
|
||||
expect(PGVector.sanitizeForJsonb(BigInt(1))).toBe(BigInt(1));
|
||||
});
|
||||
|
||||
it("returns symbol unchanged", () => {
|
||||
const sym = Symbol("x");
|
||||
expect(PGVector.sanitizeForJsonb(sym)).toBe(sym);
|
||||
});
|
||||
|
||||
it("does not mutate original objects/arrays", () => {
|
||||
const obj = { a: "bad\u0000", nested: { b: "ok" } };
|
||||
const arr = ["\u0001", { c: "bad\u0002" }];
|
||||
const objCopy = JSON.parse(JSON.stringify(obj));
|
||||
const arrCopy = JSON.parse(JSON.stringify(arr));
|
||||
const resultObj = PGVector.sanitizeForJsonb(obj);
|
||||
const resultArr = PGVector.sanitizeForJsonb(arr);
|
||||
// Original inputs remain unchanged
|
||||
expect(obj).toEqual(objCopy);
|
||||
expect(arr).toEqual(arrCopy);
|
||||
// Results are sanitized copies
|
||||
expect(resultObj).toEqual({ a: "bad", nested: { b: "ok" } });
|
||||
expect(resultArr).toEqual(["", { c: "bad" }]);
|
||||
});
|
||||
});
|
||||
@ -52,6 +52,55 @@ const PGVector = {
|
||||
console.log(`\x1b[35m[PGVectorDb]\x1b[0m ${message}`, ...args);
|
||||
},
|
||||
|
||||
/**
|
||||
* Recursively sanitize values intended for JSONB to prevent Postgres errors
|
||||
* like "unsupported Unicode escape sequence". This primarily removes the
|
||||
* NUL character (\u0000) and other disallowed control characters from
|
||||
* strings. Arrays and objects are traversed and sanitized deeply.
|
||||
* @param {any} value
|
||||
* @returns {any}
|
||||
*/
|
||||
sanitizeForJsonb: function (value) {
|
||||
// Fast path for null/undefined and primitives that do not need changes
|
||||
if (value === null || value === undefined) return value;
|
||||
|
||||
// Strings: strip NUL and unsafe C0 control characters except common whitespace
|
||||
if (typeof value === "string") {
|
||||
// Build a sanitized string by excluding C0 control characters except
|
||||
// horizontal tab (9), line feed (10), and carriage return (13).
|
||||
let sanitized = "";
|
||||
for (let i = 0; i < value.length; i++) {
|
||||
const code = value.charCodeAt(i);
|
||||
if (code === 9 || code === 10 || code === 13 || code >= 0x20) {
|
||||
sanitized += value[i];
|
||||
}
|
||||
}
|
||||
return sanitized;
|
||||
}
|
||||
|
||||
// Arrays: sanitize each element
|
||||
if (Array.isArray(value)) {
|
||||
return value.map((item) => this.sanitizeForJsonb(item));
|
||||
}
|
||||
|
||||
// Dates: keep as ISO string
|
||||
if (value instanceof Date) {
|
||||
return value.toISOString();
|
||||
}
|
||||
|
||||
// Objects: sanitize each property value
|
||||
if (typeof value === "object") {
|
||||
const result = {};
|
||||
for (const [k, v] of Object.entries(value)) {
|
||||
result[k] = this.sanitizeForJsonb(v);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Numbers, booleans, etc.
|
||||
return value;
|
||||
},
|
||||
|
||||
client: function (connectionString = null) {
|
||||
return new pgsql.Client({
|
||||
connectionString: connectionString || PGVector.connectionString(),
|
||||
@ -362,9 +411,11 @@ const PGVector = {
|
||||
|
||||
/**
|
||||
* Update or create a collection in the database
|
||||
* @param {pgsql.Connection} connection
|
||||
* @param {{id: number, vector: number[], metadata: Object}[]} submissions
|
||||
* @param {string} namespace
|
||||
* @param {Object} params
|
||||
* @param {pgsql.Connection} params.connection
|
||||
* @param {{id: number, vector: number[], metadata: Object}[]} params.submissions
|
||||
* @param {string} params.namespace
|
||||
* @param {number} params.dimensions
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
updateOrCreateCollection: async function ({
|
||||
@ -381,9 +432,10 @@ const PGVector = {
|
||||
await connection.query(`BEGIN`);
|
||||
for (const submission of submissions) {
|
||||
const embedding = `[${submission.vector.map(Number).join(",")}]`; // stringify the vector for pgvector
|
||||
const sanitizedMetadata = this.sanitizeForJsonb(submission.metadata);
|
||||
await connection.query(
|
||||
`INSERT INTO "${PGVector.tableName()}" (id, namespace, embedding, metadata) VALUES ($1, $2, $3, $4)`,
|
||||
[submission.id, namespace, embedding, submission.metadata]
|
||||
[submission.id, namespace, embedding, sanitizedMetadata]
|
||||
);
|
||||
}
|
||||
this.log(`Committing ${submissions.length} vectors to ${namespace}`);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user