VectorDB class migration (#4787)

* Migrate Astra to class (#4722)

migrate astra to class

* Migrate LanceDB to class (#4721)

migrate lancedb to class

* Migrate Pinecone to class (#4726)

migrate pinecone to class

* Migrate Zilliz to class (#4729)

migrate zilliz to class

* Migrate Weaviate to class (#4728)

migrate weaviate to class

* Migrate Qdrant to class (#4727)

migrate qdrant to class

* Migrate Milvus to class (#4725)

migrate milvus to class

* Migrate Chroma to class (#4723)

migrate chroma to class

* Migrate Chroma Cloud to class (#4724)

* migrate chroma to class

* migrate chroma cloud to class

* move limits to class field

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>

* Migrate PGVector to class (#4730)

* migrate pgvector to class

* patch pgvector test

* convert connectionString, tableName, and validateConnection to static methods

* move instance properties to class fields

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>

* Refactor Zilliz Cloud vector DB provider (#4749)

simplify zilliz implementation by using milvus as base class

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>

* VectorDatabase base class (#4738)

create generic VectorDatabase base class

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>

* Extend VectorDatabase base class to all providers (#4755)

extend VectorDatabase base class to all providers

* patch lancedb import

* breakout name and add generic logger

* dev tag build

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
Sean Hatfield 2026-01-13 15:24:42 -08:00 committed by GitHub
parent 7c3b7906e7
commit 5039045f0c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 892 additions and 860 deletions

View File

@ -6,7 +6,7 @@ concurrency:
on:
push:
branches: ['4841-aws-bedrock-api-key'] # put your current branch to create a build. Core team only.
branches: ['vectordb-class-migration'] # put your current branch to create a build. Core team only.
paths-ignore:
- '**.md'
- 'cloud-deployments/*'

View File

@ -1,4 +1,6 @@
const { PGVector } = require("../../../../utils/vectorDbProviders/pgvector");
const { PGVector: PGVectorClass } = require("../../../../utils/vectorDbProviders/pgvector");
const PGVector = new PGVectorClass();
describe("PGVector.sanitizeForJsonb", () => {
it("returns null/undefined as-is", () => {

View File

@ -86,40 +86,40 @@ function getVectorDbClass(getExactly = null) {
switch (vectorSelection) {
case "pinecone":
const { Pinecone } = require("../vectorDbProviders/pinecone");
return Pinecone;
return new Pinecone();
case "chroma":
const { Chroma } = require("../vectorDbProviders/chroma");
return Chroma;
return new Chroma();
case "chromacloud":
const { ChromaCloud } = require("../vectorDbProviders/chromacloud");
return ChromaCloud;
return new ChromaCloud();
case "lancedb":
const { LanceDb } = require("../vectorDbProviders/lance");
return LanceDb;
return new LanceDb();
case "weaviate":
const { Weaviate } = require("../vectorDbProviders/weaviate");
return Weaviate;
return new Weaviate();
case "qdrant":
const { QDrant } = require("../vectorDbProviders/qdrant");
return QDrant;
return new QDrant();
case "milvus":
const { Milvus } = require("../vectorDbProviders/milvus");
return Milvus;
return new Milvus();
case "zilliz":
const { Zilliz } = require("../vectorDbProviders/zilliz");
return Zilliz;
return new Zilliz();
case "astra":
const { AstraDB } = require("../vectorDbProviders/astra");
return AstraDB;
return new AstraDB();
case "pgvector":
const { PGVector } = require("../vectorDbProviders/pgvector");
return PGVector;
return new PGVector();
default:
console.error(
`\x1b[31m[ENV ERROR]\x1b[0m No VECTOR_DB value found in environment! Falling back to LanceDB`
);
const { LanceDb: DefaultLanceDb } = require("../vectorDbProviders/lance");
return DefaultLanceDb;
return new DefaultLanceDb();
}
}

View File

@ -5,6 +5,7 @@ const { storeVectorResult, cachedVectorInformation } = require("../../files");
const { v4: uuidv4 } = require("uuid");
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
const { sourceIdentifier } = require("../../chats");
const { VectorDatabase } = require("../base");
const sanitizeNamespace = (namespace) => {
// If namespace already starts with ns_, don't add it again
@ -22,14 +23,21 @@ const collectionExists = async function (client, namespace) {
return collections.includes(namespace);
}
} catch (error) {
console.log("Astra::collectionExists check error", error?.message || error);
this.logger("collectionExists check error", error?.message || error);
return false; // Return false for any error to allow creation attempt
}
};
const AstraDB = {
name: "AstraDB",
connect: async function () {
class AstraDB extends VectorDatabase {
constructor() {
super();
}
get name() {
return "AstraDB";
}
async connect() {
if (process.env.VECTOR_DB !== "astra")
throw new Error("AstraDB::Invalid ENV settings");
@ -38,21 +46,24 @@ const AstraDB = {
process?.env?.ASTRA_DB_ENDPOINT
);
return { client };
},
heartbeat: async function () {
}
async heartbeat() {
return { heartbeat: Number(new Date()) };
},
}
// Astra interface will return a valid collection object even if the collection
// does not actually exist. So we run a simple check which will always throw
// when the table truly does not exist. Faster than iterating all collections.
isRealCollection: async function (astraCollection = null) {
async isRealCollection(astraCollection = null) {
if (!astraCollection) return false;
return await astraCollection
.countDocuments()
.then(() => true)
.catch(() => false);
},
totalVectors: async function () {
}
async totalVectors() {
const { client } = await this.connect();
const collectionNames = await this.allNamespaces(client);
var totalVectors = 0;
@ -62,13 +73,15 @@ const AstraDB = {
totalVectors += count ? count : 0;
}
return totalVectors;
},
namespaceCount: async function (_namespace = null) {
}
async namespaceCount(_namespace = null) {
const { client } = await this.connect();
const namespace = await this.namespace(client, _namespace);
return namespace?.vectorCount || 0;
},
namespace: async function (client, namespace = null) {
}
async namespace(client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const sanitizedNamespace = sanitizeNamespace(namespace);
const collection = await client
@ -77,7 +90,7 @@ const AstraDB = {
if (!(await this.isRealCollection(collection))) return null;
const count = await collection.countDocuments().catch((e) => {
console.error("Astra::namespaceExists", e.message);
this.logger("namespaceExists", e.message);
return null;
});
@ -86,27 +99,31 @@ const AstraDB = {
...collection,
vectorCount: typeof count === "number" ? count : 0,
};
},
hasNamespace: async function (namespace = null) {
}
async hasNamespace(namespace = null) {
if (!namespace) return false;
const { client } = await this.connect();
return await this.namespaceExists(client, namespace);
},
namespaceExists: async function (client, namespace = null) {
}
async namespaceExists(client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const sanitizedNamespace = sanitizeNamespace(namespace);
const collection = await client.collection(sanitizedNamespace);
return await this.isRealCollection(collection);
},
deleteVectorsInNamespace: async function (client, namespace = null) {
}
async deleteVectorsInNamespace(client, namespace = null) {
const sanitizedNamespace = sanitizeNamespace(namespace);
await client.dropCollection(sanitizedNamespace);
return true;
},
}
// AstraDB requires a dimension aspect for collection creation
// we pass this in from the first chunk to infer the dimensions like other
// providers do.
getOrCreateCollection: async function (client, namespace, dimensions = null) {
async getOrCreateCollection(client, namespace, dimensions = null) {
const sanitizedNamespace = sanitizeNamespace(namespace);
try {
const exists = await collectionExists(client, sanitizedNamespace);
@ -132,14 +149,12 @@ const AstraDB = {
return await client.collection(sanitizedNamespace);
} catch (error) {
console.error(
"Astra::getOrCreateCollection error",
error?.message || error
);
this.logger("getOrCreateCollection", error?.message || error);
throw error;
}
},
addDocumentToNamespace: async function (
}
async addDocumentToNamespace(
namespace,
documentData = {},
fullFilePath = null,
@ -151,7 +166,7 @@ const AstraDB = {
const { pageContent, docId, ...metadata } = documentData;
if (!pageContent || pageContent.length == 0) return false;
console.log("Adding new vectorized document into namespace", namespace);
this.logger("Adding new vectorized document into namespace", namespace);
if (!skipCache) {
const cacheResult = await cachedVectorInformation(fullFilePath);
if (cacheResult.exists) {
@ -210,7 +225,7 @@ const AstraDB = {
});
const textChunks = await textSplitter.splitText(pageContent);
console.log("Snippets created from document:", textChunks.length);
this.logger("Snippets created from document:", textChunks.length);
const documentVectors = [];
const vectors = [];
const vectorValues = await EmbedderEngine.embedChunks(textChunks);
@ -246,7 +261,7 @@ const AstraDB = {
if (vectors.length > 0) {
const chunks = [];
console.log("Inserting vectorized chunks into Astra DB.");
this.logger("Inserting vectorized chunks into Astra DB.");
// AstraDB has maximum upsert size of 20 records per-request so we have to use a lower chunk size here
// in order to do the queries - this takes a lot more time than other providers but there
@ -266,11 +281,12 @@ const AstraDB = {
await DocumentVectors.bulkInsert(documentVectors);
return { vectorized: true, error: null };
} catch (e) {
console.error("addDocumentToNamespace", e.message);
this.logger("addDocumentToNamespace", e.message);
return { vectorized: false, error: e.message };
}
},
deleteDocumentFromNamespace: async function (namespace, docId) {
}
async deleteDocumentFromNamespace(namespace, docId) {
const { DocumentVectors } = require("../../../models/vectors");
const { client } = await this.connect();
namespace = sanitizeNamespace(namespace);
@ -293,8 +309,9 @@ const AstraDB = {
const indexes = knownDocuments.map((doc) => doc.id);
await DocumentVectors.deleteIds(indexes);
return true;
},
performSimilaritySearch: async function ({
}
async performSimilaritySearch({
namespace = null,
input = "",
LLMConnector = null,
@ -336,8 +353,9 @@ const AstraDB = {
sources: this.curateSources(sources),
message: false,
};
},
similarityResponse: async function ({
}
async similarityResponse({
client,
namespace,
queryVector,
@ -367,8 +385,8 @@ const AstraDB = {
responses.forEach((response) => {
if (response.$similarity < similarityThreshold) return;
if (filterIdentifiers.includes(sourceIdentifier(response.metadata))) {
console.log(
"AstraDB: A source was filtered from context as it's parent document is pinned."
this.logger(
"A source was filtered from context as it's parent document is pinned."
);
return;
}
@ -380,8 +398,9 @@ const AstraDB = {
result.scores.push(response.$similarity);
});
return result;
},
allNamespaces: async function (client) {
}
async allNamespaces(client) {
try {
let header = new Headers();
header.append("Token", client?.httpClient?.applicationToken);
@ -403,11 +422,12 @@ const AstraDB = {
const collections = resp ? JSON.parse(resp)?.status?.collections : [];
return collections;
} catch (e) {
console.error("Astra::AllNamespace", e);
this.logger("AllNamespace", e);
return [];
}
},
"namespace-stats": async function (reqBody = {}) {
}
async "namespace-stats"(reqBody = {}) {
const { namespace = null } = reqBody;
if (!namespace) throw new Error("namespace required");
const { client } = await this.connect();
@ -417,8 +437,9 @@ const AstraDB = {
return stats
? stats
: { message: "No stats were able to be fetched from DB for namespace" };
},
"delete-namespace": async function (reqBody = {}) {
}
async "delete-namespace"(reqBody = {}) {
const { namespace = null } = reqBody;
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace)))
@ -431,8 +452,9 @@ const AstraDB = {
details?.vectorCount || "all"
} vectors.`,
};
},
curateSources: function (sources = []) {
}
curateSources(sources = []) {
const documents = [];
for (const source of sources) {
if (Object.keys(source).length > 0) {
@ -446,7 +468,7 @@ const AstraDB = {
}
return documents;
},
};
}
}
module.exports.AstraDB = AstraDB;

View File

@ -0,0 +1,201 @@
/**
* Base class for all Vector Database providers.
* All vector database providers should extend this class and implement/override the necessary methods.
*/
class VectorDatabase {
get name() {
return "VectorDatabase";
}
constructor() {
if (this.constructor === VectorDatabase) {
throw new Error("VectorDatabase cannot be instantiated directly");
}
}
/**
* Connect to vector database client
* @returns {Promise<{client: any}>}
*/
async connect() {
throw new Error("Must be implemented by provider");
}
/**
* Heartbeat check for vector database client
* @returns {Promise<{heartbeat: number}>}
*/
async heartbeat() {
throw new Error("Must be implemented by provider");
}
/**
* Get total number of vectors across all namespaces
* @returns {Promise<number>}
*/
async totalVectors() {
throw new Error("Must be implemented by provider");
}
/**
* Get count of vectors in a specific namespace
* @param {string} namespace - Namespace to count vectors in
* @returns {Promise<number>}
*/
async namespaceCount(namespace = null) {
throw new Error("Must be implemented by provider");
}
/**
* Get namespace details
* @param {any} client - Vector database client
* @param {string} namespace - Namespace to get
* @returns {Promise<any>}
*/
async namespace(client, namespace = null) {
throw new Error("Must be implemented by provider");
}
/**
* Check if a namespace exists
* @param {string} namespace - Namespace to check
* @returns {Promise<boolean>}
*/
async hasNamespace(namespace = null) {
throw new Error("Must be implemented by provider");
}
/**
* Check if a namespace exists with a client
* @param {any} client - Vector database client
* @param {string} namespace - Namespace to check
* @returns {Promise<boolean>}
*/
async namespaceExists(client, namespace = null) {
throw new Error("Must be implemented by provider");
}
/**
* Delete all vectors in a namespace
* @param {any} client - Vector database client
* @param {string} namespace - Namespace to delete vectors from
* @returns {Promise<boolean>}
*/
async deleteVectorsInNamespace(client, namespace = null) {
throw new Error("Must be implemented by provider");
}
/**
* Add a document to a namespace
* @param {string} namespace - Namespace to add document to
* @param {Object} documentData - Document data
* @param {string} fullFilePath - Full file path
* @param {boolean} skipCache - Skip cache
* @returns {Promise<{vectorized: boolean, error: string|null}>}
*/
async addDocumentToNamespace(
namespace,
documentData = {},
fullFilePath = null,
skipCache = false
) {
throw new Error("Must be implemented by provider");
}
/**
* Delete a document from namespace
* @param {string} namespace - Namespace to delete document from
* @param {string} docId - Document id
* @returns {Promise<boolean>}
*/
async deleteDocumentFromNamespace(namespace, docId) {
throw new Error("Must be implemented by provider");
}
/**
* Perform a similarity search
* @param {Object} params - Search parameters
* @param {string} params.namespace - Namespace to search in
* @param {string} params.input - Input text to search for
* @param {any} params.LLMConnector - LLM connector for embeddings
* @param {number} params.similarityThreshold - Similarity threshold
* @param {number} params.topN - Number of results to return
* @param {string[]} params.filterIdentifiers - Identifiers to filter out
* @returns {Promise<{contextTexts: string[], sources: any[], message: string|boolean}>}
*/
async performSimilaritySearch({
namespace = null,
input = "",
LLMConnector = null,
similarityThreshold = 0.25,
topN = 4,
filterIdentifiers = [],
}) {
throw new Error("Must be implemented by provider");
}
/**
* Perform a similarity search and return raw results
* @param {Object} params - Search parameters
* @param {any} params.client - Vector database client
* @param {string} params.namespace - Namespace to search in
* @param {number[]} params.queryVector - Query vector
* @param {number} params.similarityThreshold - Similarity threshold
* @param {number} params.topN - Number of results to return
* @param {string[]} params.filterIdentifiers - Identifiers to filter out
* @returns {Promise<{contextTexts: string[], sourceDocuments: any[], scores: number[]}>}
*/
async similarityResponse({
client,
namespace,
queryVector,
similarityThreshold = 0.25,
topN = 4,
filterIdentifiers = [],
}) {
throw new Error("Must be implemented by provider");
}
/**
* Get namespace stats
* @param {Object} reqBody - Request body
* @param {string} reqBody.namespace - Namespace to get stats for
* @returns {Promise<any>}
*/
async "namespace-stats"(reqBody = {}) {
throw new Error("Must be implemented by provider");
}
/**
* Delete a namespace
* @param {Object} reqBody - Request body
* @param {string} reqBody.namespace - Namespace to delete
* @returns {Promise<{message: string}>}
*/
async "delete-namespace"(reqBody = {}) {
throw new Error("Must be implemented by provider");
}
/**
* Reset vector database (delete all data)
* @returns {Promise<{reset: boolean}>}
*/
async reset() {
throw new Error("Must be implemented by provider");
}
/**
* Curate sources from search results
* @param {any[]} sources - Sources to curate
* @returns {any[]}
*/
curateSources(sources = []) {
throw new Error("Must be implemented by provider");
}
logger(message = null, ...args) {
console.log(`\x1b[36m[VectorDB::${this.name}]\x1b[0m ${message}`, ...args);
}
}
module.exports = { VectorDatabase };

View File

@ -6,12 +6,20 @@ const { v4: uuidv4 } = require("uuid");
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
const { parseAuthHeader } = require("../../http");
const { sourceIdentifier } = require("../../chats");
const { VectorDatabase } = require("../base");
const COLLECTION_REGEX = new RegExp(
/^(?!\d+\.\d+\.\d+\.\d+$)(?!.*\.\.)(?=^[a-zA-Z0-9][a-zA-Z0-9_-]{1,61}[a-zA-Z0-9]$).{3,63}$/
);
const Chroma = {
name: "Chroma",
class Chroma extends VectorDatabase {
constructor() {
super();
}
get name() {
return "Chroma";
}
// Chroma DB has specific requirements for collection names:
// (1) Must contain 3-63 characters
// (2) Must start and end with an alphanumeric character
@ -20,7 +28,7 @@ const Chroma = {
// (5) Cannot be a valid IPv4 address
// We need to enforce these rules by normalizing the collection names
// before communicating with the Chroma DB.
normalize: function (inputString) {
normalize(inputString) {
if (COLLECTION_REGEX.test(inputString)) return inputString;
let normalized = inputString.replace(/[^a-zA-Z0-9_-]/g, "-");
@ -54,8 +62,9 @@ const Chroma = {
}
return normalized;
},
connect: async function () {
}
async connect() {
if (process.env.VECTOR_DB !== "chroma")
throw new Error("Chroma::Invalid ENV settings");
@ -79,12 +88,14 @@ const Chroma = {
"ChromaDB::Invalid Heartbeat received - is the instance online?"
);
return { client };
},
heartbeat: async function () {
}
async heartbeat() {
const { client } = await this.connect();
return { heartbeat: await client.heartbeat() };
},
totalVectors: async function () {
}
async totalVectors() {
const { client } = await this.connect();
const collections = await client.listCollections();
var totalVectors = 0;
@ -96,19 +107,22 @@ const Chroma = {
totalVectors += await collection.count();
}
return totalVectors;
},
distanceToSimilarity: function (distance = null) {
}
distanceToSimilarity(distance = null) {
if (distance === null || typeof distance !== "number") return 0.0;
if (distance >= 1.0) return 1;
if (distance < 0) return 1 - Math.abs(distance);
return 1 - distance;
},
namespaceCount: async function (_namespace = null) {
}
async namespaceCount(_namespace = null) {
const { client } = await this.connect();
const namespace = await this.namespace(client, this.normalize(_namespace));
return namespace?.vectorCount || 0;
},
similarityResponse: async function ({
}
async similarityResponse({
client,
namespace,
queryVector,
@ -137,8 +151,8 @@ const Chroma = {
if (
filterIdentifiers.includes(sourceIdentifier(response.metadatas[0][i]))
) {
console.log(
"Chroma: A source was filtered from context as it's parent document is pinned."
this.logger(
"A source was filtered from context as it's parent document is pinned."
);
return;
}
@ -149,8 +163,9 @@ const Chroma = {
});
return result;
},
namespace: async function (client, namespace = null) {
}
async namespace(client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const collection = await client
.getCollection({ name: this.normalize(namespace) })
@ -161,27 +176,31 @@ const Chroma = {
...collection,
vectorCount: await collection.count(),
};
},
hasNamespace: async function (namespace = null) {
}
async hasNamespace(namespace = null) {
if (!namespace) return false;
const { client } = await this.connect();
return await this.namespaceExists(client, this.normalize(namespace));
},
namespaceExists: async function (client, namespace = null) {
}
async namespaceExists(client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const collection = await client
.getCollection({ name: this.normalize(namespace) })
.catch((e) => {
console.error("ChromaDB::namespaceExists", e.message);
this.logger("namespaceExists", e.message);
return null;
});
return !!collection;
},
deleteVectorsInNamespace: async function (client, namespace = null) {
}
async deleteVectorsInNamespace(client, namespace = null) {
await client.deleteCollection({ name: this.normalize(namespace) });
return true;
},
addDocumentToNamespace: async function (
}
async addDocumentToNamespace(
namespace,
documentData = {},
fullFilePath = null,
@ -192,7 +211,7 @@ const Chroma = {
const { pageContent, docId, ...metadata } = documentData;
if (!pageContent || pageContent.length == 0) return false;
console.log("Adding new vectorized document into namespace", namespace);
this.logger("Adding new vectorized document into namespace", namespace);
if (!skipCache) {
const cacheResult = await cachedVectorInformation(fullFilePath);
if (cacheResult.exists) {
@ -254,7 +273,7 @@ const Chroma = {
});
const textChunks = await textSplitter.splitText(pageContent);
console.log("Snippets created from document:", textChunks.length);
this.logger("Snippets created from document:", textChunks.length);
const documentVectors = [];
const vectors = [];
const vectorValues = await EmbedderEngine.embedChunks(textChunks);
@ -298,16 +317,16 @@ const Chroma = {
if (vectors.length > 0) {
const chunks = [];
console.log("Inserting vectorized chunks into Chroma collection.");
this.logger("Inserting vectorized chunks into Chroma collection.");
for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
try {
await this.smartAdd(collection, submission);
console.log(
this.logger(
`Successfully added ${submission.ids.length} vectors to collection ${this.normalize(namespace)}`
);
} catch (error) {
console.error("Error adding to ChromaDB:", error);
this.logger("Error adding to ChromaDB:", error);
throw new Error(`Error embedding into ChromaDB: ${error.message}`);
}
@ -317,11 +336,12 @@ const Chroma = {
await DocumentVectors.bulkInsert(documentVectors);
return { vectorized: true, error: null };
} catch (e) {
console.error("addDocumentToNamespace", e.message);
this.logger("addDocumentToNamespace", e.message);
return { vectorized: false, error: e.message };
}
},
deleteDocumentFromNamespace: async function (namespace, docId) {
}
async deleteDocumentFromNamespace(namespace, docId) {
const { DocumentVectors } = require("../../../models/vectors");
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace))) return;
@ -338,8 +358,9 @@ const Chroma = {
const indexes = knownDocuments.map((doc) => doc.id);
await DocumentVectors.deleteIds(indexes);
return true;
},
performSimilaritySearch: async function ({
}
async performSimilaritySearch({
namespace = null,
input = "",
LLMConnector = null,
@ -383,8 +404,9 @@ const Chroma = {
sources: this.curateSources(sources),
message: false,
};
},
"namespace-stats": async function (reqBody = {}) {
}
async "namespace-stats"(reqBody = {}) {
const { namespace = null } = reqBody;
if (!namespace) throw new Error("namespace required");
const { client } = await this.connect();
@ -394,8 +416,9 @@ const Chroma = {
return stats
? stats
: { message: "No stats were able to be fetched from DB for namespace" };
},
"delete-namespace": async function (reqBody = {}) {
}
async "delete-namespace"(reqBody = {}) {
const { namespace = null } = reqBody;
const { client } = await this.connect();
if (!(await this.namespaceExists(client, this.normalize(namespace))))
@ -406,13 +429,15 @@ const Chroma = {
return {
message: `Namespace ${namespace} was deleted along with ${details?.vectorCount} vectors.`,
};
},
reset: async function () {
}
async reset() {
const { client } = await this.connect();
await client.reset();
return { reset: true };
},
curateSources: function (sources = []) {
}
curateSources(sources = []) {
const documents = [];
for (const source of sources) {
const { metadata = {} } = source;
@ -427,7 +452,8 @@ const Chroma = {
}
return documents;
},
}
/**
* This method is a wrapper around the ChromaCollection.add method.
* It will return true if the add was successful, false otherwise.
@ -436,10 +462,11 @@ const Chroma = {
* @param {{ids: string[], embeddings: number[], metadatas: Record<string, any>[], documents: string[]}[]} submissions
* @returns {Promise<boolean>} True if the add was successful, false otherwise.
*/
smartAdd: async function (collection, submissions) {
async smartAdd(collection, submissions) {
await collection.add(submissions);
return true;
},
}
/**
* This method is a wrapper around the ChromaCollection.delete method.
* It will return the result of the delete method directly.
@ -448,10 +475,10 @@ const Chroma = {
* @param {string[]} vectorIds
* @returns {Promise<boolean>} True if the delete was successful, false otherwise.
*/
smartDelete: async function (collection, vectorIds) {
async smartDelete(collection, vectorIds) {
await collection.delete({ ids: vectorIds });
return true;
},
};
}
}
module.exports.Chroma = Chroma;

View File

@ -6,20 +6,27 @@ const { toChunks } = require("../../helpers");
* ChromaCloud works nearly the same as Chroma so we can just extend the
* Chroma class and override the connect method to use the CloudClient for major differences in API functionality.
*/
const ChromaCloud = {
...Chroma,
name: "ChromaCloud",
class ChromaCloud extends Chroma {
constructor() {
super();
}
get name() {
return "ChromaCloud";
}
/**
* Basic quota/limitations for Chroma Cloud for accounts. Does not lookup client-specific limits.
* @see https://docs.trychroma.com/cloud/quotas-limits
*/
limits: {
limits = {
maxEmbeddingDim: 4_096,
maxDocumentBytes: 16_384,
maxMetadataBytes: 4_096,
maxRecordsPerWrite: 300,
},
connect: async function () {
};
async connect() {
if (process.env.VECTOR_DB !== "chromacloud")
throw new Error("ChromaCloud::Invalid ENV settings");
@ -35,7 +42,8 @@ const ChromaCloud = {
"ChromaCloud::Invalid Heartbeat received - is the instance online?"
);
return { client };
},
}
/**
* Chroma Cloud has some basic limitations on upserts to protect performance and latency.
* Local deployments do not have these limitations since they are self-hosted.
@ -47,7 +55,7 @@ const ChromaCloud = {
* @returns {Promise<boolean>} True if the upsert was successful, false otherwise.
* If the upsert was not successful, the error message will be returned.
*/
smartAdd: async function (collection, submission) {
async smartAdd(collection, submission) {
const testSubmission = {
id: submission.ids[0],
embedding: submission.embeddings[0],
@ -77,8 +85,8 @@ const ChromaCloud = {
return true;
}
console.log(
`ChromaCloud::Upsert Payload is too large (max is ${this.limits.maxRecordsPerWrite} records). Splitting into chunks of ${this.limits.maxRecordsPerWrite} records.`
this.logger(
`Upsert Payload is too large (max is ${this.limits.maxRecordsPerWrite} records). Splitting into chunks of ${this.limits.maxRecordsPerWrite} records.`
);
const chunks = [];
let chunkedSubmission = {
@ -93,7 +101,7 @@ const ChromaCloud = {
chunkedSubmission.metadatas.push(submission.metadatas[i]);
chunkedSubmission.documents.push(submission.documents[i]);
if (chunkedSubmission.ids.length === this.limits.maxRecordsPerWrite) {
console.log(
this.logger(
`ChromaCloud::Adding chunk payload ${chunks.length + 1} of ${Math.ceil(submission.ids.length / this.limits.maxRecordsPerWrite)}`
);
chunks.push(chunkedSubmission);
@ -114,7 +122,8 @@ const ChromaCloud = {
counter++;
}
return true;
},
}
/**
* This method is a wrapper around the ChromaCollection.delete method.
* It will return the result of the delete method directly.
@ -127,22 +136,22 @@ const ChromaCloud = {
* @param {string[]} vectorIds
* @returns {Promise<boolean>} True if the delete was successful, false otherwise.
*/
smartDelete: async function (collection, vectorIds) {
async smartDelete(collection, vectorIds) {
if (vectorIds.length <= this.limits.maxRecordsPerWrite)
return await collection.delete({ ids: vectorIds });
console.log(
`ChromaCloud::Delete Payload is too large (max is ${this.limits.maxRecordsPerWrite} records). Splitting into chunks of ${this.limits.maxRecordsPerWrite} records.`
this.logger(
`Delete Payload is too large (max is ${this.limits.maxRecordsPerWrite} records). Splitting into chunks of ${this.limits.maxRecordsPerWrite} records.`
);
const chunks = toChunks(vectorIds, this.limits.maxRecordsPerWrite);
let counter = 1;
for (const chunk of chunks) {
console.log(`ChromaCloud::Deleting chunk ${counter} of ${chunks.length}`);
this.logger(`Deleting chunk ${counter} of ${chunks.length}`);
await collection.delete({ ids: chunk });
counter++;
}
return true;
},
};
}
}
module.exports.ChromaCloud = ChromaCloud;

View File

@ -6,38 +6,54 @@ const { storeVectorResult, cachedVectorInformation } = require("../../files");
const { v4: uuidv4 } = require("uuid");
const { sourceIdentifier } = require("../../chats");
const { NativeEmbeddingReranker } = require("../../EmbeddingRerankers/native");
const { VectorDatabase } = require("../base");
const path = require("path");
/**
* LancedDB Client connection object
* @typedef {import('@lancedb/lancedb').Connection} LanceClient
*/
const LanceDb = {
uri: `${
!!process.env.STORAGE_DIR ? `${process.env.STORAGE_DIR}/` : "./storage/"
}lancedb`,
name: "LanceDb",
class LanceDb extends VectorDatabase {
constructor() {
super();
}
get uri() {
const basePath = !!process.env.STORAGE_DIR
? process.env.STORAGE_DIR
: path.resolve(__dirname, "../../../storage");
return path.resolve(basePath, "lancedb");
}
get name() {
return "LanceDb";
}
/** @returns {Promise<{client: LanceClient}>} */
connect: async function () {
async connect() {
const client = await lancedb.connect(this.uri);
return { client };
},
distanceToSimilarity: function (distance = null) {
}
distanceToSimilarity(distance = null) {
if (distance === null || typeof distance !== "number") return 0.0;
if (distance >= 1.0) return 1;
if (distance < 0) return 1 - Math.abs(distance);
return 1 - distance;
},
heartbeat: async function () {
}
async heartbeat() {
await this.connect();
return { heartbeat: Number(new Date()) };
},
tables: async function () {
}
async tables() {
const { client } = await this.connect();
return await client.tableNames();
},
totalVectors: async function () {
}
async totalVectors() {
const { client } = await this.connect();
const tables = await client.tableNames();
let count = 0;
@ -46,15 +62,17 @@ const LanceDb = {
count += await table.countRows();
}
return count;
},
namespaceCount: async function (_namespace = null) {
}
async namespaceCount(_namespace = null) {
const { client } = await this.connect();
const exists = await this.namespaceExists(client, _namespace);
if (!exists) return 0;
const table = await client.openTable(_namespace);
return (await table.countRows()) || 0;
},
}
/**
* Performs a SimilaritySearch + Reranking on a namespace.
* @param {Object} params - The parameters for the rerankedSimilarityResponse.
@ -67,7 +85,7 @@ const LanceDb = {
* @param {string[]} params.filterIdentifiers - The identifiers of the documents to filter out.
* @returns
*/
rerankedSimilarityResponse: async function ({
async rerankedSimilarityResponse({
client,
namespace,
query,
@ -116,8 +134,8 @@ const LanceDb = {
return;
const { vector: _, ...rest } = item;
if (filterIdentifiers.includes(sourceIdentifier(rest))) {
console.log(
"LanceDB: A source was filtered from context as it's parent document is pinned."
this.logger(
"A source was filtered from context as it's parent document is pinned."
);
return;
}
@ -133,12 +151,12 @@ const LanceDb = {
});
})
.catch((e) => {
console.error(e);
console.error("LanceDB::rerankedSimilarityResponse", e.message);
this.logger(e);
this.logger("rerankedSimilarityResponse", e.message);
});
return result;
},
}
/**
* Performs a SimilaritySearch on a give LanceDB namespace.
@ -151,7 +169,7 @@ const LanceDb = {
* @param {string[]} params.filterIdentifiers
* @returns
*/
similarityResponse: async function ({
async similarityResponse({
client,
namespace,
queryVector,
@ -177,8 +195,8 @@ const LanceDb = {
return;
const { vector: _, ...rest } = item;
if (filterIdentifiers.includes(sourceIdentifier(rest))) {
console.log(
"LanceDB: A source was filtered from context as it's parent document is pinned."
this.logger(
"A source was filtered from context as it's parent document is pinned."
);
return;
}
@ -192,14 +210,15 @@ const LanceDb = {
});
return result;
},
}
/**
*
* @param {LanceClient} client
* @param {string} namespace
* @returns
*/
namespace: async function (client, namespace = null) {
async namespace(client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const collection = await client.openTable(namespace).catch(() => false);
if (!collection) return null;
@ -207,7 +226,8 @@ const LanceDb = {
return {
...collection,
};
},
}
/**
*
* @param {LanceClient} client
@ -215,7 +235,7 @@ const LanceDb = {
* @param {string} namespace
* @returns
*/
updateOrCreateCollection: async function (client, data = [], namespace) {
async updateOrCreateCollection(client, data = [], namespace) {
const hasNamespace = await this.hasNamespace(namespace);
if (hasNamespace) {
const collection = await client.openTable(namespace);
@ -225,40 +245,44 @@ const LanceDb = {
await client.createTable(namespace, data);
return true;
},
hasNamespace: async function (namespace = null) {
}
async hasNamespace(namespace = null) {
if (!namespace) return false;
const { client } = await this.connect();
const exists = await this.namespaceExists(client, namespace);
return exists;
},
}
/**
*
* @param {LanceClient} client
* @param {string} namespace
* @returns
*/
namespaceExists: async function (client, namespace = null) {
async namespaceExists(client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const collections = await client.tableNames();
return collections.includes(namespace);
},
}
/**
*
* @param {LanceClient} client
* @param {string} namespace
* @returns
*/
deleteVectorsInNamespace: async function (client, namespace = null) {
async deleteVectorsInNamespace(client, namespace = null) {
await client.dropTable(namespace);
return true;
},
deleteDocumentFromNamespace: async function (namespace, docId) {
}
async deleteDocumentFromNamespace(namespace, docId) {
const { client } = await this.connect();
const exists = await this.namespaceExists(client, namespace);
if (!exists) {
console.error(
`LanceDB:deleteDocumentFromNamespace - namespace ${namespace} does not exist.`
this.logger(
`deleteDocumentFromNamespace - namespace ${namespace} does not exist.`
);
return;
}
@ -272,8 +296,9 @@ const LanceDb = {
if (vectorIds.length === 0) return;
await table.delete(`id IN (${vectorIds.map((v) => `'${v}'`).join(",")})`);
return true;
},
addDocumentToNamespace: async function (
}
async addDocumentToNamespace(
namespace,
documentData = {},
fullFilePath = null,
@ -284,7 +309,7 @@ const LanceDb = {
const { pageContent, docId, ...metadata } = documentData;
if (!pageContent || pageContent.length == 0) return false;
console.log("Adding new vectorized document into namespace", namespace);
this.logger("Adding new vectorized document into namespace", namespace);
if (!skipCache) {
const cacheResult = await cachedVectorInformation(fullFilePath);
if (cacheResult.exists) {
@ -329,7 +354,7 @@ const LanceDb = {
});
const textChunks = await textSplitter.splitText(pageContent);
console.log("Snippets created from document:", textChunks.length);
this.logger("Snippets created from document:", textChunks.length);
const documentVectors = [];
const vectors = [];
const submissions = [];
@ -364,7 +389,7 @@ const LanceDb = {
const chunks = [];
for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
console.log("Inserting vectorized chunks into LanceDB collection.");
this.logger("Inserting vectorized chunks into LanceDB collection.");
const { client } = await this.connect();
await this.updateOrCreateCollection(client, submissions, namespace);
await storeVectorResult(chunks, fullFilePath);
@ -373,11 +398,12 @@ const LanceDb = {
await DocumentVectors.bulkInsert(documentVectors);
return { vectorized: true, error: null };
} catch (e) {
console.error("addDocumentToNamespace", e.message);
this.logger("addDocumentToNamespace", e.message);
return { vectorized: false, error: e.message };
}
},
performSimilaritySearch: async function ({
}
async performSimilaritySearch({
namespace = null,
input = "",
LLMConnector = null,
@ -427,8 +453,9 @@ const LanceDb = {
sources: this.curateSources(sources),
message: false,
};
},
"namespace-stats": async function (reqBody = {}) {
}
async "namespace-stats"(reqBody = {}) {
const { namespace = null } = reqBody;
if (!namespace) throw new Error("namespace required");
const { client } = await this.connect();
@ -438,8 +465,9 @@ const LanceDb = {
return stats
? stats
: { message: "No stats were able to be fetched from DB for namespace" };
},
"delete-namespace": async function (reqBody = {}) {
}
async "delete-namespace"(reqBody = {}) {
const { namespace = null } = reqBody;
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace)))
@ -449,14 +477,16 @@ const LanceDb = {
return {
message: `Namespace ${namespace} was deleted.`,
};
},
reset: async function () {
}
async reset() {
const { client } = await this.connect();
const fs = require("fs");
fs.rm(`${client.uri}`, { recursive: true }, () => null);
return { reset: true };
},
curateSources: function (sources = []) {
}
curateSources(sources = []) {
const documents = [];
for (const source of sources) {
const { text, vector: _v, _distance: _d, ...rest } = source;
@ -470,7 +500,7 @@ const LanceDb = {
}
return documents;
},
};
}
}
module.exports.LanceDb = LanceDb;

View File

@ -10,22 +10,31 @@ const { v4: uuidv4 } = require("uuid");
const { storeVectorResult, cachedVectorInformation } = require("../../files");
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
const { sourceIdentifier } = require("../../chats");
const { VectorDatabase } = require("../base");
class Milvus extends VectorDatabase {
constructor() {
super();
}
get name() {
return "Milvus";
}
const Milvus = {
name: "Milvus",
// Milvus/Zilliz only allows letters, numbers, and underscores in collection names
// so we need to enforce that by re-normalizing the names when communicating with
// the DB.
// If the first char of the collection is not an underscore or letter the collection name will be invalid.
normalize: function (inputString) {
normalize(inputString) {
let normalized = inputString.replace(/[^a-zA-Z0-9_]/g, "_");
if (new RegExp(/^[a-zA-Z_]/).test(normalized.slice(0, 1)))
normalized = `anythingllm_${normalized}`;
return normalized;
},
connect: async function () {
}
async connect() {
if (process.env.VECTOR_DB !== "milvus")
throw new Error("Milvus::Invalid ENV settings");
throw new Error(`${this.name}::Invalid ENV settings`);
const client = new MilvusClient({
address: process.env.MILVUS_ADDRESS,
@ -36,16 +45,18 @@ const Milvus = {
const { isHealthy } = await client.checkHealth();
if (!isHealthy)
throw new Error(
"MilvusDB::Invalid Heartbeat received - is the instance online?"
`${this.name}::Invalid Heartbeat received - is the instance online?`
);
return { client };
},
heartbeat: async function () {
}
async heartbeat() {
await this.connect();
return { heartbeat: Number(new Date()) };
},
totalVectors: async function () {
}
async totalVectors() {
const { client } = await this.connect();
const { collection_names } = await client.listCollections();
const total = collection_names.reduce(async (acc, collection_name) => {
@ -55,49 +66,55 @@ const Milvus = {
return Number(acc) + Number(statistics?.data?.row_count ?? 0);
}, 0);
return total;
},
namespaceCount: async function (_namespace = null) {
}
async namespaceCount(_namespace = null) {
const { client } = await this.connect();
const statistics = await client.getCollectionStatistics({
collection_name: this.normalize(_namespace),
});
return Number(statistics?.data?.row_count ?? 0);
},
namespace: async function (client, namespace = null) {
}
async namespace(client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const collection = await client
.getCollectionStatistics({ collection_name: this.normalize(namespace) })
.catch(() => null);
return collection;
},
hasNamespace: async function (namespace = null) {
}
async hasNamespace(namespace = null) {
if (!namespace) return false;
const { client } = await this.connect();
return await this.namespaceExists(client, namespace);
},
namespaceExists: async function (client, namespace = null) {
}
async namespaceExists(client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const { value } = await client
.hasCollection({ collection_name: this.normalize(namespace) })
.catch((e) => {
console.error("MilvusDB::namespaceExists", e.message);
console.error(`${this.name}::namespaceExists`, e.message);
return { value: false };
});
return value;
},
deleteVectorsInNamespace: async function (client, namespace = null) {
}
async deleteVectorsInNamespace(client, namespace = null) {
await client.dropCollection({ collection_name: this.normalize(namespace) });
return true;
},
}
// Milvus requires a dimension aspect for collection creation
// we pass this in from the first chunk to infer the dimensions like other
// providers do.
getOrCreateCollection: async function (client, namespace, dimensions = null) {
async getOrCreateCollection(client, namespace, dimensions = null) {
const isExists = await this.namespaceExists(client, namespace);
if (!isExists) {
if (!dimensions)
throw new Error(
`Milvus:getOrCreateCollection Unable to infer vector dimension from input. Open an issue on GitHub for support.`
`${this.name}::getOrCreateCollection Unable to infer vector dimension from input. Open an issue on GitHub for support.`
);
await client.createCollection({
@ -133,8 +150,9 @@ const Milvus = {
collection_name: this.normalize(namespace),
});
}
},
addDocumentToNamespace: async function (
}
async addDocumentToNamespace(
namespace,
documentData = {},
fullFilePath = null,
@ -146,7 +164,7 @@ const Milvus = {
const { pageContent, docId, ...metadata } = documentData;
if (!pageContent || pageContent.length == 0) return false;
console.log("Adding new vectorized document into namespace", namespace);
this.logger("Adding new vectorized document into namespace", namespace);
if (!skipCache) {
const cacheResult = await cachedVectorInformation(fullFilePath);
if (cacheResult.exists) {
@ -172,7 +190,7 @@ const Milvus = {
if (insertResult?.status.error_code !== "Success") {
throw new Error(
`Error embedding into Milvus! Reason:${insertResult?.status.reason}`
`Error embedding into ${this.name}! Reason:${insertResult?.status.reason}`
);
}
}
@ -208,7 +226,7 @@ const Milvus = {
});
const textChunks = await textSplitter.splitText(pageContent);
console.log("Snippets created from document:", textChunks.length);
this.logger("Snippets created from document:", textChunks.length);
const documentVectors = [];
const vectors = [];
const vectorValues = await EmbedderEngine.embedChunks(textChunks);
@ -238,7 +256,7 @@ const Milvus = {
const { client } = await this.connect();
await this.getOrCreateCollection(client, namespace, vectorDimension);
console.log("Inserting vectorized chunks into Milvus.");
this.logger(`Inserting vectorized chunks into ${this.name}.`);
for (const chunk of toChunks(vectors, 100)) {
chunks.push(chunk);
const insertResult = await client.insert({
@ -252,7 +270,7 @@ const Milvus = {
if (insertResult?.status.error_code !== "Success") {
throw new Error(
`Error embedding into Milvus! Reason:${insertResult?.status.reason}`
`Error embedding into ${this.name}! Reason:${insertResult?.status.reason}`
);
}
}
@ -265,11 +283,12 @@ const Milvus = {
await DocumentVectors.bulkInsert(documentVectors);
return { vectorized: true, error: null };
} catch (e) {
console.error("addDocumentToNamespace", e.message);
this.logger("addDocumentToNamespace", e.message);
return { vectorized: false, error: e.message };
}
},
deleteDocumentFromNamespace: async function (namespace, docId) {
}
async deleteDocumentFromNamespace(namespace, docId) {
const { DocumentVectors } = require("../../../models/vectors");
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace))) return;
@ -291,8 +310,9 @@ const Milvus = {
// on a later call.
await client.flushSync({ collection_names: [this.normalize(namespace)] });
return true;
},
performSimilaritySearch: async function ({
}
async performSimilaritySearch({
namespace = null,
input = "",
LLMConnector = null,
@ -331,8 +351,9 @@ const Milvus = {
sources: this.curateSources(sources),
message: false,
};
},
similarityResponse: async function ({
}
async similarityResponse({
client,
namespace,
queryVector,
@ -353,8 +374,8 @@ const Milvus = {
response.results.forEach((match) => {
if (match.score < similarityThreshold) return;
if (filterIdentifiers.includes(sourceIdentifier(match.metadata))) {
console.log(
"Milvus: A source was filtered from context as it's parent document is pinned."
this.logger(
`${this.name}: A source was filtered from context as its parent document is pinned.`
);
return;
}
@ -367,8 +388,9 @@ const Milvus = {
result.scores.push(match.score);
});
return result;
},
"namespace-stats": async function (reqBody = {}) {
}
async "namespace-stats"(reqBody = {}) {
const { namespace = null } = reqBody;
if (!namespace) throw new Error("namespace required");
const { client } = await this.connect();
@ -378,8 +400,9 @@ const Milvus = {
return stats
? stats
: { message: "No stats were able to be fetched from DB for namespace" };
},
"delete-namespace": async function (reqBody = {}) {
}
async "delete-namespace"(reqBody = {}) {
const { namespace = null } = reqBody;
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace)))
@ -391,8 +414,9 @@ const Milvus = {
return {
message: `Namespace ${namespace} was deleted along with ${vectorCount} vectors.`,
};
},
curateSources: function (sources = []) {
}
curateSources(sources = []) {
const documents = [];
for (const source of sources) {
const { metadata = {} } = source;
@ -404,7 +428,7 @@ const Milvus = {
}
}
return documents;
},
};
}
}
module.exports.Milvus = Milvus;

View File

@ -3,6 +3,7 @@ const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
const { TextSplitter } = require("../../TextSplitter");
const { v4: uuidv4 } = require("uuid");
const { sourceIdentifier } = require("../../chats");
const { VectorDatabase } = require("../base");
/*
Embedding Table Schema (table name defined by user)
@ -13,44 +14,53 @@ const { sourceIdentifier } = require("../../chats");
- created_at: TIMESTAMP
*/
const PGVector = {
name: "PGVector",
connectionTimeout: 30_000,
/**
* Get the table name for the PGVector database.
* - Defaults to "anythingllm_vectors" if no table name is provided.
* @returns {string}
*/
tableName: () => process.env.PGVECTOR_TABLE_NAME || "anythingllm_vectors",
class PGVector extends VectorDatabase {
constructor() {
super();
}
/**
* Get the connection string for the PGVector database.
* - Requires a connection string to be present in the environment variables.
* @returns {string | null}
*/
connectionString: () => process.env.PGVECTOR_CONNECTION_STRING,
get name() {
return "PGVector";
}
connectionTimeout = 30_000;
// Possible for this to be a user-configurable option in the future.
// Will require a handler per operator to ensure scores are normalized.
operator: {
operator = {
l2: "<->",
innerProduct: "<#>",
cosine: "<=>",
l1: "<+>",
hamming: "<~>",
jaccard: "<%>",
},
getTablesSql:
"SELECT * FROM pg_catalog.pg_tables WHERE schemaname = 'public'",
getEmbeddingTableSchemaSql:
"SELECT column_name,data_type FROM information_schema.columns WHERE table_name = $1",
createExtensionSql: "CREATE EXTENSION IF NOT EXISTS vector;",
createTableSql: (dimensions) =>
`CREATE TABLE IF NOT EXISTS "${PGVector.tableName()}" (id UUID PRIMARY KEY, namespace TEXT, embedding vector(${Number(dimensions)}), metadata JSONB, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)`,
};
getTablesSql =
"SELECT * FROM pg_catalog.pg_tables WHERE schemaname = 'public'";
getEmbeddingTableSchemaSql =
"SELECT column_name,data_type FROM information_schema.columns WHERE table_name = $1";
createExtensionSql = "CREATE EXTENSION IF NOT EXISTS vector;";
log: function (message = null, ...args) {
console.log(`\x1b[35m[PGVectorDb]\x1b[0m ${message}`, ...args);
},
/**
* Get the table name for the PGVector database.
* - Defaults to "anythingllm_vectors" if no table name is provided.
* @returns {string}
*/
static tableName() {
return process.env.PGVECTOR_TABLE_NAME || "anythingllm_vectors";
}
/**
* Get the connection string for the PGVector database.
* - Requires a connection string to be present in the environment variables.
* @returns {string | null}
*/
static connectionString() {
return process.env.PGVECTOR_CONNECTION_STRING;
}
createTableSql(dimensions) {
return `CREATE TABLE IF NOT EXISTS "${PGVector.tableName()}" (id UUID PRIMARY KEY, namespace TEXT, embedding vector(${Number(dimensions)}), metadata JSONB, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)`;
}
/**
* Recursively sanitize values intended for JSONB to prevent Postgres errors
@ -60,7 +70,7 @@ const PGVector = {
* @param {any} value
* @returns {any}
*/
sanitizeForJsonb: function (value) {
sanitizeForJsonb(value) {
// Fast path for null/undefined and primitives that do not need changes
if (value === null || value === undefined) return value;
@ -99,13 +109,13 @@ const PGVector = {
// Numbers, booleans, etc.
return value;
},
}
client: function (connectionString = null) {
client(connectionString = null) {
return new pgsql.Client({
connectionString: connectionString || PGVector.connectionString(),
});
},
}
/**
* Validate the existing embedding table schema.
@ -113,7 +123,7 @@ const PGVector = {
* @param {string} tableName
* @returns {Promise<boolean>}
*/
validateExistingEmbeddingTableSchema: async function (pgClient, tableName) {
async validateExistingEmbeddingTableSchema(pgClient, tableName) {
const result = await pgClient.query(this.getEmbeddingTableSchemaSql, [
tableName,
]);
@ -178,11 +188,11 @@ const PGVector = {
);
}
this.log(
this.logger(
`✅ The pgvector table '${tableName}' was found and meets the minimum expected schema for an embedding table.`
);
return true;
},
}
/**
* Validate the connection to the database and verify that the table does not already exist.
@ -191,35 +201,36 @@ const PGVector = {
* @param {{connectionString: string | null, tableName: string | null}} params
* @returns {Promise<{error: string | null, success: boolean}>}
*/
validateConnection: async function ({
static async validateConnection({
connectionString = null,
tableName = null,
}) {
if (!connectionString) throw new Error("No connection string provided");
const instance = new PGVector();
try {
const timeoutPromise = new Promise((resolve) => {
setTimeout(() => {
resolve({
error: `Connection timeout (${(PGVector.connectionTimeout / 1000).toFixed(0)}s). Please check your connection string and try again.`,
error: `Connection timeout (${(instance.connectionTimeout / 1000).toFixed(0)}s). Please check your connection string and try again.`,
success: false,
});
}, PGVector.connectionTimeout);
}, instance.connectionTimeout);
});
const connectionPromise = new Promise(async (resolve) => {
let pgClient = null;
try {
pgClient = this.client(connectionString);
pgClient = instance.client(connectionString);
await pgClient.connect();
const result = await pgClient.query(this.getTablesSql);
const result = await pgClient.query(instance.getTablesSql);
if (result.rows.length !== 0 && !!tableName) {
const tableExists = result.rows.some(
(row) => row.tablename === tableName
);
if (tableExists)
await this.validateExistingEmbeddingTableSchema(
await instance.validateExistingEmbeddingTableSchema(
pgClient,
tableName
);
@ -236,7 +247,7 @@ const PGVector = {
const result = await Promise.race([connectionPromise, timeoutPromise]);
return result;
} catch (err) {
this.log("Validation Error:", err.message);
instance.logger("Validation Error:", err.message);
let readableError = err.message;
switch (true) {
case err.message.includes("ECONNREFUSED"):
@ -248,13 +259,13 @@ const PGVector = {
}
return { error: readableError, success: false };
}
},
}
/**
* Test the connection to the database directly.
* @returns {{error: string | null, success: boolean}}
*/
testConnectionToDB: async function () {
async testConnectionToDB() {
try {
const pgClient = await this.connect();
await pgClient.query(this.getTablesSql);
@ -263,14 +274,14 @@ const PGVector = {
} catch (err) {
return { error: err.message, success: false };
}
},
}
/**
* Connect to the database.
* - Throws an error if the connection string or table name is not provided.
* @returns {Promise<pgsql.Client>}
*/
connect: async function () {
async connect() {
if (!PGVector.connectionString())
throw new Error("No connection string provided");
if (!PGVector.tableName()) throw new Error("No table name provided");
@ -278,21 +289,21 @@ const PGVector = {
const client = this.client();
await client.connect();
return client;
},
}
/**
* Test the connection to the database with already set credentials via ENV
* @returns {{error: string | null, success: boolean}}
*/
heartbeat: async function () {
async heartbeat() {
return this.testConnectionToDB();
},
}
/**
* Check if the anythingllm embedding table exists in the database
* @returns {Promise<boolean>}
*/
dbTableExists: async function () {
async dbTableExists() {
let connection = null;
try {
connection = await this.connect();
@ -307,9 +318,9 @@ const PGVector = {
} finally {
if (connection) await connection.end();
}
},
}
totalVectors: async function () {
async totalVectors() {
if (!(await this.dbTableExists())) return 0;
let connection = null;
try {
@ -323,17 +334,17 @@ const PGVector = {
} finally {
if (connection) await connection.end();
}
},
}
// Distance for cosine is just the distance for pgvector.
distanceToSimilarity: function (distance = null) {
distanceToSimilarity(distance = null) {
if (distance === null || typeof distance !== "number") return 0.0;
if (distance >= 1.0) return 1;
if (distance < 0) return 1 - Math.abs(distance);
return 1 - distance;
},
}
namespaceCount: async function (namespace = null) {
async namespaceCount(namespace = null) {
if (!(await this.dbTableExists())) return 0;
let connection = null;
try {
@ -348,7 +359,7 @@ const PGVector = {
} finally {
if (connection) await connection.end();
}
},
}
/**
* Performs a SimilaritySearch on a given PGVector namespace.
@ -361,7 +372,7 @@ const PGVector = {
* @param {string[]} params.filterIdentifiers
* @returns
*/
similarityResponse: async function ({
async similarityResponse({
client,
namespace,
queryVector,
@ -384,7 +395,7 @@ const PGVector = {
if (this.distanceToSimilarity(item._distance) < similarityThreshold)
return;
if (filterIdentifiers.includes(sourceIdentifier(item.metadata))) {
this.log(
this.logger(
"A source was filtered from context as it's parent document is pinned."
);
return;
@ -399,15 +410,15 @@ const PGVector = {
});
return result;
},
}
normalizeVector: function (vector) {
normalizeVector(vector) {
const magnitude = Math.sqrt(
vector.reduce((sum, val) => sum + val * val, 0)
);
if (magnitude === 0) return vector; // Avoid division by zero
return vector.map((val) => val / magnitude);
},
}
/**
* Update or create a collection in the database
@ -418,14 +429,14 @@ const PGVector = {
* @param {number} params.dimensions
* @returns {Promise<boolean>}
*/
updateOrCreateCollection: async function ({
async updateOrCreateCollection({
connection,
submissions,
namespace,
dimensions = 384,
}) {
await this.createTableIfNotExists(connection, dimensions);
this.log(`Updating or creating collection ${namespace}`);
this.logger(`Updating or creating collection ${namespace}`);
try {
// Create a transaction of all inserts
@ -438,17 +449,17 @@ const PGVector = {
[submission.id, namespace, embedding, sanitizedMetadata]
);
}
this.log(`Committing ${submissions.length} vectors to ${namespace}`);
this.logger(`Committing ${submissions.length} vectors to ${namespace}`);
await connection.query(`COMMIT`);
} catch (err) {
this.log(
this.logger(
`Rolling back ${submissions.length} vectors to ${namespace}`,
err
);
await connection.query(`ROLLBACK`);
}
return true;
},
}
/**
* create a table if it doesn't exist
@ -456,12 +467,12 @@ const PGVector = {
* @param {number} dimensions
* @returns
*/
createTableIfNotExists: async function (connection, dimensions = 384) {
this.log(`Creating embedding table with ${dimensions} dimensions`);
async createTableIfNotExists(connection, dimensions = 384) {
this.logger(`Creating embedding table with ${dimensions} dimensions`);
await connection.query(this.createExtensionSql);
await connection.query(this.createTableSql(dimensions));
return true;
},
}
/**
* Get the namespace from the database
@ -469,21 +480,21 @@ const PGVector = {
* @param {string} namespace
* @returns {Promise<{name: string, vectorCount: number}>}
*/
namespace: async function (connection, namespace = null) {
async namespace(connection, namespace = null) {
if (!namespace) throw new Error("No namespace provided");
const result = await connection.query(
`SELECT COUNT(id) FROM "${PGVector.tableName()}" WHERE namespace = $1`,
[namespace]
);
return { name: namespace, vectorCount: result.rows[0].count };
},
}
/**
* Check if the namespace exists in the database
* @param {string} namespace
* @returns {Promise<boolean>}
*/
hasNamespace: async function (namespace = null) {
async hasNamespace(namespace = null) {
if (!namespace) throw new Error("No namespace provided");
let connection = null;
try {
@ -494,7 +505,7 @@ const PGVector = {
} finally {
if (connection) await connection.end();
}
},
}
/**
* Check if the namespace exists in the database
@ -502,14 +513,14 @@ const PGVector = {
* @param {string} namespace
* @returns {Promise<boolean>}
*/
namespaceExists: async function (connection, namespace = null) {
async namespaceExists(connection, namespace = null) {
if (!namespace) throw new Error("No namespace provided");
const result = await connection.query(
`SELECT COUNT(id) FROM "${PGVector.tableName()}" WHERE namespace = $1 LIMIT 1`,
[namespace]
);
return result.rows[0].count > 0;
},
}
/**
* Delete all vectors in the namespace
@ -517,16 +528,16 @@ const PGVector = {
* @param {string} namespace
* @returns {Promise<boolean>}
*/
deleteVectorsInNamespace: async function (connection, namespace = null) {
async deleteVectorsInNamespace(connection, namespace = null) {
if (!namespace) throw new Error("No namespace provided");
await connection.query(
`DELETE FROM "${PGVector.tableName()}" WHERE namespace = $1`,
[namespace]
);
return true;
},
}
addDocumentToNamespace: async function (
async addDocumentToNamespace(
namespace,
documentData = {},
fullFilePath = null,
@ -544,7 +555,7 @@ const PGVector = {
if (!pageContent || pageContent.length == 0) return false;
connection = await this.connect();
this.log("Adding new vectorized document into namespace", namespace);
this.logger("Adding new vectorized document into namespace", namespace);
if (!skipCache) {
const cacheResult = await cachedVectorInformation(fullFilePath);
let vectorDimensions;
@ -594,7 +605,7 @@ const PGVector = {
});
const textChunks = await textSplitter.splitText(pageContent);
this.log("Snippets created from document:", textChunks.length);
this.logger("Snippets created from document:", textChunks.length);
const documentVectors = [];
const vectors = [];
const submissions = [];
@ -628,7 +639,7 @@ const PGVector = {
const chunks = [];
for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
this.log("Inserting vectorized chunks into PGVector collection.");
this.logger("Inserting vectorized chunks into PGVector collection.");
await this.updateOrCreateCollection({
connection,
submissions,
@ -641,12 +652,12 @@ const PGVector = {
await DocumentVectors.bulkInsert(documentVectors);
return { vectorized: true, error: null };
} catch (err) {
this.log("addDocumentToNamespace", err.message);
this.logger("addDocumentToNamespace", err.message);
return { vectorized: false, error: err.message };
} finally {
if (connection) await connection.end();
}
},
}
/**
* Delete a document from the namespace
@ -654,7 +665,7 @@ const PGVector = {
* @param {string} docId
* @returns {Promise<boolean>}
*/
deleteDocumentFromNamespace: async function (namespace, docId) {
async deleteDocumentFromNamespace(namespace, docId) {
if (!namespace) throw new Error("No namespace provided");
if (!docId) throw new Error("No docId provided");
@ -686,21 +697,21 @@ const PGVector = {
throw err;
}
this.log(
this.logger(
`Deleted ${vectorIds.length} vectors from namespace ${namespace}`
);
return true;
} catch (err) {
this.log(
this.logger(
`Error deleting document from namespace ${namespace}: ${err.message}`
);
return false;
} finally {
if (connection) await connection.end();
}
},
}
performSimilaritySearch: async function ({
async performSimilaritySearch({
namespace = null,
input = "",
LLMConnector = null,
@ -716,7 +727,7 @@ const PGVector = {
connection = await this.connect();
const exists = await this.namespaceExists(connection, namespace);
if (!exists) {
this.log(
this.logger(
`The namespace ${namespace} does not exist or has no vectors. Returning empty results.`
);
return {
@ -750,9 +761,9 @@ const PGVector = {
} finally {
if (connection) await connection.end();
}
},
}
"namespace-stats": async function (reqBody = {}) {
async "namespace-stats"(reqBody = {}) {
const { namespace = null } = reqBody;
if (!namespace) throw new Error("namespace required");
if (!(await this.dbTableExists()))
@ -774,9 +785,9 @@ const PGVector = {
} finally {
if (connection) await connection.end();
}
},
}
"delete-namespace": async function (reqBody = {}) {
async "delete-namespace"(reqBody = {}) {
const { namespace = null } = reqBody;
if (!namespace) throw new Error("No namespace provided");
@ -800,13 +811,13 @@ const PGVector = {
} finally {
if (connection) await connection.end();
}
},
}
/**
* Reset the entire vector database table associated with anythingllm
* @returns {Promise<{reset: boolean}>}
*/
reset: async function () {
async reset() {
let connection = null;
try {
connection = await this.connect();
@ -817,9 +828,9 @@ const PGVector = {
} finally {
if (connection) await connection.end();
}
},
}
curateSources: function (sources = []) {
curateSources(sources = []) {
const documents = [];
for (const source of sources) {
const { text, vector: _v, _distance: _d, ...rest } = source;
@ -833,7 +844,7 @@ const PGVector = {
}
return documents;
},
};
}
}
module.exports.PGVector = PGVector;

View File

@ -5,10 +5,18 @@ const { storeVectorResult, cachedVectorInformation } = require("../../files");
const { v4: uuidv4 } = require("uuid");
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
const { sourceIdentifier } = require("../../chats");
const { VectorDatabase } = require("../base");
const PineconeDB = {
name: "Pinecone",
connect: async function () {
class PineconeDB extends VectorDatabase {
constructor() {
super();
}
get name() {
return "Pinecone";
}
async connect() {
if (process.env.VECTOR_DB !== "pinecone")
throw new Error("Pinecone::Invalid ENV settings");
@ -21,8 +29,9 @@ const PineconeDB = {
if (!status.ready) throw new Error("Pinecone::Index not ready.");
return { client, pineconeIndex, indexName: process.env.PINECONE_INDEX };
},
totalVectors: async function () {
}
async totalVectors() {
const { pineconeIndex } = await this.connect();
const { namespaces } = await pineconeIndex.describeIndexStats();
@ -30,13 +39,15 @@ const PineconeDB = {
(a, b) => a + (b?.recordCount || 0),
0
);
},
namespaceCount: async function (_namespace = null) {
}
async namespaceCount(_namespace = null) {
const { pineconeIndex } = await this.connect();
const namespace = await this.namespace(pineconeIndex, _namespace);
return namespace?.recordCount || 0;
},
similarityResponse: async function ({
}
async similarityResponse({
client,
namespace,
queryVector,
@ -60,7 +71,7 @@ const PineconeDB = {
response.matches.forEach((match) => {
if (match.score < similarityThreshold) return;
if (filterIdentifiers.includes(sourceIdentifier(match.metadata))) {
console.log(
this.logger(
"Pinecone: A source was filtered from context as it's parent document is pinned."
);
return;
@ -75,28 +86,33 @@ const PineconeDB = {
});
return result;
},
namespace: async function (index, namespace = null) {
}
async namespace(index, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const { namespaces } = await index.describeIndexStats();
return namespaces.hasOwnProperty(namespace) ? namespaces[namespace] : null;
},
hasNamespace: async function (namespace = null) {
}
async hasNamespace(namespace = null) {
if (!namespace) return false;
const { pineconeIndex } = await this.connect();
return await this.namespaceExists(pineconeIndex, namespace);
},
namespaceExists: async function (index, namespace = null) {
}
async namespaceExists(index, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const { namespaces } = await index.describeIndexStats();
return namespaces.hasOwnProperty(namespace);
},
deleteVectorsInNamespace: async function (index, namespace = null) {
}
async deleteVectorsInNamespace(index, namespace = null) {
const pineconeNamespace = index.namespace(namespace);
await pineconeNamespace.deleteAll();
return true;
},
addDocumentToNamespace: async function (
}
async addDocumentToNamespace(
namespace,
documentData = {},
fullFilePath = null,
@ -107,7 +123,7 @@ const PineconeDB = {
const { pageContent, docId, ...metadata } = documentData;
if (!pageContent || pageContent.length == 0) return false;
console.log("Adding new vectorized document into namespace", namespace);
this.logger("Adding new vectorized document into namespace", namespace);
if (!skipCache) {
const cacheResult = await cachedVectorInformation(fullFilePath);
if (cacheResult.exists) {
@ -154,7 +170,7 @@ const PineconeDB = {
});
const textChunks = await textSplitter.splitText(pageContent);
console.log("Snippets created from document:", textChunks.length);
this.logger("Snippets created from document:", textChunks.length);
const documentVectors = [];
const vectors = [];
const vectorValues = await EmbedderEngine.embedChunks(textChunks);
@ -183,7 +199,7 @@ const PineconeDB = {
const chunks = [];
const { pineconeIndex } = await this.connect();
const pineconeNamespace = pineconeIndex.namespace(namespace);
console.log("Inserting vectorized chunks into Pinecone.");
this.logger("Inserting vectorized chunks into Pinecone.");
for (const chunk of toChunks(vectors, 100)) {
chunks.push(chunk);
await pineconeNamespace.upsert([...chunk]);
@ -194,11 +210,12 @@ const PineconeDB = {
await DocumentVectors.bulkInsert(documentVectors);
return { vectorized: true, error: null };
} catch (e) {
console.error("addDocumentToNamespace", e.message);
this.logger("addDocumentToNamespace", e.message);
return { vectorized: false, error: e.message };
}
},
deleteDocumentFromNamespace: async function (namespace, docId) {
}
async deleteDocumentFromNamespace(namespace, docId) {
const { DocumentVectors } = require("../../../models/vectors");
const { pineconeIndex } = await this.connect();
if (!(await this.namespaceExists(pineconeIndex, namespace))) return;
@ -216,8 +233,9 @@ const PineconeDB = {
const indexes = knownDocuments.map((doc) => doc.id);
await DocumentVectors.deleteIds(indexes);
return true;
},
"namespace-stats": async function (reqBody = {}) {
}
async "namespace-stats"(reqBody = {}) {
const { namespace = null } = reqBody;
if (!namespace) throw new Error("namespace required");
const { pineconeIndex } = await this.connect();
@ -227,8 +245,9 @@ const PineconeDB = {
return stats
? stats
: { message: "No stats were able to be fetched from DB" };
},
"delete-namespace": async function (reqBody = {}) {
}
async "delete-namespace"(reqBody = {}) {
const { namespace = null } = reqBody;
const { pineconeIndex } = await this.connect();
if (!(await this.namespaceExists(pineconeIndex, namespace)))
@ -239,8 +258,9 @@ const PineconeDB = {
return {
message: `Namespace ${namespace} was deleted along with ${details.vectorCount} vectors.`,
};
},
performSimilaritySearch: async function ({
}
async performSimilaritySearch({
namespace = null,
input = "",
LLMConnector = null,
@ -275,8 +295,9 @@ const PineconeDB = {
sources: this.curateSources(sources),
message: false,
};
},
curateSources: function (sources = []) {
}
curateSources(sources = []) {
const documents = [];
for (const source of sources) {
const { metadata = {} } = source;
@ -290,7 +311,7 @@ const PineconeDB = {
}
}
return documents;
},
};
}
}
module.exports.Pinecone = PineconeDB;

View File

@ -5,10 +5,18 @@ const { storeVectorResult, cachedVectorInformation } = require("../../files");
const { v4: uuidv4 } = require("uuid");
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
const { sourceIdentifier } = require("../../chats");
const { VectorDatabase } = require("../base");
const QDrant = {
name: "QDrant",
connect: async function () {
class QDrant extends VectorDatabase {
constructor() {
super();
}
get name() {
return "QDrant";
}
async connect() {
if (process.env.VECTOR_DB !== "qdrant")
throw new Error("QDrant::Invalid ENV settings");
@ -26,12 +34,14 @@ const QDrant = {
);
return { client };
},
heartbeat: async function () {
}
async heartbeat() {
await this.connect();
return { heartbeat: Number(new Date()) };
},
totalVectors: async function () {
}
async totalVectors() {
const { client } = await this.connect();
const { collections } = await client.getCollections();
var totalVectors = 0;
@ -41,13 +51,15 @@ const QDrant = {
(await this.namespace(client, collection.name))?.vectorCount || 0;
}
return totalVectors;
},
namespaceCount: async function (_namespace = null) {
}
async namespaceCount(_namespace = null) {
const { client } = await this.connect();
const namespace = await this.namespace(client, _namespace);
return namespace?.vectorCount || 0;
},
similarityResponse: async function ({
}
async similarityResponse({
client,
namespace,
queryVector,
@ -70,7 +82,7 @@ const QDrant = {
responses.forEach((response) => {
if (response.score < similarityThreshold) return;
if (filterIdentifiers.includes(sourceIdentifier(response?.payload))) {
console.log(
this.logger(
"QDrant: A source was filtered from context as it's parent document is pinned."
);
return;
@ -86,8 +98,9 @@ const QDrant = {
});
return result;
},
namespace: async function (client, namespace = null) {
}
async namespace(client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const collection = await client.getCollection(namespace).catch(() => null);
if (!collection) return null;
@ -97,28 +110,32 @@ const QDrant = {
...collection,
vectorCount: (await client.count(namespace, { exact: true })).count,
};
},
hasNamespace: async function (namespace = null) {
}
async hasNamespace(namespace = null) {
if (!namespace) return false;
const { client } = await this.connect();
return await this.namespaceExists(client, namespace);
},
namespaceExists: async function (client, namespace = null) {
}
async namespaceExists(client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const collection = await client.getCollection(namespace).catch((e) => {
console.error("QDrant::namespaceExists", e.message);
this.logger("namespaceExists", e.message);
return null;
});
return !!collection;
},
deleteVectorsInNamespace: async function (client, namespace = null) {
}
async deleteVectorsInNamespace(client, namespace = null) {
await client.deleteCollection(namespace);
return true;
},
}
// QDrant requires a dimension aspect for collection creation
// we pass this in from the first chunk to infer the dimensions like other
// providers do.
getOrCreateCollection: async function (client, namespace, dimensions = null) {
async getOrCreateCollection(client, namespace, dimensions = null) {
if (await this.namespaceExists(client, namespace)) {
return await client.getCollection(namespace);
}
@ -133,8 +150,9 @@ const QDrant = {
},
});
return await client.getCollection(namespace);
},
addDocumentToNamespace: async function (
}
async addDocumentToNamespace(
namespace,
documentData = {},
fullFilePath = null,
@ -146,7 +164,7 @@ const QDrant = {
const { pageContent, docId, ...metadata } = documentData;
if (!pageContent || pageContent.length == 0) return false;
console.log("Adding new vectorized document into namespace", namespace);
this.logger("Adding new vectorized document into namespace", namespace);
if (!skipCache) {
const cacheResult = await cachedVectorInformation(fullFilePath);
if (cacheResult.exists) {
@ -227,7 +245,7 @@ const QDrant = {
});
const textChunks = await textSplitter.splitText(pageContent);
console.log("Snippets created from document:", textChunks.length);
this.logger("Snippets created from document:", textChunks.length);
const documentVectors = [];
const vectors = [];
const vectorValues = await EmbedderEngine.embedChunks(textChunks);
@ -276,7 +294,7 @@ const QDrant = {
if (vectors.length > 0) {
const chunks = [];
console.log("Inserting vectorized chunks into QDrant collection.");
this.logger("Inserting vectorized chunks into QDrant collection.");
for (const chunk of toChunks(vectors, 500)) {
const batchIds = [],
batchVectors = [],
@ -306,11 +324,12 @@ const QDrant = {
await DocumentVectors.bulkInsert(documentVectors);
return { vectorized: true, error: null };
} catch (e) {
console.error("addDocumentToNamespace", e.message);
this.logger("addDocumentToNamespace", e.message);
return { vectorized: false, error: e.message };
}
},
deleteDocumentFromNamespace: async function (namespace, docId) {
}
async deleteDocumentFromNamespace(namespace, docId) {
const { DocumentVectors } = require("../../../models/vectors");
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace))) return;
@ -327,8 +346,9 @@ const QDrant = {
const indexes = knownDocuments.map((doc) => doc.id);
await DocumentVectors.deleteIds(indexes);
return true;
},
performSimilaritySearch: async function ({
}
async performSimilaritySearch({
namespace = null,
input = "",
LLMConnector = null,
@ -366,8 +386,9 @@ const QDrant = {
sources: this.curateSources(sources),
message: false,
};
},
"namespace-stats": async function (reqBody = {}) {
}
async "namespace-stats"(reqBody = {}) {
const { namespace = null } = reqBody;
if (!namespace) throw new Error("namespace required");
const { client } = await this.connect();
@ -377,8 +398,9 @@ const QDrant = {
return stats
? stats
: { message: "No stats were able to be fetched from DB for namespace" };
},
"delete-namespace": async function (reqBody = {}) {
}
async "delete-namespace"(reqBody = {}) {
const { namespace = null } = reqBody;
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace)))
@ -389,16 +411,18 @@ const QDrant = {
return {
message: `Namespace ${namespace} was deleted along with ${details?.vectorCount} vectors.`,
};
},
reset: async function () {
}
async reset() {
const { client } = await this.connect();
const response = await client.getCollections();
for (const collection of response.collections) {
await client.deleteCollection(collection.name);
}
return { reset: true };
},
curateSources: function (sources = []) {
}
curateSources(sources = []) {
const documents = [];
for (const source of sources) {
if (Object.keys(source).length > 0) {
@ -412,7 +436,7 @@ const QDrant = {
}
return documents;
},
};
}
}
module.exports.QDrant = QDrant;

View File

@ -6,10 +6,18 @@ const { v4: uuidv4 } = require("uuid");
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
const { camelCase } = require("../../helpers/camelcase");
const { sourceIdentifier } = require("../../chats");
const { VectorDatabase } = require("../base");
const Weaviate = {
name: "Weaviate",
connect: async function () {
class Weaviate extends VectorDatabase {
constructor() {
super();
}
get name() {
return "Weaviate";
}
async connect() {
if (process.env.VECTOR_DB !== "weaviate")
throw new Error("Weaviate::Invalid ENV settings");
@ -28,12 +36,14 @@ const Weaviate = {
"Weaviate::Invalid Alive signal received - is the service online?"
);
return { client };
},
heartbeat: async function () {
}
async heartbeat() {
await this.connect();
return { heartbeat: Number(new Date()) };
},
totalVectors: async function () {
}
async totalVectors() {
const { client } = await this.connect();
const collectionNames = await this.allNamespaces(client);
var totalVectors = 0;
@ -41,8 +51,9 @@ const Weaviate = {
totalVectors += await this.namespaceCountWithClient(client, name);
}
return totalVectors;
},
namespaceCountWithClient: async function (client, namespace) {
}
async namespaceCountWithClient(client, namespace) {
try {
const response = await client.graphql
.aggregate()
@ -53,11 +64,12 @@ const Weaviate = {
response?.data?.Aggregate?.[camelCase(namespace)]?.[0]?.meta?.count || 0
);
} catch (e) {
console.error(`Weaviate:namespaceCountWithClient`, e.message);
this.logger(`namespaceCountWithClient`, e.message);
return 0;
}
},
namespaceCount: async function (namespace = null) {
}
async namespaceCount(namespace = null) {
try {
const { client } = await this.connect();
const response = await client.graphql
@ -70,11 +82,12 @@ const Weaviate = {
response?.data?.Aggregate?.[camelCase(namespace)]?.[0]?.meta?.count || 0
);
} catch (e) {
console.error(`Weaviate:namespaceCountWithClient`, e.message);
this.logger(`namespaceCountWithClient`, e.message);
return 0;
}
},
similarityResponse: async function ({
}
async similarityResponse({
client,
namespace,
queryVector,
@ -109,8 +122,8 @@ const Weaviate = {
} = response;
if (certainty < similarityThreshold) return;
if (filterIdentifiers.includes(sourceIdentifier(rest))) {
console.log(
"Weaviate: A source was filtered from context as it's parent document is pinned."
this.logger(
"A source was filtered from context as it's parent document is pinned."
);
return;
}
@ -120,17 +133,19 @@ const Weaviate = {
});
return result;
},
allNamespaces: async function (client) {
}
async allNamespaces(client) {
try {
const { classes = [] } = await client.schema.getter().do();
return classes.map((classObj) => classObj.class);
} catch (e) {
console.error("Weaviate::AllNamespace", e);
this.logger("AllNamespace", e);
return [];
}
},
namespace: async function (client, namespace = null) {
}
async namespace(client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
if (!(await this.namespaceExists(client, namespace))) return null;
@ -143,8 +158,9 @@ const Weaviate = {
...weaviateClass,
vectorCount: await this.namespaceCount(namespace),
};
},
addVectors: async function (client, vectors = []) {
}
async addVectors(client, vectors = []) {
const response = { success: true, errors: new Set([]) };
const results = await client.batch
.objectsBatcher()
@ -160,23 +176,27 @@ const Weaviate = {
response.errors = [...response.errors];
return response;
},
hasNamespace: async function (namespace = null) {
}
async hasNamespace(namespace = null) {
if (!namespace) return false;
const { client } = await this.connect();
const weaviateClasses = await this.allNamespaces(client);
return weaviateClasses.includes(camelCase(namespace));
},
namespaceExists: async function (client, namespace = null) {
}
async namespaceExists(client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const weaviateClasses = await this.allNamespaces(client);
return weaviateClasses.includes(camelCase(namespace));
},
deleteVectorsInNamespace: async function (client, namespace = null) {
}
async deleteVectorsInNamespace(client, namespace = null) {
await client.schema.classDeleter().withClassName(camelCase(namespace)).do();
return true;
},
addDocumentToNamespace: async function (
}
async addDocumentToNamespace(
namespace,
documentData = {},
fullFilePath = null,
@ -192,7 +212,7 @@ const Weaviate = {
} = documentData;
if (!pageContent || pageContent.length == 0) return false;
console.log("Adding new vectorized document into namespace", namespace);
this.logger("Adding new vectorized document into namespace", namespace);
if (!skipCache) {
const cacheResult = await cachedVectorInformation(fullFilePath);
if (cacheResult.exists) {
@ -236,7 +256,7 @@ const Weaviate = {
const { success: additionResult, errors = [] } =
await this.addVectors(client, vectors);
if (!additionResult) {
console.error("Weaviate::addVectors failed to insert", errors);
this.logger("addVectors failed to insert", errors);
throw new Error("Error embedding into Weaviate");
}
}
@ -267,7 +287,7 @@ const Weaviate = {
});
const textChunks = await textSplitter.splitText(pageContent);
console.log("Snippets created from document:", textChunks.length);
this.logger("Snippets created from document:", textChunks.length);
const documentVectors = [];
const vectors = [];
const vectorValues = await EmbedderEngine.embedChunks(textChunks);
@ -322,13 +342,13 @@ const Weaviate = {
const chunks = [];
for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
console.log("Inserting vectorized chunks into Weaviate collection.");
this.logger("Inserting vectorized chunks into Weaviate collection.");
const { success: additionResult, errors = [] } = await this.addVectors(
client,
vectors
);
if (!additionResult) {
console.error("Weaviate::addVectors failed to insert", errors);
this.logger("addVectors failed to insert", errors);
throw new Error("Error embedding into Weaviate");
}
await storeVectorResult(chunks, fullFilePath);
@ -337,11 +357,12 @@ const Weaviate = {
await DocumentVectors.bulkInsert(documentVectors);
return { vectorized: true, error: null };
} catch (e) {
console.error("addDocumentToNamespace", e.message);
this.logger("addDocumentToNamespace", e.message);
return { vectorized: false, error: e.message };
}
},
deleteDocumentFromNamespace: async function (namespace, docId) {
}
async deleteDocumentFromNamespace(namespace, docId) {
const { DocumentVectors } = require("../../../models/vectors");
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace))) return;
@ -360,8 +381,9 @@ const Weaviate = {
const indexes = knownDocuments.map((doc) => doc.id);
await DocumentVectors.deleteIds(indexes);
return true;
},
performSimilaritySearch: async function ({
}
async performSimilaritySearch({
namespace = null,
input = "",
LLMConnector = null,
@ -399,8 +421,9 @@ const Weaviate = {
sources: this.curateSources(sources),
message: false,
};
},
"namespace-stats": async function (reqBody = {}) {
}
async "namespace-stats"(reqBody = {}) {
const { namespace = null } = reqBody;
if (!namespace) throw new Error("namespace required");
const { client } = await this.connect();
@ -408,8 +431,9 @@ const Weaviate = {
return stats
? stats
: { message: "No stats were able to be fetched from DB for namespace" };
},
"delete-namespace": async function (reqBody = {}) {
}
async "delete-namespace"(reqBody = {}) {
const { namespace = null } = reqBody;
const { client } = await this.connect();
const details = await this.namespace(client, namespace);
@ -419,16 +443,18 @@ const Weaviate = {
details?.vectorCount
} vectors.`,
};
},
reset: async function () {
}
async reset() {
const { client } = await this.connect();
const weaviateClasses = await this.allNamespaces(client);
for (const weaviateClass of weaviateClasses) {
await client.schema.classDeleter().withClassName(weaviateClass).do();
}
return { reset: true };
},
curateSources: function (sources = []) {
}
curateSources(sources = []) {
const documents = [];
for (const source of sources) {
if (Object.keys(source).length > 0) {
@ -440,8 +466,9 @@ const Weaviate = {
}
return documents;
},
flattenObjectForWeaviate: function (obj = {}) {
}
flattenObjectForWeaviate(obj = {}) {
// Note this function is not generic, it is designed specifically for Weaviate
// https://weaviate.io/developers/weaviate/config-refs/datatypes#introduction
// Credit to LangchainJS
@ -478,7 +505,7 @@ const Weaviate = {
}
return flattenedObject;
},
};
}
}
module.exports.Weaviate = Weaviate;

View File

@ -1,33 +1,22 @@
const {
DataType,
MetricType,
IndexType,
MilvusClient,
} = require("@zilliz/milvus2-sdk-node");
const { TextSplitter } = require("../../TextSplitter");
const { SystemSettings } = require("../../../models/systemSettings");
const { v4: uuidv4 } = require("uuid");
const { storeVectorResult, cachedVectorInformation } = require("../../files");
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
const { sourceIdentifier } = require("../../chats");
const { MilvusClient } = require("@zilliz/milvus2-sdk-node");
const { Milvus } = require("../milvus");
// Zilliz is basically a copy of Milvus DB class with a different constructor
// to connect to the cloud
const Zilliz = {
name: "Zilliz",
// Milvus/Zilliz only allows letters, numbers, and underscores in collection names
// so we need to enforce that by re-normalizing the names when communicating with
// the DB.
// If the first char of the collection is not an underscore or letter the collection name will be invalid.
normalize: function (inputString) {
let normalized = inputString.replace(/[^a-zA-Z0-9_]/g, "_");
if (new RegExp(/^[a-zA-Z_]/).test(normalized.slice(0, 1)))
normalized = `anythingllm_${normalized}`;
return normalized;
},
connect: async function () {
/**
* Zilliz is the cloud version of Milvus so we can just extend the
* Milvus class and override the connect method
*/
class Zilliz extends Milvus {
constructor() {
super();
}
get name() {
return "Zilliz";
}
async connect() {
if (process.env.VECTOR_DB !== "zilliz")
throw new Error("Zilliz::Invalid ENV settings");
throw new Error(`${this.name}::Invalid ENV settings`);
const client = new MilvusClient({
address: process.env.ZILLIZ_ENDPOINT,
@ -37,366 +26,11 @@ const Zilliz = {
const { isHealthy } = await client.checkHealth();
if (!isHealthy)
throw new Error(
"Zilliz::Invalid Heartbeat received - is the instance online?"
`${this.name}::Invalid Heartbeat received - is the instance online?`
);
return { client };
},
heartbeat: async function () {
await this.connect();
return { heartbeat: Number(new Date()) };
},
totalVectors: async function () {
const { client } = await this.connect();
const { collection_names } = await client.listCollections();
const total = collection_names.reduce(async (acc, collection_name) => {
const statistics = await client.getCollectionStatistics({
collection_name: this.normalize(collection_name),
});
return Number(acc) + Number(statistics?.data?.row_count ?? 0);
}, 0);
return total;
},
namespaceCount: async function (_namespace = null) {
const { client } = await this.connect();
const statistics = await client.getCollectionStatistics({
collection_name: this.normalize(_namespace),
});
return Number(statistics?.data?.row_count ?? 0);
},
namespace: async function (client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const collection = await client
.getCollectionStatistics({ collection_name: this.normalize(namespace) })
.catch(() => null);
return collection;
},
hasNamespace: async function (namespace = null) {
if (!namespace) return false;
const { client } = await this.connect();
return await this.namespaceExists(client, namespace);
},
namespaceExists: async function (client, namespace = null) {
if (!namespace) throw new Error("No namespace value provided.");
const { value } = await client
.hasCollection({ collection_name: this.normalize(namespace) })
.catch((e) => {
console.error("Zilliz::namespaceExists", e.message);
return { value: false };
});
return value;
},
deleteVectorsInNamespace: async function (client, namespace = null) {
await client.dropCollection({ collection_name: this.normalize(namespace) });
return true;
},
// Zilliz requires a dimension aspect for collection creation
// we pass this in from the first chunk to infer the dimensions like other
// providers do.
getOrCreateCollection: async function (client, namespace, dimensions = null) {
const isExists = await this.namespaceExists(client, namespace);
if (!isExists) {
if (!dimensions)
throw new Error(
`Zilliz:getOrCreateCollection Unable to infer vector dimension from input. Open an issue on GitHub for support.`
);
await client.createCollection({
collection_name: this.normalize(namespace),
fields: [
{
name: "id",
description: "id",
data_type: DataType.VarChar,
max_length: 255,
is_primary_key: true,
},
{
name: "vector",
description: "vector",
data_type: DataType.FloatVector,
dim: dimensions,
},
{
name: "metadata",
description: "metadata",
data_type: DataType.JSON,
},
],
});
await client.createIndex({
collection_name: this.normalize(namespace),
field_name: "vector",
index_type: IndexType.AUTOINDEX,
metric_type: MetricType.COSINE,
});
await client.loadCollectionSync({
collection_name: this.normalize(namespace),
});
}
},
addDocumentToNamespace: async function (
namespace,
documentData = {},
fullFilePath = null,
skipCache = false
) {
const { DocumentVectors } = require("../../../models/vectors");
try {
let vectorDimension = null;
const { pageContent, docId, ...metadata } = documentData;
if (!pageContent || pageContent.length == 0) return false;
console.log("Adding new vectorized document into namespace", namespace);
if (!skipCache) {
const cacheResult = await cachedVectorInformation(fullFilePath);
if (cacheResult.exists) {
const { client } = await this.connect();
const { chunks } = cacheResult;
const documentVectors = [];
vectorDimension = chunks[0][0].values.length || null;
await this.getOrCreateCollection(client, namespace, vectorDimension);
for (const chunk of chunks) {
// Before sending to Pinecone and saving the records to our db
// we need to assign the id of each chunk that is stored in the cached file.
const newChunks = chunk.map((chunk) => {
const id = uuidv4();
documentVectors.push({ docId, vectorId: id });
return { id, vector: chunk.values, metadata: chunk.metadata };
});
const insertResult = await client.insert({
collection_name: this.normalize(namespace),
data: newChunks,
});
if (insertResult?.status.error_code !== "Success") {
throw new Error(
`Error embedding into Zilliz! Reason:${insertResult?.status.reason}`
);
}
}
await DocumentVectors.bulkInsert(documentVectors);
await client.flushSync({
collection_names: [this.normalize(namespace)],
});
return { vectorized: true, error: null };
}
}
const EmbedderEngine = getEmbeddingEngineSelection();
const textSplitter = new TextSplitter({
chunkSize: TextSplitter.determineMaxChunkSize(
await SystemSettings.getValueOrFallback({
label: "text_splitter_chunk_size",
}),
EmbedderEngine?.embeddingMaxChunkLength
),
chunkOverlap: await SystemSettings.getValueOrFallback(
{ label: "text_splitter_chunk_overlap" },
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);
console.log("Snippets created from document:", textChunks.length);
const documentVectors = [];
const vectors = [];
const vectorValues = await EmbedderEngine.embedChunks(textChunks);
if (!!vectorValues && vectorValues.length > 0) {
for (const [i, vector] of vectorValues.entries()) {
if (!vectorDimension) vectorDimension = vector.length;
const vectorRecord = {
id: uuidv4(),
values: vector,
// [DO NOT REMOVE]
// LangChain will be unable to find your text if you embed manually and dont include the `text` key.
metadata: { ...metadata, text: textChunks[i] },
};
vectors.push(vectorRecord);
documentVectors.push({ docId, vectorId: vectorRecord.id });
}
} else {
throw new Error(
"Could not embed document chunks! This document will not be recorded."
);
}
if (vectors.length > 0) {
const chunks = [];
const { client } = await this.connect();
await this.getOrCreateCollection(client, namespace, vectorDimension);
console.log("Inserting vectorized chunks into Zilliz.");
for (const chunk of toChunks(vectors, 100)) {
chunks.push(chunk);
const insertResult = await client.insert({
collection_name: this.normalize(namespace),
data: chunk.map((item) => ({
id: item.id,
vector: item.values,
metadata: item.metadata,
})),
});
if (insertResult?.status.error_code !== "Success") {
throw new Error(
`Error embedding into Zilliz! Reason:${insertResult?.status.reason}`
);
}
}
await storeVectorResult(chunks, fullFilePath);
await client.flushSync({
collection_names: [this.normalize(namespace)],
});
}
await DocumentVectors.bulkInsert(documentVectors);
return { vectorized: true, error: null };
} catch (e) {
console.error("addDocumentToNamespace", e.message);
return { vectorized: false, error: e.message };
}
},
deleteDocumentFromNamespace: async function (namespace, docId) {
const { DocumentVectors } = require("../../../models/vectors");
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace))) return;
const knownDocuments = await DocumentVectors.where({ docId });
if (knownDocuments.length === 0) return;
const vectorIds = knownDocuments.map((doc) => doc.vectorId);
const queryIn = vectorIds.map((v) => `'${v}'`).join(",");
await client.deleteEntities({
collection_name: this.normalize(namespace),
expr: `id in [${queryIn}]`,
});
const indexes = knownDocuments.map((doc) => doc.id);
await DocumentVectors.deleteIds(indexes);
// Even after flushing Zilliz can take some time to re-calc the count
// so all we can hope to do is flushSync so that the count can be correct
// on a later call.
await client.flushSync({ collection_names: [this.normalize(namespace)] });
return true;
},
performSimilaritySearch: async function ({
namespace = null,
input = "",
LLMConnector = null,
similarityThreshold = 0.25,
topN = 4,
filterIdentifiers = [],
}) {
if (!namespace || !input || !LLMConnector)
throw new Error("Invalid request to performSimilaritySearch.");
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace))) {
return {
contextTexts: [],
sources: [],
message: "Invalid query - no documents found for workspace!",
};
}
const queryVector = await LLMConnector.embedTextInput(input);
const { contextTexts, sourceDocuments } = await this.similarityResponse({
client,
namespace,
queryVector,
similarityThreshold,
topN,
filterIdentifiers,
});
const sources = sourceDocuments.map((doc, i) => {
return { metadata: doc, text: contextTexts[i] };
});
return {
contextTexts,
sources: this.curateSources(sources),
message: false,
};
},
similarityResponse: async function ({
client,
namespace,
queryVector,
similarityThreshold = 0.25,
topN = 4,
filterIdentifiers = [],
}) {
const result = {
contextTexts: [],
sourceDocuments: [],
scores: [],
};
const response = await client.search({
collection_name: this.normalize(namespace),
vectors: queryVector,
limit: topN,
});
response.results.forEach((match) => {
if (match.score < similarityThreshold) return;
if (filterIdentifiers.includes(sourceIdentifier(match.metadata))) {
console.log(
"Zilliz: A source was filtered from context as it's parent document is pinned."
);
return;
}
result.contextTexts.push(match.metadata.text);
result.sourceDocuments.push({
...match.metadata,
score: match.score,
});
result.scores.push(match.score);
});
return result;
},
"namespace-stats": async function (reqBody = {}) {
const { namespace = null } = reqBody;
if (!namespace) throw new Error("namespace required");
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace)))
throw new Error("Namespace by that name does not exist.");
const stats = await this.namespace(client, namespace);
return stats
? stats
: { message: "No stats were able to be fetched from DB for namespace" };
},
"delete-namespace": async function (reqBody = {}) {
const { namespace = null } = reqBody;
const { client } = await this.connect();
if (!(await this.namespaceExists(client, namespace)))
throw new Error("Namespace by that name does not exist.");
const statistics = await this.namespace(client, namespace);
await this.deleteVectorsInNamespace(client, namespace);
const vectorCount = Number(statistics?.data?.row_count ?? 0);
return {
message: `Namespace ${namespace} was deleted along with ${vectorCount} vectors.`,
};
},
curateSources: function (sources = []) {
const documents = [];
for (const source of sources) {
const { metadata = {} } = source;
if (Object.keys(metadata).length > 0) {
documents.push({
...metadata,
...(source.text ? { text: source.text } : {}),
});
}
}
return documents;
},
};
module.exports.Zilliz = Zilliz;