VectorDB class migration (#4787)
* Migrate Astra to class (#4722) migrate astra to class * Migrate LanceDB to class (#4721) migrate lancedb to class * Migrate Pinecone to class (#4726) migrate pinecone to class * Migrate Zilliz to class (#4729) migrate zilliz to class * Migrate Weaviate to class (#4728) migrate weaviate to class * Migrate Qdrant to class (#4727) migrate qdrant to class * Migrate Milvus to class (#4725) migrate milvus to class * Migrate Chroma to class (#4723) migrate chroma to class * Migrate Chroma Cloud to class (#4724) * migrate chroma to class * migrate chroma cloud to class * move limits to class field --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com> * Migrate PGVector to class (#4730) * migrate pgvector to class * patch pgvector test * convert connectionString, tableName, and validateConnection to static methods * move instance properties to class fields --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com> * Refactor Zilliz Cloud vector DB provider (#4749) simplify zilliz implementation by using milvus as base class Co-authored-by: Timothy Carambat <rambat1010@gmail.com> * VectorDatabase base class (#4738) create generic VectorDatabase base class Co-authored-by: Timothy Carambat <rambat1010@gmail.com> * Extend VectorDatabase base class to all providers (#4755) extend VectorDatabase base class to all providers * patch lancedb import * breakout name and add generic logger * dev tag build --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
parent
7c3b7906e7
commit
5039045f0c
2
.github/workflows/dev-build.yaml
vendored
2
.github/workflows/dev-build.yaml
vendored
@ -6,7 +6,7 @@ concurrency:
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ['4841-aws-bedrock-api-key'] # put your current branch to create a build. Core team only.
|
||||
branches: ['vectordb-class-migration'] # put your current branch to create a build. Core team only.
|
||||
paths-ignore:
|
||||
- '**.md'
|
||||
- 'cloud-deployments/*'
|
||||
|
||||
@ -1,4 +1,6 @@
|
||||
const { PGVector } = require("../../../../utils/vectorDbProviders/pgvector");
|
||||
const { PGVector: PGVectorClass } = require("../../../../utils/vectorDbProviders/pgvector");
|
||||
|
||||
const PGVector = new PGVectorClass();
|
||||
|
||||
describe("PGVector.sanitizeForJsonb", () => {
|
||||
it("returns null/undefined as-is", () => {
|
||||
|
||||
@ -86,40 +86,40 @@ function getVectorDbClass(getExactly = null) {
|
||||
switch (vectorSelection) {
|
||||
case "pinecone":
|
||||
const { Pinecone } = require("../vectorDbProviders/pinecone");
|
||||
return Pinecone;
|
||||
return new Pinecone();
|
||||
case "chroma":
|
||||
const { Chroma } = require("../vectorDbProviders/chroma");
|
||||
return Chroma;
|
||||
return new Chroma();
|
||||
case "chromacloud":
|
||||
const { ChromaCloud } = require("../vectorDbProviders/chromacloud");
|
||||
return ChromaCloud;
|
||||
return new ChromaCloud();
|
||||
case "lancedb":
|
||||
const { LanceDb } = require("../vectorDbProviders/lance");
|
||||
return LanceDb;
|
||||
return new LanceDb();
|
||||
case "weaviate":
|
||||
const { Weaviate } = require("../vectorDbProviders/weaviate");
|
||||
return Weaviate;
|
||||
return new Weaviate();
|
||||
case "qdrant":
|
||||
const { QDrant } = require("../vectorDbProviders/qdrant");
|
||||
return QDrant;
|
||||
return new QDrant();
|
||||
case "milvus":
|
||||
const { Milvus } = require("../vectorDbProviders/milvus");
|
||||
return Milvus;
|
||||
return new Milvus();
|
||||
case "zilliz":
|
||||
const { Zilliz } = require("../vectorDbProviders/zilliz");
|
||||
return Zilliz;
|
||||
return new Zilliz();
|
||||
case "astra":
|
||||
const { AstraDB } = require("../vectorDbProviders/astra");
|
||||
return AstraDB;
|
||||
return new AstraDB();
|
||||
case "pgvector":
|
||||
const { PGVector } = require("../vectorDbProviders/pgvector");
|
||||
return PGVector;
|
||||
return new PGVector();
|
||||
default:
|
||||
console.error(
|
||||
`\x1b[31m[ENV ERROR]\x1b[0m No VECTOR_DB value found in environment! Falling back to LanceDB`
|
||||
);
|
||||
const { LanceDb: DefaultLanceDb } = require("../vectorDbProviders/lance");
|
||||
return DefaultLanceDb;
|
||||
return new DefaultLanceDb();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -5,6 +5,7 @@ const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
|
||||
const { sourceIdentifier } = require("../../chats");
|
||||
const { VectorDatabase } = require("../base");
|
||||
|
||||
const sanitizeNamespace = (namespace) => {
|
||||
// If namespace already starts with ns_, don't add it again
|
||||
@ -22,14 +23,21 @@ const collectionExists = async function (client, namespace) {
|
||||
return collections.includes(namespace);
|
||||
}
|
||||
} catch (error) {
|
||||
console.log("Astra::collectionExists check error", error?.message || error);
|
||||
this.logger("collectionExists check error", error?.message || error);
|
||||
return false; // Return false for any error to allow creation attempt
|
||||
}
|
||||
};
|
||||
|
||||
const AstraDB = {
|
||||
name: "AstraDB",
|
||||
connect: async function () {
|
||||
class AstraDB extends VectorDatabase {
|
||||
constructor() {
|
||||
super();
|
||||
}
|
||||
|
||||
get name() {
|
||||
return "AstraDB";
|
||||
}
|
||||
|
||||
async connect() {
|
||||
if (process.env.VECTOR_DB !== "astra")
|
||||
throw new Error("AstraDB::Invalid ENV settings");
|
||||
|
||||
@ -38,21 +46,24 @@ const AstraDB = {
|
||||
process?.env?.ASTRA_DB_ENDPOINT
|
||||
);
|
||||
return { client };
|
||||
},
|
||||
heartbeat: async function () {
|
||||
}
|
||||
|
||||
async heartbeat() {
|
||||
return { heartbeat: Number(new Date()) };
|
||||
},
|
||||
}
|
||||
|
||||
// Astra interface will return a valid collection object even if the collection
|
||||
// does not actually exist. So we run a simple check which will always throw
|
||||
// when the table truly does not exist. Faster than iterating all collections.
|
||||
isRealCollection: async function (astraCollection = null) {
|
||||
async isRealCollection(astraCollection = null) {
|
||||
if (!astraCollection) return false;
|
||||
return await astraCollection
|
||||
.countDocuments()
|
||||
.then(() => true)
|
||||
.catch(() => false);
|
||||
},
|
||||
totalVectors: async function () {
|
||||
}
|
||||
|
||||
async totalVectors() {
|
||||
const { client } = await this.connect();
|
||||
const collectionNames = await this.allNamespaces(client);
|
||||
var totalVectors = 0;
|
||||
@ -62,13 +73,15 @@ const AstraDB = {
|
||||
totalVectors += count ? count : 0;
|
||||
}
|
||||
return totalVectors;
|
||||
},
|
||||
namespaceCount: async function (_namespace = null) {
|
||||
}
|
||||
|
||||
async namespaceCount(_namespace = null) {
|
||||
const { client } = await this.connect();
|
||||
const namespace = await this.namespace(client, _namespace);
|
||||
return namespace?.vectorCount || 0;
|
||||
},
|
||||
namespace: async function (client, namespace = null) {
|
||||
}
|
||||
|
||||
async namespace(client, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
const sanitizedNamespace = sanitizeNamespace(namespace);
|
||||
const collection = await client
|
||||
@ -77,7 +90,7 @@ const AstraDB = {
|
||||
if (!(await this.isRealCollection(collection))) return null;
|
||||
|
||||
const count = await collection.countDocuments().catch((e) => {
|
||||
console.error("Astra::namespaceExists", e.message);
|
||||
this.logger("namespaceExists", e.message);
|
||||
return null;
|
||||
});
|
||||
|
||||
@ -86,27 +99,31 @@ const AstraDB = {
|
||||
...collection,
|
||||
vectorCount: typeof count === "number" ? count : 0,
|
||||
};
|
||||
},
|
||||
hasNamespace: async function (namespace = null) {
|
||||
}
|
||||
|
||||
async hasNamespace(namespace = null) {
|
||||
if (!namespace) return false;
|
||||
const { client } = await this.connect();
|
||||
return await this.namespaceExists(client, namespace);
|
||||
},
|
||||
namespaceExists: async function (client, namespace = null) {
|
||||
}
|
||||
|
||||
async namespaceExists(client, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
const sanitizedNamespace = sanitizeNamespace(namespace);
|
||||
const collection = await client.collection(sanitizedNamespace);
|
||||
return await this.isRealCollection(collection);
|
||||
},
|
||||
deleteVectorsInNamespace: async function (client, namespace = null) {
|
||||
}
|
||||
|
||||
async deleteVectorsInNamespace(client, namespace = null) {
|
||||
const sanitizedNamespace = sanitizeNamespace(namespace);
|
||||
await client.dropCollection(sanitizedNamespace);
|
||||
return true;
|
||||
},
|
||||
}
|
||||
|
||||
// AstraDB requires a dimension aspect for collection creation
|
||||
// we pass this in from the first chunk to infer the dimensions like other
|
||||
// providers do.
|
||||
getOrCreateCollection: async function (client, namespace, dimensions = null) {
|
||||
async getOrCreateCollection(client, namespace, dimensions = null) {
|
||||
const sanitizedNamespace = sanitizeNamespace(namespace);
|
||||
try {
|
||||
const exists = await collectionExists(client, sanitizedNamespace);
|
||||
@ -132,14 +149,12 @@ const AstraDB = {
|
||||
|
||||
return await client.collection(sanitizedNamespace);
|
||||
} catch (error) {
|
||||
console.error(
|
||||
"Astra::getOrCreateCollection error",
|
||||
error?.message || error
|
||||
);
|
||||
this.logger("getOrCreateCollection", error?.message || error);
|
||||
throw error;
|
||||
}
|
||||
},
|
||||
addDocumentToNamespace: async function (
|
||||
}
|
||||
|
||||
async addDocumentToNamespace(
|
||||
namespace,
|
||||
documentData = {},
|
||||
fullFilePath = null,
|
||||
@ -151,7 +166,7 @@ const AstraDB = {
|
||||
const { pageContent, docId, ...metadata } = documentData;
|
||||
if (!pageContent || pageContent.length == 0) return false;
|
||||
|
||||
console.log("Adding new vectorized document into namespace", namespace);
|
||||
this.logger("Adding new vectorized document into namespace", namespace);
|
||||
if (!skipCache) {
|
||||
const cacheResult = await cachedVectorInformation(fullFilePath);
|
||||
if (cacheResult.exists) {
|
||||
@ -210,7 +225,7 @@ const AstraDB = {
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
console.log("Snippets created from document:", textChunks.length);
|
||||
this.logger("Snippets created from document:", textChunks.length);
|
||||
const documentVectors = [];
|
||||
const vectors = [];
|
||||
const vectorValues = await EmbedderEngine.embedChunks(textChunks);
|
||||
@ -246,7 +261,7 @@ const AstraDB = {
|
||||
if (vectors.length > 0) {
|
||||
const chunks = [];
|
||||
|
||||
console.log("Inserting vectorized chunks into Astra DB.");
|
||||
this.logger("Inserting vectorized chunks into Astra DB.");
|
||||
|
||||
// AstraDB has maximum upsert size of 20 records per-request so we have to use a lower chunk size here
|
||||
// in order to do the queries - this takes a lot more time than other providers but there
|
||||
@ -266,11 +281,12 @@ const AstraDB = {
|
||||
await DocumentVectors.bulkInsert(documentVectors);
|
||||
return { vectorized: true, error: null };
|
||||
} catch (e) {
|
||||
console.error("addDocumentToNamespace", e.message);
|
||||
this.logger("addDocumentToNamespace", e.message);
|
||||
return { vectorized: false, error: e.message };
|
||||
}
|
||||
},
|
||||
deleteDocumentFromNamespace: async function (namespace, docId) {
|
||||
}
|
||||
|
||||
async deleteDocumentFromNamespace(namespace, docId) {
|
||||
const { DocumentVectors } = require("../../../models/vectors");
|
||||
const { client } = await this.connect();
|
||||
namespace = sanitizeNamespace(namespace);
|
||||
@ -293,8 +309,9 @@ const AstraDB = {
|
||||
const indexes = knownDocuments.map((doc) => doc.id);
|
||||
await DocumentVectors.deleteIds(indexes);
|
||||
return true;
|
||||
},
|
||||
performSimilaritySearch: async function ({
|
||||
}
|
||||
|
||||
async performSimilaritySearch({
|
||||
namespace = null,
|
||||
input = "",
|
||||
LLMConnector = null,
|
||||
@ -336,8 +353,9 @@ const AstraDB = {
|
||||
sources: this.curateSources(sources),
|
||||
message: false,
|
||||
};
|
||||
},
|
||||
similarityResponse: async function ({
|
||||
}
|
||||
|
||||
async similarityResponse({
|
||||
client,
|
||||
namespace,
|
||||
queryVector,
|
||||
@ -367,8 +385,8 @@ const AstraDB = {
|
||||
responses.forEach((response) => {
|
||||
if (response.$similarity < similarityThreshold) return;
|
||||
if (filterIdentifiers.includes(sourceIdentifier(response.metadata))) {
|
||||
console.log(
|
||||
"AstraDB: A source was filtered from context as it's parent document is pinned."
|
||||
this.logger(
|
||||
"A source was filtered from context as it's parent document is pinned."
|
||||
);
|
||||
return;
|
||||
}
|
||||
@ -380,8 +398,9 @@ const AstraDB = {
|
||||
result.scores.push(response.$similarity);
|
||||
});
|
||||
return result;
|
||||
},
|
||||
allNamespaces: async function (client) {
|
||||
}
|
||||
|
||||
async allNamespaces(client) {
|
||||
try {
|
||||
let header = new Headers();
|
||||
header.append("Token", client?.httpClient?.applicationToken);
|
||||
@ -403,11 +422,12 @@ const AstraDB = {
|
||||
const collections = resp ? JSON.parse(resp)?.status?.collections : [];
|
||||
return collections;
|
||||
} catch (e) {
|
||||
console.error("Astra::AllNamespace", e);
|
||||
this.logger("AllNamespace", e);
|
||||
return [];
|
||||
}
|
||||
},
|
||||
"namespace-stats": async function (reqBody = {}) {
|
||||
}
|
||||
|
||||
async "namespace-stats"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
if (!namespace) throw new Error("namespace required");
|
||||
const { client } = await this.connect();
|
||||
@ -417,8 +437,9 @@ const AstraDB = {
|
||||
return stats
|
||||
? stats
|
||||
: { message: "No stats were able to be fetched from DB for namespace" };
|
||||
},
|
||||
"delete-namespace": async function (reqBody = {}) {
|
||||
}
|
||||
|
||||
async "delete-namespace"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
const { client } = await this.connect();
|
||||
if (!(await this.namespaceExists(client, namespace)))
|
||||
@ -431,8 +452,9 @@ const AstraDB = {
|
||||
details?.vectorCount || "all"
|
||||
} vectors.`,
|
||||
};
|
||||
},
|
||||
curateSources: function (sources = []) {
|
||||
}
|
||||
|
||||
curateSources(sources = []) {
|
||||
const documents = [];
|
||||
for (const source of sources) {
|
||||
if (Object.keys(source).length > 0) {
|
||||
@ -446,7 +468,7 @@ const AstraDB = {
|
||||
}
|
||||
|
||||
return documents;
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
module.exports.AstraDB = AstraDB;
|
||||
|
||||
201
server/utils/vectorDbProviders/base.js
Normal file
201
server/utils/vectorDbProviders/base.js
Normal file
@ -0,0 +1,201 @@
|
||||
/**
|
||||
* Base class for all Vector Database providers.
|
||||
* All vector database providers should extend this class and implement/override the necessary methods.
|
||||
*/
|
||||
class VectorDatabase {
|
||||
get name() {
|
||||
return "VectorDatabase";
|
||||
}
|
||||
|
||||
constructor() {
|
||||
if (this.constructor === VectorDatabase) {
|
||||
throw new Error("VectorDatabase cannot be instantiated directly");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Connect to vector database client
|
||||
* @returns {Promise<{client: any}>}
|
||||
*/
|
||||
async connect() {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Heartbeat check for vector database client
|
||||
* @returns {Promise<{heartbeat: number}>}
|
||||
*/
|
||||
async heartbeat() {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get total number of vectors across all namespaces
|
||||
* @returns {Promise<number>}
|
||||
*/
|
||||
async totalVectors() {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get count of vectors in a specific namespace
|
||||
* @param {string} namespace - Namespace to count vectors in
|
||||
* @returns {Promise<number>}
|
||||
*/
|
||||
async namespaceCount(namespace = null) {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get namespace details
|
||||
* @param {any} client - Vector database client
|
||||
* @param {string} namespace - Namespace to get
|
||||
* @returns {Promise<any>}
|
||||
*/
|
||||
async namespace(client, namespace = null) {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a namespace exists
|
||||
* @param {string} namespace - Namespace to check
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
async hasNamespace(namespace = null) {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a namespace exists with a client
|
||||
* @param {any} client - Vector database client
|
||||
* @param {string} namespace - Namespace to check
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
async namespaceExists(client, namespace = null) {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all vectors in a namespace
|
||||
* @param {any} client - Vector database client
|
||||
* @param {string} namespace - Namespace to delete vectors from
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
async deleteVectorsInNamespace(client, namespace = null) {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a document to a namespace
|
||||
* @param {string} namespace - Namespace to add document to
|
||||
* @param {Object} documentData - Document data
|
||||
* @param {string} fullFilePath - Full file path
|
||||
* @param {boolean} skipCache - Skip cache
|
||||
* @returns {Promise<{vectorized: boolean, error: string|null}>}
|
||||
*/
|
||||
async addDocumentToNamespace(
|
||||
namespace,
|
||||
documentData = {},
|
||||
fullFilePath = null,
|
||||
skipCache = false
|
||||
) {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a document from namespace
|
||||
* @param {string} namespace - Namespace to delete document from
|
||||
* @param {string} docId - Document id
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
async deleteDocumentFromNamespace(namespace, docId) {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform a similarity search
|
||||
* @param {Object} params - Search parameters
|
||||
* @param {string} params.namespace - Namespace to search in
|
||||
* @param {string} params.input - Input text to search for
|
||||
* @param {any} params.LLMConnector - LLM connector for embeddings
|
||||
* @param {number} params.similarityThreshold - Similarity threshold
|
||||
* @param {number} params.topN - Number of results to return
|
||||
* @param {string[]} params.filterIdentifiers - Identifiers to filter out
|
||||
* @returns {Promise<{contextTexts: string[], sources: any[], message: string|boolean}>}
|
||||
*/
|
||||
async performSimilaritySearch({
|
||||
namespace = null,
|
||||
input = "",
|
||||
LLMConnector = null,
|
||||
similarityThreshold = 0.25,
|
||||
topN = 4,
|
||||
filterIdentifiers = [],
|
||||
}) {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform a similarity search and return raw results
|
||||
* @param {Object} params - Search parameters
|
||||
* @param {any} params.client - Vector database client
|
||||
* @param {string} params.namespace - Namespace to search in
|
||||
* @param {number[]} params.queryVector - Query vector
|
||||
* @param {number} params.similarityThreshold - Similarity threshold
|
||||
* @param {number} params.topN - Number of results to return
|
||||
* @param {string[]} params.filterIdentifiers - Identifiers to filter out
|
||||
* @returns {Promise<{contextTexts: string[], sourceDocuments: any[], scores: number[]}>}
|
||||
*/
|
||||
async similarityResponse({
|
||||
client,
|
||||
namespace,
|
||||
queryVector,
|
||||
similarityThreshold = 0.25,
|
||||
topN = 4,
|
||||
filterIdentifiers = [],
|
||||
}) {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get namespace stats
|
||||
* @param {Object} reqBody - Request body
|
||||
* @param {string} reqBody.namespace - Namespace to get stats for
|
||||
* @returns {Promise<any>}
|
||||
*/
|
||||
async "namespace-stats"(reqBody = {}) {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a namespace
|
||||
* @param {Object} reqBody - Request body
|
||||
* @param {string} reqBody.namespace - Namespace to delete
|
||||
* @returns {Promise<{message: string}>}
|
||||
*/
|
||||
async "delete-namespace"(reqBody = {}) {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset vector database (delete all data)
|
||||
* @returns {Promise<{reset: boolean}>}
|
||||
*/
|
||||
async reset() {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
/**
|
||||
* Curate sources from search results
|
||||
* @param {any[]} sources - Sources to curate
|
||||
* @returns {any[]}
|
||||
*/
|
||||
curateSources(sources = []) {
|
||||
throw new Error("Must be implemented by provider");
|
||||
}
|
||||
|
||||
logger(message = null, ...args) {
|
||||
console.log(`\x1b[36m[VectorDB::${this.name}]\x1b[0m ${message}`, ...args);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { VectorDatabase };
|
||||
@ -6,12 +6,20 @@ const { v4: uuidv4 } = require("uuid");
|
||||
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
|
||||
const { parseAuthHeader } = require("../../http");
|
||||
const { sourceIdentifier } = require("../../chats");
|
||||
const { VectorDatabase } = require("../base");
|
||||
const COLLECTION_REGEX = new RegExp(
|
||||
/^(?!\d+\.\d+\.\d+\.\d+$)(?!.*\.\.)(?=^[a-zA-Z0-9][a-zA-Z0-9_-]{1,61}[a-zA-Z0-9]$).{3,63}$/
|
||||
);
|
||||
|
||||
const Chroma = {
|
||||
name: "Chroma",
|
||||
class Chroma extends VectorDatabase {
|
||||
constructor() {
|
||||
super();
|
||||
}
|
||||
|
||||
get name() {
|
||||
return "Chroma";
|
||||
}
|
||||
|
||||
// Chroma DB has specific requirements for collection names:
|
||||
// (1) Must contain 3-63 characters
|
||||
// (2) Must start and end with an alphanumeric character
|
||||
@ -20,7 +28,7 @@ const Chroma = {
|
||||
// (5) Cannot be a valid IPv4 address
|
||||
// We need to enforce these rules by normalizing the collection names
|
||||
// before communicating with the Chroma DB.
|
||||
normalize: function (inputString) {
|
||||
normalize(inputString) {
|
||||
if (COLLECTION_REGEX.test(inputString)) return inputString;
|
||||
let normalized = inputString.replace(/[^a-zA-Z0-9_-]/g, "-");
|
||||
|
||||
@ -54,8 +62,9 @@ const Chroma = {
|
||||
}
|
||||
|
||||
return normalized;
|
||||
},
|
||||
connect: async function () {
|
||||
}
|
||||
|
||||
async connect() {
|
||||
if (process.env.VECTOR_DB !== "chroma")
|
||||
throw new Error("Chroma::Invalid ENV settings");
|
||||
|
||||
@ -79,12 +88,14 @@ const Chroma = {
|
||||
"ChromaDB::Invalid Heartbeat received - is the instance online?"
|
||||
);
|
||||
return { client };
|
||||
},
|
||||
heartbeat: async function () {
|
||||
}
|
||||
|
||||
async heartbeat() {
|
||||
const { client } = await this.connect();
|
||||
return { heartbeat: await client.heartbeat() };
|
||||
},
|
||||
totalVectors: async function () {
|
||||
}
|
||||
|
||||
async totalVectors() {
|
||||
const { client } = await this.connect();
|
||||
const collections = await client.listCollections();
|
||||
var totalVectors = 0;
|
||||
@ -96,19 +107,22 @@ const Chroma = {
|
||||
totalVectors += await collection.count();
|
||||
}
|
||||
return totalVectors;
|
||||
},
|
||||
distanceToSimilarity: function (distance = null) {
|
||||
}
|
||||
|
||||
distanceToSimilarity(distance = null) {
|
||||
if (distance === null || typeof distance !== "number") return 0.0;
|
||||
if (distance >= 1.0) return 1;
|
||||
if (distance < 0) return 1 - Math.abs(distance);
|
||||
return 1 - distance;
|
||||
},
|
||||
namespaceCount: async function (_namespace = null) {
|
||||
}
|
||||
|
||||
async namespaceCount(_namespace = null) {
|
||||
const { client } = await this.connect();
|
||||
const namespace = await this.namespace(client, this.normalize(_namespace));
|
||||
return namespace?.vectorCount || 0;
|
||||
},
|
||||
similarityResponse: async function ({
|
||||
}
|
||||
|
||||
async similarityResponse({
|
||||
client,
|
||||
namespace,
|
||||
queryVector,
|
||||
@ -137,8 +151,8 @@ const Chroma = {
|
||||
if (
|
||||
filterIdentifiers.includes(sourceIdentifier(response.metadatas[0][i]))
|
||||
) {
|
||||
console.log(
|
||||
"Chroma: A source was filtered from context as it's parent document is pinned."
|
||||
this.logger(
|
||||
"A source was filtered from context as it's parent document is pinned."
|
||||
);
|
||||
return;
|
||||
}
|
||||
@ -149,8 +163,9 @@ const Chroma = {
|
||||
});
|
||||
|
||||
return result;
|
||||
},
|
||||
namespace: async function (client, namespace = null) {
|
||||
}
|
||||
|
||||
async namespace(client, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
const collection = await client
|
||||
.getCollection({ name: this.normalize(namespace) })
|
||||
@ -161,27 +176,31 @@ const Chroma = {
|
||||
...collection,
|
||||
vectorCount: await collection.count(),
|
||||
};
|
||||
},
|
||||
hasNamespace: async function (namespace = null) {
|
||||
}
|
||||
|
||||
async hasNamespace(namespace = null) {
|
||||
if (!namespace) return false;
|
||||
const { client } = await this.connect();
|
||||
return await this.namespaceExists(client, this.normalize(namespace));
|
||||
},
|
||||
namespaceExists: async function (client, namespace = null) {
|
||||
}
|
||||
|
||||
async namespaceExists(client, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
const collection = await client
|
||||
.getCollection({ name: this.normalize(namespace) })
|
||||
.catch((e) => {
|
||||
console.error("ChromaDB::namespaceExists", e.message);
|
||||
this.logger("namespaceExists", e.message);
|
||||
return null;
|
||||
});
|
||||
return !!collection;
|
||||
},
|
||||
deleteVectorsInNamespace: async function (client, namespace = null) {
|
||||
}
|
||||
|
||||
async deleteVectorsInNamespace(client, namespace = null) {
|
||||
await client.deleteCollection({ name: this.normalize(namespace) });
|
||||
return true;
|
||||
},
|
||||
addDocumentToNamespace: async function (
|
||||
}
|
||||
|
||||
async addDocumentToNamespace(
|
||||
namespace,
|
||||
documentData = {},
|
||||
fullFilePath = null,
|
||||
@ -192,7 +211,7 @@ const Chroma = {
|
||||
const { pageContent, docId, ...metadata } = documentData;
|
||||
if (!pageContent || pageContent.length == 0) return false;
|
||||
|
||||
console.log("Adding new vectorized document into namespace", namespace);
|
||||
this.logger("Adding new vectorized document into namespace", namespace);
|
||||
if (!skipCache) {
|
||||
const cacheResult = await cachedVectorInformation(fullFilePath);
|
||||
if (cacheResult.exists) {
|
||||
@ -254,7 +273,7 @@ const Chroma = {
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
console.log("Snippets created from document:", textChunks.length);
|
||||
this.logger("Snippets created from document:", textChunks.length);
|
||||
const documentVectors = [];
|
||||
const vectors = [];
|
||||
const vectorValues = await EmbedderEngine.embedChunks(textChunks);
|
||||
@ -298,16 +317,16 @@ const Chroma = {
|
||||
|
||||
if (vectors.length > 0) {
|
||||
const chunks = [];
|
||||
console.log("Inserting vectorized chunks into Chroma collection.");
|
||||
this.logger("Inserting vectorized chunks into Chroma collection.");
|
||||
for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
|
||||
|
||||
try {
|
||||
await this.smartAdd(collection, submission);
|
||||
console.log(
|
||||
this.logger(
|
||||
`Successfully added ${submission.ids.length} vectors to collection ${this.normalize(namespace)}`
|
||||
);
|
||||
} catch (error) {
|
||||
console.error("Error adding to ChromaDB:", error);
|
||||
this.logger("Error adding to ChromaDB:", error);
|
||||
throw new Error(`Error embedding into ChromaDB: ${error.message}`);
|
||||
}
|
||||
|
||||
@ -317,11 +336,12 @@ const Chroma = {
|
||||
await DocumentVectors.bulkInsert(documentVectors);
|
||||
return { vectorized: true, error: null };
|
||||
} catch (e) {
|
||||
console.error("addDocumentToNamespace", e.message);
|
||||
this.logger("addDocumentToNamespace", e.message);
|
||||
return { vectorized: false, error: e.message };
|
||||
}
|
||||
},
|
||||
deleteDocumentFromNamespace: async function (namespace, docId) {
|
||||
}
|
||||
|
||||
async deleteDocumentFromNamespace(namespace, docId) {
|
||||
const { DocumentVectors } = require("../../../models/vectors");
|
||||
const { client } = await this.connect();
|
||||
if (!(await this.namespaceExists(client, namespace))) return;
|
||||
@ -338,8 +358,9 @@ const Chroma = {
|
||||
const indexes = knownDocuments.map((doc) => doc.id);
|
||||
await DocumentVectors.deleteIds(indexes);
|
||||
return true;
|
||||
},
|
||||
performSimilaritySearch: async function ({
|
||||
}
|
||||
|
||||
async performSimilaritySearch({
|
||||
namespace = null,
|
||||
input = "",
|
||||
LLMConnector = null,
|
||||
@ -383,8 +404,9 @@ const Chroma = {
|
||||
sources: this.curateSources(sources),
|
||||
message: false,
|
||||
};
|
||||
},
|
||||
"namespace-stats": async function (reqBody = {}) {
|
||||
}
|
||||
|
||||
async "namespace-stats"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
if (!namespace) throw new Error("namespace required");
|
||||
const { client } = await this.connect();
|
||||
@ -394,8 +416,9 @@ const Chroma = {
|
||||
return stats
|
||||
? stats
|
||||
: { message: "No stats were able to be fetched from DB for namespace" };
|
||||
},
|
||||
"delete-namespace": async function (reqBody = {}) {
|
||||
}
|
||||
|
||||
async "delete-namespace"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
const { client } = await this.connect();
|
||||
if (!(await this.namespaceExists(client, this.normalize(namespace))))
|
||||
@ -406,13 +429,15 @@ const Chroma = {
|
||||
return {
|
||||
message: `Namespace ${namespace} was deleted along with ${details?.vectorCount} vectors.`,
|
||||
};
|
||||
},
|
||||
reset: async function () {
|
||||
}
|
||||
|
||||
async reset() {
|
||||
const { client } = await this.connect();
|
||||
await client.reset();
|
||||
return { reset: true };
|
||||
},
|
||||
curateSources: function (sources = []) {
|
||||
}
|
||||
|
||||
curateSources(sources = []) {
|
||||
const documents = [];
|
||||
for (const source of sources) {
|
||||
const { metadata = {} } = source;
|
||||
@ -427,7 +452,8 @@ const Chroma = {
|
||||
}
|
||||
|
||||
return documents;
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* This method is a wrapper around the ChromaCollection.add method.
|
||||
* It will return true if the add was successful, false otherwise.
|
||||
@ -436,10 +462,11 @@ const Chroma = {
|
||||
* @param {{ids: string[], embeddings: number[], metadatas: Record<string, any>[], documents: string[]}[]} submissions
|
||||
* @returns {Promise<boolean>} True if the add was successful, false otherwise.
|
||||
*/
|
||||
smartAdd: async function (collection, submissions) {
|
||||
async smartAdd(collection, submissions) {
|
||||
await collection.add(submissions);
|
||||
return true;
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* This method is a wrapper around the ChromaCollection.delete method.
|
||||
* It will return the result of the delete method directly.
|
||||
@ -448,10 +475,10 @@ const Chroma = {
|
||||
* @param {string[]} vectorIds
|
||||
* @returns {Promise<boolean>} True if the delete was successful, false otherwise.
|
||||
*/
|
||||
smartDelete: async function (collection, vectorIds) {
|
||||
async smartDelete(collection, vectorIds) {
|
||||
await collection.delete({ ids: vectorIds });
|
||||
return true;
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
module.exports.Chroma = Chroma;
|
||||
|
||||
@ -6,20 +6,27 @@ const { toChunks } = require("../../helpers");
|
||||
* ChromaCloud works nearly the same as Chroma so we can just extend the
|
||||
* Chroma class and override the connect method to use the CloudClient for major differences in API functionality.
|
||||
*/
|
||||
const ChromaCloud = {
|
||||
...Chroma,
|
||||
name: "ChromaCloud",
|
||||
class ChromaCloud extends Chroma {
|
||||
constructor() {
|
||||
super();
|
||||
}
|
||||
|
||||
get name() {
|
||||
return "ChromaCloud";
|
||||
}
|
||||
|
||||
/**
|
||||
* Basic quota/limitations for Chroma Cloud for accounts. Does not lookup client-specific limits.
|
||||
* @see https://docs.trychroma.com/cloud/quotas-limits
|
||||
*/
|
||||
limits: {
|
||||
limits = {
|
||||
maxEmbeddingDim: 4_096,
|
||||
maxDocumentBytes: 16_384,
|
||||
maxMetadataBytes: 4_096,
|
||||
maxRecordsPerWrite: 300,
|
||||
},
|
||||
connect: async function () {
|
||||
};
|
||||
|
||||
async connect() {
|
||||
if (process.env.VECTOR_DB !== "chromacloud")
|
||||
throw new Error("ChromaCloud::Invalid ENV settings");
|
||||
|
||||
@ -35,7 +42,8 @@ const ChromaCloud = {
|
||||
"ChromaCloud::Invalid Heartbeat received - is the instance online?"
|
||||
);
|
||||
return { client };
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Chroma Cloud has some basic limitations on upserts to protect performance and latency.
|
||||
* Local deployments do not have these limitations since they are self-hosted.
|
||||
@ -47,7 +55,7 @@ const ChromaCloud = {
|
||||
* @returns {Promise<boolean>} True if the upsert was successful, false otherwise.
|
||||
* If the upsert was not successful, the error message will be returned.
|
||||
*/
|
||||
smartAdd: async function (collection, submission) {
|
||||
async smartAdd(collection, submission) {
|
||||
const testSubmission = {
|
||||
id: submission.ids[0],
|
||||
embedding: submission.embeddings[0],
|
||||
@ -77,8 +85,8 @@ const ChromaCloud = {
|
||||
return true;
|
||||
}
|
||||
|
||||
console.log(
|
||||
`ChromaCloud::Upsert Payload is too large (max is ${this.limits.maxRecordsPerWrite} records). Splitting into chunks of ${this.limits.maxRecordsPerWrite} records.`
|
||||
this.logger(
|
||||
`Upsert Payload is too large (max is ${this.limits.maxRecordsPerWrite} records). Splitting into chunks of ${this.limits.maxRecordsPerWrite} records.`
|
||||
);
|
||||
const chunks = [];
|
||||
let chunkedSubmission = {
|
||||
@ -93,7 +101,7 @@ const ChromaCloud = {
|
||||
chunkedSubmission.metadatas.push(submission.metadatas[i]);
|
||||
chunkedSubmission.documents.push(submission.documents[i]);
|
||||
if (chunkedSubmission.ids.length === this.limits.maxRecordsPerWrite) {
|
||||
console.log(
|
||||
this.logger(
|
||||
`ChromaCloud::Adding chunk payload ${chunks.length + 1} of ${Math.ceil(submission.ids.length / this.limits.maxRecordsPerWrite)}`
|
||||
);
|
||||
chunks.push(chunkedSubmission);
|
||||
@ -114,7 +122,8 @@ const ChromaCloud = {
|
||||
counter++;
|
||||
}
|
||||
return true;
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* This method is a wrapper around the ChromaCollection.delete method.
|
||||
* It will return the result of the delete method directly.
|
||||
@ -127,22 +136,22 @@ const ChromaCloud = {
|
||||
* @param {string[]} vectorIds
|
||||
* @returns {Promise<boolean>} True if the delete was successful, false otherwise.
|
||||
*/
|
||||
smartDelete: async function (collection, vectorIds) {
|
||||
async smartDelete(collection, vectorIds) {
|
||||
if (vectorIds.length <= this.limits.maxRecordsPerWrite)
|
||||
return await collection.delete({ ids: vectorIds });
|
||||
|
||||
console.log(
|
||||
`ChromaCloud::Delete Payload is too large (max is ${this.limits.maxRecordsPerWrite} records). Splitting into chunks of ${this.limits.maxRecordsPerWrite} records.`
|
||||
this.logger(
|
||||
`Delete Payload is too large (max is ${this.limits.maxRecordsPerWrite} records). Splitting into chunks of ${this.limits.maxRecordsPerWrite} records.`
|
||||
);
|
||||
const chunks = toChunks(vectorIds, this.limits.maxRecordsPerWrite);
|
||||
let counter = 1;
|
||||
for (const chunk of chunks) {
|
||||
console.log(`ChromaCloud::Deleting chunk ${counter} of ${chunks.length}`);
|
||||
this.logger(`Deleting chunk ${counter} of ${chunks.length}`);
|
||||
await collection.delete({ ids: chunk });
|
||||
counter++;
|
||||
}
|
||||
return true;
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
module.exports.ChromaCloud = ChromaCloud;
|
||||
|
||||
@ -6,38 +6,54 @@ const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { sourceIdentifier } = require("../../chats");
|
||||
const { NativeEmbeddingReranker } = require("../../EmbeddingRerankers/native");
|
||||
const { VectorDatabase } = require("../base");
|
||||
const path = require("path");
|
||||
|
||||
/**
|
||||
* LancedDB Client connection object
|
||||
* @typedef {import('@lancedb/lancedb').Connection} LanceClient
|
||||
*/
|
||||
|
||||
const LanceDb = {
|
||||
uri: `${
|
||||
!!process.env.STORAGE_DIR ? `${process.env.STORAGE_DIR}/` : "./storage/"
|
||||
}lancedb`,
|
||||
name: "LanceDb",
|
||||
class LanceDb extends VectorDatabase {
|
||||
constructor() {
|
||||
super();
|
||||
}
|
||||
|
||||
get uri() {
|
||||
const basePath = !!process.env.STORAGE_DIR
|
||||
? process.env.STORAGE_DIR
|
||||
: path.resolve(__dirname, "../../../storage");
|
||||
return path.resolve(basePath, "lancedb");
|
||||
}
|
||||
|
||||
get name() {
|
||||
return "LanceDb";
|
||||
}
|
||||
|
||||
/** @returns {Promise<{client: LanceClient}>} */
|
||||
connect: async function () {
|
||||
async connect() {
|
||||
const client = await lancedb.connect(this.uri);
|
||||
return { client };
|
||||
},
|
||||
distanceToSimilarity: function (distance = null) {
|
||||
}
|
||||
|
||||
distanceToSimilarity(distance = null) {
|
||||
if (distance === null || typeof distance !== "number") return 0.0;
|
||||
if (distance >= 1.0) return 1;
|
||||
if (distance < 0) return 1 - Math.abs(distance);
|
||||
return 1 - distance;
|
||||
},
|
||||
heartbeat: async function () {
|
||||
}
|
||||
|
||||
async heartbeat() {
|
||||
await this.connect();
|
||||
return { heartbeat: Number(new Date()) };
|
||||
},
|
||||
tables: async function () {
|
||||
}
|
||||
|
||||
async tables() {
|
||||
const { client } = await this.connect();
|
||||
return await client.tableNames();
|
||||
},
|
||||
totalVectors: async function () {
|
||||
}
|
||||
|
||||
async totalVectors() {
|
||||
const { client } = await this.connect();
|
||||
const tables = await client.tableNames();
|
||||
let count = 0;
|
||||
@ -46,15 +62,17 @@ const LanceDb = {
|
||||
count += await table.countRows();
|
||||
}
|
||||
return count;
|
||||
},
|
||||
namespaceCount: async function (_namespace = null) {
|
||||
}
|
||||
|
||||
async namespaceCount(_namespace = null) {
|
||||
const { client } = await this.connect();
|
||||
const exists = await this.namespaceExists(client, _namespace);
|
||||
if (!exists) return 0;
|
||||
|
||||
const table = await client.openTable(_namespace);
|
||||
return (await table.countRows()) || 0;
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs a SimilaritySearch + Reranking on a namespace.
|
||||
* @param {Object} params - The parameters for the rerankedSimilarityResponse.
|
||||
@ -67,7 +85,7 @@ const LanceDb = {
|
||||
* @param {string[]} params.filterIdentifiers - The identifiers of the documents to filter out.
|
||||
* @returns
|
||||
*/
|
||||
rerankedSimilarityResponse: async function ({
|
||||
async rerankedSimilarityResponse({
|
||||
client,
|
||||
namespace,
|
||||
query,
|
||||
@ -116,8 +134,8 @@ const LanceDb = {
|
||||
return;
|
||||
const { vector: _, ...rest } = item;
|
||||
if (filterIdentifiers.includes(sourceIdentifier(rest))) {
|
||||
console.log(
|
||||
"LanceDB: A source was filtered from context as it's parent document is pinned."
|
||||
this.logger(
|
||||
"A source was filtered from context as it's parent document is pinned."
|
||||
);
|
||||
return;
|
||||
}
|
||||
@ -133,12 +151,12 @@ const LanceDb = {
|
||||
});
|
||||
})
|
||||
.catch((e) => {
|
||||
console.error(e);
|
||||
console.error("LanceDB::rerankedSimilarityResponse", e.message);
|
||||
this.logger(e);
|
||||
this.logger("rerankedSimilarityResponse", e.message);
|
||||
});
|
||||
|
||||
return result;
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs a SimilaritySearch on a give LanceDB namespace.
|
||||
@ -151,7 +169,7 @@ const LanceDb = {
|
||||
* @param {string[]} params.filterIdentifiers
|
||||
* @returns
|
||||
*/
|
||||
similarityResponse: async function ({
|
||||
async similarityResponse({
|
||||
client,
|
||||
namespace,
|
||||
queryVector,
|
||||
@ -177,8 +195,8 @@ const LanceDb = {
|
||||
return;
|
||||
const { vector: _, ...rest } = item;
|
||||
if (filterIdentifiers.includes(sourceIdentifier(rest))) {
|
||||
console.log(
|
||||
"LanceDB: A source was filtered from context as it's parent document is pinned."
|
||||
this.logger(
|
||||
"A source was filtered from context as it's parent document is pinned."
|
||||
);
|
||||
return;
|
||||
}
|
||||
@ -192,14 +210,15 @@ const LanceDb = {
|
||||
});
|
||||
|
||||
return result;
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {LanceClient} client
|
||||
* @param {string} namespace
|
||||
* @returns
|
||||
*/
|
||||
namespace: async function (client, namespace = null) {
|
||||
async namespace(client, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
const collection = await client.openTable(namespace).catch(() => false);
|
||||
if (!collection) return null;
|
||||
@ -207,7 +226,8 @@ const LanceDb = {
|
||||
return {
|
||||
...collection,
|
||||
};
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {LanceClient} client
|
||||
@ -215,7 +235,7 @@ const LanceDb = {
|
||||
* @param {string} namespace
|
||||
* @returns
|
||||
*/
|
||||
updateOrCreateCollection: async function (client, data = [], namespace) {
|
||||
async updateOrCreateCollection(client, data = [], namespace) {
|
||||
const hasNamespace = await this.hasNamespace(namespace);
|
||||
if (hasNamespace) {
|
||||
const collection = await client.openTable(namespace);
|
||||
@ -225,40 +245,44 @@ const LanceDb = {
|
||||
|
||||
await client.createTable(namespace, data);
|
||||
return true;
|
||||
},
|
||||
hasNamespace: async function (namespace = null) {
|
||||
}
|
||||
|
||||
async hasNamespace(namespace = null) {
|
||||
if (!namespace) return false;
|
||||
const { client } = await this.connect();
|
||||
const exists = await this.namespaceExists(client, namespace);
|
||||
return exists;
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {LanceClient} client
|
||||
* @param {string} namespace
|
||||
* @returns
|
||||
*/
|
||||
namespaceExists: async function (client, namespace = null) {
|
||||
async namespaceExists(client, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
const collections = await client.tableNames();
|
||||
return collections.includes(namespace);
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param {LanceClient} client
|
||||
* @param {string} namespace
|
||||
* @returns
|
||||
*/
|
||||
deleteVectorsInNamespace: async function (client, namespace = null) {
|
||||
async deleteVectorsInNamespace(client, namespace = null) {
|
||||
await client.dropTable(namespace);
|
||||
return true;
|
||||
},
|
||||
deleteDocumentFromNamespace: async function (namespace, docId) {
|
||||
}
|
||||
|
||||
async deleteDocumentFromNamespace(namespace, docId) {
|
||||
const { client } = await this.connect();
|
||||
const exists = await this.namespaceExists(client, namespace);
|
||||
if (!exists) {
|
||||
console.error(
|
||||
`LanceDB:deleteDocumentFromNamespace - namespace ${namespace} does not exist.`
|
||||
this.logger(
|
||||
`deleteDocumentFromNamespace - namespace ${namespace} does not exist.`
|
||||
);
|
||||
return;
|
||||
}
|
||||
@ -272,8 +296,9 @@ const LanceDb = {
|
||||
if (vectorIds.length === 0) return;
|
||||
await table.delete(`id IN (${vectorIds.map((v) => `'${v}'`).join(",")})`);
|
||||
return true;
|
||||
},
|
||||
addDocumentToNamespace: async function (
|
||||
}
|
||||
|
||||
async addDocumentToNamespace(
|
||||
namespace,
|
||||
documentData = {},
|
||||
fullFilePath = null,
|
||||
@ -284,7 +309,7 @@ const LanceDb = {
|
||||
const { pageContent, docId, ...metadata } = documentData;
|
||||
if (!pageContent || pageContent.length == 0) return false;
|
||||
|
||||
console.log("Adding new vectorized document into namespace", namespace);
|
||||
this.logger("Adding new vectorized document into namespace", namespace);
|
||||
if (!skipCache) {
|
||||
const cacheResult = await cachedVectorInformation(fullFilePath);
|
||||
if (cacheResult.exists) {
|
||||
@ -329,7 +354,7 @@ const LanceDb = {
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
console.log("Snippets created from document:", textChunks.length);
|
||||
this.logger("Snippets created from document:", textChunks.length);
|
||||
const documentVectors = [];
|
||||
const vectors = [];
|
||||
const submissions = [];
|
||||
@ -364,7 +389,7 @@ const LanceDb = {
|
||||
const chunks = [];
|
||||
for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
|
||||
|
||||
console.log("Inserting vectorized chunks into LanceDB collection.");
|
||||
this.logger("Inserting vectorized chunks into LanceDB collection.");
|
||||
const { client } = await this.connect();
|
||||
await this.updateOrCreateCollection(client, submissions, namespace);
|
||||
await storeVectorResult(chunks, fullFilePath);
|
||||
@ -373,11 +398,12 @@ const LanceDb = {
|
||||
await DocumentVectors.bulkInsert(documentVectors);
|
||||
return { vectorized: true, error: null };
|
||||
} catch (e) {
|
||||
console.error("addDocumentToNamespace", e.message);
|
||||
this.logger("addDocumentToNamespace", e.message);
|
||||
return { vectorized: false, error: e.message };
|
||||
}
|
||||
},
|
||||
performSimilaritySearch: async function ({
|
||||
}
|
||||
|
||||
async performSimilaritySearch({
|
||||
namespace = null,
|
||||
input = "",
|
||||
LLMConnector = null,
|
||||
@ -427,8 +453,9 @@ const LanceDb = {
|
||||
sources: this.curateSources(sources),
|
||||
message: false,
|
||||
};
|
||||
},
|
||||
"namespace-stats": async function (reqBody = {}) {
|
||||
}
|
||||
|
||||
async "namespace-stats"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
if (!namespace) throw new Error("namespace required");
|
||||
const { client } = await this.connect();
|
||||
@ -438,8 +465,9 @@ const LanceDb = {
|
||||
return stats
|
||||
? stats
|
||||
: { message: "No stats were able to be fetched from DB for namespace" };
|
||||
},
|
||||
"delete-namespace": async function (reqBody = {}) {
|
||||
}
|
||||
|
||||
async "delete-namespace"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
const { client } = await this.connect();
|
||||
if (!(await this.namespaceExists(client, namespace)))
|
||||
@ -449,14 +477,16 @@ const LanceDb = {
|
||||
return {
|
||||
message: `Namespace ${namespace} was deleted.`,
|
||||
};
|
||||
},
|
||||
reset: async function () {
|
||||
}
|
||||
|
||||
async reset() {
|
||||
const { client } = await this.connect();
|
||||
const fs = require("fs");
|
||||
fs.rm(`${client.uri}`, { recursive: true }, () => null);
|
||||
return { reset: true };
|
||||
},
|
||||
curateSources: function (sources = []) {
|
||||
}
|
||||
|
||||
curateSources(sources = []) {
|
||||
const documents = [];
|
||||
for (const source of sources) {
|
||||
const { text, vector: _v, _distance: _d, ...rest } = source;
|
||||
@ -470,7 +500,7 @@ const LanceDb = {
|
||||
}
|
||||
|
||||
return documents;
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
module.exports.LanceDb = LanceDb;
|
||||
|
||||
@ -10,22 +10,31 @@ const { v4: uuidv4 } = require("uuid");
|
||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
|
||||
const { sourceIdentifier } = require("../../chats");
|
||||
const { VectorDatabase } = require("../base");
|
||||
|
||||
class Milvus extends VectorDatabase {
|
||||
constructor() {
|
||||
super();
|
||||
}
|
||||
|
||||
get name() {
|
||||
return "Milvus";
|
||||
}
|
||||
|
||||
const Milvus = {
|
||||
name: "Milvus",
|
||||
// Milvus/Zilliz only allows letters, numbers, and underscores in collection names
|
||||
// so we need to enforce that by re-normalizing the names when communicating with
|
||||
// the DB.
|
||||
// If the first char of the collection is not an underscore or letter the collection name will be invalid.
|
||||
normalize: function (inputString) {
|
||||
normalize(inputString) {
|
||||
let normalized = inputString.replace(/[^a-zA-Z0-9_]/g, "_");
|
||||
if (new RegExp(/^[a-zA-Z_]/).test(normalized.slice(0, 1)))
|
||||
normalized = `anythingllm_${normalized}`;
|
||||
return normalized;
|
||||
},
|
||||
connect: async function () {
|
||||
}
|
||||
|
||||
async connect() {
|
||||
if (process.env.VECTOR_DB !== "milvus")
|
||||
throw new Error("Milvus::Invalid ENV settings");
|
||||
throw new Error(`${this.name}::Invalid ENV settings`);
|
||||
|
||||
const client = new MilvusClient({
|
||||
address: process.env.MILVUS_ADDRESS,
|
||||
@ -36,16 +45,18 @@ const Milvus = {
|
||||
const { isHealthy } = await client.checkHealth();
|
||||
if (!isHealthy)
|
||||
throw new Error(
|
||||
"MilvusDB::Invalid Heartbeat received - is the instance online?"
|
||||
`${this.name}::Invalid Heartbeat received - is the instance online?`
|
||||
);
|
||||
|
||||
return { client };
|
||||
},
|
||||
heartbeat: async function () {
|
||||
}
|
||||
|
||||
async heartbeat() {
|
||||
await this.connect();
|
||||
return { heartbeat: Number(new Date()) };
|
||||
},
|
||||
totalVectors: async function () {
|
||||
}
|
||||
|
||||
async totalVectors() {
|
||||
const { client } = await this.connect();
|
||||
const { collection_names } = await client.listCollections();
|
||||
const total = collection_names.reduce(async (acc, collection_name) => {
|
||||
@ -55,49 +66,55 @@ const Milvus = {
|
||||
return Number(acc) + Number(statistics?.data?.row_count ?? 0);
|
||||
}, 0);
|
||||
return total;
|
||||
},
|
||||
namespaceCount: async function (_namespace = null) {
|
||||
}
|
||||
|
||||
async namespaceCount(_namespace = null) {
|
||||
const { client } = await this.connect();
|
||||
const statistics = await client.getCollectionStatistics({
|
||||
collection_name: this.normalize(_namespace),
|
||||
});
|
||||
return Number(statistics?.data?.row_count ?? 0);
|
||||
},
|
||||
namespace: async function (client, namespace = null) {
|
||||
}
|
||||
|
||||
async namespace(client, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
const collection = await client
|
||||
.getCollectionStatistics({ collection_name: this.normalize(namespace) })
|
||||
.catch(() => null);
|
||||
return collection;
|
||||
},
|
||||
hasNamespace: async function (namespace = null) {
|
||||
}
|
||||
|
||||
async hasNamespace(namespace = null) {
|
||||
if (!namespace) return false;
|
||||
const { client } = await this.connect();
|
||||
return await this.namespaceExists(client, namespace);
|
||||
},
|
||||
namespaceExists: async function (client, namespace = null) {
|
||||
}
|
||||
|
||||
async namespaceExists(client, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
const { value } = await client
|
||||
.hasCollection({ collection_name: this.normalize(namespace) })
|
||||
.catch((e) => {
|
||||
console.error("MilvusDB::namespaceExists", e.message);
|
||||
console.error(`${this.name}::namespaceExists`, e.message);
|
||||
return { value: false };
|
||||
});
|
||||
return value;
|
||||
},
|
||||
deleteVectorsInNamespace: async function (client, namespace = null) {
|
||||
}
|
||||
|
||||
async deleteVectorsInNamespace(client, namespace = null) {
|
||||
await client.dropCollection({ collection_name: this.normalize(namespace) });
|
||||
return true;
|
||||
},
|
||||
}
|
||||
|
||||
// Milvus requires a dimension aspect for collection creation
|
||||
// we pass this in from the first chunk to infer the dimensions like other
|
||||
// providers do.
|
||||
getOrCreateCollection: async function (client, namespace, dimensions = null) {
|
||||
async getOrCreateCollection(client, namespace, dimensions = null) {
|
||||
const isExists = await this.namespaceExists(client, namespace);
|
||||
if (!isExists) {
|
||||
if (!dimensions)
|
||||
throw new Error(
|
||||
`Milvus:getOrCreateCollection Unable to infer vector dimension from input. Open an issue on GitHub for support.`
|
||||
`${this.name}::getOrCreateCollection Unable to infer vector dimension from input. Open an issue on GitHub for support.`
|
||||
);
|
||||
|
||||
await client.createCollection({
|
||||
@ -133,8 +150,9 @@ const Milvus = {
|
||||
collection_name: this.normalize(namespace),
|
||||
});
|
||||
}
|
||||
},
|
||||
addDocumentToNamespace: async function (
|
||||
}
|
||||
|
||||
async addDocumentToNamespace(
|
||||
namespace,
|
||||
documentData = {},
|
||||
fullFilePath = null,
|
||||
@ -146,7 +164,7 @@ const Milvus = {
|
||||
const { pageContent, docId, ...metadata } = documentData;
|
||||
if (!pageContent || pageContent.length == 0) return false;
|
||||
|
||||
console.log("Adding new vectorized document into namespace", namespace);
|
||||
this.logger("Adding new vectorized document into namespace", namespace);
|
||||
if (!skipCache) {
|
||||
const cacheResult = await cachedVectorInformation(fullFilePath);
|
||||
if (cacheResult.exists) {
|
||||
@ -172,7 +190,7 @@ const Milvus = {
|
||||
|
||||
if (insertResult?.status.error_code !== "Success") {
|
||||
throw new Error(
|
||||
`Error embedding into Milvus! Reason:${insertResult?.status.reason}`
|
||||
`Error embedding into ${this.name}! Reason:${insertResult?.status.reason}`
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -208,7 +226,7 @@ const Milvus = {
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
console.log("Snippets created from document:", textChunks.length);
|
||||
this.logger("Snippets created from document:", textChunks.length);
|
||||
const documentVectors = [];
|
||||
const vectors = [];
|
||||
const vectorValues = await EmbedderEngine.embedChunks(textChunks);
|
||||
@ -238,7 +256,7 @@ const Milvus = {
|
||||
const { client } = await this.connect();
|
||||
await this.getOrCreateCollection(client, namespace, vectorDimension);
|
||||
|
||||
console.log("Inserting vectorized chunks into Milvus.");
|
||||
this.logger(`Inserting vectorized chunks into ${this.name}.`);
|
||||
for (const chunk of toChunks(vectors, 100)) {
|
||||
chunks.push(chunk);
|
||||
const insertResult = await client.insert({
|
||||
@ -252,7 +270,7 @@ const Milvus = {
|
||||
|
||||
if (insertResult?.status.error_code !== "Success") {
|
||||
throw new Error(
|
||||
`Error embedding into Milvus! Reason:${insertResult?.status.reason}`
|
||||
`Error embedding into ${this.name}! Reason:${insertResult?.status.reason}`
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -265,11 +283,12 @@ const Milvus = {
|
||||
await DocumentVectors.bulkInsert(documentVectors);
|
||||
return { vectorized: true, error: null };
|
||||
} catch (e) {
|
||||
console.error("addDocumentToNamespace", e.message);
|
||||
this.logger("addDocumentToNamespace", e.message);
|
||||
return { vectorized: false, error: e.message };
|
||||
}
|
||||
},
|
||||
deleteDocumentFromNamespace: async function (namespace, docId) {
|
||||
}
|
||||
|
||||
async deleteDocumentFromNamespace(namespace, docId) {
|
||||
const { DocumentVectors } = require("../../../models/vectors");
|
||||
const { client } = await this.connect();
|
||||
if (!(await this.namespaceExists(client, namespace))) return;
|
||||
@ -291,8 +310,9 @@ const Milvus = {
|
||||
// on a later call.
|
||||
await client.flushSync({ collection_names: [this.normalize(namespace)] });
|
||||
return true;
|
||||
},
|
||||
performSimilaritySearch: async function ({
|
||||
}
|
||||
|
||||
async performSimilaritySearch({
|
||||
namespace = null,
|
||||
input = "",
|
||||
LLMConnector = null,
|
||||
@ -331,8 +351,9 @@ const Milvus = {
|
||||
sources: this.curateSources(sources),
|
||||
message: false,
|
||||
};
|
||||
},
|
||||
similarityResponse: async function ({
|
||||
}
|
||||
|
||||
async similarityResponse({
|
||||
client,
|
||||
namespace,
|
||||
queryVector,
|
||||
@ -353,8 +374,8 @@ const Milvus = {
|
||||
response.results.forEach((match) => {
|
||||
if (match.score < similarityThreshold) return;
|
||||
if (filterIdentifiers.includes(sourceIdentifier(match.metadata))) {
|
||||
console.log(
|
||||
"Milvus: A source was filtered from context as it's parent document is pinned."
|
||||
this.logger(
|
||||
`${this.name}: A source was filtered from context as its parent document is pinned.`
|
||||
);
|
||||
return;
|
||||
}
|
||||
@ -367,8 +388,9 @@ const Milvus = {
|
||||
result.scores.push(match.score);
|
||||
});
|
||||
return result;
|
||||
},
|
||||
"namespace-stats": async function (reqBody = {}) {
|
||||
}
|
||||
|
||||
async "namespace-stats"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
if (!namespace) throw new Error("namespace required");
|
||||
const { client } = await this.connect();
|
||||
@ -378,8 +400,9 @@ const Milvus = {
|
||||
return stats
|
||||
? stats
|
||||
: { message: "No stats were able to be fetched from DB for namespace" };
|
||||
},
|
||||
"delete-namespace": async function (reqBody = {}) {
|
||||
}
|
||||
|
||||
async "delete-namespace"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
const { client } = await this.connect();
|
||||
if (!(await this.namespaceExists(client, namespace)))
|
||||
@ -391,8 +414,9 @@ const Milvus = {
|
||||
return {
|
||||
message: `Namespace ${namespace} was deleted along with ${vectorCount} vectors.`,
|
||||
};
|
||||
},
|
||||
curateSources: function (sources = []) {
|
||||
}
|
||||
|
||||
curateSources(sources = []) {
|
||||
const documents = [];
|
||||
for (const source of sources) {
|
||||
const { metadata = {} } = source;
|
||||
@ -404,7 +428,7 @@ const Milvus = {
|
||||
}
|
||||
}
|
||||
return documents;
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
module.exports.Milvus = Milvus;
|
||||
|
||||
@ -3,6 +3,7 @@ const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
|
||||
const { TextSplitter } = require("../../TextSplitter");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { sourceIdentifier } = require("../../chats");
|
||||
const { VectorDatabase } = require("../base");
|
||||
|
||||
/*
|
||||
Embedding Table Schema (table name defined by user)
|
||||
@ -13,44 +14,53 @@ const { sourceIdentifier } = require("../../chats");
|
||||
- created_at: TIMESTAMP
|
||||
*/
|
||||
|
||||
const PGVector = {
|
||||
name: "PGVector",
|
||||
connectionTimeout: 30_000,
|
||||
/**
|
||||
* Get the table name for the PGVector database.
|
||||
* - Defaults to "anythingllm_vectors" if no table name is provided.
|
||||
* @returns {string}
|
||||
*/
|
||||
tableName: () => process.env.PGVECTOR_TABLE_NAME || "anythingllm_vectors",
|
||||
class PGVector extends VectorDatabase {
|
||||
constructor() {
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the connection string for the PGVector database.
|
||||
* - Requires a connection string to be present in the environment variables.
|
||||
* @returns {string | null}
|
||||
*/
|
||||
connectionString: () => process.env.PGVECTOR_CONNECTION_STRING,
|
||||
get name() {
|
||||
return "PGVector";
|
||||
}
|
||||
|
||||
connectionTimeout = 30_000;
|
||||
// Possible for this to be a user-configurable option in the future.
|
||||
// Will require a handler per operator to ensure scores are normalized.
|
||||
operator: {
|
||||
operator = {
|
||||
l2: "<->",
|
||||
innerProduct: "<#>",
|
||||
cosine: "<=>",
|
||||
l1: "<+>",
|
||||
hamming: "<~>",
|
||||
jaccard: "<%>",
|
||||
},
|
||||
getTablesSql:
|
||||
"SELECT * FROM pg_catalog.pg_tables WHERE schemaname = 'public'",
|
||||
getEmbeddingTableSchemaSql:
|
||||
"SELECT column_name,data_type FROM information_schema.columns WHERE table_name = $1",
|
||||
createExtensionSql: "CREATE EXTENSION IF NOT EXISTS vector;",
|
||||
createTableSql: (dimensions) =>
|
||||
`CREATE TABLE IF NOT EXISTS "${PGVector.tableName()}" (id UUID PRIMARY KEY, namespace TEXT, embedding vector(${Number(dimensions)}), metadata JSONB, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)`,
|
||||
};
|
||||
getTablesSql =
|
||||
"SELECT * FROM pg_catalog.pg_tables WHERE schemaname = 'public'";
|
||||
getEmbeddingTableSchemaSql =
|
||||
"SELECT column_name,data_type FROM information_schema.columns WHERE table_name = $1";
|
||||
createExtensionSql = "CREATE EXTENSION IF NOT EXISTS vector;";
|
||||
|
||||
log: function (message = null, ...args) {
|
||||
console.log(`\x1b[35m[PGVectorDb]\x1b[0m ${message}`, ...args);
|
||||
},
|
||||
/**
|
||||
* Get the table name for the PGVector database.
|
||||
* - Defaults to "anythingllm_vectors" if no table name is provided.
|
||||
* @returns {string}
|
||||
*/
|
||||
static tableName() {
|
||||
return process.env.PGVECTOR_TABLE_NAME || "anythingllm_vectors";
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the connection string for the PGVector database.
|
||||
* - Requires a connection string to be present in the environment variables.
|
||||
* @returns {string | null}
|
||||
*/
|
||||
static connectionString() {
|
||||
return process.env.PGVECTOR_CONNECTION_STRING;
|
||||
}
|
||||
|
||||
createTableSql(dimensions) {
|
||||
return `CREATE TABLE IF NOT EXISTS "${PGVector.tableName()}" (id UUID PRIMARY KEY, namespace TEXT, embedding vector(${Number(dimensions)}), metadata JSONB, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively sanitize values intended for JSONB to prevent Postgres errors
|
||||
@ -60,7 +70,7 @@ const PGVector = {
|
||||
* @param {any} value
|
||||
* @returns {any}
|
||||
*/
|
||||
sanitizeForJsonb: function (value) {
|
||||
sanitizeForJsonb(value) {
|
||||
// Fast path for null/undefined and primitives that do not need changes
|
||||
if (value === null || value === undefined) return value;
|
||||
|
||||
@ -99,13 +109,13 @@ const PGVector = {
|
||||
|
||||
// Numbers, booleans, etc.
|
||||
return value;
|
||||
},
|
||||
}
|
||||
|
||||
client: function (connectionString = null) {
|
||||
client(connectionString = null) {
|
||||
return new pgsql.Client({
|
||||
connectionString: connectionString || PGVector.connectionString(),
|
||||
});
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate the existing embedding table schema.
|
||||
@ -113,7 +123,7 @@ const PGVector = {
|
||||
* @param {string} tableName
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
validateExistingEmbeddingTableSchema: async function (pgClient, tableName) {
|
||||
async validateExistingEmbeddingTableSchema(pgClient, tableName) {
|
||||
const result = await pgClient.query(this.getEmbeddingTableSchemaSql, [
|
||||
tableName,
|
||||
]);
|
||||
@ -178,11 +188,11 @@ const PGVector = {
|
||||
);
|
||||
}
|
||||
|
||||
this.log(
|
||||
this.logger(
|
||||
`✅ The pgvector table '${tableName}' was found and meets the minimum expected schema for an embedding table.`
|
||||
);
|
||||
return true;
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate the connection to the database and verify that the table does not already exist.
|
||||
@ -191,35 +201,36 @@ const PGVector = {
|
||||
* @param {{connectionString: string | null, tableName: string | null}} params
|
||||
* @returns {Promise<{error: string | null, success: boolean}>}
|
||||
*/
|
||||
validateConnection: async function ({
|
||||
static async validateConnection({
|
||||
connectionString = null,
|
||||
tableName = null,
|
||||
}) {
|
||||
if (!connectionString) throw new Error("No connection string provided");
|
||||
const instance = new PGVector();
|
||||
|
||||
try {
|
||||
const timeoutPromise = new Promise((resolve) => {
|
||||
setTimeout(() => {
|
||||
resolve({
|
||||
error: `Connection timeout (${(PGVector.connectionTimeout / 1000).toFixed(0)}s). Please check your connection string and try again.`,
|
||||
error: `Connection timeout (${(instance.connectionTimeout / 1000).toFixed(0)}s). Please check your connection string and try again.`,
|
||||
success: false,
|
||||
});
|
||||
}, PGVector.connectionTimeout);
|
||||
}, instance.connectionTimeout);
|
||||
});
|
||||
|
||||
const connectionPromise = new Promise(async (resolve) => {
|
||||
let pgClient = null;
|
||||
try {
|
||||
pgClient = this.client(connectionString);
|
||||
pgClient = instance.client(connectionString);
|
||||
await pgClient.connect();
|
||||
const result = await pgClient.query(this.getTablesSql);
|
||||
const result = await pgClient.query(instance.getTablesSql);
|
||||
|
||||
if (result.rows.length !== 0 && !!tableName) {
|
||||
const tableExists = result.rows.some(
|
||||
(row) => row.tablename === tableName
|
||||
);
|
||||
if (tableExists)
|
||||
await this.validateExistingEmbeddingTableSchema(
|
||||
await instance.validateExistingEmbeddingTableSchema(
|
||||
pgClient,
|
||||
tableName
|
||||
);
|
||||
@ -236,7 +247,7 @@ const PGVector = {
|
||||
const result = await Promise.race([connectionPromise, timeoutPromise]);
|
||||
return result;
|
||||
} catch (err) {
|
||||
this.log("Validation Error:", err.message);
|
||||
instance.logger("Validation Error:", err.message);
|
||||
let readableError = err.message;
|
||||
switch (true) {
|
||||
case err.message.includes("ECONNREFUSED"):
|
||||
@ -248,13 +259,13 @@ const PGVector = {
|
||||
}
|
||||
return { error: readableError, success: false };
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the connection to the database directly.
|
||||
* @returns {{error: string | null, success: boolean}}
|
||||
*/
|
||||
testConnectionToDB: async function () {
|
||||
async testConnectionToDB() {
|
||||
try {
|
||||
const pgClient = await this.connect();
|
||||
await pgClient.query(this.getTablesSql);
|
||||
@ -263,14 +274,14 @@ const PGVector = {
|
||||
} catch (err) {
|
||||
return { error: err.message, success: false };
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Connect to the database.
|
||||
* - Throws an error if the connection string or table name is not provided.
|
||||
* @returns {Promise<pgsql.Client>}
|
||||
*/
|
||||
connect: async function () {
|
||||
async connect() {
|
||||
if (!PGVector.connectionString())
|
||||
throw new Error("No connection string provided");
|
||||
if (!PGVector.tableName()) throw new Error("No table name provided");
|
||||
@ -278,21 +289,21 @@ const PGVector = {
|
||||
const client = this.client();
|
||||
await client.connect();
|
||||
return client;
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the connection to the database with already set credentials via ENV
|
||||
* @returns {{error: string | null, success: boolean}}
|
||||
*/
|
||||
heartbeat: async function () {
|
||||
async heartbeat() {
|
||||
return this.testConnectionToDB();
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the anythingllm embedding table exists in the database
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
dbTableExists: async function () {
|
||||
async dbTableExists() {
|
||||
let connection = null;
|
||||
try {
|
||||
connection = await this.connect();
|
||||
@ -307,9 +318,9 @@ const PGVector = {
|
||||
} finally {
|
||||
if (connection) await connection.end();
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
totalVectors: async function () {
|
||||
async totalVectors() {
|
||||
if (!(await this.dbTableExists())) return 0;
|
||||
let connection = null;
|
||||
try {
|
||||
@ -323,17 +334,17 @@ const PGVector = {
|
||||
} finally {
|
||||
if (connection) await connection.end();
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// Distance for cosine is just the distance for pgvector.
|
||||
distanceToSimilarity: function (distance = null) {
|
||||
distanceToSimilarity(distance = null) {
|
||||
if (distance === null || typeof distance !== "number") return 0.0;
|
||||
if (distance >= 1.0) return 1;
|
||||
if (distance < 0) return 1 - Math.abs(distance);
|
||||
return 1 - distance;
|
||||
},
|
||||
}
|
||||
|
||||
namespaceCount: async function (namespace = null) {
|
||||
async namespaceCount(namespace = null) {
|
||||
if (!(await this.dbTableExists())) return 0;
|
||||
let connection = null;
|
||||
try {
|
||||
@ -348,7 +359,7 @@ const PGVector = {
|
||||
} finally {
|
||||
if (connection) await connection.end();
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Performs a SimilaritySearch on a given PGVector namespace.
|
||||
@ -361,7 +372,7 @@ const PGVector = {
|
||||
* @param {string[]} params.filterIdentifiers
|
||||
* @returns
|
||||
*/
|
||||
similarityResponse: async function ({
|
||||
async similarityResponse({
|
||||
client,
|
||||
namespace,
|
||||
queryVector,
|
||||
@ -384,7 +395,7 @@ const PGVector = {
|
||||
if (this.distanceToSimilarity(item._distance) < similarityThreshold)
|
||||
return;
|
||||
if (filterIdentifiers.includes(sourceIdentifier(item.metadata))) {
|
||||
this.log(
|
||||
this.logger(
|
||||
"A source was filtered from context as it's parent document is pinned."
|
||||
);
|
||||
return;
|
||||
@ -399,15 +410,15 @@ const PGVector = {
|
||||
});
|
||||
|
||||
return result;
|
||||
},
|
||||
}
|
||||
|
||||
normalizeVector: function (vector) {
|
||||
normalizeVector(vector) {
|
||||
const magnitude = Math.sqrt(
|
||||
vector.reduce((sum, val) => sum + val * val, 0)
|
||||
);
|
||||
if (magnitude === 0) return vector; // Avoid division by zero
|
||||
return vector.map((val) => val / magnitude);
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Update or create a collection in the database
|
||||
@ -418,14 +429,14 @@ const PGVector = {
|
||||
* @param {number} params.dimensions
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
updateOrCreateCollection: async function ({
|
||||
async updateOrCreateCollection({
|
||||
connection,
|
||||
submissions,
|
||||
namespace,
|
||||
dimensions = 384,
|
||||
}) {
|
||||
await this.createTableIfNotExists(connection, dimensions);
|
||||
this.log(`Updating or creating collection ${namespace}`);
|
||||
this.logger(`Updating or creating collection ${namespace}`);
|
||||
|
||||
try {
|
||||
// Create a transaction of all inserts
|
||||
@ -438,17 +449,17 @@ const PGVector = {
|
||||
[submission.id, namespace, embedding, sanitizedMetadata]
|
||||
);
|
||||
}
|
||||
this.log(`Committing ${submissions.length} vectors to ${namespace}`);
|
||||
this.logger(`Committing ${submissions.length} vectors to ${namespace}`);
|
||||
await connection.query(`COMMIT`);
|
||||
} catch (err) {
|
||||
this.log(
|
||||
this.logger(
|
||||
`Rolling back ${submissions.length} vectors to ${namespace}`,
|
||||
err
|
||||
);
|
||||
await connection.query(`ROLLBACK`);
|
||||
}
|
||||
return true;
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* create a table if it doesn't exist
|
||||
@ -456,12 +467,12 @@ const PGVector = {
|
||||
* @param {number} dimensions
|
||||
* @returns
|
||||
*/
|
||||
createTableIfNotExists: async function (connection, dimensions = 384) {
|
||||
this.log(`Creating embedding table with ${dimensions} dimensions`);
|
||||
async createTableIfNotExists(connection, dimensions = 384) {
|
||||
this.logger(`Creating embedding table with ${dimensions} dimensions`);
|
||||
await connection.query(this.createExtensionSql);
|
||||
await connection.query(this.createTableSql(dimensions));
|
||||
return true;
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the namespace from the database
|
||||
@ -469,21 +480,21 @@ const PGVector = {
|
||||
* @param {string} namespace
|
||||
* @returns {Promise<{name: string, vectorCount: number}>}
|
||||
*/
|
||||
namespace: async function (connection, namespace = null) {
|
||||
async namespace(connection, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace provided");
|
||||
const result = await connection.query(
|
||||
`SELECT COUNT(id) FROM "${PGVector.tableName()}" WHERE namespace = $1`,
|
||||
[namespace]
|
||||
);
|
||||
return { name: namespace, vectorCount: result.rows[0].count };
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the namespace exists in the database
|
||||
* @param {string} namespace
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
hasNamespace: async function (namespace = null) {
|
||||
async hasNamespace(namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace provided");
|
||||
let connection = null;
|
||||
try {
|
||||
@ -494,7 +505,7 @@ const PGVector = {
|
||||
} finally {
|
||||
if (connection) await connection.end();
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the namespace exists in the database
|
||||
@ -502,14 +513,14 @@ const PGVector = {
|
||||
* @param {string} namespace
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
namespaceExists: async function (connection, namespace = null) {
|
||||
async namespaceExists(connection, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace provided");
|
||||
const result = await connection.query(
|
||||
`SELECT COUNT(id) FROM "${PGVector.tableName()}" WHERE namespace = $1 LIMIT 1`,
|
||||
[namespace]
|
||||
);
|
||||
return result.rows[0].count > 0;
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete all vectors in the namespace
|
||||
@ -517,16 +528,16 @@ const PGVector = {
|
||||
* @param {string} namespace
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
deleteVectorsInNamespace: async function (connection, namespace = null) {
|
||||
async deleteVectorsInNamespace(connection, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace provided");
|
||||
await connection.query(
|
||||
`DELETE FROM "${PGVector.tableName()}" WHERE namespace = $1`,
|
||||
[namespace]
|
||||
);
|
||||
return true;
|
||||
},
|
||||
}
|
||||
|
||||
addDocumentToNamespace: async function (
|
||||
async addDocumentToNamespace(
|
||||
namespace,
|
||||
documentData = {},
|
||||
fullFilePath = null,
|
||||
@ -544,7 +555,7 @@ const PGVector = {
|
||||
if (!pageContent || pageContent.length == 0) return false;
|
||||
connection = await this.connect();
|
||||
|
||||
this.log("Adding new vectorized document into namespace", namespace);
|
||||
this.logger("Adding new vectorized document into namespace", namespace);
|
||||
if (!skipCache) {
|
||||
const cacheResult = await cachedVectorInformation(fullFilePath);
|
||||
let vectorDimensions;
|
||||
@ -594,7 +605,7 @@ const PGVector = {
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
this.log("Snippets created from document:", textChunks.length);
|
||||
this.logger("Snippets created from document:", textChunks.length);
|
||||
const documentVectors = [];
|
||||
const vectors = [];
|
||||
const submissions = [];
|
||||
@ -628,7 +639,7 @@ const PGVector = {
|
||||
const chunks = [];
|
||||
for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
|
||||
|
||||
this.log("Inserting vectorized chunks into PGVector collection.");
|
||||
this.logger("Inserting vectorized chunks into PGVector collection.");
|
||||
await this.updateOrCreateCollection({
|
||||
connection,
|
||||
submissions,
|
||||
@ -641,12 +652,12 @@ const PGVector = {
|
||||
await DocumentVectors.bulkInsert(documentVectors);
|
||||
return { vectorized: true, error: null };
|
||||
} catch (err) {
|
||||
this.log("addDocumentToNamespace", err.message);
|
||||
this.logger("addDocumentToNamespace", err.message);
|
||||
return { vectorized: false, error: err.message };
|
||||
} finally {
|
||||
if (connection) await connection.end();
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a document from the namespace
|
||||
@ -654,7 +665,7 @@ const PGVector = {
|
||||
* @param {string} docId
|
||||
* @returns {Promise<boolean>}
|
||||
*/
|
||||
deleteDocumentFromNamespace: async function (namespace, docId) {
|
||||
async deleteDocumentFromNamespace(namespace, docId) {
|
||||
if (!namespace) throw new Error("No namespace provided");
|
||||
if (!docId) throw new Error("No docId provided");
|
||||
|
||||
@ -686,21 +697,21 @@ const PGVector = {
|
||||
throw err;
|
||||
}
|
||||
|
||||
this.log(
|
||||
this.logger(
|
||||
`Deleted ${vectorIds.length} vectors from namespace ${namespace}`
|
||||
);
|
||||
return true;
|
||||
} catch (err) {
|
||||
this.log(
|
||||
this.logger(
|
||||
`Error deleting document from namespace ${namespace}: ${err.message}`
|
||||
);
|
||||
return false;
|
||||
} finally {
|
||||
if (connection) await connection.end();
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
performSimilaritySearch: async function ({
|
||||
async performSimilaritySearch({
|
||||
namespace = null,
|
||||
input = "",
|
||||
LLMConnector = null,
|
||||
@ -716,7 +727,7 @@ const PGVector = {
|
||||
connection = await this.connect();
|
||||
const exists = await this.namespaceExists(connection, namespace);
|
||||
if (!exists) {
|
||||
this.log(
|
||||
this.logger(
|
||||
`The namespace ${namespace} does not exist or has no vectors. Returning empty results.`
|
||||
);
|
||||
return {
|
||||
@ -750,9 +761,9 @@ const PGVector = {
|
||||
} finally {
|
||||
if (connection) await connection.end();
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
"namespace-stats": async function (reqBody = {}) {
|
||||
async "namespace-stats"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
if (!namespace) throw new Error("namespace required");
|
||||
if (!(await this.dbTableExists()))
|
||||
@ -774,9 +785,9 @@ const PGVector = {
|
||||
} finally {
|
||||
if (connection) await connection.end();
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
"delete-namespace": async function (reqBody = {}) {
|
||||
async "delete-namespace"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
if (!namespace) throw new Error("No namespace provided");
|
||||
|
||||
@ -800,13 +811,13 @@ const PGVector = {
|
||||
} finally {
|
||||
if (connection) await connection.end();
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset the entire vector database table associated with anythingllm
|
||||
* @returns {Promise<{reset: boolean}>}
|
||||
*/
|
||||
reset: async function () {
|
||||
async reset() {
|
||||
let connection = null;
|
||||
try {
|
||||
connection = await this.connect();
|
||||
@ -817,9 +828,9 @@ const PGVector = {
|
||||
} finally {
|
||||
if (connection) await connection.end();
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
curateSources: function (sources = []) {
|
||||
curateSources(sources = []) {
|
||||
const documents = [];
|
||||
for (const source of sources) {
|
||||
const { text, vector: _v, _distance: _d, ...rest } = source;
|
||||
@ -833,7 +844,7 @@ const PGVector = {
|
||||
}
|
||||
|
||||
return documents;
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
module.exports.PGVector = PGVector;
|
||||
|
||||
@ -5,10 +5,18 @@ const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
|
||||
const { sourceIdentifier } = require("../../chats");
|
||||
const { VectorDatabase } = require("../base");
|
||||
|
||||
const PineconeDB = {
|
||||
name: "Pinecone",
|
||||
connect: async function () {
|
||||
class PineconeDB extends VectorDatabase {
|
||||
constructor() {
|
||||
super();
|
||||
}
|
||||
|
||||
get name() {
|
||||
return "Pinecone";
|
||||
}
|
||||
|
||||
async connect() {
|
||||
if (process.env.VECTOR_DB !== "pinecone")
|
||||
throw new Error("Pinecone::Invalid ENV settings");
|
||||
|
||||
@ -21,8 +29,9 @@ const PineconeDB = {
|
||||
|
||||
if (!status.ready) throw new Error("Pinecone::Index not ready.");
|
||||
return { client, pineconeIndex, indexName: process.env.PINECONE_INDEX };
|
||||
},
|
||||
totalVectors: async function () {
|
||||
}
|
||||
|
||||
async totalVectors() {
|
||||
const { pineconeIndex } = await this.connect();
|
||||
const { namespaces } = await pineconeIndex.describeIndexStats();
|
||||
|
||||
@ -30,13 +39,15 @@ const PineconeDB = {
|
||||
(a, b) => a + (b?.recordCount || 0),
|
||||
0
|
||||
);
|
||||
},
|
||||
namespaceCount: async function (_namespace = null) {
|
||||
}
|
||||
|
||||
async namespaceCount(_namespace = null) {
|
||||
const { pineconeIndex } = await this.connect();
|
||||
const namespace = await this.namespace(pineconeIndex, _namespace);
|
||||
return namespace?.recordCount || 0;
|
||||
},
|
||||
similarityResponse: async function ({
|
||||
}
|
||||
|
||||
async similarityResponse({
|
||||
client,
|
||||
namespace,
|
||||
queryVector,
|
||||
@ -60,7 +71,7 @@ const PineconeDB = {
|
||||
response.matches.forEach((match) => {
|
||||
if (match.score < similarityThreshold) return;
|
||||
if (filterIdentifiers.includes(sourceIdentifier(match.metadata))) {
|
||||
console.log(
|
||||
this.logger(
|
||||
"Pinecone: A source was filtered from context as it's parent document is pinned."
|
||||
);
|
||||
return;
|
||||
@ -75,28 +86,33 @@ const PineconeDB = {
|
||||
});
|
||||
|
||||
return result;
|
||||
},
|
||||
namespace: async function (index, namespace = null) {
|
||||
}
|
||||
|
||||
async namespace(index, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
const { namespaces } = await index.describeIndexStats();
|
||||
return namespaces.hasOwnProperty(namespace) ? namespaces[namespace] : null;
|
||||
},
|
||||
hasNamespace: async function (namespace = null) {
|
||||
}
|
||||
|
||||
async hasNamespace(namespace = null) {
|
||||
if (!namespace) return false;
|
||||
const { pineconeIndex } = await this.connect();
|
||||
return await this.namespaceExists(pineconeIndex, namespace);
|
||||
},
|
||||
namespaceExists: async function (index, namespace = null) {
|
||||
}
|
||||
|
||||
async namespaceExists(index, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
const { namespaces } = await index.describeIndexStats();
|
||||
return namespaces.hasOwnProperty(namespace);
|
||||
},
|
||||
deleteVectorsInNamespace: async function (index, namespace = null) {
|
||||
}
|
||||
|
||||
async deleteVectorsInNamespace(index, namespace = null) {
|
||||
const pineconeNamespace = index.namespace(namespace);
|
||||
await pineconeNamespace.deleteAll();
|
||||
return true;
|
||||
},
|
||||
addDocumentToNamespace: async function (
|
||||
}
|
||||
|
||||
async addDocumentToNamespace(
|
||||
namespace,
|
||||
documentData = {},
|
||||
fullFilePath = null,
|
||||
@ -107,7 +123,7 @@ const PineconeDB = {
|
||||
const { pageContent, docId, ...metadata } = documentData;
|
||||
if (!pageContent || pageContent.length == 0) return false;
|
||||
|
||||
console.log("Adding new vectorized document into namespace", namespace);
|
||||
this.logger("Adding new vectorized document into namespace", namespace);
|
||||
if (!skipCache) {
|
||||
const cacheResult = await cachedVectorInformation(fullFilePath);
|
||||
if (cacheResult.exists) {
|
||||
@ -154,7 +170,7 @@ const PineconeDB = {
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
console.log("Snippets created from document:", textChunks.length);
|
||||
this.logger("Snippets created from document:", textChunks.length);
|
||||
const documentVectors = [];
|
||||
const vectors = [];
|
||||
const vectorValues = await EmbedderEngine.embedChunks(textChunks);
|
||||
@ -183,7 +199,7 @@ const PineconeDB = {
|
||||
const chunks = [];
|
||||
const { pineconeIndex } = await this.connect();
|
||||
const pineconeNamespace = pineconeIndex.namespace(namespace);
|
||||
console.log("Inserting vectorized chunks into Pinecone.");
|
||||
this.logger("Inserting vectorized chunks into Pinecone.");
|
||||
for (const chunk of toChunks(vectors, 100)) {
|
||||
chunks.push(chunk);
|
||||
await pineconeNamespace.upsert([...chunk]);
|
||||
@ -194,11 +210,12 @@ const PineconeDB = {
|
||||
await DocumentVectors.bulkInsert(documentVectors);
|
||||
return { vectorized: true, error: null };
|
||||
} catch (e) {
|
||||
console.error("addDocumentToNamespace", e.message);
|
||||
this.logger("addDocumentToNamespace", e.message);
|
||||
return { vectorized: false, error: e.message };
|
||||
}
|
||||
},
|
||||
deleteDocumentFromNamespace: async function (namespace, docId) {
|
||||
}
|
||||
|
||||
async deleteDocumentFromNamespace(namespace, docId) {
|
||||
const { DocumentVectors } = require("../../../models/vectors");
|
||||
const { pineconeIndex } = await this.connect();
|
||||
if (!(await this.namespaceExists(pineconeIndex, namespace))) return;
|
||||
@ -216,8 +233,9 @@ const PineconeDB = {
|
||||
const indexes = knownDocuments.map((doc) => doc.id);
|
||||
await DocumentVectors.deleteIds(indexes);
|
||||
return true;
|
||||
},
|
||||
"namespace-stats": async function (reqBody = {}) {
|
||||
}
|
||||
|
||||
async "namespace-stats"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
if (!namespace) throw new Error("namespace required");
|
||||
const { pineconeIndex } = await this.connect();
|
||||
@ -227,8 +245,9 @@ const PineconeDB = {
|
||||
return stats
|
||||
? stats
|
||||
: { message: "No stats were able to be fetched from DB" };
|
||||
},
|
||||
"delete-namespace": async function (reqBody = {}) {
|
||||
}
|
||||
|
||||
async "delete-namespace"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
const { pineconeIndex } = await this.connect();
|
||||
if (!(await this.namespaceExists(pineconeIndex, namespace)))
|
||||
@ -239,8 +258,9 @@ const PineconeDB = {
|
||||
return {
|
||||
message: `Namespace ${namespace} was deleted along with ${details.vectorCount} vectors.`,
|
||||
};
|
||||
},
|
||||
performSimilaritySearch: async function ({
|
||||
}
|
||||
|
||||
async performSimilaritySearch({
|
||||
namespace = null,
|
||||
input = "",
|
||||
LLMConnector = null,
|
||||
@ -275,8 +295,9 @@ const PineconeDB = {
|
||||
sources: this.curateSources(sources),
|
||||
message: false,
|
||||
};
|
||||
},
|
||||
curateSources: function (sources = []) {
|
||||
}
|
||||
|
||||
curateSources(sources = []) {
|
||||
const documents = [];
|
||||
for (const source of sources) {
|
||||
const { metadata = {} } = source;
|
||||
@ -290,7 +311,7 @@ const PineconeDB = {
|
||||
}
|
||||
}
|
||||
return documents;
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
module.exports.Pinecone = PineconeDB;
|
||||
|
||||
@ -5,10 +5,18 @@ const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
|
||||
const { sourceIdentifier } = require("../../chats");
|
||||
const { VectorDatabase } = require("../base");
|
||||
|
||||
const QDrant = {
|
||||
name: "QDrant",
|
||||
connect: async function () {
|
||||
class QDrant extends VectorDatabase {
|
||||
constructor() {
|
||||
super();
|
||||
}
|
||||
|
||||
get name() {
|
||||
return "QDrant";
|
||||
}
|
||||
|
||||
async connect() {
|
||||
if (process.env.VECTOR_DB !== "qdrant")
|
||||
throw new Error("QDrant::Invalid ENV settings");
|
||||
|
||||
@ -26,12 +34,14 @@ const QDrant = {
|
||||
);
|
||||
|
||||
return { client };
|
||||
},
|
||||
heartbeat: async function () {
|
||||
}
|
||||
|
||||
async heartbeat() {
|
||||
await this.connect();
|
||||
return { heartbeat: Number(new Date()) };
|
||||
},
|
||||
totalVectors: async function () {
|
||||
}
|
||||
|
||||
async totalVectors() {
|
||||
const { client } = await this.connect();
|
||||
const { collections } = await client.getCollections();
|
||||
var totalVectors = 0;
|
||||
@ -41,13 +51,15 @@ const QDrant = {
|
||||
(await this.namespace(client, collection.name))?.vectorCount || 0;
|
||||
}
|
||||
return totalVectors;
|
||||
},
|
||||
namespaceCount: async function (_namespace = null) {
|
||||
}
|
||||
|
||||
async namespaceCount(_namespace = null) {
|
||||
const { client } = await this.connect();
|
||||
const namespace = await this.namespace(client, _namespace);
|
||||
return namespace?.vectorCount || 0;
|
||||
},
|
||||
similarityResponse: async function ({
|
||||
}
|
||||
|
||||
async similarityResponse({
|
||||
client,
|
||||
namespace,
|
||||
queryVector,
|
||||
@ -70,7 +82,7 @@ const QDrant = {
|
||||
responses.forEach((response) => {
|
||||
if (response.score < similarityThreshold) return;
|
||||
if (filterIdentifiers.includes(sourceIdentifier(response?.payload))) {
|
||||
console.log(
|
||||
this.logger(
|
||||
"QDrant: A source was filtered from context as it's parent document is pinned."
|
||||
);
|
||||
return;
|
||||
@ -86,8 +98,9 @@ const QDrant = {
|
||||
});
|
||||
|
||||
return result;
|
||||
},
|
||||
namespace: async function (client, namespace = null) {
|
||||
}
|
||||
|
||||
async namespace(client, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
const collection = await client.getCollection(namespace).catch(() => null);
|
||||
if (!collection) return null;
|
||||
@ -97,28 +110,32 @@ const QDrant = {
|
||||
...collection,
|
||||
vectorCount: (await client.count(namespace, { exact: true })).count,
|
||||
};
|
||||
},
|
||||
hasNamespace: async function (namespace = null) {
|
||||
}
|
||||
|
||||
async hasNamespace(namespace = null) {
|
||||
if (!namespace) return false;
|
||||
const { client } = await this.connect();
|
||||
return await this.namespaceExists(client, namespace);
|
||||
},
|
||||
namespaceExists: async function (client, namespace = null) {
|
||||
}
|
||||
|
||||
async namespaceExists(client, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
const collection = await client.getCollection(namespace).catch((e) => {
|
||||
console.error("QDrant::namespaceExists", e.message);
|
||||
this.logger("namespaceExists", e.message);
|
||||
return null;
|
||||
});
|
||||
return !!collection;
|
||||
},
|
||||
deleteVectorsInNamespace: async function (client, namespace = null) {
|
||||
}
|
||||
|
||||
async deleteVectorsInNamespace(client, namespace = null) {
|
||||
await client.deleteCollection(namespace);
|
||||
return true;
|
||||
},
|
||||
}
|
||||
|
||||
// QDrant requires a dimension aspect for collection creation
|
||||
// we pass this in from the first chunk to infer the dimensions like other
|
||||
// providers do.
|
||||
getOrCreateCollection: async function (client, namespace, dimensions = null) {
|
||||
async getOrCreateCollection(client, namespace, dimensions = null) {
|
||||
if (await this.namespaceExists(client, namespace)) {
|
||||
return await client.getCollection(namespace);
|
||||
}
|
||||
@ -133,8 +150,9 @@ const QDrant = {
|
||||
},
|
||||
});
|
||||
return await client.getCollection(namespace);
|
||||
},
|
||||
addDocumentToNamespace: async function (
|
||||
}
|
||||
|
||||
async addDocumentToNamespace(
|
||||
namespace,
|
||||
documentData = {},
|
||||
fullFilePath = null,
|
||||
@ -146,7 +164,7 @@ const QDrant = {
|
||||
const { pageContent, docId, ...metadata } = documentData;
|
||||
if (!pageContent || pageContent.length == 0) return false;
|
||||
|
||||
console.log("Adding new vectorized document into namespace", namespace);
|
||||
this.logger("Adding new vectorized document into namespace", namespace);
|
||||
if (!skipCache) {
|
||||
const cacheResult = await cachedVectorInformation(fullFilePath);
|
||||
if (cacheResult.exists) {
|
||||
@ -227,7 +245,7 @@ const QDrant = {
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
console.log("Snippets created from document:", textChunks.length);
|
||||
this.logger("Snippets created from document:", textChunks.length);
|
||||
const documentVectors = [];
|
||||
const vectors = [];
|
||||
const vectorValues = await EmbedderEngine.embedChunks(textChunks);
|
||||
@ -276,7 +294,7 @@ const QDrant = {
|
||||
if (vectors.length > 0) {
|
||||
const chunks = [];
|
||||
|
||||
console.log("Inserting vectorized chunks into QDrant collection.");
|
||||
this.logger("Inserting vectorized chunks into QDrant collection.");
|
||||
for (const chunk of toChunks(vectors, 500)) {
|
||||
const batchIds = [],
|
||||
batchVectors = [],
|
||||
@ -306,11 +324,12 @@ const QDrant = {
|
||||
await DocumentVectors.bulkInsert(documentVectors);
|
||||
return { vectorized: true, error: null };
|
||||
} catch (e) {
|
||||
console.error("addDocumentToNamespace", e.message);
|
||||
this.logger("addDocumentToNamespace", e.message);
|
||||
return { vectorized: false, error: e.message };
|
||||
}
|
||||
},
|
||||
deleteDocumentFromNamespace: async function (namespace, docId) {
|
||||
}
|
||||
|
||||
async deleteDocumentFromNamespace(namespace, docId) {
|
||||
const { DocumentVectors } = require("../../../models/vectors");
|
||||
const { client } = await this.connect();
|
||||
if (!(await this.namespaceExists(client, namespace))) return;
|
||||
@ -327,8 +346,9 @@ const QDrant = {
|
||||
const indexes = knownDocuments.map((doc) => doc.id);
|
||||
await DocumentVectors.deleteIds(indexes);
|
||||
return true;
|
||||
},
|
||||
performSimilaritySearch: async function ({
|
||||
}
|
||||
|
||||
async performSimilaritySearch({
|
||||
namespace = null,
|
||||
input = "",
|
||||
LLMConnector = null,
|
||||
@ -366,8 +386,9 @@ const QDrant = {
|
||||
sources: this.curateSources(sources),
|
||||
message: false,
|
||||
};
|
||||
},
|
||||
"namespace-stats": async function (reqBody = {}) {
|
||||
}
|
||||
|
||||
async "namespace-stats"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
if (!namespace) throw new Error("namespace required");
|
||||
const { client } = await this.connect();
|
||||
@ -377,8 +398,9 @@ const QDrant = {
|
||||
return stats
|
||||
? stats
|
||||
: { message: "No stats were able to be fetched from DB for namespace" };
|
||||
},
|
||||
"delete-namespace": async function (reqBody = {}) {
|
||||
}
|
||||
|
||||
async "delete-namespace"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
const { client } = await this.connect();
|
||||
if (!(await this.namespaceExists(client, namespace)))
|
||||
@ -389,16 +411,18 @@ const QDrant = {
|
||||
return {
|
||||
message: `Namespace ${namespace} was deleted along with ${details?.vectorCount} vectors.`,
|
||||
};
|
||||
},
|
||||
reset: async function () {
|
||||
}
|
||||
|
||||
async reset() {
|
||||
const { client } = await this.connect();
|
||||
const response = await client.getCollections();
|
||||
for (const collection of response.collections) {
|
||||
await client.deleteCollection(collection.name);
|
||||
}
|
||||
return { reset: true };
|
||||
},
|
||||
curateSources: function (sources = []) {
|
||||
}
|
||||
|
||||
curateSources(sources = []) {
|
||||
const documents = [];
|
||||
for (const source of sources) {
|
||||
if (Object.keys(source).length > 0) {
|
||||
@ -412,7 +436,7 @@ const QDrant = {
|
||||
}
|
||||
|
||||
return documents;
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
module.exports.QDrant = QDrant;
|
||||
|
||||
@ -6,10 +6,18 @@ const { v4: uuidv4 } = require("uuid");
|
||||
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
|
||||
const { camelCase } = require("../../helpers/camelcase");
|
||||
const { sourceIdentifier } = require("../../chats");
|
||||
const { VectorDatabase } = require("../base");
|
||||
|
||||
const Weaviate = {
|
||||
name: "Weaviate",
|
||||
connect: async function () {
|
||||
class Weaviate extends VectorDatabase {
|
||||
constructor() {
|
||||
super();
|
||||
}
|
||||
|
||||
get name() {
|
||||
return "Weaviate";
|
||||
}
|
||||
|
||||
async connect() {
|
||||
if (process.env.VECTOR_DB !== "weaviate")
|
||||
throw new Error("Weaviate::Invalid ENV settings");
|
||||
|
||||
@ -28,12 +36,14 @@ const Weaviate = {
|
||||
"Weaviate::Invalid Alive signal received - is the service online?"
|
||||
);
|
||||
return { client };
|
||||
},
|
||||
heartbeat: async function () {
|
||||
}
|
||||
|
||||
async heartbeat() {
|
||||
await this.connect();
|
||||
return { heartbeat: Number(new Date()) };
|
||||
},
|
||||
totalVectors: async function () {
|
||||
}
|
||||
|
||||
async totalVectors() {
|
||||
const { client } = await this.connect();
|
||||
const collectionNames = await this.allNamespaces(client);
|
||||
var totalVectors = 0;
|
||||
@ -41,8 +51,9 @@ const Weaviate = {
|
||||
totalVectors += await this.namespaceCountWithClient(client, name);
|
||||
}
|
||||
return totalVectors;
|
||||
},
|
||||
namespaceCountWithClient: async function (client, namespace) {
|
||||
}
|
||||
|
||||
async namespaceCountWithClient(client, namespace) {
|
||||
try {
|
||||
const response = await client.graphql
|
||||
.aggregate()
|
||||
@ -53,11 +64,12 @@ const Weaviate = {
|
||||
response?.data?.Aggregate?.[camelCase(namespace)]?.[0]?.meta?.count || 0
|
||||
);
|
||||
} catch (e) {
|
||||
console.error(`Weaviate:namespaceCountWithClient`, e.message);
|
||||
this.logger(`namespaceCountWithClient`, e.message);
|
||||
return 0;
|
||||
}
|
||||
},
|
||||
namespaceCount: async function (namespace = null) {
|
||||
}
|
||||
|
||||
async namespaceCount(namespace = null) {
|
||||
try {
|
||||
const { client } = await this.connect();
|
||||
const response = await client.graphql
|
||||
@ -70,11 +82,12 @@ const Weaviate = {
|
||||
response?.data?.Aggregate?.[camelCase(namespace)]?.[0]?.meta?.count || 0
|
||||
);
|
||||
} catch (e) {
|
||||
console.error(`Weaviate:namespaceCountWithClient`, e.message);
|
||||
this.logger(`namespaceCountWithClient`, e.message);
|
||||
return 0;
|
||||
}
|
||||
},
|
||||
similarityResponse: async function ({
|
||||
}
|
||||
|
||||
async similarityResponse({
|
||||
client,
|
||||
namespace,
|
||||
queryVector,
|
||||
@ -109,8 +122,8 @@ const Weaviate = {
|
||||
} = response;
|
||||
if (certainty < similarityThreshold) return;
|
||||
if (filterIdentifiers.includes(sourceIdentifier(rest))) {
|
||||
console.log(
|
||||
"Weaviate: A source was filtered from context as it's parent document is pinned."
|
||||
this.logger(
|
||||
"A source was filtered from context as it's parent document is pinned."
|
||||
);
|
||||
return;
|
||||
}
|
||||
@ -120,17 +133,19 @@ const Weaviate = {
|
||||
});
|
||||
|
||||
return result;
|
||||
},
|
||||
allNamespaces: async function (client) {
|
||||
}
|
||||
|
||||
async allNamespaces(client) {
|
||||
try {
|
||||
const { classes = [] } = await client.schema.getter().do();
|
||||
return classes.map((classObj) => classObj.class);
|
||||
} catch (e) {
|
||||
console.error("Weaviate::AllNamespace", e);
|
||||
this.logger("AllNamespace", e);
|
||||
return [];
|
||||
}
|
||||
},
|
||||
namespace: async function (client, namespace = null) {
|
||||
}
|
||||
|
||||
async namespace(client, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
if (!(await this.namespaceExists(client, namespace))) return null;
|
||||
|
||||
@ -143,8 +158,9 @@ const Weaviate = {
|
||||
...weaviateClass,
|
||||
vectorCount: await this.namespaceCount(namespace),
|
||||
};
|
||||
},
|
||||
addVectors: async function (client, vectors = []) {
|
||||
}
|
||||
|
||||
async addVectors(client, vectors = []) {
|
||||
const response = { success: true, errors: new Set([]) };
|
||||
const results = await client.batch
|
||||
.objectsBatcher()
|
||||
@ -160,23 +176,27 @@ const Weaviate = {
|
||||
|
||||
response.errors = [...response.errors];
|
||||
return response;
|
||||
},
|
||||
hasNamespace: async function (namespace = null) {
|
||||
}
|
||||
|
||||
async hasNamespace(namespace = null) {
|
||||
if (!namespace) return false;
|
||||
const { client } = await this.connect();
|
||||
const weaviateClasses = await this.allNamespaces(client);
|
||||
return weaviateClasses.includes(camelCase(namespace));
|
||||
},
|
||||
namespaceExists: async function (client, namespace = null) {
|
||||
}
|
||||
|
||||
async namespaceExists(client, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
const weaviateClasses = await this.allNamespaces(client);
|
||||
return weaviateClasses.includes(camelCase(namespace));
|
||||
},
|
||||
deleteVectorsInNamespace: async function (client, namespace = null) {
|
||||
}
|
||||
|
||||
async deleteVectorsInNamespace(client, namespace = null) {
|
||||
await client.schema.classDeleter().withClassName(camelCase(namespace)).do();
|
||||
return true;
|
||||
},
|
||||
addDocumentToNamespace: async function (
|
||||
}
|
||||
|
||||
async addDocumentToNamespace(
|
||||
namespace,
|
||||
documentData = {},
|
||||
fullFilePath = null,
|
||||
@ -192,7 +212,7 @@ const Weaviate = {
|
||||
} = documentData;
|
||||
if (!pageContent || pageContent.length == 0) return false;
|
||||
|
||||
console.log("Adding new vectorized document into namespace", namespace);
|
||||
this.logger("Adding new vectorized document into namespace", namespace);
|
||||
if (!skipCache) {
|
||||
const cacheResult = await cachedVectorInformation(fullFilePath);
|
||||
if (cacheResult.exists) {
|
||||
@ -236,7 +256,7 @@ const Weaviate = {
|
||||
const { success: additionResult, errors = [] } =
|
||||
await this.addVectors(client, vectors);
|
||||
if (!additionResult) {
|
||||
console.error("Weaviate::addVectors failed to insert", errors);
|
||||
this.logger("addVectors failed to insert", errors);
|
||||
throw new Error("Error embedding into Weaviate");
|
||||
}
|
||||
}
|
||||
@ -267,7 +287,7 @@ const Weaviate = {
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
console.log("Snippets created from document:", textChunks.length);
|
||||
this.logger("Snippets created from document:", textChunks.length);
|
||||
const documentVectors = [];
|
||||
const vectors = [];
|
||||
const vectorValues = await EmbedderEngine.embedChunks(textChunks);
|
||||
@ -322,13 +342,13 @@ const Weaviate = {
|
||||
const chunks = [];
|
||||
for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
|
||||
|
||||
console.log("Inserting vectorized chunks into Weaviate collection.");
|
||||
this.logger("Inserting vectorized chunks into Weaviate collection.");
|
||||
const { success: additionResult, errors = [] } = await this.addVectors(
|
||||
client,
|
||||
vectors
|
||||
);
|
||||
if (!additionResult) {
|
||||
console.error("Weaviate::addVectors failed to insert", errors);
|
||||
this.logger("addVectors failed to insert", errors);
|
||||
throw new Error("Error embedding into Weaviate");
|
||||
}
|
||||
await storeVectorResult(chunks, fullFilePath);
|
||||
@ -337,11 +357,12 @@ const Weaviate = {
|
||||
await DocumentVectors.bulkInsert(documentVectors);
|
||||
return { vectorized: true, error: null };
|
||||
} catch (e) {
|
||||
console.error("addDocumentToNamespace", e.message);
|
||||
this.logger("addDocumentToNamespace", e.message);
|
||||
return { vectorized: false, error: e.message };
|
||||
}
|
||||
},
|
||||
deleteDocumentFromNamespace: async function (namespace, docId) {
|
||||
}
|
||||
|
||||
async deleteDocumentFromNamespace(namespace, docId) {
|
||||
const { DocumentVectors } = require("../../../models/vectors");
|
||||
const { client } = await this.connect();
|
||||
if (!(await this.namespaceExists(client, namespace))) return;
|
||||
@ -360,8 +381,9 @@ const Weaviate = {
|
||||
const indexes = knownDocuments.map((doc) => doc.id);
|
||||
await DocumentVectors.deleteIds(indexes);
|
||||
return true;
|
||||
},
|
||||
performSimilaritySearch: async function ({
|
||||
}
|
||||
|
||||
async performSimilaritySearch({
|
||||
namespace = null,
|
||||
input = "",
|
||||
LLMConnector = null,
|
||||
@ -399,8 +421,9 @@ const Weaviate = {
|
||||
sources: this.curateSources(sources),
|
||||
message: false,
|
||||
};
|
||||
},
|
||||
"namespace-stats": async function (reqBody = {}) {
|
||||
}
|
||||
|
||||
async "namespace-stats"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
if (!namespace) throw new Error("namespace required");
|
||||
const { client } = await this.connect();
|
||||
@ -408,8 +431,9 @@ const Weaviate = {
|
||||
return stats
|
||||
? stats
|
||||
: { message: "No stats were able to be fetched from DB for namespace" };
|
||||
},
|
||||
"delete-namespace": async function (reqBody = {}) {
|
||||
}
|
||||
|
||||
async "delete-namespace"(reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
const { client } = await this.connect();
|
||||
const details = await this.namespace(client, namespace);
|
||||
@ -419,16 +443,18 @@ const Weaviate = {
|
||||
details?.vectorCount
|
||||
} vectors.`,
|
||||
};
|
||||
},
|
||||
reset: async function () {
|
||||
}
|
||||
|
||||
async reset() {
|
||||
const { client } = await this.connect();
|
||||
const weaviateClasses = await this.allNamespaces(client);
|
||||
for (const weaviateClass of weaviateClasses) {
|
||||
await client.schema.classDeleter().withClassName(weaviateClass).do();
|
||||
}
|
||||
return { reset: true };
|
||||
},
|
||||
curateSources: function (sources = []) {
|
||||
}
|
||||
|
||||
curateSources(sources = []) {
|
||||
const documents = [];
|
||||
for (const source of sources) {
|
||||
if (Object.keys(source).length > 0) {
|
||||
@ -440,8 +466,9 @@ const Weaviate = {
|
||||
}
|
||||
|
||||
return documents;
|
||||
},
|
||||
flattenObjectForWeaviate: function (obj = {}) {
|
||||
}
|
||||
|
||||
flattenObjectForWeaviate(obj = {}) {
|
||||
// Note this function is not generic, it is designed specifically for Weaviate
|
||||
// https://weaviate.io/developers/weaviate/config-refs/datatypes#introduction
|
||||
// Credit to LangchainJS
|
||||
@ -478,7 +505,7 @@ const Weaviate = {
|
||||
}
|
||||
|
||||
return flattenedObject;
|
||||
},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
module.exports.Weaviate = Weaviate;
|
||||
|
||||
@ -1,33 +1,22 @@
|
||||
const {
|
||||
DataType,
|
||||
MetricType,
|
||||
IndexType,
|
||||
MilvusClient,
|
||||
} = require("@zilliz/milvus2-sdk-node");
|
||||
const { TextSplitter } = require("../../TextSplitter");
|
||||
const { SystemSettings } = require("../../../models/systemSettings");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
const { storeVectorResult, cachedVectorInformation } = require("../../files");
|
||||
const { toChunks, getEmbeddingEngineSelection } = require("../../helpers");
|
||||
const { sourceIdentifier } = require("../../chats");
|
||||
const { MilvusClient } = require("@zilliz/milvus2-sdk-node");
|
||||
const { Milvus } = require("../milvus");
|
||||
|
||||
// Zilliz is basically a copy of Milvus DB class with a different constructor
|
||||
// to connect to the cloud
|
||||
const Zilliz = {
|
||||
name: "Zilliz",
|
||||
// Milvus/Zilliz only allows letters, numbers, and underscores in collection names
|
||||
// so we need to enforce that by re-normalizing the names when communicating with
|
||||
// the DB.
|
||||
// If the first char of the collection is not an underscore or letter the collection name will be invalid.
|
||||
normalize: function (inputString) {
|
||||
let normalized = inputString.replace(/[^a-zA-Z0-9_]/g, "_");
|
||||
if (new RegExp(/^[a-zA-Z_]/).test(normalized.slice(0, 1)))
|
||||
normalized = `anythingllm_${normalized}`;
|
||||
return normalized;
|
||||
},
|
||||
connect: async function () {
|
||||
/**
|
||||
* Zilliz is the cloud version of Milvus so we can just extend the
|
||||
* Milvus class and override the connect method
|
||||
*/
|
||||
class Zilliz extends Milvus {
|
||||
constructor() {
|
||||
super();
|
||||
}
|
||||
|
||||
get name() {
|
||||
return "Zilliz";
|
||||
}
|
||||
|
||||
async connect() {
|
||||
if (process.env.VECTOR_DB !== "zilliz")
|
||||
throw new Error("Zilliz::Invalid ENV settings");
|
||||
throw new Error(`${this.name}::Invalid ENV settings`);
|
||||
|
||||
const client = new MilvusClient({
|
||||
address: process.env.ZILLIZ_ENDPOINT,
|
||||
@ -37,366 +26,11 @@ const Zilliz = {
|
||||
const { isHealthy } = await client.checkHealth();
|
||||
if (!isHealthy)
|
||||
throw new Error(
|
||||
"Zilliz::Invalid Heartbeat received - is the instance online?"
|
||||
`${this.name}::Invalid Heartbeat received - is the instance online?`
|
||||
);
|
||||
|
||||
return { client };
|
||||
},
|
||||
heartbeat: async function () {
|
||||
await this.connect();
|
||||
return { heartbeat: Number(new Date()) };
|
||||
},
|
||||
totalVectors: async function () {
|
||||
const { client } = await this.connect();
|
||||
const { collection_names } = await client.listCollections();
|
||||
const total = collection_names.reduce(async (acc, collection_name) => {
|
||||
const statistics = await client.getCollectionStatistics({
|
||||
collection_name: this.normalize(collection_name),
|
||||
});
|
||||
return Number(acc) + Number(statistics?.data?.row_count ?? 0);
|
||||
}, 0);
|
||||
return total;
|
||||
},
|
||||
namespaceCount: async function (_namespace = null) {
|
||||
const { client } = await this.connect();
|
||||
const statistics = await client.getCollectionStatistics({
|
||||
collection_name: this.normalize(_namespace),
|
||||
});
|
||||
return Number(statistics?.data?.row_count ?? 0);
|
||||
},
|
||||
namespace: async function (client, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
const collection = await client
|
||||
.getCollectionStatistics({ collection_name: this.normalize(namespace) })
|
||||
.catch(() => null);
|
||||
return collection;
|
||||
},
|
||||
hasNamespace: async function (namespace = null) {
|
||||
if (!namespace) return false;
|
||||
const { client } = await this.connect();
|
||||
return await this.namespaceExists(client, namespace);
|
||||
},
|
||||
namespaceExists: async function (client, namespace = null) {
|
||||
if (!namespace) throw new Error("No namespace value provided.");
|
||||
const { value } = await client
|
||||
.hasCollection({ collection_name: this.normalize(namespace) })
|
||||
.catch((e) => {
|
||||
console.error("Zilliz::namespaceExists", e.message);
|
||||
return { value: false };
|
||||
});
|
||||
return value;
|
||||
},
|
||||
deleteVectorsInNamespace: async function (client, namespace = null) {
|
||||
await client.dropCollection({ collection_name: this.normalize(namespace) });
|
||||
return true;
|
||||
},
|
||||
// Zilliz requires a dimension aspect for collection creation
|
||||
// we pass this in from the first chunk to infer the dimensions like other
|
||||
// providers do.
|
||||
getOrCreateCollection: async function (client, namespace, dimensions = null) {
|
||||
const isExists = await this.namespaceExists(client, namespace);
|
||||
if (!isExists) {
|
||||
if (!dimensions)
|
||||
throw new Error(
|
||||
`Zilliz:getOrCreateCollection Unable to infer vector dimension from input. Open an issue on GitHub for support.`
|
||||
);
|
||||
|
||||
await client.createCollection({
|
||||
collection_name: this.normalize(namespace),
|
||||
fields: [
|
||||
{
|
||||
name: "id",
|
||||
description: "id",
|
||||
data_type: DataType.VarChar,
|
||||
max_length: 255,
|
||||
is_primary_key: true,
|
||||
},
|
||||
{
|
||||
name: "vector",
|
||||
description: "vector",
|
||||
data_type: DataType.FloatVector,
|
||||
dim: dimensions,
|
||||
},
|
||||
{
|
||||
name: "metadata",
|
||||
description: "metadata",
|
||||
data_type: DataType.JSON,
|
||||
},
|
||||
],
|
||||
});
|
||||
await client.createIndex({
|
||||
collection_name: this.normalize(namespace),
|
||||
field_name: "vector",
|
||||
index_type: IndexType.AUTOINDEX,
|
||||
metric_type: MetricType.COSINE,
|
||||
});
|
||||
await client.loadCollectionSync({
|
||||
collection_name: this.normalize(namespace),
|
||||
});
|
||||
}
|
||||
},
|
||||
addDocumentToNamespace: async function (
|
||||
namespace,
|
||||
documentData = {},
|
||||
fullFilePath = null,
|
||||
skipCache = false
|
||||
) {
|
||||
const { DocumentVectors } = require("../../../models/vectors");
|
||||
try {
|
||||
let vectorDimension = null;
|
||||
const { pageContent, docId, ...metadata } = documentData;
|
||||
if (!pageContent || pageContent.length == 0) return false;
|
||||
|
||||
console.log("Adding new vectorized document into namespace", namespace);
|
||||
if (!skipCache) {
|
||||
const cacheResult = await cachedVectorInformation(fullFilePath);
|
||||
if (cacheResult.exists) {
|
||||
const { client } = await this.connect();
|
||||
const { chunks } = cacheResult;
|
||||
const documentVectors = [];
|
||||
vectorDimension = chunks[0][0].values.length || null;
|
||||
|
||||
await this.getOrCreateCollection(client, namespace, vectorDimension);
|
||||
for (const chunk of chunks) {
|
||||
// Before sending to Pinecone and saving the records to our db
|
||||
// we need to assign the id of each chunk that is stored in the cached file.
|
||||
const newChunks = chunk.map((chunk) => {
|
||||
const id = uuidv4();
|
||||
documentVectors.push({ docId, vectorId: id });
|
||||
return { id, vector: chunk.values, metadata: chunk.metadata };
|
||||
});
|
||||
const insertResult = await client.insert({
|
||||
collection_name: this.normalize(namespace),
|
||||
data: newChunks,
|
||||
});
|
||||
|
||||
if (insertResult?.status.error_code !== "Success") {
|
||||
throw new Error(
|
||||
`Error embedding into Zilliz! Reason:${insertResult?.status.reason}`
|
||||
);
|
||||
}
|
||||
}
|
||||
await DocumentVectors.bulkInsert(documentVectors);
|
||||
await client.flushSync({
|
||||
collection_names: [this.normalize(namespace)],
|
||||
});
|
||||
return { vectorized: true, error: null };
|
||||
}
|
||||
}
|
||||
|
||||
const EmbedderEngine = getEmbeddingEngineSelection();
|
||||
const textSplitter = new TextSplitter({
|
||||
chunkSize: TextSplitter.determineMaxChunkSize(
|
||||
await SystemSettings.getValueOrFallback({
|
||||
label: "text_splitter_chunk_size",
|
||||
}),
|
||||
EmbedderEngine?.embeddingMaxChunkLength
|
||||
),
|
||||
chunkOverlap: await SystemSettings.getValueOrFallback(
|
||||
{ label: "text_splitter_chunk_overlap" },
|
||||
20
|
||||
),
|
||||
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
|
||||
chunkPrefix: EmbedderEngine?.embeddingPrefix,
|
||||
});
|
||||
const textChunks = await textSplitter.splitText(pageContent);
|
||||
|
||||
console.log("Snippets created from document:", textChunks.length);
|
||||
const documentVectors = [];
|
||||
const vectors = [];
|
||||
const vectorValues = await EmbedderEngine.embedChunks(textChunks);
|
||||
|
||||
if (!!vectorValues && vectorValues.length > 0) {
|
||||
for (const [i, vector] of vectorValues.entries()) {
|
||||
if (!vectorDimension) vectorDimension = vector.length;
|
||||
const vectorRecord = {
|
||||
id: uuidv4(),
|
||||
values: vector,
|
||||
// [DO NOT REMOVE]
|
||||
// LangChain will be unable to find your text if you embed manually and dont include the `text` key.
|
||||
metadata: { ...metadata, text: textChunks[i] },
|
||||
};
|
||||
|
||||
vectors.push(vectorRecord);
|
||||
documentVectors.push({ docId, vectorId: vectorRecord.id });
|
||||
}
|
||||
} else {
|
||||
throw new Error(
|
||||
"Could not embed document chunks! This document will not be recorded."
|
||||
);
|
||||
}
|
||||
|
||||
if (vectors.length > 0) {
|
||||
const chunks = [];
|
||||
const { client } = await this.connect();
|
||||
await this.getOrCreateCollection(client, namespace, vectorDimension);
|
||||
|
||||
console.log("Inserting vectorized chunks into Zilliz.");
|
||||
for (const chunk of toChunks(vectors, 100)) {
|
||||
chunks.push(chunk);
|
||||
const insertResult = await client.insert({
|
||||
collection_name: this.normalize(namespace),
|
||||
data: chunk.map((item) => ({
|
||||
id: item.id,
|
||||
vector: item.values,
|
||||
metadata: item.metadata,
|
||||
})),
|
||||
});
|
||||
|
||||
if (insertResult?.status.error_code !== "Success") {
|
||||
throw new Error(
|
||||
`Error embedding into Zilliz! Reason:${insertResult?.status.reason}`
|
||||
);
|
||||
}
|
||||
}
|
||||
await storeVectorResult(chunks, fullFilePath);
|
||||
await client.flushSync({
|
||||
collection_names: [this.normalize(namespace)],
|
||||
});
|
||||
}
|
||||
|
||||
await DocumentVectors.bulkInsert(documentVectors);
|
||||
return { vectorized: true, error: null };
|
||||
} catch (e) {
|
||||
console.error("addDocumentToNamespace", e.message);
|
||||
return { vectorized: false, error: e.message };
|
||||
}
|
||||
},
|
||||
deleteDocumentFromNamespace: async function (namespace, docId) {
|
||||
const { DocumentVectors } = require("../../../models/vectors");
|
||||
const { client } = await this.connect();
|
||||
if (!(await this.namespaceExists(client, namespace))) return;
|
||||
const knownDocuments = await DocumentVectors.where({ docId });
|
||||
if (knownDocuments.length === 0) return;
|
||||
|
||||
const vectorIds = knownDocuments.map((doc) => doc.vectorId);
|
||||
const queryIn = vectorIds.map((v) => `'${v}'`).join(",");
|
||||
await client.deleteEntities({
|
||||
collection_name: this.normalize(namespace),
|
||||
expr: `id in [${queryIn}]`,
|
||||
});
|
||||
|
||||
const indexes = knownDocuments.map((doc) => doc.id);
|
||||
await DocumentVectors.deleteIds(indexes);
|
||||
|
||||
// Even after flushing Zilliz can take some time to re-calc the count
|
||||
// so all we can hope to do is flushSync so that the count can be correct
|
||||
// on a later call.
|
||||
await client.flushSync({ collection_names: [this.normalize(namespace)] });
|
||||
return true;
|
||||
},
|
||||
performSimilaritySearch: async function ({
|
||||
namespace = null,
|
||||
input = "",
|
||||
LLMConnector = null,
|
||||
similarityThreshold = 0.25,
|
||||
topN = 4,
|
||||
filterIdentifiers = [],
|
||||
}) {
|
||||
if (!namespace || !input || !LLMConnector)
|
||||
throw new Error("Invalid request to performSimilaritySearch.");
|
||||
|
||||
const { client } = await this.connect();
|
||||
if (!(await this.namespaceExists(client, namespace))) {
|
||||
return {
|
||||
contextTexts: [],
|
||||
sources: [],
|
||||
message: "Invalid query - no documents found for workspace!",
|
||||
};
|
||||
}
|
||||
|
||||
const queryVector = await LLMConnector.embedTextInput(input);
|
||||
const { contextTexts, sourceDocuments } = await this.similarityResponse({
|
||||
client,
|
||||
namespace,
|
||||
queryVector,
|
||||
similarityThreshold,
|
||||
topN,
|
||||
filterIdentifiers,
|
||||
});
|
||||
|
||||
const sources = sourceDocuments.map((doc, i) => {
|
||||
return { metadata: doc, text: contextTexts[i] };
|
||||
});
|
||||
return {
|
||||
contextTexts,
|
||||
sources: this.curateSources(sources),
|
||||
message: false,
|
||||
};
|
||||
},
|
||||
similarityResponse: async function ({
|
||||
client,
|
||||
namespace,
|
||||
queryVector,
|
||||
similarityThreshold = 0.25,
|
||||
topN = 4,
|
||||
filterIdentifiers = [],
|
||||
}) {
|
||||
const result = {
|
||||
contextTexts: [],
|
||||
sourceDocuments: [],
|
||||
scores: [],
|
||||
};
|
||||
const response = await client.search({
|
||||
collection_name: this.normalize(namespace),
|
||||
vectors: queryVector,
|
||||
limit: topN,
|
||||
});
|
||||
response.results.forEach((match) => {
|
||||
if (match.score < similarityThreshold) return;
|
||||
if (filterIdentifiers.includes(sourceIdentifier(match.metadata))) {
|
||||
console.log(
|
||||
"Zilliz: A source was filtered from context as it's parent document is pinned."
|
||||
);
|
||||
return;
|
||||
}
|
||||
result.contextTexts.push(match.metadata.text);
|
||||
result.sourceDocuments.push({
|
||||
...match.metadata,
|
||||
score: match.score,
|
||||
});
|
||||
result.scores.push(match.score);
|
||||
});
|
||||
return result;
|
||||
},
|
||||
"namespace-stats": async function (reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
if (!namespace) throw new Error("namespace required");
|
||||
const { client } = await this.connect();
|
||||
if (!(await this.namespaceExists(client, namespace)))
|
||||
throw new Error("Namespace by that name does not exist.");
|
||||
const stats = await this.namespace(client, namespace);
|
||||
return stats
|
||||
? stats
|
||||
: { message: "No stats were able to be fetched from DB for namespace" };
|
||||
},
|
||||
"delete-namespace": async function (reqBody = {}) {
|
||||
const { namespace = null } = reqBody;
|
||||
const { client } = await this.connect();
|
||||
if (!(await this.namespaceExists(client, namespace)))
|
||||
throw new Error("Namespace by that name does not exist.");
|
||||
|
||||
const statistics = await this.namespace(client, namespace);
|
||||
await this.deleteVectorsInNamespace(client, namespace);
|
||||
const vectorCount = Number(statistics?.data?.row_count ?? 0);
|
||||
return {
|
||||
message: `Namespace ${namespace} was deleted along with ${vectorCount} vectors.`,
|
||||
};
|
||||
},
|
||||
curateSources: function (sources = []) {
|
||||
const documents = [];
|
||||
for (const source of sources) {
|
||||
const { metadata = {} } = source;
|
||||
if (Object.keys(metadata).length > 0) {
|
||||
documents.push({
|
||||
...metadata,
|
||||
...(source.text ? { text: source.text } : {}),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return documents;
|
||||
},
|
||||
};
|
||||
|
||||
module.exports.Zilliz = Zilliz;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user