* Migrate Astra to class (#4722) migrate astra to class * Migrate LanceDB to class (#4721) migrate lancedb to class * Migrate Pinecone to class (#4726) migrate pinecone to class * Migrate Zilliz to class (#4729) migrate zilliz to class * Migrate Weaviate to class (#4728) migrate weaviate to class * Migrate Qdrant to class (#4727) migrate qdrant to class * Migrate Milvus to class (#4725) migrate milvus to class * Migrate Chroma to class (#4723) migrate chroma to class * Migrate Chroma Cloud to class (#4724) * migrate chroma to class * migrate chroma cloud to class * move limits to class field --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com> * Migrate PGVector to class (#4730) * migrate pgvector to class * patch pgvector test * convert connectionString, tableName, and validateConnection to static methods * move instance properties to class fields --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com> * Refactor Zilliz Cloud vector DB provider (#4749) simplify zilliz implementation by using milvus as base class Co-authored-by: Timothy Carambat <rambat1010@gmail.com> * VectorDatabase base class (#4738) create generic VectorDatabase base class Co-authored-by: Timothy Carambat <rambat1010@gmail.com> * Extend VectorDatabase base class to all providers (#4755) extend VectorDatabase base class to all providers * patch lancedb import * breakout name and add generic logger * dev tag build --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
158 lines
5.7 KiB
JavaScript
158 lines
5.7 KiB
JavaScript
const { CloudClient } = require("chromadb");
|
|
const { Chroma } = require("../chroma");
|
|
const { toChunks } = require("../../helpers");
|
|
|
|
/**
|
|
* ChromaCloud works nearly the same as Chroma so we can just extend the
|
|
* Chroma class and override the connect method to use the CloudClient for major differences in API functionality.
|
|
*/
|
|
class ChromaCloud extends Chroma {
|
|
constructor() {
|
|
super();
|
|
}
|
|
|
|
get name() {
|
|
return "ChromaCloud";
|
|
}
|
|
|
|
/**
|
|
* Basic quota/limitations for Chroma Cloud for accounts. Does not lookup client-specific limits.
|
|
* @see https://docs.trychroma.com/cloud/quotas-limits
|
|
*/
|
|
limits = {
|
|
maxEmbeddingDim: 4_096,
|
|
maxDocumentBytes: 16_384,
|
|
maxMetadataBytes: 4_096,
|
|
maxRecordsPerWrite: 300,
|
|
};
|
|
|
|
async connect() {
|
|
if (process.env.VECTOR_DB !== "chromacloud")
|
|
throw new Error("ChromaCloud::Invalid ENV settings");
|
|
|
|
const client = new CloudClient({
|
|
apiKey: process.env.CHROMACLOUD_API_KEY,
|
|
tenant: process.env.CHROMACLOUD_TENANT,
|
|
database: process.env.CHROMACLOUD_DATABASE,
|
|
});
|
|
|
|
const isAlive = await client.heartbeat();
|
|
if (!isAlive)
|
|
throw new Error(
|
|
"ChromaCloud::Invalid Heartbeat received - is the instance online?"
|
|
);
|
|
return { client };
|
|
}
|
|
|
|
/**
|
|
* Chroma Cloud has some basic limitations on upserts to protect performance and latency.
|
|
* Local deployments do not have these limitations since they are self-hosted.
|
|
*
|
|
* This method, if cloud, will do some simple logic/heuristics to ensure that the upserts are not too large.
|
|
* Otherwise, it may throw a 422.
|
|
* @param {import("chromadb").Collection} collection
|
|
* @param {{ids: string[], embeddings: number[], metadatas: Record<string, any>[], documents: string[]}[]} submissions
|
|
* @returns {Promise<boolean>} True if the upsert was successful, false otherwise.
|
|
* If the upsert was not successful, the error message will be returned.
|
|
*/
|
|
async smartAdd(collection, submission) {
|
|
const testSubmission = {
|
|
id: submission.ids[0],
|
|
embedding: submission.embeddings[0],
|
|
metadata: submission.metadatas[0],
|
|
document: submission.documents[0],
|
|
};
|
|
|
|
if (testSubmission.embedding.length > this.limits.maxEmbeddingDim)
|
|
console.warn(
|
|
`ChromaCloud::Embedding dimension too large (default max is ${this.limits.maxEmbeddingDim}). Got ${testSubmission.embedding.length}. Upsert may fail!`
|
|
);
|
|
if (testSubmission.document.length > this.limits.maxDocumentBytes)
|
|
console.warn(
|
|
`ChromaCloud::Document length too large (default max is ${this.limits.maxDocumentBytes}). Got ${testSubmission.document.length}. Upsert may fail!`
|
|
);
|
|
if (
|
|
JSON.stringify(testSubmission.metadata).length >
|
|
this.limits.maxMetadataBytes
|
|
)
|
|
console.warn(
|
|
`ChromaCloud::Metadata length too large (default max is ${this.limits.maxMetadataBytes}). Got ${JSON.stringify(testSubmission.metadata).length}. Upsert may fail!`
|
|
);
|
|
|
|
// If the submissions are not too large, just add them directly.
|
|
if (submission.ids.length <= this.limits.maxRecordsPerWrite) {
|
|
await collection.add(submission);
|
|
return true;
|
|
}
|
|
|
|
this.logger(
|
|
`Upsert Payload is too large (max is ${this.limits.maxRecordsPerWrite} records). Splitting into chunks of ${this.limits.maxRecordsPerWrite} records.`
|
|
);
|
|
const chunks = [];
|
|
let chunkedSubmission = {
|
|
ids: [],
|
|
embeddings: [],
|
|
metadatas: [],
|
|
documents: [],
|
|
};
|
|
for (let i = 0; i < submission.ids.length; i++) {
|
|
chunkedSubmission.ids.push(submission.ids[i]);
|
|
chunkedSubmission.embeddings.push(submission.embeddings[i]);
|
|
chunkedSubmission.metadatas.push(submission.metadatas[i]);
|
|
chunkedSubmission.documents.push(submission.documents[i]);
|
|
if (chunkedSubmission.ids.length === this.limits.maxRecordsPerWrite) {
|
|
this.logger(
|
|
`ChromaCloud::Adding chunk payload ${chunks.length + 1} of ${Math.ceil(submission.ids.length / this.limits.maxRecordsPerWrite)}`
|
|
);
|
|
chunks.push(chunkedSubmission);
|
|
chunkedSubmission = {
|
|
ids: [],
|
|
embeddings: [],
|
|
metadatas: [],
|
|
documents: [],
|
|
};
|
|
}
|
|
}
|
|
// Push remaining submissions to the last chunk
|
|
if (chunkedSubmission.ids.length > 0) chunks.push(chunkedSubmission);
|
|
|
|
let counter = 1;
|
|
for (const chunk of chunks) {
|
|
await collection.add(chunk);
|
|
counter++;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
/**
|
|
* This method is a wrapper around the ChromaCollection.delete method.
|
|
* It will return the result of the delete method directly.
|
|
* Chroma Cloud has some basic limitations on deletes to protect performance and latency.
|
|
* Local deployments do not have these limitations since they are self-hosted.
|
|
*
|
|
* This method, if cloud, will do some simple logic/heuristics to ensure that the deletes are not too large.
|
|
* Otherwise, it may throw a 422.
|
|
* @param {import("chromadb").Collection} collection
|
|
* @param {string[]} vectorIds
|
|
* @returns {Promise<boolean>} True if the delete was successful, false otherwise.
|
|
*/
|
|
async smartDelete(collection, vectorIds) {
|
|
if (vectorIds.length <= this.limits.maxRecordsPerWrite)
|
|
return await collection.delete({ ids: vectorIds });
|
|
|
|
this.logger(
|
|
`Delete Payload is too large (max is ${this.limits.maxRecordsPerWrite} records). Splitting into chunks of ${this.limits.maxRecordsPerWrite} records.`
|
|
);
|
|
const chunks = toChunks(vectorIds, this.limits.maxRecordsPerWrite);
|
|
let counter = 1;
|
|
for (const chunk of chunks) {
|
|
this.logger(`Deleting chunk ${counter} of ${chunks.length}`);
|
|
await collection.delete({ ids: chunk });
|
|
counter++;
|
|
}
|
|
return true;
|
|
}
|
|
}
|
|
|
|
module.exports.ChromaCloud = ChromaCloud;
|