fix chroma db + add similarity offset (#3458)

* fix chroma db + add similarity offset

* patch chroma scoring

---------

Co-authored-by: timothycarambat <rambat1010@gmail.com>
This commit is contained in:
Sean Hatfield 2025-03-17 17:48:23 -07:00 committed by GitHub
parent 0d5e869f5c
commit f6239a39f8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 984 additions and 916 deletions

View File

@ -44,7 +44,7 @@
"chalk": "^4", "chalk": "^4",
"check-disk-space": "^3.4.0", "check-disk-space": "^3.4.0",
"cheerio": "^1.0.0", "cheerio": "^1.0.0",
"chromadb": "^1.5.2", "chromadb": "^2.0.1",
"cohere-ai": "^7.9.5", "cohere-ai": "^7.9.5",
"cors": "^2.8.5", "cors": "^2.8.5",
"dotenv": "^16.0.3", "dotenv": "^16.0.3",

View File

@ -129,12 +129,10 @@ const Chroma = {
queryEmbeddings: queryVector, queryEmbeddings: queryVector,
nResults: topN, nResults: topN,
}); });
response.ids[0].forEach((_, i) => { response.ids[0].forEach((_, i) => {
if ( const similarity = this.distanceToSimilarity(response.distances[0][i]);
this.distanceToSimilarity(response.distances[0][i]) < if (similarity < similarityThreshold) return;
similarityThreshold
)
return;
if ( if (
filterIdentifiers.includes(sourceIdentifier(response.metadatas[0][i])) filterIdentifiers.includes(sourceIdentifier(response.metadatas[0][i]))
@ -144,9 +142,10 @@ const Chroma = {
); );
return; return;
} }
result.contextTexts.push(response.documents[0][i]); result.contextTexts.push(response.documents[0][i]);
result.sourceDocuments.push(response.metadatas[0][i]); result.sourceDocuments.push(response.metadatas[0][i]);
result.scores.push(this.distanceToSimilarity(response.distances[0][i])); result.scores.push(similarity);
}); });
return result; return result;
@ -200,6 +199,7 @@ const Chroma = {
const { client } = await this.connect(); const { client } = await this.connect();
const collection = await client.getOrCreateCollection({ const collection = await client.getOrCreateCollection({
name: this.normalize(namespace), name: this.normalize(namespace),
// returns [-1, 1] unit vector
metadata: { "hnsw:space": "cosine" }, metadata: { "hnsw:space": "cosine" },
}); });
const { chunks } = cacheResult; const { chunks } = cacheResult;
@ -299,13 +299,18 @@ const Chroma = {
if (vectors.length > 0) { if (vectors.length > 0) {
const chunks = []; const chunks = [];
console.log("Inserting vectorized chunks into Chroma collection."); console.log("Inserting vectorized chunks into Chroma collection.");
for (const chunk of toChunks(vectors, 500)) chunks.push(chunk); for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
const additionResult = await collection.add(submission); try {
if (!additionResult) await collection.add(submission);
throw new Error("Error embedding into ChromaDB", additionResult); console.log(
`Successfully added ${submission.ids.length} vectors to collection ${this.normalize(namespace)}`
);
} catch (error) {
console.error("Error adding to ChromaDB:", error);
throw new Error(`Error embedding into ChromaDB: ${error.message}`);
}
await storeVectorResult(chunks, fullFilePath); await storeVectorResult(chunks, fullFilePath);
} }
@ -356,18 +361,24 @@ const Chroma = {
} }
const queryVector = await LLMConnector.embedTextInput(input); const queryVector = await LLMConnector.embedTextInput(input);
const { contextTexts, sourceDocuments } = await this.similarityResponse({ const { contextTexts, sourceDocuments, scores } =
client, await this.similarityResponse({
namespace, client,
queryVector, namespace,
similarityThreshold, queryVector,
topN, similarityThreshold,
filterIdentifiers, topN,
}); filterIdentifiers,
});
const sources = sourceDocuments.map((metadata, i) => ({
metadata: {
...metadata,
text: contextTexts[i],
score: scores?.[i] || null,
},
}));
const sources = sourceDocuments.map((metadata, i) => {
return { metadata: { ...metadata, text: contextTexts[i] } };
});
return { return {
contextTexts, contextTexts,
sources: this.curateSources(sources), sources: this.curateSources(sources),

File diff suppressed because it is too large Load Diff