fix chroma db + add similarity offset (#3458)

* fix chroma db + add similarity offset

* patch chroma scoring

---------

Co-authored-by: timothycarambat <rambat1010@gmail.com>
This commit is contained in:
Sean Hatfield 2025-03-17 17:48:23 -07:00 committed by GitHub
parent 0d5e869f5c
commit f6239a39f8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 984 additions and 916 deletions

View File

@ -44,7 +44,7 @@
"chalk": "^4",
"check-disk-space": "^3.4.0",
"cheerio": "^1.0.0",
"chromadb": "^1.5.2",
"chromadb": "^2.0.1",
"cohere-ai": "^7.9.5",
"cors": "^2.8.5",
"dotenv": "^16.0.3",

View File

@ -129,12 +129,10 @@ const Chroma = {
queryEmbeddings: queryVector,
nResults: topN,
});
response.ids[0].forEach((_, i) => {
if (
this.distanceToSimilarity(response.distances[0][i]) <
similarityThreshold
)
return;
const similarity = this.distanceToSimilarity(response.distances[0][i]);
if (similarity < similarityThreshold) return;
if (
filterIdentifiers.includes(sourceIdentifier(response.metadatas[0][i]))
@ -144,9 +142,10 @@ const Chroma = {
);
return;
}
result.contextTexts.push(response.documents[0][i]);
result.sourceDocuments.push(response.metadatas[0][i]);
result.scores.push(this.distanceToSimilarity(response.distances[0][i]));
result.scores.push(similarity);
});
return result;
@ -200,6 +199,7 @@ const Chroma = {
const { client } = await this.connect();
const collection = await client.getOrCreateCollection({
name: this.normalize(namespace),
// returns [-1, 1] unit vector
metadata: { "hnsw:space": "cosine" },
});
const { chunks } = cacheResult;
@ -299,13 +299,18 @@ const Chroma = {
if (vectors.length > 0) {
const chunks = [];
console.log("Inserting vectorized chunks into Chroma collection.");
for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
const additionResult = await collection.add(submission);
if (!additionResult)
throw new Error("Error embedding into ChromaDB", additionResult);
try {
await collection.add(submission);
console.log(
`Successfully added ${submission.ids.length} vectors to collection ${this.normalize(namespace)}`
);
} catch (error) {
console.error("Error adding to ChromaDB:", error);
throw new Error(`Error embedding into ChromaDB: ${error.message}`);
}
await storeVectorResult(chunks, fullFilePath);
}
@ -356,18 +361,24 @@ const Chroma = {
}
const queryVector = await LLMConnector.embedTextInput(input);
const { contextTexts, sourceDocuments } = await this.similarityResponse({
client,
namespace,
queryVector,
similarityThreshold,
topN,
filterIdentifiers,
});
const { contextTexts, sourceDocuments, scores } =
await this.similarityResponse({
client,
namespace,
queryVector,
similarityThreshold,
topN,
filterIdentifiers,
});
const sources = sourceDocuments.map((metadata, i) => ({
metadata: {
...metadata,
text: contextTexts[i],
score: scores?.[i] || null,
},
}));
const sources = sourceDocuments.map((metadata, i) => {
return { metadata: { ...metadata, text: contextTexts[i] } };
});
return {
contextTexts,
sources: this.curateSources(sources),

File diff suppressed because it is too large Load Diff