fix chroma db + add similarity offset (#3458)
* fix chroma db + add similarity offset * patch chroma scoring --------- Co-authored-by: timothycarambat <rambat1010@gmail.com>
This commit is contained in:
parent
0d5e869f5c
commit
f6239a39f8
@ -44,7 +44,7 @@
|
||||
"chalk": "^4",
|
||||
"check-disk-space": "^3.4.0",
|
||||
"cheerio": "^1.0.0",
|
||||
"chromadb": "^1.5.2",
|
||||
"chromadb": "^2.0.1",
|
||||
"cohere-ai": "^7.9.5",
|
||||
"cors": "^2.8.5",
|
||||
"dotenv": "^16.0.3",
|
||||
|
||||
@ -129,12 +129,10 @@ const Chroma = {
|
||||
queryEmbeddings: queryVector,
|
||||
nResults: topN,
|
||||
});
|
||||
|
||||
response.ids[0].forEach((_, i) => {
|
||||
if (
|
||||
this.distanceToSimilarity(response.distances[0][i]) <
|
||||
similarityThreshold
|
||||
)
|
||||
return;
|
||||
const similarity = this.distanceToSimilarity(response.distances[0][i]);
|
||||
if (similarity < similarityThreshold) return;
|
||||
|
||||
if (
|
||||
filterIdentifiers.includes(sourceIdentifier(response.metadatas[0][i]))
|
||||
@ -144,9 +142,10 @@ const Chroma = {
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
result.contextTexts.push(response.documents[0][i]);
|
||||
result.sourceDocuments.push(response.metadatas[0][i]);
|
||||
result.scores.push(this.distanceToSimilarity(response.distances[0][i]));
|
||||
result.scores.push(similarity);
|
||||
});
|
||||
|
||||
return result;
|
||||
@ -200,6 +199,7 @@ const Chroma = {
|
||||
const { client } = await this.connect();
|
||||
const collection = await client.getOrCreateCollection({
|
||||
name: this.normalize(namespace),
|
||||
// returns [-1, 1] unit vector
|
||||
metadata: { "hnsw:space": "cosine" },
|
||||
});
|
||||
const { chunks } = cacheResult;
|
||||
@ -299,13 +299,18 @@ const Chroma = {
|
||||
|
||||
if (vectors.length > 0) {
|
||||
const chunks = [];
|
||||
|
||||
console.log("Inserting vectorized chunks into Chroma collection.");
|
||||
for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
|
||||
|
||||
const additionResult = await collection.add(submission);
|
||||
if (!additionResult)
|
||||
throw new Error("Error embedding into ChromaDB", additionResult);
|
||||
try {
|
||||
await collection.add(submission);
|
||||
console.log(
|
||||
`Successfully added ${submission.ids.length} vectors to collection ${this.normalize(namespace)}`
|
||||
);
|
||||
} catch (error) {
|
||||
console.error("Error adding to ChromaDB:", error);
|
||||
throw new Error(`Error embedding into ChromaDB: ${error.message}`);
|
||||
}
|
||||
|
||||
await storeVectorResult(chunks, fullFilePath);
|
||||
}
|
||||
@ -356,18 +361,24 @@ const Chroma = {
|
||||
}
|
||||
|
||||
const queryVector = await LLMConnector.embedTextInput(input);
|
||||
const { contextTexts, sourceDocuments } = await this.similarityResponse({
|
||||
client,
|
||||
namespace,
|
||||
queryVector,
|
||||
similarityThreshold,
|
||||
topN,
|
||||
filterIdentifiers,
|
||||
});
|
||||
const { contextTexts, sourceDocuments, scores } =
|
||||
await this.similarityResponse({
|
||||
client,
|
||||
namespace,
|
||||
queryVector,
|
||||
similarityThreshold,
|
||||
topN,
|
||||
filterIdentifiers,
|
||||
});
|
||||
|
||||
const sources = sourceDocuments.map((metadata, i) => ({
|
||||
metadata: {
|
||||
...metadata,
|
||||
text: contextTexts[i],
|
||||
score: scores?.[i] || null,
|
||||
},
|
||||
}));
|
||||
|
||||
const sources = sourceDocuments.map((metadata, i) => {
|
||||
return { metadata: { ...metadata, text: contextTexts[i] } };
|
||||
});
|
||||
return {
|
||||
contextTexts,
|
||||
sources: this.curateSources(sources),
|
||||
|
||||
1845
server/yarn.lock
1845
server/yarn.lock
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user