fix chroma db + add similarity offset (#3458)
* fix chroma db + add similarity offset * patch chroma scoring --------- Co-authored-by: timothycarambat <rambat1010@gmail.com>
This commit is contained in:
parent
0d5e869f5c
commit
f6239a39f8
@ -44,7 +44,7 @@
|
|||||||
"chalk": "^4",
|
"chalk": "^4",
|
||||||
"check-disk-space": "^3.4.0",
|
"check-disk-space": "^3.4.0",
|
||||||
"cheerio": "^1.0.0",
|
"cheerio": "^1.0.0",
|
||||||
"chromadb": "^1.5.2",
|
"chromadb": "^2.0.1",
|
||||||
"cohere-ai": "^7.9.5",
|
"cohere-ai": "^7.9.5",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dotenv": "^16.0.3",
|
"dotenv": "^16.0.3",
|
||||||
|
|||||||
@ -129,12 +129,10 @@ const Chroma = {
|
|||||||
queryEmbeddings: queryVector,
|
queryEmbeddings: queryVector,
|
||||||
nResults: topN,
|
nResults: topN,
|
||||||
});
|
});
|
||||||
|
|
||||||
response.ids[0].forEach((_, i) => {
|
response.ids[0].forEach((_, i) => {
|
||||||
if (
|
const similarity = this.distanceToSimilarity(response.distances[0][i]);
|
||||||
this.distanceToSimilarity(response.distances[0][i]) <
|
if (similarity < similarityThreshold) return;
|
||||||
similarityThreshold
|
|
||||||
)
|
|
||||||
return;
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
filterIdentifiers.includes(sourceIdentifier(response.metadatas[0][i]))
|
filterIdentifiers.includes(sourceIdentifier(response.metadatas[0][i]))
|
||||||
@ -144,9 +142,10 @@ const Chroma = {
|
|||||||
);
|
);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
result.contextTexts.push(response.documents[0][i]);
|
result.contextTexts.push(response.documents[0][i]);
|
||||||
result.sourceDocuments.push(response.metadatas[0][i]);
|
result.sourceDocuments.push(response.metadatas[0][i]);
|
||||||
result.scores.push(this.distanceToSimilarity(response.distances[0][i]));
|
result.scores.push(similarity);
|
||||||
});
|
});
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
@ -200,6 +199,7 @@ const Chroma = {
|
|||||||
const { client } = await this.connect();
|
const { client } = await this.connect();
|
||||||
const collection = await client.getOrCreateCollection({
|
const collection = await client.getOrCreateCollection({
|
||||||
name: this.normalize(namespace),
|
name: this.normalize(namespace),
|
||||||
|
// returns [-1, 1] unit vector
|
||||||
metadata: { "hnsw:space": "cosine" },
|
metadata: { "hnsw:space": "cosine" },
|
||||||
});
|
});
|
||||||
const { chunks } = cacheResult;
|
const { chunks } = cacheResult;
|
||||||
@ -299,13 +299,18 @@ const Chroma = {
|
|||||||
|
|
||||||
if (vectors.length > 0) {
|
if (vectors.length > 0) {
|
||||||
const chunks = [];
|
const chunks = [];
|
||||||
|
|
||||||
console.log("Inserting vectorized chunks into Chroma collection.");
|
console.log("Inserting vectorized chunks into Chroma collection.");
|
||||||
for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
|
for (const chunk of toChunks(vectors, 500)) chunks.push(chunk);
|
||||||
|
|
||||||
const additionResult = await collection.add(submission);
|
try {
|
||||||
if (!additionResult)
|
await collection.add(submission);
|
||||||
throw new Error("Error embedding into ChromaDB", additionResult);
|
console.log(
|
||||||
|
`Successfully added ${submission.ids.length} vectors to collection ${this.normalize(namespace)}`
|
||||||
|
);
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error adding to ChromaDB:", error);
|
||||||
|
throw new Error(`Error embedding into ChromaDB: ${error.message}`);
|
||||||
|
}
|
||||||
|
|
||||||
await storeVectorResult(chunks, fullFilePath);
|
await storeVectorResult(chunks, fullFilePath);
|
||||||
}
|
}
|
||||||
@ -356,18 +361,24 @@ const Chroma = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const queryVector = await LLMConnector.embedTextInput(input);
|
const queryVector = await LLMConnector.embedTextInput(input);
|
||||||
const { contextTexts, sourceDocuments } = await this.similarityResponse({
|
const { contextTexts, sourceDocuments, scores } =
|
||||||
client,
|
await this.similarityResponse({
|
||||||
namespace,
|
client,
|
||||||
queryVector,
|
namespace,
|
||||||
similarityThreshold,
|
queryVector,
|
||||||
topN,
|
similarityThreshold,
|
||||||
filterIdentifiers,
|
topN,
|
||||||
});
|
filterIdentifiers,
|
||||||
|
});
|
||||||
|
|
||||||
|
const sources = sourceDocuments.map((metadata, i) => ({
|
||||||
|
metadata: {
|
||||||
|
...metadata,
|
||||||
|
text: contextTexts[i],
|
||||||
|
score: scores?.[i] || null,
|
||||||
|
},
|
||||||
|
}));
|
||||||
|
|
||||||
const sources = sourceDocuments.map((metadata, i) => {
|
|
||||||
return { metadata: { ...metadata, text: contextTexts[i] } };
|
|
||||||
});
|
|
||||||
return {
|
return {
|
||||||
contextTexts,
|
contextTexts,
|
||||||
sources: this.curateSources(sources),
|
sources: this.curateSources(sources),
|
||||||
|
|||||||
1845
server/yarn.lock
1845
server/yarn.lock
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user