Infinite prompt input and compression implementation (#332)

* WIP on continuous prompt window summary * wip * Move chat out of VDB simplify chat interface normalize LLM model interface have compression abstraction Cleanup compressor TODO: Anthropic stuff * Implement compression for Anythropic Fix lancedb sources * cleanup vectorDBs and check that lance, chroma, and pinecone are returning valid metadata sources * Resolve Weaviate citation sources not working with schema * comment cleanup
2023-11-06 13:13:53 -08:00 · 2023-11-06 13:13:53 -08:00 · be9d8b0397
commit be9d8b0397
parent 0751fb1fdd
23 changed files with 837 additions and 445 deletions
--- a/frontend/src/components/LLMSelection/AnthropicAiOptions/index.jsx
+++ b/frontend/src/components/LLMSelection/AnthropicAiOptions/index.jsx
@ -24,7 +24,7 @@ export default function AnthropicAiOptions({ settings, showAlert = false }) {
      <div className="w-full flex items-center gap-4">
        <div className="flex flex-col w-60">
          <label className="text-white text-sm font-semibold block mb-4">
-            Anthropic Claude-2 API Key
+            Anthropic API Key
          </label>
          <input
            type="password"
@ -48,7 +48,7 @@ export default function AnthropicAiOptions({ settings, showAlert = false }) {
            required={true}
            className="bg-zinc-900 border border-gray-500 text-white text-sm rounded-lg block w-full p-2.5"
          >
-            {["claude-2"].map((model) => {
+            {["claude-2", "claude-instant-1"].map((model) => {
              return (
                <option key={model} value={model}>
                  {model}
--- a/frontend/src/components/LLMSelection/AzureAiOptions/index.jsx
+++ b/frontend/src/components/LLMSelection/AzureAiOptions/index.jsx
@ -49,6 +49,23 @@ export default function AzureAiOptions({ settings }) {
        />
      </div>
      <div className="flex flex-col w-60">
        <label className="text-white text-sm font-semibold block mb-4">
          Chat Model Token Limit
        </label>
        <select
          name="AzureOpenAiTokenLimit"
          defaultValue={settings?.AzureOpenAiTokenLimit || 4096}
          className="bg-zinc-900 text-white placeholder-white placeholder-opacity-60 text-sm rounded-lg focus:border-white block w-full p-2.5"
          required={true}
        >
          <option value={4096}>4,096 (gpt-3.5-turbo)</option>
          <option value={16384}>16,384 (gpt-3.5-16k)</option>
          <option value={8192}>8,192 (gpt-4)</option>
          <option value={32768}>32,768 (gpt-4-32k)</option>
        </select>
      </div>
      <div className="flex flex-col w-60">
        <label className="text-white text-sm font-semibold block mb-4">
          Embedding Deployment Name
--- a/frontend/src/components/Modals/MangeWorkspace/Settings/index.jsx
+++ b/frontend/src/components/Modals/MangeWorkspace/Settings/index.jsx
@ -224,7 +224,6 @@ export default function WorkspaceSettings({ workspace }) {
                </div>
                <textarea
                  name="openAiPrompt"
                  maxLength={500}
                  rows={5}
                  defaultValue={chatPrompt(workspace)}
                  className="bg-zinc-900 text-white text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block w-full p-2.5"
--- a/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/PromptInput/index.jsx
@ -55,7 +55,6 @@ export default function PromptInput({
                onKeyDown={captureEnter}
                onChange={onChange}
                required={true}
                maxLength={240}
                disabled={inputDisabled}
                onFocus={() => setFocused(true)}
                onBlur={(e) => {
--- a/server/endpoints/chat.js
+++ b/server/endpoints/chat.js
@ -71,6 +71,7 @@ function chatEndpoints(app) {
        });
        response.status(200).json({ ...result });
      } catch (e) {
        console.error(e);
        response.status(500).json({
          id: uuidv4(),
          type: "abort",
--- a/server/models/cacheData.js
+++ b/server/models/cacheData.js
@ -0,0 +1,69 @@
 const prisma = require("../utils/prisma");
 const CacheData = {
  new: async function (inputs = {}) {
    try {
      const cache = await prisma.cache_data.create({
        data: inputs,
      });
      return { cache, message: null };
    } catch (error) {
      console.error(error.message);
      return { cache: null, message: error.message };
    }
  },
  get: async function (clause = {}, limit = null, orderBy = null) {
    try {
      const cache = await prisma.cache_data.findFirst({
        where: clause,
        ...(limit !== null ? { take: limit } : {}),
        ...(orderBy !== null ? { orderBy } : {}),
      });
      return cache || null;
    } catch (error) {
      console.error(error.message);
      return null;
    }
  },
  delete: async function (clause = {}) {
    try {
      await prisma.cache_data.deleteMany({
        where: clause,
      });
      return true;
    } catch (error) {
      console.error(error.message);
      return false;
    }
  },
  where: async function (clause = {}, limit = null, orderBy = null) {
    try {
      const caches = await prisma.cache_data.findMany({
        where: clause,
        ...(limit !== null ? { take: limit } : {}),
        ...(orderBy !== null ? { orderBy } : {}),
      });
      return caches;
    } catch (error) {
      console.error(error.message);
      return [];
    }
  },
  count: async function (clause = {}) {
    try {
      const count = await prisma.cache_data.count({
        where: clause,
      });
      return count;
    } catch (error) {
      console.error(error.message);
      return 0;
    }
  },
 };
 module.exports = { CacheData };
--- a/server/models/systemSettings.js
+++ b/server/models/systemSettings.js
@ -65,6 +65,7 @@ const SystemSettings = {
            AzureOpenAiKey: !!process.env.AZURE_OPENAI_KEY,
            AzureOpenAiModelPref: process.env.OPEN_MODEL_PREF,
            AzureOpenAiEmbeddingModelPref: process.env.EMBEDDING_MODEL_PREF,
            AzureOpenAiTokenLimit: process.env.AZURE_OPENAI_TOKEN_LIMIT || 4096,
          }
        : {}),
--- a/server/package.json
+++ b/server/package.json
@ -36,6 +36,7 @@
    "express": "^4.18.2",
    "extract-zip": "^2.0.1",
    "graphql": "^16.7.1",
    "js-tiktoken": "^1.0.7",
    "jsonwebtoken": "^8.5.1",
    "langchain": "^0.0.90",
    "mime": "^3.0.0",
--- a/server/prisma/migrations/20231101195421_init/migration.sql
+++ b/server/prisma/migrations/20231101195421_init/migration.sql
@ -0,0 +1,11 @@
 -- CreateTable
 CREATE TABLE "cache_data" (
    "id" INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
    "name" TEXT NOT NULL,
    "data" TEXT NOT NULL,
    "belongsTo" TEXT,
    "byId" INTEGER,
    "expiresAt" DATETIME,
    "createdAt" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
    "lastUpdatedAt" DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
 );
--- a/server/prisma/schema.prisma
+++ b/server/prisma/schema.prisma
@ -116,3 +116,14 @@ model workspace_users {
  workspaces    workspaces @relation(fields: [workspace_id], references: [id], onDelete: Cascade, onUpdate: Cascade)
  users         users      @relation(fields: [user_id], references: [id], onDelete: Cascade, onUpdate: Cascade)
 }
 model cache_data {
  id            Int       @id @default(autoincrement())
  name          String
  data          String
  belongsTo     String?
  byId          Int?
  expiresAt     DateTime?
  createdAt     DateTime  @default(now())
  lastUpdatedAt DateTime  @default(now())
 }
--- a/server/utils/AiProviders/anthropic/index.js
+++ b/server/utils/AiProviders/anthropic/index.js
@ -12,6 +12,12 @@ class AnthropicLLM {
      apiKey: process.env.ANTHROPIC_API_KEY,
    });
    this.anthropic = anthropic;
    this.model = process.env.ANTHROPIC_MODEL_PREF;
    this.limits = {
      history: this.promptWindowLimit() * 0.15,
      system: this.promptWindowLimit() * 0.15,
      user: this.promptWindowLimit() * 0.7,
    };
    if (!embedder)
      throw new Error(
@ -21,8 +27,19 @@ class AnthropicLLM {
    this.answerKey = v4().split("-")[0];
  }
-  isValidChatModel(modelName = "") {
+  promptWindowLimit() {
-    const validModels = ["claude-2"];
+    switch (this.model) {
      case "claude-instant-1":
        return 72_000;
      case "claude-2":
        return 100_000;
      default:
        return 72_000; // assume a claude-instant-1 model
    }
  }
  isValidChatCompletionModel(modelName = "") {
    const validModels = ["claude-2", "claude-instant-1"];
    return validModels.includes(modelName);
  }
@ -62,24 +79,25 @@ class AnthropicLLM {
    \n\nAssistant:`;
  }
-  // This is the interface used when no embeddings are present in the workspace
+  async sendChat(chatHistory = [], prompt, workspace = {}, rawHistory = []) {
-  // This is just having a conversation with the LLM as one would normally.
+    if (!this.isValidChatCompletionModel(this.model))
  async sendChat(chatHistory = [], prompt, workspace = {}) {
    const model = process.env.ANTHROPIC_MODEL_PREF || "claude-2";
    if (!this.isValidChatModel(model))
      throw new Error(
-        `Anthropic chat: ${model} is not valid for chat completion!`
+        `Anthropic chat: ${this.model} is not valid for chat completion!`
      );
    const compressedPrompt = await this.compressMessages(
      {
        systemPrompt: chatPrompt(workspace),
        userPrompt: prompt,
        chatHistory,
      },
      rawHistory
    );
    const { content, error } = await this.anthropic.completions
      .create({
-        model: "claude-2",
+        model: this.model,
        max_tokens_to_sample: 300,
-        prompt: this.constructPrompt({
+        prompt: compressedPrompt,
          systemPrompt: chatPrompt(workspace),
          userPrompt: prompt,
          chatHistory,
        }),
      })
      .then((res) => {
        const { completion } = res;
@ -100,15 +118,14 @@ class AnthropicLLM {
  }
  async getChatCompletion(prompt = "", _opts = {}) {
-    const model = process.env.ANTHROPIC_MODEL_PREF || "claude-2";
+    if (!this.isValidChatCompletionModel(this.model))
    if (!this.isValidChatModel(model))
      throw new Error(
-        `Anthropic chat: ${model} is not valid for chat completion!`
+        `Anthropic chat: ${this.model} is not valid for chat completion!`
      );
    const { content, error } = await this.anthropic.completions
      .create({
-        model: "claude-2",
+        model: this.model,
        max_tokens_to_sample: 300,
        prompt,
      })
@ -130,6 +147,16 @@ class AnthropicLLM {
    return content;
  }
  async compressMessages(promptArgs = {}, rawHistory = []) {
    const { messageStringCompressor } = require("../../helpers/chat");
    const compressedPrompt = await messageStringCompressor(
      this,
      promptArgs,
      rawHistory
    );
    return compressedPrompt;
  }
  // Simple wrapper for dynamic embedder & normalize interface for all LLM implementations
  async embedTextInput(textInput) {
    return await this.embedder.embedTextInput(textInput);
--- a/server/utils/AiProviders/azureOpenAi/index.js
+++ b/server/utils/AiProviders/azureOpenAi/index.js
@ -1,4 +1,5 @@
 const { AzureOpenAiEmbedder } = require("../../EmbeddingEngines/azureOpenAi");
 const { chatPrompt } = require("../../chats");
 class AzureOpenAiLLM extends AzureOpenAiEmbedder {
  constructor() {
@ -13,9 +14,24 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder {
      process.env.AZURE_OPENAI_ENDPOINT,
      new AzureKeyCredential(process.env.AZURE_OPENAI_KEY)
    );
    this.model = process.env.OPEN_MODEL_PREF;
    this.limits = {
      history: this.promptWindowLimit() * 0.15,
      system: this.promptWindowLimit() * 0.15,
      user: this.promptWindowLimit() * 0.7,
    };
  }
-  isValidChatModel(_modelName = "") {
+  // Sure the user selected a proper value for the token limit
  // could be any of these https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-4-models
  // and if undefined - assume it is the lowest end.
  promptWindowLimit() {
    return !!process.env.AZURE_OPENAI_TOKEN_LIMIT
      ? Number(process.env.AZURE_OPENAI_TOKEN_LIMIT)
      : 4096;
  }
  isValidChatCompletionModel(_modelName = "") {
    // The Azure user names their "models" as deployments and they can be any name
    // so we rely on the user to put in the correct deployment as only they would
    // know it.
@ -31,7 +47,7 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder {
    const prompt = {
      role: "system",
      content: `${systemPrompt}
-    Context:
+Context:
    ${contextTexts
      .map((text, i) => {
        return `[CONTEXT ${i}]:\n${text}\n[END CONTEXT ${i}]\n\n`;
@ -46,26 +62,25 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder {
    return { safe: true, reasons: [] };
  }
-  async sendChat(chatHistory = [], prompt, workspace = {}) {
+  async sendChat(chatHistory = [], prompt, workspace = {}, rawHistory = []) {
-    const model = process.env.OPEN_MODEL_PREF;
+    if (!this.model)
    if (!model)
      throw new Error(
        "No OPEN_MODEL_PREF ENV defined. This must the name of a deployment on your Azure account for an LLM chat model like GPT-3.5."
      );
    const messages = await this.compressMessages(
      {
        systemPrompt: chatPrompt(workspace),
        userPrompt: prompt,
        chatHistory,
      },
      rawHistory
    );
    const textResponse = await this.openai
-      .getChatCompletions(
+      .getChatCompletions(this.model, messages, {
-        model,
+        temperature: Number(workspace?.openAiTemp ?? 0.7),
-        [
+        n: 1,
-          { role: "system", content: "" },
+      })
          ...chatHistory,
          { role: "user", content: prompt },
        ],
        {
          temperature: Number(workspace?.openAiTemp ?? 0.7),
          n: 1,
        }
      )
      .then((res) => {
        if (!res.hasOwnProperty("choices"))
          throw new Error("OpenAI chat: No results!");
@ -83,18 +98,23 @@ class AzureOpenAiLLM extends AzureOpenAiEmbedder {
  }
  async getChatCompletion(messages = [], { temperature = 0.7 }) {
-    const model = process.env.OPEN_MODEL_PREF;
+    if (!this.model)
    if (!model)
      throw new Error(
        "No OPEN_MODEL_PREF ENV defined. This must the name of a deployment on your Azure account for an LLM chat model like GPT-3.5."
      );
-    const data = await this.openai.getChatCompletions(model, messages, {
+    const data = await this.openai.getChatCompletions(this.model, messages, {
      temperature,
    });
    if (!data.hasOwnProperty("choices")) return null;
    return data.choices[0].message.content;
  }
  async compressMessages(promptArgs = {}, rawHistory = []) {
    const { messageArrayCompressor } = require("../../helpers/chat");
    const messageArray = this.constructPrompt(promptArgs);
    return await messageArrayCompressor(this, messageArray, rawHistory);
  }
 }
 module.exports = {
--- a/server/utils/AiProviders/openAi/index.js
+++ b/server/utils/AiProviders/openAi/index.js
@ -1,4 +1,5 @@
 const { OpenAiEmbedder } = require("../../EmbeddingEngines/openAi");
 const { chatPrompt } = require("../../chats");
 class OpenAiLLM extends OpenAiEmbedder {
  constructor() {
@ -10,6 +11,23 @@ class OpenAiLLM extends OpenAiEmbedder {
      apiKey: process.env.OPEN_AI_KEY,
    });
    this.openai = new OpenAIApi(config);
    this.model = process.env.OPEN_MODEL_PREF;
    this.limits = {
      history: this.promptWindowLimit() * 0.15,
      system: this.promptWindowLimit() * 0.15,
      user: this.promptWindowLimit() * 0.7,
    };
  }
  promptWindowLimit() {
    switch (this.model) {
      case "gpt-3.5-turbo":
        return 4096;
      case "gpt-4":
        return 8192;
      default:
        return 4096; // assume a fine-tune 3.5
    }
  }
  async isValidChatCompletionModel(modelName = "") {
@ -33,7 +51,7 @@ class OpenAiLLM extends OpenAiEmbedder {
    const prompt = {
      role: "system",
      content: `${systemPrompt}
-    Context:
+Context:
    ${contextTexts
      .map((text, i) => {
        return `[CONTEXT ${i}]:\n${text}\n[END CONTEXT ${i}]\n\n`;
@ -75,7 +93,7 @@ class OpenAiLLM extends OpenAiEmbedder {
    return { safe: false, reasons };
  }
-  async sendChat(chatHistory = [], prompt, workspace = {}) {
+  async sendChat(chatHistory = [], prompt, workspace = {}, rawHistory = []) {
    const model = process.env.OPEN_MODEL_PREF;
    if (!(await this.isValidChatCompletionModel(model)))
      throw new Error(
@ -87,11 +105,14 @@ class OpenAiLLM extends OpenAiEmbedder {
        model,
        temperature: Number(workspace?.openAiTemp ?? 0.7),
        n: 1,
-        messages: [
+        messages: await this.compressMessages(
-          { role: "system", content: "" },
+          {
-          ...chatHistory,
+            systemPrompt: chatPrompt(workspace),
-          { role: "user", content: prompt },
+            userPrompt: prompt,
-        ],
+            chatHistory,
          },
          rawHistory
        ),
      })
      .then((json) => {
        const res = json.data;
@ -111,14 +132,13 @@ class OpenAiLLM extends OpenAiEmbedder {
  }
  async getChatCompletion(messages = null, { temperature = 0.7 }) {
-    const model = process.env.OPEN_MODEL_PREF || "gpt-3.5-turbo";
+    if (!(await this.isValidChatCompletionModel(this.model)))
    if (!(await this.isValidChatCompletionModel(model)))
      throw new Error(
-        `OpenAI chat: ${model} is not valid for chat completion!`
+        `OpenAI chat: ${this.model} is not valid for chat completion!`
      );
    const { data } = await this.openai.createChatCompletion({
-      model,
+      model: this.model,
      messages,
      temperature,
    });
@ -126,6 +146,12 @@ class OpenAiLLM extends OpenAiEmbedder {
    if (!data.hasOwnProperty("choices")) return null;
    return data.choices[0].message.content;
  }
  async compressMessages(promptArgs = {}, rawHistory = []) {
    const { messageArrayCompressor } = require("../../helpers/chat");
    const messageArray = this.constructPrompt(promptArgs);
    return await messageArrayCompressor(this, messageArray, rawHistory);
  }
 }
 module.exports = {
--- a/server/utils/chats/index.js
+++ b/server/utils/chats/index.js
@ -91,91 +91,146 @@ async function chatWithWorkspace(
  const hasVectorizedSpace = await VectorDb.hasNamespace(workspace.slug);
  const embeddingsCount = await VectorDb.namespaceCount(workspace.slug);
  if (!hasVectorizedSpace || embeddingsCount === 0) {
-    const rawHistory = (
+    // If there are no embeddings - chat like a normal LLM chat interface.
-      user
+    return await emptyEmbeddingChat({
-        ? await WorkspaceChats.forWorkspaceByUser(
+      uuid,
-            workspace.id,
+      user,
            user.id,
            messageLimit,
            { id: "desc" }
          )
        : await WorkspaceChats.forWorkspace(workspace.id, messageLimit, {
            id: "desc",
          })
    ).reverse();
    const chatHistory = convertToPromptHistory(rawHistory);
    const response = await LLMConnector.sendChat(
      chatHistory,
      message,
      workspace
    );
    const data = { text: response, sources: [], type: "chat" };
    await WorkspaceChats.new({
      workspaceId: workspace.id,
      prompt: message,
      response: data,
      user,
    });
    return {
      id: uuid,
      type: "textResponse",
      textResponse: response,
      sources: [],
      close: true,
      error: null,
    };
  } else {
    const rawHistory = (
      user
        ? await WorkspaceChats.forWorkspaceByUser(
            workspace.id,
            user.id,
            messageLimit,
            { id: "desc" }
          )
        : await WorkspaceChats.forWorkspace(workspace.id, messageLimit, {
            id: "desc",
          })
    ).reverse();
    const chatHistory = convertToPromptHistory(rawHistory);
    const {
      response,
      sources,
      message: error,
    } = await VectorDb[chatMode]({
      namespace: workspace.slug,
      input: message,
      workspace,
-      chatHistory,
+      messageLimit,
      LLMConnector,
    });
-    if (!response) {
+  }
      return {
        id: uuid,
        type: "abort",
        textResponse: null,
        sources: [],
        close: true,
        error,
      };
    }
-    const data = { text: response, sources, type: chatMode };
+  const { rawHistory, chatHistory } = await recentChatHistory(
-    await WorkspaceChats.new({
+    user,
-      workspaceId: workspace.id,
+    workspace,
-      prompt: message,
+    messageLimit,
-      response: data,
+    chatMode
-      user,
+  );
-    });
+  const {
    contextTexts = [],
    sources = [],
    message: error,
  } = await VectorDb.performSimilaritySearch({
    namespace: workspace.slug,
    input: message,
    LLMConnector,
  });
  // Failed similarity search.
  if (!!error) {
    return {
      id: uuid,
-      type: "textResponse",
+      type: "abort",
-      textResponse: response,
+      textResponse: null,
-      sources,
+      sources: [],
      close: true,
      error,
    };
  }
  // Compress message to ensure prompt passes token limit with room for response
  // and build system messages based on inputs and history.
  const messages = await LLMConnector.compressMessages(
    {
      systemPrompt: chatPrompt(workspace),
      userPrompt: message,
      contextTexts,
      chatHistory,
    },
    rawHistory
  );
  // Send the text completion.
  const textResponse = await LLMConnector.getChatCompletion(messages, {
    temperature: workspace?.openAiTemp ?? 0.7,
  });
  if (!textResponse) {
    return {
      id: uuid,
      type: "abort",
      textResponse: null,
      sources: [],
      close: true,
      error: "No text completion could be completed with this input.",
    };
  }
  await WorkspaceChats.new({
    workspaceId: workspace.id,
    prompt: message,
    response: { text: textResponse, sources, type: chatMode },
    user,
  });
  return {
    id: uuid,
    type: "textResponse",
    close: true,
    textResponse,
    sources,
    error,
  };
 }
 // On query we dont return message history. All other chat modes and when chatting
 // with no embeddings we return history.
 async function recentChatHistory(
  user = null,
  workspace,
  messageLimit = 20,
  chatMode = null
 ) {
  if (chatMode === "query") return [];
  const rawHistory = (
    user
      ? await WorkspaceChats.forWorkspaceByUser(
          workspace.id,
          user.id,
          messageLimit,
          { id: "desc" }
        )
      : await WorkspaceChats.forWorkspace(workspace.id, messageLimit, {
          id: "desc",
        })
  ).reverse();
  return { rawHistory, chatHistory: convertToPromptHistory(rawHistory) };
 }
 async function emptyEmbeddingChat({
  uuid,
  user,
  message,
  workspace,
  messageLimit,
  LLMConnector,
 }) {
  const { rawHistory, chatHistory } = await recentChatHistory(
    user,
    workspace,
    messageLimit
  );
  const textResponse = await LLMConnector.sendChat(
    chatHistory,
    message,
    workspace,
    rawHistory
  );
  await WorkspaceChats.new({
    workspaceId: workspace.id,
    prompt: message,
    response: { text: textResponse, sources: [], type: "chat" },
    user,
  });
  return {
    id: uuid,
    type: "textResponse",
    sources: [],
    close: true,
    error: null,
    textResponse,
  };
 }
 function chatPrompt(workspace) {
@ -186,6 +241,7 @@ function chatPrompt(workspace) {
 }
 module.exports = {
  convertToPromptHistory,
  convertToChatHistory,
  chatWithWorkspace,
  chatPrompt,
--- a/server/utils/helpers/chat/index.js
+++ b/server/utils/helpers/chat/index.js
@ -0,0 +1,325 @@
 const { convertToPromptHistory } = require("../../chats");
 const { TokenManager } = require("../tiktoken");
 /*
 What is the message Array compressor?
 TLDR: So anyway, i started blasting (your prompts & stuff)
 messageArrayCompressor arose out of a need for users to be able to insert unlimited token prompts
 and also maintain coherent history, system instructions and context, if applicable.
 We took an opinionated approach that after much back-testing we have found retained a highly coherent answer
 under most user conditions that a user would take while using this specific system. While other systems may
 use a more advanced model for compressing message history or simplify text through a recursive approach - our is much more simple.
 We "cannonball" the input.
 Cannonball (verb): To ensure a prompt fits through a model window we blast a hole in the center of any inputs blocking our path to doing so.
 This starts by dissecting the input as tokens and delete from the middle-out bi-directionally until the prompt window is satisfied.
 You may think: "Doesn't this result in massive data loss?" - yes & no.
 Under the use cases we expect the tool to be used, which is mostly chatting with documents, we are able to use this approach with minimal blowback
 on the quality of responses.
 We accomplish this by taking a rate-limit approach that is proportional to the model capacity. Since we support more than openAI models, this needs to 
 be generic and reliance on a "better summary" model just is not a luxury we can afford. The added latency overhead during prompting is also unacceptable.
 In general:
  system: at best 15% of token capacity
  history: at best 15% of token capacity
  prompt: at best 70% of token capacity.
 we handle overflows by taking an aggressive path for two main cases.
 1. Very large user prompt
 - Likely uninterested in context, history, or even system prompt. This is a "standalone" prompt that highjacks the whole thread.
 - We run this prompt on its own since a prompt that is over 70% of context window certainly is standalone.
 2. Context window is exceeded in regular use.
 - We do not touch prompt since it is very likely to be <70% of window.
 - We check system prompt is not outrageous - if it is we cannonball it and keep context if present.
 - We check a sliding window of history, only allowing up to 15% of the history to pass through if it fits, with a 
 preference for recent history if we can cannonball to fit it, otherwise it is omitted.
 We end up with a rather large prompt that fits through a given window with a lot of room for response in most use-cases.
 We also take the approach that history is the least important and most flexible of the items in this array of responses.
 There is a supplemental version of this function that also returns a formatted string for models like Claude-2
 */
 async function messageArrayCompressor(llm, messages = [], rawHistory = []) {
  // assume the response will be at least 600 tokens. If the total prompt + reply is over we need to proactively
  // run the compressor to ensure the prompt has enough space to reply.
  // realistically - most users will not be impacted by this.
  const tokenBuffer = 600;
  const tokenManager = new TokenManager(llm.model);
  // If no work needs to be done, just pass through.
  if (tokenManager.statsFrom(messages) + tokenBuffer < llm.promptWindowLimit())
    return messages;
  const system = messages.shift();
  const user = messages.pop();
  const userPromptSize = tokenManager.countFromString(user.content);
  // User prompt is the main focus here - we we prioritize it and allow
  // it to highjack the entire conversation thread. We are going to
  // cannonball the prompt through to ensure the reply has at least 20% of
  // the token supply to reply with.
  if (userPromptSize > llm.limits.user) {
    return [
      {
        role: "user",
        content: cannonball({
          input: user.content,
          targetTokenSize: llm.promptWindowLimit() * 0.8,
          tiktokenInstance: tokenManager,
        }),
      },
    ];
  }
  const compressedSystem = new Promise(async (resolve) => {
    const count = tokenManager.countFromString(system.content);
    if (count < llm.limits.system) {
      resolve(system);
      return;
    }
    // Split context from system prompt - cannonball since its over the window.
    // We assume the context + user prompt is enough tokens to fit.
    const [prompt, context = ""] = system.content.split("Context:");
    system.content = `${cannonball({
      input: prompt,
      targetTokenSize: llm.limits.system,
      tiktokenInstance: tokenManager,
    })}${context ? `\nContext: ${context}` : ""}`;
    resolve(system);
  });
  // Prompt is allowed to take up to 70% of window - we know its under
  // if we are here, so passthrough.
  const compressedPrompt = new Promise(async (resolve) => resolve(user));
  // We always aggressively compress history because it is the least
  // important data to retain in full-fidelity.
  const compressedHistory = new Promise((resolve) => {
    const eligibleHistoryItems = [];
    var historyTokenCount = 0;
    for (const [i, history] of rawHistory.reverse().entries()) {
      const [user, assistant] = convertToPromptHistory([history]);
      const [userTokens, assistantTokens] = [
        tokenManager.countFromString(user.content),
        tokenManager.countFromString(assistant.content),
      ];
      const total = userTokens + assistantTokens;
      // If during the loop the token cost of adding this history
      // is small, we can add it to history and move onto next.
      if (historyTokenCount + total < llm.limits.history) {
        eligibleHistoryItems.unshift(user, assistant);
        historyTokenCount += total;
        continue;
      }
      // If we reach here the overhead of adding this history item will
      // be too much of the limit. So now, we are prioritizing
      // the most recent 3 message pairs - if we are already past those - exit loop and stop
      // trying to make history work.
      if (i > 2) break;
      // We are over the limit and we are within the first 3 most recent chats.
      // so now we cannonball them to make them fit into the window.
      // max size = llm.limit.history; Each component of the message, can at most
      // be 50% of the history. We cannonball whichever is the problem.
      // The math isnt perfect for tokens, so we have to add a fudge factor for safety.
      const maxTargetSize = Math.floor(llm.limits.history / 2.2);
      if (userTokens > maxTargetSize) {
        user.content = cannonball({
          input: user.content,
          targetTokenSize: maxTargetSize,
          tiktokenInstance: tokenManager,
        });
      }
      if (assistantTokens > maxTargetSize) {
        assistant.content = cannonball({
          input: assistant.content,
          targetTokenSize: maxTargetSize,
          tiktokenInstance: tokenManager,
        });
      }
      const newTotal = tokenManager.statsFrom([user, assistant]);
      if (historyTokenCount + newTotal > llm.limits.history) continue;
      eligibleHistoryItems.unshift(user, assistant);
      historyTokenCount += newTotal;
    }
    resolve(eligibleHistoryItems);
  });
  const [cSystem, cHistory, cPrompt] = await Promise.all([
    compressedSystem,
    compressedHistory,
    compressedPrompt,
  ]);
  return [cSystem, ...cHistory, cPrompt];
 }
 // Implementation of messageArrayCompressor, but for string only completion models
 async function messageStringCompressor(llm, promptArgs = {}, rawHistory = []) {
  const tokenBuffer = 600;
  const tokenManager = new TokenManager(llm.model);
  const initialPrompt = llm.constructPrompt(promptArgs);
  if (
    tokenManager.statsFrom(initialPrompt) + tokenBuffer <
    llm.promptWindowLimit()
  )
    return initialPrompt;
  const system = promptArgs.systemPrompt;
  const user = promptArgs.userPrompt;
  const userPromptSize = tokenManager.countFromString(user);
  // User prompt is the main focus here - we we prioritize it and allow
  // it to highjack the entire conversation thread. We are going to
  // cannonball the prompt through to ensure the reply has at least 20% of
  // the token supply to reply with.
  if (userPromptSize > llm.limits.user) {
    return llm.constructPrompt({
      userPrompt: cannonball({
        input: user,
        targetTokenSize: llm.promptWindowLimit() * 0.8,
        tiktokenInstance: tokenManager,
      }),
    });
  }
  const compressedSystem = new Promise(async (resolve) => {
    const count = tokenManager.countFromString(system);
    if (count < llm.limits.system) {
      resolve(system);
      return;
    }
    resolve(
      cannonball({
        input: system,
        targetTokenSize: llm.limits.system,
        tiktokenInstance: tokenManager,
      })
    );
  });
  // Prompt is allowed to take up to 70% of window - we know its under
  // if we are here, so passthrough.
  const compressedPrompt = new Promise(async (resolve) => resolve(user));
  // We always aggressively compress history because it is the least
  // important data to retain in full-fidelity.
  const compressedHistory = new Promise((resolve) => {
    const eligibleHistoryItems = [];
    var historyTokenCount = 0;
    for (const [i, history] of rawHistory.reverse().entries()) {
      const [user, assistant] = convertToPromptHistory([history]);
      const [userTokens, assistantTokens] = [
        tokenManager.countFromString(user.content),
        tokenManager.countFromString(assistant.content),
      ];
      const total = userTokens + assistantTokens;
      // If during the loop the token cost of adding this history
      // is small, we can add it to history and move onto next.
      if (historyTokenCount + total < llm.limits.history) {
        eligibleHistoryItems.unshift(user, assistant);
        historyTokenCount += total;
        continue;
      }
      // If we reach here the overhead of adding this history item will
      // be too much of the limit. So now, we are prioritizing
      // the most recent 3 message pairs - if we are already past those - exit loop and stop
      // trying to make history work.
      if (i > 2) break;
      // We are over the limit and we are within the first 3 most recent chats.
      // so now we cannonball them to make them fit into the window.
      // max size = llm.limit.history; Each component of the message, can at most
      // be 50% of the history. We cannonball whichever is the problem.
      // The math isnt perfect for tokens, so we have to add a fudge factor for safety.
      const maxTargetSize = Math.floor(llm.limits.history / 2.2);
      if (userTokens > maxTargetSize) {
        user.content = cannonball({
          input: user.content,
          targetTokenSize: maxTargetSize,
          tiktokenInstance: tokenManager,
        });
      }
      if (assistantTokens > maxTargetSize) {
        assistant.content = cannonball({
          input: assistant.content,
          targetTokenSize: maxTargetSize,
          tiktokenInstance: tokenManager,
        });
      }
      const newTotal = tokenManager.statsFrom([user, assistant]);
      if (historyTokenCount + newTotal > llm.limits.history) continue;
      eligibleHistoryItems.unshift(user, assistant);
      historyTokenCount += newTotal;
    }
    resolve(eligibleHistoryItems);
  });
  const [cSystem, cHistory, cPrompt] = await Promise.all([
    compressedSystem,
    compressedHistory,
    compressedPrompt,
  ]);
  return llm.constructPrompt({
    systemPrompt: cSystem,
    contextTexts: promptArgs?.contextTexts || [],
    chatHistory: cHistory,
    userPrompt: cPrompt,
  });
 }
 // Cannonball prompting: aka where we shoot a proportionally big cannonball through a proportional large prompt
 // Nobody should be sending prompts this big, but there is no reason we shouldn't allow it if results are good even by doing it.
 function cannonball({
  input = "",
  targetTokenSize = 0,
  tiktokenInstance = null,
  ellipsesStr = null,
 }) {
  if (!input || !targetTokenSize) return input;
  const tokenManager = tiktokenInstance || new TokenManager();
  const truncText = ellipsesStr || "\n\n--prompt truncated for brevity--\n\n";
  const initialInputSize = tokenManager.countFromString(input);
  if (initialInputSize < targetTokenSize) return input;
  // if the delta is the token difference between where our prompt is in size
  // and where we ideally need to land.
  const delta = initialInputSize - targetTokenSize;
  const tokenChunks = tokenManager.tokensFromString(input);
  const middleIdx = Math.floor(tokenChunks.length / 2);
  // middle truncate the text going left and right of midpoint
  const leftChunks = tokenChunks.slice(0, middleIdx - Math.round(delta / 2));
  const rightChunks = tokenChunks.slice(middleIdx + Math.round(delta / 2));
  const truncatedText =
    tokenManager.bytesFromTokens(leftChunks) +
    truncText +
    tokenManager.bytesFromTokens(rightChunks);
  console.log(
    `Cannonball results ${initialInputSize} -> ${tokenManager.countFromString(
      truncatedText
    )} tokens.`
  );
  return truncatedText;
 }
 module.exports = {
  messageArrayCompressor,
  messageStringCompressor,
 };
--- a/server/utils/helpers/tiktoken.js
+++ b/server/utils/helpers/tiktoken.js
@ -0,0 +1,57 @@
 const { getEncodingNameForModel, getEncoding } = require("js-tiktoken");
 class TokenManager {
  constructor(model = "gpt-3.5-turbo") {
    this.model = model;
    this.encoderName = this.getEncodingFromModel(model);
    this.encoder = getEncoding(this.encoderName);
    this.buffer = 50;
  }
  getEncodingFromModel(model) {
    try {
      return getEncodingNameForModel(model);
    } catch {
      return "cl100k_base";
    }
  }
  tokensFromString(input = "") {
    const tokens = this.encoder.encode(input);
    return tokens;
  }
  bytesFromTokens(tokens = []) {
    const bytes = this.encoder.decode(tokens);
    return bytes;
  }
  countFromString(input = "") {
    const tokens = this.encoder.encode(input);
    return tokens.length;
  }
  statsFrom(input) {
    if (typeof input === "string") return this.countFromString(input);
    // What is going on here?
    // https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb Item 6.
    // The only option is to estimate. From repeated testing using the static values in the code we are always 2 off,
    // which means as of Nov 1, 2023 the additional factor on ln: 476 changed from 3 to 5.
    if (Array.isArray(input)) {
      const perMessageFactorTokens = input.length * 3;
      const tokensFromContent = input.reduce(
        (a, b) => a + this.countFromString(b.content),
        0
      );
      const diffCoefficient = 5;
      return perMessageFactorTokens + tokensFromContent + diffCoefficient;
    }
    throw new Error("Not a supported tokenized format.");
  }
 }
 module.exports = {
  TokenManager,
 };
--- a/server/utils/helpers/updateENV.js
+++ b/server/utils/helpers/updateENV.js
@ -17,6 +17,10 @@ const KEY_MAPPING = {
    envKey: "AZURE_OPENAI_ENDPOINT",
    checks: [isNotEmpty, validAzureURL],
  },
  AzureOpenAiTokenLimit: {
    envKey: "AZURE_OPENAI_TOKEN_LIMIT",
    checks: [validOpenAiTokenLimit],
  },
  AzureOpenAiKey: {
    envKey: "AZURE_OPENAI_KEY",
    checks: [isNotEmpty],
@ -137,7 +141,7 @@ function supportedLLM(input = "") {
 }
 function validAnthropicModel(input = "") {
-  const validModels = ["claude-2"];
+  const validModels = ["claude-2", "claude-instant-1"];
  return validModels.includes(input)
    ? null
    : `Invalid Model type. Must be one of ${validModels.join(", ")}.`;
@ -174,6 +178,14 @@ function validAzureURL(input = "") {
  }
 }
 function validOpenAiTokenLimit(input = "") {
  const tokenLimit = Number(input);
  if (isNaN(tokenLimit)) return "Token limit is not a number";
  if (![4_096, 16_384, 8_192, 32_768].includes(tokenLimit))
    return "Invalid OpenAI token limit.";
  return null;
 }
 function requiresForceMode(_, forceModeEnabled = false) {
  return forceModeEnabled === true ? null : "Cannot set this setting.";
 }
--- a/server/utils/vectorDbProviders/chroma/index.js
+++ b/server/utils/vectorDbProviders/chroma/index.js
@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
 const { toChunks, getLLMProvider } = require("../../helpers");
 const { chatPrompt } = require("../../chats");
 const Chroma = {
  name: "Chroma",
@ -253,92 +252,35 @@ const Chroma = {
    await DocumentVectors.deleteIds(indexes);
    return true;
  },
-  query: async function (reqBody = {}) {
+  performSimilaritySearch: async function ({
-    const { namespace = null, input, workspace = {} } = reqBody;
+    namespace = null,
-    if (!namespace || !input) throw new Error("Invalid request body");
+    input = "",
    LLMConnector = null,
  }) {
    if (!namespace || !input || !LLMConnector)
      throw new Error("Invalid request to performSimilaritySearch.");
    const { client } = await this.connect();
    if (!(await this.namespaceExists(client, namespace))) {
      return {
-        response: null,
+        contextTexts: [],
        sources: [],
        message: "Invalid query - no documents found for workspace!",
      };
    }
    const LLMConnector = getLLMProvider();
    const queryVector = await LLMConnector.embedTextInput(input);
    const { contextTexts, sourceDocuments } = await this.similarityResponse(
      client,
      namespace,
      queryVector
    );
    const memory = LLMConnector.constructPrompt({
      systemPrompt: chatPrompt(workspace),
      contextTexts: contextTexts,
      userPrompt: input,
    });
    const responseText = await LLMConnector.getChatCompletion(memory, {
      temperature: workspace?.openAiTemp ?? 0.7,
    });
    // When we roll out own response we have separate metadata and texts,
    // so for source collection we need to combine them.
    const sources = sourceDocuments.map((metadata, i) => {
      return { metadata: { ...metadata, text: contextTexts[i] } };
    });
    return {
-      response: responseText,
+      contextTexts,
      sources: this.curateSources(sources),
      message: false,
    };
  },
  // This implementation of chat uses the chat history and modifies the system prompt at execution
  // this is improved over the regular langchain implementation so that chats do not directly modify embeddings
  // because then multi-user support will have all conversations mutating the base vector collection to which then
  // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs
  chat: async function (reqBody = {}) {
    const {
      namespace = null,
      input,
      workspace = {},
      chatHistory = [],
    } = reqBody;
    if (!namespace || !input) throw new Error("Invalid request body");
    const { client } = await this.connect();
    if (!(await this.namespaceExists(client, namespace))) {
      return {
        response: null,
        sources: [],
        message: "Invalid query - no documents found for workspace!",
      };
    }
    const LLMConnector = getLLMProvider();
    const queryVector = await LLMConnector.embedTextInput(input);
    const { contextTexts, sourceDocuments } = await this.similarityResponse(
      client,
      namespace,
      queryVector
    );
    const memory = LLMConnector.constructPrompt({
      systemPrompt: chatPrompt(workspace),
      contextTexts: contextTexts,
      userPrompt: input,
      chatHistory,
    });
    const responseText = await LLMConnector.getChatCompletion(memory, {
      temperature: workspace?.openAiTemp ?? 0.7,
    });
    // When we roll out own response we have separate metadata and texts,
    // so for source collection we need to combine them.
    const sources = sourceDocuments.map((metadata, i) => {
      return { metadata: { ...metadata, text: contextTexts[i] } };
    });
    return {
      response: responseText,
      sources: this.curateSources(sources),
      message: false,
    };
--- a/server/utils/vectorDbProviders/lance/index.js
+++ b/server/utils/vectorDbProviders/lance/index.js
@ -4,7 +4,6 @@ const { OpenAIEmbeddings } = require("langchain/embeddings/openai");
 const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
 const { chatPrompt } = require("../../chats");
 const LanceDb = {
  uri: `${
@ -226,83 +225,36 @@ const LanceDb = {
      return false;
    }
  },
-  query: async function (reqBody = {}) {
+  performSimilaritySearch: async function ({
-    const { namespace = null, input, workspace = {} } = reqBody;
+    namespace = null,
-    if (!namespace || !input) throw new Error("Invalid request body");
+    input = "",
    LLMConnector = null,
  }) {
    if (!namespace || !input || !LLMConnector)
      throw new Error("Invalid request to performSimilaritySearch.");
    const { client } = await this.connect();
    if (!(await this.namespaceExists(client, namespace))) {
      return {
-        response: null,
+        contextTexts: [],
        sources: [],
        message: "Invalid query - no documents found for workspace!",
      };
    }
    const LLMConnector = getLLMProvider();
    const queryVector = await LLMConnector.embedTextInput(input);
    const { contextTexts, sourceDocuments } = await this.similarityResponse(
      client,
      namespace,
      queryVector
    );
    const memory = LLMConnector.constructPrompt({
      systemPrompt: chatPrompt(workspace),
      contextTexts: contextTexts,
      userPrompt: input,
    });
    const responseText = await LLMConnector.getChatCompletion(memory, {
      temperature: workspace?.openAiTemp ?? 0.7,
    });
    const sources = sourceDocuments.map((metadata, i) => {
      return { metadata: { ...metadata, text: contextTexts[i] } };
    });
    return {
-      response: responseText,
+      contextTexts,
-      sources: this.curateSources(sourceDocuments),
+      sources: this.curateSources(sources),
      message: false,
    };
  },
  // This implementation of chat uses the chat history and modifies the system prompt at execution
  // this is improved over the regular langchain implementation so that chats do not directly modify embeddings
  // because then multi-user support will have all conversations mutating the base vector collection to which then
  // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs
  chat: async function (reqBody = {}) {
    const {
      namespace = null,
      input,
      workspace = {},
      chatHistory = [],
    } = reqBody;
    if (!namespace || !input) throw new Error("Invalid request body");
    const { client } = await this.connect();
    if (!(await this.namespaceExists(client, namespace))) {
      return {
        response: null,
        sources: [],
        message: "Invalid query - no documents found for workspace!",
      };
    }
    const LLMConnector = getLLMProvider();
    const queryVector = await LLMConnector.embedTextInput(input);
    const { contextTexts, sourceDocuments } = await this.similarityResponse(
      client,
      namespace,
      queryVector
    );
    const memory = LLMConnector.constructPrompt({
      systemPrompt: chatPrompt(workspace),
      contextTexts: contextTexts,
      userPrompt: input,
      chatHistory,
    });
    const responseText = await LLMConnector.getChatCompletion(memory, {
      temperature: workspace?.openAiTemp ?? 0.7,
    });
    return {
      response: responseText,
      sources: this.curateSources(sourceDocuments),
      message: false,
    };
  },
@ -337,9 +289,13 @@ const LanceDb = {
  curateSources: function (sources = []) {
    const documents = [];
    for (const source of sources) {
-      const { text, vector: _v, score: _s, ...metadata } = source;
+      const { text, vector: _v, score: _s, ...rest } = source;
      const metadata = rest.hasOwnProperty("metadata") ? rest.metadata : rest;
      if (Object.keys(metadata).length > 0) {
-        documents.push({ ...metadata, text });
+        documents.push({
          ...metadata,
          ...(text ? { text } : {}),
        });
      }
    }
--- a/server/utils/vectorDbProviders/pinecone/index.js
+++ b/server/utils/vectorDbProviders/pinecone/index.js
@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
 const { toChunks, getLLMProvider } = require("../../helpers");
 const { chatPrompt } = require("../../chats");
 const Pinecone = {
  name: "Pinecone",
@ -222,80 +221,33 @@ const Pinecone = {
      message: `Namespace ${namespace} was deleted along with ${details.vectorCount} vectors.`,
    };
  },
-  query: async function (reqBody = {}) {
+  performSimilaritySearch: async function ({
-    const { namespace = null, input, workspace = {} } = reqBody;
+    namespace = null,
-    if (!namespace || !input) throw new Error("Invalid request body");
+    input = "",
-
+    LLMConnector = null,
-    const { pineconeIndex } = await this.connect();
+  }) {
-    if (!(await this.namespaceExists(pineconeIndex, namespace))) {
+    if (!namespace || !input || !LLMConnector)
-      return {
+      throw new Error("Invalid request to performSimilaritySearch.");
        response: null,
        sources: [],
        message: "Invalid query - no documents found for workspace!",
      };
    }
    const LLMConnector = getLLMProvider();
    const queryVector = await LLMConnector.embedTextInput(input);
    const { contextTexts, sourceDocuments } = await this.similarityResponse(
      pineconeIndex,
      namespace,
      queryVector
    );
    const memory = LLMConnector.constructPrompt({
      systemPrompt: chatPrompt(workspace),
      contextTexts: contextTexts,
      userPrompt: input,
    });
    const responseText = await LLMConnector.getChatCompletion(memory, {
      temperature: workspace?.openAiTemp ?? 0.7,
    });
    return {
      response: responseText,
      sources: this.curateSources(sourceDocuments),
      message: false,
    };
  },
  // This implementation of chat uses the chat history and modifies the system prompt at execution
  // this is improved over the regular langchain implementation so that chats do not directly modify embeddings
  // because then multi-user support will have all conversations mutating the base vector collection to which then
  // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs
  chat: async function (reqBody = {}) {
    const {
      namespace = null,
      input,
      workspace = {},
      chatHistory = [],
    } = reqBody;
    if (!namespace || !input) throw new Error("Invalid request body");
    const { pineconeIndex } = await this.connect();
    if (!(await this.namespaceExists(pineconeIndex, namespace)))
      throw new Error(
-        "Invalid namespace - has it been collected and seeded yet?"
+        "Invalid namespace - has it been collected and populated yet?"
      );
    const LLMConnector = getLLMProvider();
    const queryVector = await LLMConnector.embedTextInput(input);
    const { contextTexts, sourceDocuments } = await this.similarityResponse(
      pineconeIndex,
      namespace,
      queryVector
    );
    const memory = LLMConnector.constructPrompt({
      systemPrompt: chatPrompt(workspace),
      contextTexts: contextTexts,
      userPrompt: input,
      chatHistory,
    });
    const responseText = await LLMConnector.getChatCompletion(memory, {
      temperature: workspace?.openAiTemp ?? 0.7,
    });
    const sources = sourceDocuments.map((metadata, i) => {
      return { ...metadata, text: contextTexts[i] };
    });
    return {
-      response: responseText,
+      contextTexts,
-      sources: this.curateSources(sourceDocuments),
+      sources: this.curateSources(sources),
      message: false,
    };
  },
--- a/server/utils/vectorDbProviders/qdrant/index.js
+++ b/server/utils/vectorDbProviders/qdrant/index.js
@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
 const { toChunks, getLLMProvider } = require("../../helpers");
 const { chatPrompt } = require("../../chats");
 const QDrant = {
  name: "QDrant",
@ -262,83 +261,36 @@ const QDrant = {
    await DocumentVectors.deleteIds(indexes);
    return true;
  },
-  query: async function (reqBody = {}) {
+  performSimilaritySearch: async function ({
-    const { namespace = null, input, workspace = {} } = reqBody;
+    namespace = null,
-    if (!namespace || !input) throw new Error("Invalid request body");
+    input = "",
    LLMConnector = null,
  }) {
    if (!namespace || !input || !LLMConnector)
      throw new Error("Invalid request to performSimilaritySearch.");
    const { client } = await this.connect();
    if (!(await this.namespaceExists(client, namespace))) {
      return {
-        response: null,
+        contextTexts: [],
        sources: [],
        message: "Invalid query - no documents found for workspace!",
      };
    }
    const LLMConnector = getLLMProvider();
    const queryVector = await LLMConnector.embedTextInput(input);
    const { contextTexts, sourceDocuments } = await this.similarityResponse(
      client,
      namespace,
      queryVector
    );
    const memory = LLMConnector.constructPrompt({
      systemPrompt: chatPrompt(workspace),
      contextTexts: contextTexts,
      userPrompt: input,
    });
    const responseText = await LLMConnector.getChatCompletion(memory, {
      temperature: workspace?.openAiTemp ?? 0.7,
    });
    const sources = sourceDocuments.map((metadata, i) => {
      return { ...metadata, text: contextTexts[i] };
    });
    return {
-      response: responseText,
+      contextTexts,
-      sources: this.curateSources(sourceDocuments),
+      sources: this.curateSources(sources),
      message: false,
    };
  },
  // This implementation of chat uses the chat history and modifies the system prompt at execution
  // this is improved over the regular langchain implementation so that chats do not directly modify embeddings
  // because then multi-user support will have all conversations mutating the base vector collection to which then
  // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs
  chat: async function (reqBody = {}) {
    const {
      namespace = null,
      input,
      workspace = {},
      chatHistory = [],
    } = reqBody;
    if (!namespace || !input) throw new Error("Invalid request body");
    const { client } = await this.connect();
    if (!(await this.namespaceExists(client, namespace))) {
      return {
        response: null,
        sources: [],
        message: "Invalid query - no documents found for workspace!",
      };
    }
    const LLMConnector = getLLMProvider();
    const queryVector = await LLMConnector.embedTextInput(input);
    const { contextTexts, sourceDocuments } = await this.similarityResponse(
      client,
      namespace,
      queryVector
    );
    const memory = LLMConnector.constructPrompt({
      systemPrompt: chatPrompt(workspace),
      contextTexts: contextTexts,
      userPrompt: input,
      chatHistory,
    });
    const responseText = await LLMConnector.getChatCompletion(memory, {
      temperature: workspace?.openAiTemp ?? 0.7,
    });
    return {
      response: responseText,
      sources: this.curateSources(sourceDocuments),
      message: false,
    };
  },
@ -377,8 +329,11 @@ const QDrant = {
    const documents = [];
    for (const source of sources) {
      if (Object.keys(source).length > 0) {
        const metadata = source.hasOwnProperty("metadata")
          ? source.metadata
          : source;
        documents.push({
-          ...source,
+          ...metadata,
        });
      }
    }
--- a/server/utils/vectorDbProviders/weaviate/index.js
+++ b/server/utils/vectorDbProviders/weaviate/index.js
@ -3,7 +3,6 @@ const { RecursiveCharacterTextSplitter } = require("langchain/text_splitter");
 const { storeVectorResult, cachedVectorInformation } = require("../../files");
 const { v4: uuidv4 } = require("uuid");
 const { toChunks, getLLMProvider } = require("../../helpers");
 const { chatPrompt } = require("../../chats");
 const { camelCase } = require("../../helpers/camelcase");
 const Weaviate = {
@ -333,83 +332,36 @@ const Weaviate = {
    await DocumentVectors.deleteIds(indexes);
    return true;
  },
-  query: async function (reqBody = {}) {
+  performSimilaritySearch: async function ({
-    const { namespace = null, input, workspace = {} } = reqBody;
+    namespace = null,
-    if (!namespace || !input) throw new Error("Invalid request body");
+    input = "",
    LLMConnector = null,
  }) {
    if (!namespace || !input || !LLMConnector)
      throw new Error("Invalid request to performSimilaritySearch.");
    const { client } = await this.connect();
    if (!(await this.namespaceExists(client, namespace))) {
      return {
-        response: null,
+        contextTexts: [],
        sources: [],
        message: "Invalid query - no documents found for workspace!",
      };
    }
    const LLMConnector = getLLMProvider();
    const queryVector = await LLMConnector.embedTextInput(input);
    const { contextTexts, sourceDocuments } = await this.similarityResponse(
      client,
      namespace,
      queryVector
    );
    const memory = LLMConnector.constructPrompt({
      systemPrompt: chatPrompt(workspace),
      contextTexts: contextTexts,
      userPrompt: input,
    });
    const responseText = await LLMConnector.getChatCompletion(memory, {
      temperature: workspace?.openAiTemp ?? 0.7,
    });
    const sources = sourceDocuments.map((metadata, i) => {
      return { ...metadata, text: contextTexts[i] };
    });
    return {
-      response: responseText,
+      contextTexts,
-      sources: this.curateSources(sourceDocuments),
+      sources: this.curateSources(sources),
      message: false,
    };
  },
  // This implementation of chat uses the chat history and modifies the system prompt at execution
  // this is improved over the regular langchain implementation so that chats do not directly modify embeddings
  // because then multi-user support will have all conversations mutating the base vector collection to which then
  // the only solution is replicating entire vector databases per user - which will very quickly consume space on VectorDbs
  chat: async function (reqBody = {}) {
    const {
      namespace = null,
      input,
      workspace = {},
      chatHistory = [],
    } = reqBody;
    if (!namespace || !input) throw new Error("Invalid request body");
    const { client } = await this.connect();
    if (!(await this.namespaceExists(client, namespace))) {
      return {
        response: null,
        sources: [],
        message: "Invalid query - no documents found for workspace!",
      };
    }
    const LLMConnector = getLLMProvider();
    const queryVector = await LLMConnector.embedTextInput(input);
    const { contextTexts, sourceDocuments } = await this.similarityResponse(
      client,
      namespace,
      queryVector
    );
    const memory = LLMConnector.constructPrompt({
      systemPrompt: chatPrompt(workspace),
      contextTexts: contextTexts,
      userPrompt: input,
      chatHistory,
    });
    const responseText = await LLMConnector.getChatCompletion(memory, {
      temperature: workspace?.openAiTemp ?? 0.7,
    });
    return {
      response: responseText,
      sources: this.curateSources(sourceDocuments),
      message: false,
    };
  },
@ -445,7 +397,10 @@ const Weaviate = {
    const documents = [];
    for (const source of sources) {
      if (Object.keys(source).length > 0) {
-        documents.push(source);
+        const metadata = source.hasOwnProperty("metadata")
          ? source.metadata
          : source;
        documents.push({ ...metadata });
      }
    }
--- a/server/yarn.lock
+++ b/server/yarn.lock
@ -1556,7 +1556,7 @@ isomorphic-fetch@^3.0.0:
    node-fetch "^2.6.1"
    whatwg-fetch "^3.4.1"
-js-tiktoken@^1.0.6:
+js-tiktoken@^1.0.6, js-tiktoken@^1.0.7:
  version "1.0.7"
  resolved "https://registry.yarnpkg.com/js-tiktoken/-/js-tiktoken-1.0.7.tgz#56933fcd2093e8304060dfde3071bda91812e6f5"
  integrity sha512-biba8u/clw7iesNEWLOLwrNGoBP2lA+hTaBLs/D45pJdUPFXyxD6nhcDVtADChghv4GgyAiMKYMiRx7x6h7Biw==