Added metadata parameter to document/upload, document/upload/{folderName}, and document/upload-link (#4342)

* Added the ability to pass in metadata to the /document/upload/{folderName} endpoint * Added the ability to pass in metadata to the /document/upload-link endpoint * feat: added metadata to document/upload api endpoint * simplify optional metadata in document dev api endpoints * lint * patch handling of metadata in dev api * Linting, small comments --------- Co-authored-by: jstawskigmi <jstawski@getmyinterns.org> Co-authored-by: shatfield4 <seanhatfield5@gmail.com> Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
2025-09-17 14:17:29 -04:00 · 2025-09-17 14:17:29 -04:00 · b8d4cc3454
commit b8d4cc3454
parent 9841deb513
16 changed files with 229 additions and 103 deletions
--- a/collector/index.js
+++ b/collector/index.js
@ -32,7 +32,7 @@ app.post(
  "/process",
  [verifyPayloadIntegrity],
  async function (request, response) {
-    const { filename, options = {} } = reqBody(request);
+    const { filename, options = {}, metadata = {} } = reqBody(request);
    try {
      const targetFilename = path
        .normalize(filename)
@ -41,7 +41,7 @@ app.post(
        success,
        reason,
        documents = [],
-      } = await processSingleFile(targetFilename, options);
+      } = await processSingleFile(targetFilename, options, metadata);
      response
        .status(200)
        .json({ filename: targetFilename, success, reason, documents });
@ -95,13 +95,13 @@ app.post(
  "/process-link",
  [verifyPayloadIntegrity],
  async function (request, response) {
-    const { link, scraperHeaders = {} } = reqBody(request);
+    const { link, scraperHeaders = {}, metadata = {} } = reqBody(request);
    try {
      const {
        success,
        reason,
        documents = [],
-      } = await processLink(link, scraperHeaders);
+      } = await processLink(link, scraperHeaders, metadata);
      response.status(200).json({ url: link, success, reason, documents });
    } catch (e) {
      console.error(e);
--- a/collector/processLink/convert/generic.js
+++ b/collector/processLink/convert/generic.js
@ -13,6 +13,7 @@ const { default: slugify } = require("slugify");
 * @param {('html' | 'text')} config.captureAs - The format to capture the page content as. Default is 'text'
 * @param {boolean} config.processAsDocument - Whether to process the content as a document or return the content directly. Default is true
 * @param {{[key: string]: string}} config.scraperHeaders - Custom headers to use when making the request
+ * @param {{[key: string]: string}} config.metadata - Metadata to use when creating the document
 * @returns {Promise<Object>} - The content of the page
 */
 async function scrapeGenericUrl({
@ -20,6 +21,7 @@ async function scrapeGenericUrl({
  captureAs = "text",
  processAsDocument = true,
  scraperHeaders = {},
+  metadata = {},
 }) {
  console.log(`-- Working URL ${link} => (${captureAs}) --`);
  const content = await getPageContent({
@ -51,10 +53,10 @@ async function scrapeGenericUrl({
  const data = {
    id: v4(),
    url: "file://" + slugify(filename) + ".html",
-    title: slugify(filename) + ".html",
-    docAuthor: "no author found",
-    description: "No description found.",
-    docSource: "URL link uploaded by the user.",
+    title: metadata.title || slugify(filename) + ".html",
+    docAuthor: metadata.docAuthor || "no author found",
+    description: metadata.description || "No description found.",
+    docSource: metadata.docSource || "URL link uploaded by the user.",
    chunkSource: `link://${link}`,
    published: new Date().toLocaleString(),
    wordCount: content.split(" ").length,
--- a/collector/processLink/index.js
+++ b/collector/processLink/index.js
@ -6,15 +6,17 @@ const { scrapeGenericUrl } = require("./convert/generic");
 * so it can be used for embedding later.
 * @param {string} link - The link to process
 * @param {{[key: string]: string}} scraperHeaders - Custom headers to apply when scraping the link
+ * @param {Object} metadata - Optional metadata to attach to the document
 * @returns {Promise<{success: boolean, content: string}>} - Response from collector
 */
-async function processLink(link, scraperHeaders = {}) {
+async function processLink(link, scraperHeaders = {}, metadata = {}) {
  if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
  return await scrapeGenericUrl({
    link,
    captureAs: "text",
    processAsDocument: true,
    scraperHeaders,
+    metadata,
  });
 }

--- a/collector/processSingleFile/convert/asAudio.js
+++ b/collector/processSingleFile/convert/asAudio.js
@ -14,7 +14,12 @@ const WHISPER_PROVIDERS = {
  local: LocalWhisper,
 };

-async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
+async function asAudio({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
  const WhisperProvider = WHISPER_PROVIDERS.hasOwnProperty(
    options?.whisperProvider
  )
@ -48,11 +53,11 @@ async function asAudio({ fullFilePath = "", filename = "", options = {} }) {
  const data = {
    id: v4(),
    url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "no author found",
-    description: "No description found.",
-    docSource: "pdf file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "no author found",
+    description: metadata.description || "No description found.",
+    docSource: metadata.docSource || "audio file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
    published: createdDate(fullFilePath),
    wordCount: content.split(" ").length,
    pageContent: content,
--- a/collector/processSingleFile/convert/asDocx.js
+++ b/collector/processSingleFile/convert/asDocx.js
@ -8,7 +8,12 @@ const {
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");

-async function asDocX({ fullFilePath = "", filename = "", options = {} }) {
+async function asDocX({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
  const loader = new DocxLoader(fullFilePath);

  console.log(`-- Working ${filename} --`);
@ -34,11 +39,11 @@ async function asDocX({ fullFilePath = "", filename = "", options = {} }) {
  const data = {
    id: v4(),
    url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "no author found",
-    description: "No description found.",
-    docSource: "pdf file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "no author found",
+    description: metadata.description || "No description found.",
+    docSource: metadata.docSource || "docx file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
    published: createdDate(fullFilePath),
    wordCount: content.split(" ").length,
    pageContent: content,
--- a/collector/processSingleFile/convert/asEPub.js
+++ b/collector/processSingleFile/convert/asEPub.js
@ -8,7 +8,12 @@ const {
 } = require("../../utils/files");
 const { default: slugify } = require("slugify");

-async function asEPub({ fullFilePath = "", filename = "", options = {} }) {
+async function asEPub({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
  let content = "";
  try {
    const loader = new EPubLoader(fullFilePath, { splitChapters: false });
@ -32,11 +37,11 @@ async function asEPub({ fullFilePath = "", filename = "", options = {} }) {
  const data = {
    id: v4(),
    url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "Unknown", // TODO: Find a better author
-    description: "Unknown", // TODO: Find a better description
-    docSource: "a epub file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "Unknown",
+    description: metadata.description || "Unknown",
+    docSource: metadata.docSource || "epub file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
    published: createdDate(fullFilePath),
    wordCount: content.split(" ").length,
    pageContent: content,
--- a/collector/processSingleFile/convert/asImage.js
+++ b/collector/processSingleFile/convert/asImage.js
@ -8,7 +8,12 @@ const {
 const OCRLoader = require("../../utils/OCRLoader");
 const { default: slugify } = require("slugify");

-async function asImage({ fullFilePath = "", filename = "", options = {} }) {
+async function asImage({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
  let content = await new OCRLoader({
    targetLanguages: options?.ocr?.langList,
  }).ocrImage(fullFilePath);
@ -27,11 +32,11 @@ async function asImage({ fullFilePath = "", filename = "", options = {} }) {
  const data = {
    id: v4(),
    url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "Unknown", // TODO: Find a better author
-    description: "Unknown", // TODO: Find a better description
-    docSource: "a text file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "Unknown",
+    description: metadata.description || "Unknown",
+    docSource: metadata.docSource || "image file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
    published: createdDate(fullFilePath),
    wordCount: content.split(" ").length,
    pageContent: content,
--- a/collector/processSingleFile/convert/asMbox.js
+++ b/collector/processSingleFile/convert/asMbox.js
@ -9,7 +9,12 @@ const {
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");

-async function asMbox({ fullFilePath = "", filename = "", options = {} }) {
+async function asMbox({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
  console.log(`-- Working ${filename} --`);

  const mails = await mboxParser(fs.createReadStream(fullFilePath))
@ -43,13 +48,16 @@ async function asMbox({ fullFilePath = "", filename = "", options = {} }) {
    const data = {
      id: v4(),
      url: "file://" + fullFilePath,
-      title: mail?.subject
-        ? slugify(mail?.subject?.replace(".", "")) + ".mbox"
-        : `msg_${item}-${filename}`,
-      docAuthor: mail?.from?.text,
-      description: "No description found.",
-      docSource: "Mbox message file uploaded by the user.",
-      chunkSource: "",
+      title:
+        metadata.title ||
+        (mail?.subject
+          ? slugify(mail?.subject?.replace(".", "")) + ".mbox"
+          : `msg_${item}-${filename}`),
+      docAuthor: metadata.docAuthor || mail?.from?.text,
+      description: metadata.description || "No description found.",
+      docSource:
+        metadata.docSource || "Mbox message file uploaded by the user.",
+      chunkSource: metadata.chunkSource || "",
      published: createdDate(fullFilePath),
      wordCount: content.split(" ").length,
      pageContent: content,
--- a/collector/processSingleFile/convert/asOfficeMime.js
+++ b/collector/processSingleFile/convert/asOfficeMime.js
@ -12,6 +12,7 @@ async function asOfficeMime({
  fullFilePath = "",
  filename = "",
  options = {},
+  metadata = {},
 }) {
  console.log(`-- Working ${filename} --`);
  let content = "";
@ -34,11 +35,11 @@ async function asOfficeMime({
  const data = {
    id: v4(),
    url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "no author found",
-    description: "No description found.",
-    docSource: "Office file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "no author found",
+    description: metadata.description || "No description found.",
+    docSource: metadata.docSource || "Office file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
    published: createdDate(fullFilePath),
    wordCount: content.split(" ").length,
    pageContent: content,
--- a/collector/processSingleFile/convert/asPDF/index.js
+++ b/collector/processSingleFile/convert/asPDF/index.js
@ -9,7 +9,12 @@ const { default: slugify } = require("slugify");
 const PDFLoader = require("./PDFLoader");
 const OCRLoader = require("../../../utils/OCRLoader");

-async function asPdf({ fullFilePath = "", filename = "", options = {} }) {
+async function asPdf({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
  const pdfLoader = new PDFLoader(fullFilePath, {
    splitPages: true,
  });
@ -51,11 +56,17 @@ async function asPdf({ fullFilePath = "", filename = "", options = {} }) {
  const data = {
    id: v4(),
    url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: docs[0]?.metadata?.pdf?.info?.Creator || "no author found",
-    description: docs[0]?.metadata?.pdf?.info?.Title || "No description found.",
-    docSource: "pdf file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor:
+      metadata.docAuthor ||
+      docs[0]?.metadata?.pdf?.info?.Creator ||
+      "no author found",
+    description:
+      metadata.description ||
+      docs[0]?.metadata?.pdf?.info?.Title ||
+      "No description found.",
+    docSource: metadata.docSource || "pdf file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
    published: createdDate(fullFilePath),
    wordCount: content.split(" ").length,
    pageContent: content,
--- a/collector/processSingleFile/convert/asTxt.js
+++ b/collector/processSingleFile/convert/asTxt.js
@ -8,7 +8,12 @@ const {
 } = require("../../utils/files");
 const { default: slugify } = require("slugify");

-async function asTxt({ fullFilePath = "", filename = "", options = {} }) {
+async function asTxt({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
  let content = "";
  try {
    content = fs.readFileSync(fullFilePath, "utf8");
@ -30,11 +35,11 @@ async function asTxt({ fullFilePath = "", filename = "", options = {} }) {
  const data = {
    id: v4(),
    url: "file://" + fullFilePath,
-    title: filename,
-    docAuthor: "Unknown", // TODO: Find a better author
-    description: "Unknown", // TODO: Find a better description
-    docSource: "a text file uploaded by the user.",
-    chunkSource: "",
+    title: metadata.title || filename,
+    docAuthor: metadata.docAuthor || "Unknown",
+    description: metadata.description || "Unknown",
+    docSource: metadata.docSource || "a text file uploaded by the user.",
+    chunkSource: metadata.chunkSource || "",
    published: createdDate(fullFilePath),
    wordCount: content.split(" ").length,
    pageContent: content,
--- a/collector/processSingleFile/convert/asXlsx.js
+++ b/collector/processSingleFile/convert/asXlsx.js
@ -27,7 +27,12 @@ function convertToCSV(data) {
    .join("\n");
 }

-async function asXlsx({ fullFilePath = "", filename = "", options = {} }) {
+async function asXlsx({
+  fullFilePath = "",
+  filename = "",
+  options = {},
+  metadata = {},
+}) {
  const documents = [];
  const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, {
    lower: true,
@ -56,11 +61,12 @@ async function asXlsx({ fullFilePath = "", filename = "", options = {} }) {
        const sheetData = {
          id: v4(),
          url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`,
-          title: `${filename} - Sheet:${name}`,
-          docAuthor: "Unknown",
-          description: `Spreadsheet data from sheet: ${name}`,
-          docSource: "an xlsx file uploaded by the user.",
-          chunkSource: "",
+          title: metadata.title || `${filename} - Sheet:${name}`,
+          docAuthor: metadata.docAuthor || "Unknown",
+          description:
+            metadata.description || `Spreadsheet data from sheet: ${name}`,
+          docSource: metadata.docSource || "an xlsx file uploaded by the user.",
+          chunkSource: metadata.chunkSource || "",
          published: createdDate(fullFilePath),
          wordCount: content.split(/\s+/).length,
          pageContent: content,
--- a/collector/processSingleFile/index.js
+++ b/collector/processSingleFile/index.js
@ -12,7 +12,14 @@ const {
 } = require("../utils/files");
 const RESERVED_FILES = ["__HOTDIR__.md"];

-async function processSingleFile(targetFilename, options = {}) {
+/**
+ * Process a single file and return the documents
+ * @param {string} targetFilename - The filename to process
+ * @param {Object} options - The options for the file processing
+ * @param {Object} metadata - The metadata for the file processing
+ * @returns {Promise<{success: boolean, reason: string, documents: Object[]}>} - The documents from the file processing
+ */
+async function processSingleFile(targetFilename, options = {}, metadata = {}) {
  const fullFilePath = path.resolve(
    WATCH_DIRECTORY,
    normalizePath(targetFilename)
@ -70,6 +77,7 @@ async function processSingleFile(targetFilename, options = {}) {
    fullFilePath,
    filename: targetFilename,
    options,
+    metadata,
  });
 }

--- a/server/endpoints/api/document/index.js
+++ b/server/endpoints/api/document/index.js
@ -8,7 +8,7 @@ const {
  normalizePath,
  isWithin,
 } = require("../../../utils/files");
-const { reqBody } = require("../../../utils/http");
+const { reqBody, safeJsonParse } = require("../../../utils/http");
 const { EventLogs } = require("../../../models/eventLogs");
 const { CollectorApi } = require("../../../utils/collectorApi");
 const fs = require("fs");
@ -29,7 +29,7 @@ function apiDocumentEndpoints(app) {
    async (request, response) => {
      /*
    #swagger.tags = ['Documents']
-    #swagger.description = 'Upload a new file to AnythingLLM to be parsed and prepared for embedding.'
+    #swagger.description = 'Upload a new file to AnythingLLM to be parsed and prepared for embedding, with optional metadata.'
    #swagger.requestBody = {
      description: 'File to be uploaded.',
      required: true,
@ -47,6 +47,11 @@ function apiDocumentEndpoints(app) {
              addToWorkspaces: {
                type: 'string',
                description: 'comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2',
+              },
+              metadata: {
+                type: 'object',
+                description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.',
+                example: { 'title': 'Custom Title', 'docAuthor': 'Author Name', 'description': 'A brief description', 'docSource': 'Source of the document' }
              }
            },
            required: ['file']
@ -91,7 +96,12 @@ function apiDocumentEndpoints(app) {
      try {
        const Collector = new CollectorApi();
        const { originalname } = request.file;
-        const { addToWorkspaces = "" } = reqBody(request);
+        const { addToWorkspaces = "", metadata: _metadata = {} } =
+          reqBody(request);
+        const metadata =
+          typeof _metadata === "string"
+            ? safeJsonParse(_metadata, {})
+            : _metadata;
        const processingOnline = await Collector.online();

        if (!processingOnline) {
@ -105,14 +115,16 @@ function apiDocumentEndpoints(app) {
          return;
        }

-        const { success, reason, documents } =
-          await Collector.processDocument(originalname);
+        const { success, reason, documents } = await Collector.processDocument(
+          originalname,
+          metadata
+        );
+
        if (!success) {
-          response
+          return response
            .status(500)
            .json({ success: false, error: reason, documents })
            .end();
-          return;
        }

        Collector.log(
@ -151,7 +163,7 @@ function apiDocumentEndpoints(app) {
        example: 'my-folder'
      }
      #swagger.requestBody = {
-        description: 'File to be uploaded.',
+        description: 'File to be uploaded, with optional metadata.',
        required: true,
        content: {
          "multipart/form-data": {
@ -167,6 +179,11 @@ function apiDocumentEndpoints(app) {
                addToWorkspaces: {
                  type: 'string',
                  description: 'comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2',
+                },
+                metadata: {
+                  type: 'object',
+                  description: 'Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.',
+                  example: { 'title': 'Custom Title', 'docAuthor': 'Author Name', 'description': 'A brief description', 'docSource': 'Source of the document' }
                }
              }
            }
@ -221,7 +238,13 @@ function apiDocumentEndpoints(app) {
      */
      try {
        const { originalname } = request.file;
-        const { addToWorkspaces = "" } = reqBody(request);
+        const { addToWorkspaces = "", metadata: _metadata = {} } =
+          reqBody(request);
+        const metadata =
+          typeof _metadata === "string"
+            ? safeJsonParse(_metadata, {})
+            : _metadata;
+
        let folder = request.params?.folderName || "custom-documents";
        folder = normalizePath(folder);
        const targetFolderPath = path.join(documentsPath, folder);
@ -236,25 +259,25 @@ function apiDocumentEndpoints(app) {
        const Collector = new CollectorApi();
        const processingOnline = await Collector.online();
        if (!processingOnline) {
-          response
+          return response
            .status(500)
            .json({
              success: false,
              error: `Document processing API is not online. Document ${originalname} will not be processed automatically.`,
            })
            .end();
-          return;
        }

-        // Process the uploaded document
-        const { success, reason, documents } =
-          await Collector.processDocument(originalname);
+        // Process the uploaded document with metadata
+        const { success, reason, documents } = await Collector.processDocument(
+          originalname,
+          metadata
+        );
        if (!success) {
-          response
+          return response
            .status(500)
            .json({ success: false, error: reason, documents })
            .end();
-          return;
        }

        // For each processed document, check if it is already in the desired folder.
@ -314,7 +337,7 @@ function apiDocumentEndpoints(app) {
    #swagger.tags = ['Documents']
    #swagger.description = 'Upload a valid URL for AnythingLLM to scrape and prepare for embedding. Optionally, specify a comma-separated list of workspace slugs to embed the document into post-upload.'
    #swagger.requestBody = {
-      description: 'Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload.',
+      description: 'Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload, and optional metadata.',
      required: true,
      content: {
          "application/json": {
@ -326,6 +349,12 @@ function apiDocumentEndpoints(app) {
                "scraperHeaders": {
                  "Authorization": "Bearer token123",
                  "My-Custom-Header": "value"
+                },
+                "metadata": {
+                  "title": "Custom Title",
+                  "docAuthor": "Author Name",
+                  "description": "A brief description",
+                  "docSource": "Source of the document"
                }
              }
            }
@ -373,30 +402,34 @@ function apiDocumentEndpoints(app) {
          link,
          addToWorkspaces = "",
          scraperHeaders = {},
+          metadata: _metadata = {},
        } = reqBody(request);
+        const metadata =
+          typeof _metadata === "string"
+            ? safeJsonParse(_metadata, {})
+            : _metadata;
        const processingOnline = await Collector.online();

        if (!processingOnline) {
-          response
+          return response
            .status(500)
            .json({
              success: false,
              error: `Document processing API is not online. Link ${link} will not be processed automatically.`,
            })
            .end();
-          return;
        }

        const { success, reason, documents } = await Collector.processLink(
          link,
-          scraperHeaders
+          scraperHeaders,
+          metadata
        );
        if (!success) {
-          response
+          return response
            .status(500)
            .json({ success: false, error: reason, documents })
            .end();
-          return;
        }

        Collector.log(
@ -488,20 +521,23 @@ function apiDocumentEndpoints(app) {
        const requiredMetadata = ["title"];
        const {
          textContent,
-          metadata = {},
+          metadata: _metadata = {},
          addToWorkspaces = "",
        } = reqBody(request);
+        const metadata =
+          typeof _metadata === "string"
+            ? safeJsonParse(_metadata, {})
+            : _metadata;
        const processingOnline = await Collector.online();

        if (!processingOnline) {
-          response
+          return response
            .status(500)
            .json({
              success: false,
              error: `Document processing API is not online. Request will not be processed.`,
            })
            .end();
-          return;
        }

        if (
@ -510,7 +546,7 @@ function apiDocumentEndpoints(app) {
              Object.keys(metadata).includes(reqKey) && !!metadata[reqKey]
          )
        ) {
-          response
+          return response
            .status(422)
            .json({
              success: false,
@ -519,18 +555,16 @@ function apiDocumentEndpoints(app) {
                .join(", ")}`,
            })
            .end();
-          return;
        }

        if (!textContent || textContent?.length === 0) {
-          response
+          return response
            .status(422)
            .json({
              success: false,
              error: `The 'textContent' key cannot have an empty value.`,
            })
            .end();
-          return;
        }

        const { success, reason, documents } = await Collector.processRawText(
@ -538,11 +572,10 @@ function apiDocumentEndpoints(app) {
          metadata
        );
        if (!success) {
-          response
+          return response
            .status(500)
            .json({ success: false, error: reason, documents })
            .end();
-          return;
        }

        Collector.log(
--- a/server/swagger/openapi.json
+++ b/server/swagger/openapi.json
@ -843,7 +843,7 @@
        "tags": [
          "Documents"
        ],
-        "description": "Upload a new file to AnythingLLM to be parsed and prepared for embedding.",
+        "description": "Upload a new file to AnythingLLM to be parsed and prepared for embedding, with optional metadata.",
        "parameters": [],
        "responses": {
          "200": {
@ -913,6 +913,16 @@
                  "addToWorkspaces": {
                    "type": "string",
                    "description": "comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2"
+                  },
+                  "metadata": {
+                    "type": "object",
+                    "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.",
+                    "example": {
+                      "title": "Custom Title",
+                      "docAuthor": "Author Name",
+                      "description": "A brief description",
+                      "docSource": "Source of the document"
+                    }
                  }
                }
              }
@ -1000,7 +1010,7 @@
          }
        },
        "requestBody": {
-          "description": "File to be uploaded.",
+          "description": "File to be uploaded, with optional metadata.",
          "required": true,
          "content": {
            "multipart/form-data": {
@ -1018,6 +1028,16 @@
                  "addToWorkspaces": {
                    "type": "string",
                    "description": "comma-separated text-string of workspace slugs to embed the document into post-upload. eg: workspace1,workspace2"
+                  },
+                  "metadata": {
+                    "type": "object",
+                    "description": "Key:Value pairs of metadata to attach to the document in JSON Object format. Only specific keys are allowed - see example.",
+                    "example": {
+                      "title": "Custom Title",
+                      "docAuthor": "Author Name",
+                      "description": "A brief description",
+                      "docSource": "Source of the document"
+                    }
                  }
                }
              }
@ -1084,7 +1104,7 @@
          }
        },
        "requestBody": {
-          "description": "Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload.",
+          "description": "Link of web address to be scraped and optionally a comma-separated list of workspace slugs to embed the document into post-upload, and optional metadata.",
          "required": true,
          "content": {
            "application/json": {
@ -1096,6 +1116,12 @@
                  "scraperHeaders": {
                    "Authorization": "Bearer token123",
                    "My-Custom-Header": "value"
+                  },
+                  "metadata": {
+                    "title": "Custom Title",
+                    "docAuthor": "Author Name",
+                    "description": "A brief description",
+                    "docSource": "Source of the document"
                  }
                }
              }
--- a/server/utils/collectorApi/index.js
+++ b/server/utils/collectorApi/index.js
@ -63,15 +63,17 @@ class CollectorApi {

  /**
   * Process a document
-   * - Will append the options to the request body
+   * - Will append the options and optional metadata to the request body
   * @param {string} filename - The filename of the document to process
+   * @param {Object} metadata - Optional metadata key:value pairs
   * @returns {Promise<Object>} - The response from the collector API
   */
-  async processDocument(filename = "") {
+  async processDocument(filename = "", metadata = {}) {
    if (!filename) return false;

    const data = JSON.stringify({
      filename,
+      metadata,
      options: this.#attachOptions(),
    });

@ -102,15 +104,17 @@ class CollectorApi {
   * - Will append the options to the request body
   * @param {string} link - The link to process
   * @param {{[key: string]: string}} scraperHeaders - Custom headers to apply to the web-scraping request URL
+   * @param {[key: string]: string} metadata - Optional metadata to attach to the document
   * @returns {Promise<Object>} - The response from the collector API
   */
-  async processLink(link = "", scraperHeaders = {}) {
+  async processLink(link = "", scraperHeaders = {}, metadata = {}) {
    if (!link) return false;

    const data = JSON.stringify({
      link,
      scraperHeaders,
      options: this.#attachOptions(),
+      metadata: metadata,
    });

    return await fetch(`${this.endpoint}/process-link`, {
@ -139,7 +143,7 @@ class CollectorApi {
   * Process raw text as a document for the collector
   * - Will append the options to the request body
   * @param {string} textContent - The text to process
-   * @param {Object} metadata - The metadata to process
+   * @param {[key: string]: string} metadata - The metadata to process
   * @returns {Promise<Object>} - The response from the collector API
   */
  async processRawText(textContent = "", metadata = {}) {