Fetch, Parse, and Create Documents for Statically Hosted Files (#4398)

* Add capability to web scraping feature for document creation to download and parse statically hosted files * lint * Remove unneeded comment * Simplified process by using key of ACCEPTED_MIMES to validate the response content type, as a result unlocked all supported files * Add TODO comments for future implementation of asDoc.js to handle standard MS Word files in constants.js * Return captureAs argument to be exposed by scrapeGenericUrl and passed into getPageContent | Return explicit argument of captureAs into scrapeGenericUrl in processLink fn * Return debug log for scrapeGenericUrl * Change conditional to a guard clause. * Add error handling, validation, and JSDOC to getContentType helper fn * remove unneeded comments * Simplify URL validation by reusing module * Rename downloadFileToHotDir to downloadURIToFile and moved up to a global module | Add URL valuidation to downloadURIToFile * refactor * add support for webp remove unused imports --------- Co-authored-by: timothycarambat <rambat1010@gmail.com>
2025-10-01 15:49:05 -07:00 · 2025-10-01 15:49:05 -07:00 · f7b90571be
commit f7b90571be
parent 004327264a
8 changed files with 230 additions and 14 deletions
--- a/collector/processLink/convert/generic.js
+++ b/collector/processLink/convert/generic.js
@ -1,10 +1,15 @@
 const { v4 } = require("uuid");
+const path = require("path");
 const {
  PuppeteerWebBaseLoader,
 } = require("langchain/document_loaders/web/puppeteer");
 const { writeToServerDocuments } = require("../../utils/files");
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");
+const { getContentTypeFromURL, returnResult } = require("../helpers");
+const { processSingleFile } = require("../../processSingleFile");
+const { downloadURIToFile } = require("../../utils/downloadURIToFile");
+const { ACCEPTED_MIMES } = require("../../utils/constants");
 const RuntimeSettings = require("../../utils/runtimeSettings");

 /**
@ -12,19 +17,86 @@ const RuntimeSettings = require("../../utils/runtimeSettings");
 * @param {Object} config - The configuration object
 * @param {string} config.link - The URL to scrape
 * @param {('html' | 'text')} config.captureAs - The format to capture the page content as. Default is 'text'
- * @param {boolean} config.processAsDocument - Whether to process the content as a document or return the content directly. Default is true
 * @param {{[key: string]: string}} config.scraperHeaders - Custom headers to use when making the request
 * @param {{[key: string]: string}} config.metadata - Metadata to use when creating the document
+ * @param {boolean} config.saveAsDocument - Whether to save the content as a document. Default is true
 * @returns {Promise<Object>} - The content of the page
 */
 async function scrapeGenericUrl({
  link,
  captureAs = "text",
-  processAsDocument = true,
  scraperHeaders = {},
  metadata = {},
+  saveAsDocument = true,
 }) {
-  console.log(`-- Working URL ${link} => (${captureAs}) --`);
+  /** @type {'web' | 'file'} */
+  let processVia = "web";
+  console.log(`-- Working URL ${link} => (captureAs: ${captureAs}) --`);
+
+  const contentType = await getContentTypeFromURL(link)
+    .then((result) => {
+      // If there is a reason, log it, but continue with the process
+      if (!!result.reason) console.error(result.reason);
+      return result.contentType;
+    })
+    .catch((error) => {
+      console.error("Error getting content type from URL", error);
+      return null;
+    });
+
+  // If the content is unlikely to be a webpage, assume it is a file and process it as a file
+  if (
+    !["text/html", "text/plain"].includes(contentType) &&
+    contentType in ACCEPTED_MIMES
+  )
+    processVia = "file";
+
+  console.log(`-- URL determined to be ${contentType} (${processVia}) --`);
+  // If the content type is a file, download the file to the hotdir and process it
+  // Then return the content of the file as a document or whatever the captureAs dictates.
+  if (processVia === "file") {
+    const fileContentResult = await downloadURIToFile(link);
+    if (!fileContentResult.success)
+      return returnResult({
+        success: false,
+        reason: fileContentResult.reason,
+        documents: [],
+        content: null,
+        saveAsDocument,
+      });
+
+    const fileFilePath = fileContentResult.fileLocation;
+    const targetFilename = path.basename(fileFilePath);
+
+    // If the saveAsDocument is false, we are only interested in the text content
+    // and can delete the file after we have the text content via the parseOnly option
+    const processSingleFileResult = await processSingleFile(targetFilename, {
+      parseOnly: saveAsDocument === false,
+    });
+    if (!processSingleFileResult.success) {
+      return returnResult({
+        success: false,
+        reason: processSingleFileResult.reason,
+        documents: [],
+        content: null,
+        saveAsDocument,
+      });
+    }
+
+    // If we intend to return only the text content, return the content from the file
+    // and then delete the file - otherwise it will be saved as a document
+    if (!saveAsDocument) {
+      return returnResult({
+        success: true,
+        content: processSingleFileResult.documents[0].pageContent,
+        saveAsDocument,
+      });
+    }
+
+    return processSingleFileResult;
+  }
+
+  // Otherwise, assume the content is a webpage and scrape the content from the webpage
  const content = await getPageContent({
    link,
    captureAs,
@ -33,24 +105,29 @@ async function scrapeGenericUrl({

  if (!content.length) {
    console.error(`Resulting URL content was empty at ${link}.`);
-    return {
+    return returnResult({
      success: false,
      reason: `No URL content found at ${link}.`,
      documents: [],
-    };
+      content: null,
+      saveAsDocument,
+    });
  }

-  if (!processAsDocument) {
-    return {
+  // If the captureAs is text, return the content as a string immediately
+  // so that we dont save the content as a document
+  if (!saveAsDocument) {
+    return returnResult({
      success: true,
      content,
-    };
+      saveAsDocument,
+    });
  }

+  // Save the content as a document from the URL
  const url = new URL(link);
  const decodedPathname = decodeURIComponent(url.pathname);
  const filename = `${url.hostname}${decodedPathname.replace(/\//g, "_")}`;
-
  const data = {
    id: v4(),
    url: "file://" + slugify(filename) + ".html",
--- a/collector/processLink/helpers/index.js
+++ b/collector/processLink/helpers/index.js
@ -0,0 +1,72 @@
+const { validURL } = require("../../utils/url");
+
+/**
+ * Get the content type of a resource
+ * - Sends a HEAD request to the URL and returns the Content-Type header with a 5 second timeout
+ * @param {string} url - The URL to get the content type of
+ * @returns {Promise<{success: boolean, reason: string|null, contentType: string|null}>} - The content type of the resource
+ */
+async function getContentTypeFromURL(url) {
+  try {
+    if (!url || typeof url !== "string" || !validURL(url))
+      return { success: false, reason: "Not a valid URL.", contentType: null };
+
+    const abortController = new AbortController();
+    const timeout = setTimeout(() => {
+      abortController.abort();
+      console.error("Timeout fetching content type for URL:", url.toString());
+    }, 5_000);
+
+    const res = await fetch(url, {
+      method: "HEAD",
+      signal: abortController.signal,
+    }).finally(() => clearTimeout(timeout));
+
+    if (!res.ok)
+      return {
+        success: false,
+        reason: `HTTP ${res.status}: ${res.statusText}`,
+        contentType: null,
+      };
+
+    const contentType = res.headers.get("Content-Type")?.toLowerCase();
+    const contentTypeWithoutCharset = contentType?.split(";")[0].trim();
+    if (!contentTypeWithoutCharset)
+      return {
+        success: false,
+        reason: "No Content-Type found.",
+        contentType: null,
+      };
+    return {
+      success: true,
+      reason: null,
+      contentType: contentTypeWithoutCharset,
+    };
+  } catch (error) {
+    return {
+      success: false,
+      reason: `Error: ${error.message}`,
+      contentType: null,
+    };
+  }
+}
+
+function returnResult({
+  success,
+  reason,
+  documents,
+  content,
+  saveAsDocument = true,
+}) {
+  if (!saveAsDocument) {
+    return {
+      success,
+      content,
+    };
+  } else return { success, reason, documents };
+}
+
+module.exports = {
+  returnResult,
+  getContentTypeFromURL,
+};
--- a/collector/processLink/index.js
+++ b/collector/processLink/index.js
@ -14,9 +14,9 @@ async function processLink(link, scraperHeaders = {}, metadata = {}) {
  return await scrapeGenericUrl({
    link,
    captureAs: "text",
-    processAsDocument: true,
    scraperHeaders,
    metadata,
+    saveAsDocument: true,
  });
 }

@ -32,7 +32,7 @@ async function getLinkText(link, captureAs = "text") {
  return await scrapeGenericUrl({
    link,
    captureAs,
-    processAsDocument: false,
+    saveAsDocument: false,
  });
 }

--- a/collector/processSingleFile/index.js
+++ b/collector/processSingleFile/index.js
@ -16,6 +16,7 @@ const RESERVED_FILES = ["__HOTDIR__.md"];
 * Process a single file and return the documents
 * @param {string} targetFilename - The filename to process
 * @param {Object} options - The options for the file processing
+ * @param {boolean} options.parseOnly - If true, the file will not be saved as a document even when `writeToServerDocuments` is called in the handler. Must be explicitly set to true to use.
 * @param {Object} metadata - The metadata for the file processing
 * @returns {Promise<{success: boolean, reason: string, documents: Object[]}>} - The documents from the file processing
 */
--- a/collector/utils/constants.js
+++ b/collector/utils/constants.js
@ -3,6 +3,10 @@ const WATCH_DIRECTORY = require("path").resolve(__dirname, "../hotdir");
 const ACCEPTED_MIMES = {
  "text/plain": [".txt", ".md", ".org", ".adoc", ".rst"],
  "text/html": [".html"],
+  "text/csv": [".csv"],
+  "application/json": [".json"],
+  // TODO: Create asDoc.js that works for standard MS Word files.
+  // "application/msword": [".doc"],

  "application/vnd.openxmlformats-officedocument.wordprocessingml.document": [
    ".docx",
@ -30,6 +34,7 @@ const ACCEPTED_MIMES = {
  "image/png": [".png"],
  "image/jpeg": [".jpg"],
  "image/jpg": [".jpg"],
+  "image/webp": [".webp"],
 };

 const SUPPORTED_FILETYPE_CONVERTERS = {
@ -38,11 +43,16 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
  ".org": "./convert/asTxt.js",
  ".adoc": "./convert/asTxt.js",
  ".rst": "./convert/asTxt.js",
+  ".csv": "./convert/asTxt.js",
+  ".json": "./convert/asTxt.js",

  ".html": "./convert/asTxt.js",
  ".pdf": "./convert/asPDF/index.js",

  ".docx": "./convert/asDocx.js",
+  // TODO: Create asDoc.js that works for standard MS Word files.
+  // ".doc": "./convert/asDoc.js",
+
  ".pptx": "./convert/asOfficeMime.js",

  ".odt": "./convert/asOfficeMime.js",
@ -62,6 +72,7 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
  ".png": "./convert/asImage.js",
  ".jpg": "./convert/asImage.js",
  ".jpeg": "./convert/asImage.js",
+  ".webp": "./convert/asImage.js",
 };

 module.exports = {
--- a/collector/utils/downloadURIToFile/index.js
+++ b/collector/utils/downloadURIToFile/index.js
@ -0,0 +1,48 @@
+const { WATCH_DIRECTORY } = require("../constants");
+const fs = require("fs");
+const path = require("path");
+const { pipeline } = require("stream/promises");
+const { validURL } = require("../url");
+
+/**
+ * Download a file to the hotdir
+ * @param {string} url - The URL of the file to download
+ * @param {number} maxTimeout - The maximum timeout in milliseconds
+ * @returns {Promise<{success: boolean, fileLocation: string|null, reason: string|null}>} - The path to the downloaded file
+ */
+async function downloadURIToFile(url, maxTimeout = 10_000) {
+  if (!url || typeof url !== "string" || !validURL(url))
+    return { success: false, reason: "Not a valid URL.", fileLocation: null };
+
+  try {
+    const abortController = new AbortController();
+    const timeout = setTimeout(() => {
+      abortController.abort();
+      console.error(
+        `Timeout ${maxTimeout}ms reached while downloading file for URL:`,
+        url.toString()
+      );
+    }, maxTimeout);
+
+    const res = await fetch(url, { signal: abortController.signal })
+      .then((res) => {
+        if (!res.ok) throw new Error(`HTTP ${res.status}: ${res.statusText}`);
+        return res;
+      })
+      .finally(() => clearTimeout(timeout));
+
+    const localFilePath = path.join(WATCH_DIRECTORY, path.basename(url));
+    const writeStream = fs.createWriteStream(localFilePath);
+    await pipeline(res.body, writeStream);
+
+    console.log(`[SUCCESS]: File ${localFilePath} downloaded to hotdir.`);
+    return { success: true, fileLocation: localFilePath, reason: null };
+  } catch (error) {
+    console.error(`Error writing to hotdir: ${error} for URL: ${url}`);
+    return { success: false, reason: error.message, fileLocation: null };
+  }
+}
+
+module.exports = {
+  downloadURIToFile,
+};
--- a/collector/utils/runtimeSettings/index.js
+++ b/collector/utils/runtimeSettings/index.js
@ -22,6 +22,10 @@ class RuntimeSettings {
  // Any settings here will be persisted across requests
  // and must be explicitly defined here.
  settingConfigs = {
+    seenAnyIpWarning: {
+      default: false,
+      validate: (value) => String(value) === "true",
+    },
    allowAnyIp: {
      default: false,
      // Value must be explicitly "true" or "false" as a string
--- a/collector/utils/url/index.js
+++ b/collector/utils/url/index.js
@ -26,9 +26,12 @@ const runtimeSettings = new RuntimeSettings();
 */
 function isInvalidIp({ hostname }) {
  if (runtimeSettings.get("allowAnyIp")) {
-    console.log(
-      "\x1b[33mURL IP local address restrictions have been disabled by administrator!\x1b[0m"
-    );
+    if (!runtimeSettings.get("seenAnyIpWarning")) {
+      console.log(
+        "\x1b[33mURL IP local address restrictions have been disabled by administrator!\x1b[0m"
+      );
+      runtimeSettings.set("seenAnyIpWarning", true);
+    }
    return false;
  }