Fetch, Parse, and Create Documents for Statically Hosted Files (#4398)

* Add capability to web scraping feature for document creation to download and parse statically hosted files * lint * Remove unneeded comment * Simplified process by using key of ACCEPTED_MIMES to validate the response content type, as a result unlocked all supported files * Add TODO comments for future implementation of asDoc.js to handle standard MS Word files in constants.js * Return captureAs argument to be exposed by scrapeGenericUrl and passed into getPageContent | Return explicit argument of captureAs into scrapeGenericUrl in processLink fn * Return debug log for scrapeGenericUrl * Change conditional to a guard clause. * Add error handling, validation, and JSDOC to getContentType helper fn * remove unneeded comments * Simplify URL validation by reusing module * Rename downloadFileToHotDir to downloadURIToFile and moved up to a global module | Add URL valuidation to downloadURIToFile * refactor * add support for webp remove unused imports --------- Co-authored-by: timothycarambat <rambat1010@gmail.com>
2025-10-01 15:49:05 -07:00 · 2025-10-01 15:49:05 -07:00 · f7b90571be
commit f7b90571be
parent 004327264a
8 changed files with 230 additions and 14 deletions
--- a/collector/processLink/convert/generic.js
+++ b/collector/processLink/convert/generic.js
@ -1,10 +1,15 @@
 const { v4 } = require("uuid");
 const path = require("path");
 const {
  PuppeteerWebBaseLoader,
 } = require("langchain/document_loaders/web/puppeteer");
 const { writeToServerDocuments } = require("../../utils/files");
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");
 const { getContentTypeFromURL, returnResult } = require("../helpers");
 const { processSingleFile } = require("../../processSingleFile");
 const { downloadURIToFile } = require("../../utils/downloadURIToFile");
 const { ACCEPTED_MIMES } = require("../../utils/constants");
 const RuntimeSettings = require("../../utils/runtimeSettings");
 /**
@ -12,19 +17,86 @@ const RuntimeSettings = require("../../utils/runtimeSettings");
 * @param {Object} config - The configuration object
 * @param {string} config.link - The URL to scrape
 * @param {('html' | 'text')} config.captureAs - The format to capture the page content as. Default is 'text'
 * @param {boolean} config.processAsDocument - Whether to process the content as a document or return the content directly. Default is true
 * @param {{[key: string]: string}} config.scraperHeaders - Custom headers to use when making the request
 * @param {{[key: string]: string}} config.metadata - Metadata to use when creating the document
 * @param {boolean} config.saveAsDocument - Whether to save the content as a document. Default is true
 * @returns {Promise<Object>} - The content of the page
 */
 async function scrapeGenericUrl({
  link,
  captureAs = "text",
  processAsDocument = true,
  scraperHeaders = {},
  metadata = {},
  saveAsDocument = true,
 }) {
-  console.log(`-- Working URL ${link} => (${captureAs}) --`);
+  /** @type {'web' | 'file'} */
  let processVia = "web";
  console.log(`-- Working URL ${link} => (captureAs: ${captureAs}) --`);
  const contentType = await getContentTypeFromURL(link)
    .then((result) => {
      // If there is a reason, log it, but continue with the process
      if (!!result.reason) console.error(result.reason);
      return result.contentType;
    })
    .catch((error) => {
      console.error("Error getting content type from URL", error);
      return null;
    });
  // If the content is unlikely to be a webpage, assume it is a file and process it as a file
  if (
    !["text/html", "text/plain"].includes(contentType) &&
    contentType in ACCEPTED_MIMES
  )
    processVia = "file";
  console.log(`-- URL determined to be ${contentType} (${processVia}) --`);
  // If the content type is a file, download the file to the hotdir and process it
  // Then return the content of the file as a document or whatever the captureAs dictates.
  if (processVia === "file") {
    const fileContentResult = await downloadURIToFile(link);
    if (!fileContentResult.success)
      return returnResult({
        success: false,
        reason: fileContentResult.reason,
        documents: [],
        content: null,
        saveAsDocument,
      });
    const fileFilePath = fileContentResult.fileLocation;
    const targetFilename = path.basename(fileFilePath);
    // If the saveAsDocument is false, we are only interested in the text content
    // and can delete the file after we have the text content via the parseOnly option
    const processSingleFileResult = await processSingleFile(targetFilename, {
      parseOnly: saveAsDocument === false,
    });
    if (!processSingleFileResult.success) {
      return returnResult({
        success: false,
        reason: processSingleFileResult.reason,
        documents: [],
        content: null,
        saveAsDocument,
      });
    }
    // If we intend to return only the text content, return the content from the file
    // and then delete the file - otherwise it will be saved as a document
    if (!saveAsDocument) {
      return returnResult({
        success: true,
        content: processSingleFileResult.documents[0].pageContent,
        saveAsDocument,
      });
    }
    return processSingleFileResult;
  }
  // Otherwise, assume the content is a webpage and scrape the content from the webpage
  const content = await getPageContent({
    link,
    captureAs,
@ -33,24 +105,29 @@ async function scrapeGenericUrl({
  if (!content.length) {
    console.error(`Resulting URL content was empty at ${link}.`);
-    return {
+    return returnResult({
      success: false,
      reason: `No URL content found at ${link}.`,
      documents: [],
-    };
+      content: null,
      saveAsDocument,
    });
  }
-  if (!processAsDocument) {
+  // If the captureAs is text, return the content as a string immediately
-    return {
+  // so that we dont save the content as a document
  if (!saveAsDocument) {
    return returnResult({
      success: true,
      content,
-    };
+      saveAsDocument,
    });
  }
  // Save the content as a document from the URL
  const url = new URL(link);
  const decodedPathname = decodeURIComponent(url.pathname);
  const filename = `${url.hostname}${decodedPathname.replace(/\//g, "_")}`;
  const data = {
    id: v4(),
    url: "file://" + slugify(filename) + ".html",
--- a/collector/processLink/helpers/index.js
+++ b/collector/processLink/helpers/index.js
@ -0,0 +1,72 @@
 const { validURL } = require("../../utils/url");
 /**
 * Get the content type of a resource
 * - Sends a HEAD request to the URL and returns the Content-Type header with a 5 second timeout
 * @param {string} url - The URL to get the content type of
 * @returns {Promise<{success: boolean, reason: string|null, contentType: string|null}>} - The content type of the resource
 */
 async function getContentTypeFromURL(url) {
  try {
    if (!url || typeof url !== "string" || !validURL(url))
      return { success: false, reason: "Not a valid URL.", contentType: null };
    const abortController = new AbortController();
    const timeout = setTimeout(() => {
      abortController.abort();
      console.error("Timeout fetching content type for URL:", url.toString());
    }, 5_000);
    const res = await fetch(url, {
      method: "HEAD",
      signal: abortController.signal,
    }).finally(() => clearTimeout(timeout));
    if (!res.ok)
      return {
        success: false,
        reason: `HTTP ${res.status}: ${res.statusText}`,
        contentType: null,
      };
    const contentType = res.headers.get("Content-Type")?.toLowerCase();
    const contentTypeWithoutCharset = contentType?.split(";")[0].trim();
    if (!contentTypeWithoutCharset)
      return {
        success: false,
        reason: "No Content-Type found.",
        contentType: null,
      };
    return {
      success: true,
      reason: null,
      contentType: contentTypeWithoutCharset,
    };
  } catch (error) {
    return {
      success: false,
      reason: `Error: ${error.message}`,
      contentType: null,
    };
  }
 }
 function returnResult({
  success,
  reason,
  documents,
  content,
  saveAsDocument = true,
 }) {
  if (!saveAsDocument) {
    return {
      success,
      content,
    };
  } else return { success, reason, documents };
 }
 module.exports = {
  returnResult,
  getContentTypeFromURL,
 };
--- a/collector/processLink/index.js
+++ b/collector/processLink/index.js
@ -14,9 +14,9 @@ async function processLink(link, scraperHeaders = {}, metadata = {}) {
  return await scrapeGenericUrl({
    link,
    captureAs: "text",
    processAsDocument: true,
    scraperHeaders,
    metadata,
    saveAsDocument: true,
  });
 }
@ -32,7 +32,7 @@ async function getLinkText(link, captureAs = "text") {
  return await scrapeGenericUrl({
    link,
    captureAs,
-    processAsDocument: false,
+    saveAsDocument: false,
  });
 }
--- a/collector/processSingleFile/index.js
+++ b/collector/processSingleFile/index.js
@ -16,6 +16,7 @@ const RESERVED_FILES = ["__HOTDIR__.md"];
 * Process a single file and return the documents
 * @param {string} targetFilename - The filename to process
 * @param {Object} options - The options for the file processing
 * @param {boolean} options.parseOnly - If true, the file will not be saved as a document even when `writeToServerDocuments` is called in the handler. Must be explicitly set to true to use.
 * @param {Object} metadata - The metadata for the file processing
 * @returns {Promise<{success: boolean, reason: string, documents: Object[]}>} - The documents from the file processing
 */
--- a/collector/utils/constants.js
+++ b/collector/utils/constants.js
@ -3,6 +3,10 @@ const WATCH_DIRECTORY = require("path").resolve(__dirname, "../hotdir");
 const ACCEPTED_MIMES = {
  "text/plain": [".txt", ".md", ".org", ".adoc", ".rst"],
  "text/html": [".html"],
  "text/csv": [".csv"],
  "application/json": [".json"],
  // TODO: Create asDoc.js that works for standard MS Word files.
  // "application/msword": [".doc"],
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document": [
    ".docx",
@ -30,6 +34,7 @@ const ACCEPTED_MIMES = {
  "image/png": [".png"],
  "image/jpeg": [".jpg"],
  "image/jpg": [".jpg"],
  "image/webp": [".webp"],
 };
 const SUPPORTED_FILETYPE_CONVERTERS = {
@ -38,11 +43,16 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
  ".org": "./convert/asTxt.js",
  ".adoc": "./convert/asTxt.js",
  ".rst": "./convert/asTxt.js",
  ".csv": "./convert/asTxt.js",
  ".json": "./convert/asTxt.js",
  ".html": "./convert/asTxt.js",
  ".pdf": "./convert/asPDF/index.js",
  ".docx": "./convert/asDocx.js",
  // TODO: Create asDoc.js that works for standard MS Word files.
  // ".doc": "./convert/asDoc.js",
  ".pptx": "./convert/asOfficeMime.js",
  ".odt": "./convert/asOfficeMime.js",
@ -62,6 +72,7 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
  ".png": "./convert/asImage.js",
  ".jpg": "./convert/asImage.js",
  ".jpeg": "./convert/asImage.js",
  ".webp": "./convert/asImage.js",
 };
 module.exports = {
--- a/collector/utils/downloadURIToFile/index.js
+++ b/collector/utils/downloadURIToFile/index.js
@ -0,0 +1,48 @@
 const { WATCH_DIRECTORY } = require("../constants");
 const fs = require("fs");
 const path = require("path");
 const { pipeline } = require("stream/promises");
 const { validURL } = require("../url");
 /**
 * Download a file to the hotdir
 * @param {string} url - The URL of the file to download
 * @param {number} maxTimeout - The maximum timeout in milliseconds
 * @returns {Promise<{success: boolean, fileLocation: string|null, reason: string|null}>} - The path to the downloaded file
 */
 async function downloadURIToFile(url, maxTimeout = 10_000) {
  if (!url || typeof url !== "string" || !validURL(url))
    return { success: false, reason: "Not a valid URL.", fileLocation: null };
  try {
    const abortController = new AbortController();
    const timeout = setTimeout(() => {
      abortController.abort();
      console.error(
        `Timeout ${maxTimeout}ms reached while downloading file for URL:`,
        url.toString()
      );
    }, maxTimeout);
    const res = await fetch(url, { signal: abortController.signal })
      .then((res) => {
        if (!res.ok) throw new Error(`HTTP ${res.status}: ${res.statusText}`);
        return res;
      })
      .finally(() => clearTimeout(timeout));
    const localFilePath = path.join(WATCH_DIRECTORY, path.basename(url));
    const writeStream = fs.createWriteStream(localFilePath);
    await pipeline(res.body, writeStream);
    console.log(`[SUCCESS]: File ${localFilePath} downloaded to hotdir.`);
    return { success: true, fileLocation: localFilePath, reason: null };
  } catch (error) {
    console.error(`Error writing to hotdir: ${error} for URL: ${url}`);
    return { success: false, reason: error.message, fileLocation: null };
  }
 }
 module.exports = {
  downloadURIToFile,
 };
--- a/collector/utils/runtimeSettings/index.js
+++ b/collector/utils/runtimeSettings/index.js
@ -22,6 +22,10 @@ class RuntimeSettings {
  // Any settings here will be persisted across requests
  // and must be explicitly defined here.
  settingConfigs = {
    seenAnyIpWarning: {
      default: false,
      validate: (value) => String(value) === "true",
    },
    allowAnyIp: {
      default: false,
      // Value must be explicitly "true" or "false" as a string
--- a/collector/utils/url/index.js
+++ b/collector/utils/url/index.js
@ -26,9 +26,12 @@ const runtimeSettings = new RuntimeSettings();
 */
 function isInvalidIp({ hostname }) {
  if (runtimeSettings.get("allowAnyIp")) {
-    console.log(
+    if (!runtimeSettings.get("seenAnyIpWarning")) {
-      "\x1b[33mURL IP local address restrictions have been disabled by administrator!\x1b[0m"
+      console.log(
-    );
+        "\x1b[33mURL IP local address restrictions have been disabled by administrator!\x1b[0m"
      );
      runtimeSettings.set("seenAnyIpWarning", true);
    }
    return false;
  }