Fetch, Parse, and Create Documents for Statically Hosted Files (#4398)

* Add capability to web scraping feature for document creation to download and parse statically hosted files

* lint

* Remove unneeded comment

* Simplified process by using key of ACCEPTED_MIMES to validate the response content type, as a result unlocked all supported files

* Add TODO comments for future implementation of asDoc.js to handle standard MS Word files in constants.js

* Return captureAs argument to be exposed by scrapeGenericUrl and passed into getPageContent | Return explicit argument of captureAs into scrapeGenericUrl in processLink fn

* Return debug log for scrapeGenericUrl

* Change conditional to a guard clause.

* Add error handling, validation, and JSDOC to getContentType helper fn

* remove unneeded comments

* Simplify URL validation by reusing module

* Rename downloadFileToHotDir to downloadURIToFile and moved up to a global module | Add URL valuidation to downloadURIToFile

* refactor

* add support for webp
remove unused imports

---------

Co-authored-by: timothycarambat <rambat1010@gmail.com>
This commit is contained in:
Marcello Fitton 2025-10-01 15:49:05 -07:00 committed by GitHub
parent 004327264a
commit f7b90571be
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 230 additions and 14 deletions

View File

@ -1,10 +1,15 @@
const { v4 } = require("uuid"); const { v4 } = require("uuid");
const path = require("path");
const { const {
PuppeteerWebBaseLoader, PuppeteerWebBaseLoader,
} = require("langchain/document_loaders/web/puppeteer"); } = require("langchain/document_loaders/web/puppeteer");
const { writeToServerDocuments } = require("../../utils/files"); const { writeToServerDocuments } = require("../../utils/files");
const { tokenizeString } = require("../../utils/tokenizer"); const { tokenizeString } = require("../../utils/tokenizer");
const { default: slugify } = require("slugify"); const { default: slugify } = require("slugify");
const { getContentTypeFromURL, returnResult } = require("../helpers");
const { processSingleFile } = require("../../processSingleFile");
const { downloadURIToFile } = require("../../utils/downloadURIToFile");
const { ACCEPTED_MIMES } = require("../../utils/constants");
const RuntimeSettings = require("../../utils/runtimeSettings"); const RuntimeSettings = require("../../utils/runtimeSettings");
/** /**
@ -12,19 +17,86 @@ const RuntimeSettings = require("../../utils/runtimeSettings");
* @param {Object} config - The configuration object * @param {Object} config - The configuration object
* @param {string} config.link - The URL to scrape * @param {string} config.link - The URL to scrape
* @param {('html' | 'text')} config.captureAs - The format to capture the page content as. Default is 'text' * @param {('html' | 'text')} config.captureAs - The format to capture the page content as. Default is 'text'
* @param {boolean} config.processAsDocument - Whether to process the content as a document or return the content directly. Default is true
* @param {{[key: string]: string}} config.scraperHeaders - Custom headers to use when making the request * @param {{[key: string]: string}} config.scraperHeaders - Custom headers to use when making the request
* @param {{[key: string]: string}} config.metadata - Metadata to use when creating the document * @param {{[key: string]: string}} config.metadata - Metadata to use when creating the document
* @param {boolean} config.saveAsDocument - Whether to save the content as a document. Default is true
* @returns {Promise<Object>} - The content of the page * @returns {Promise<Object>} - The content of the page
*/ */
async function scrapeGenericUrl({ async function scrapeGenericUrl({
link, link,
captureAs = "text", captureAs = "text",
processAsDocument = true,
scraperHeaders = {}, scraperHeaders = {},
metadata = {}, metadata = {},
saveAsDocument = true,
}) { }) {
console.log(`-- Working URL ${link} => (${captureAs}) --`); /** @type {'web' | 'file'} */
let processVia = "web";
console.log(`-- Working URL ${link} => (captureAs: ${captureAs}) --`);
const contentType = await getContentTypeFromURL(link)
.then((result) => {
// If there is a reason, log it, but continue with the process
if (!!result.reason) console.error(result.reason);
return result.contentType;
})
.catch((error) => {
console.error("Error getting content type from URL", error);
return null;
});
// If the content is unlikely to be a webpage, assume it is a file and process it as a file
if (
!["text/html", "text/plain"].includes(contentType) &&
contentType in ACCEPTED_MIMES
)
processVia = "file";
console.log(`-- URL determined to be ${contentType} (${processVia}) --`);
// If the content type is a file, download the file to the hotdir and process it
// Then return the content of the file as a document or whatever the captureAs dictates.
if (processVia === "file") {
const fileContentResult = await downloadURIToFile(link);
if (!fileContentResult.success)
return returnResult({
success: false,
reason: fileContentResult.reason,
documents: [],
content: null,
saveAsDocument,
});
const fileFilePath = fileContentResult.fileLocation;
const targetFilename = path.basename(fileFilePath);
// If the saveAsDocument is false, we are only interested in the text content
// and can delete the file after we have the text content via the parseOnly option
const processSingleFileResult = await processSingleFile(targetFilename, {
parseOnly: saveAsDocument === false,
});
if (!processSingleFileResult.success) {
return returnResult({
success: false,
reason: processSingleFileResult.reason,
documents: [],
content: null,
saveAsDocument,
});
}
// If we intend to return only the text content, return the content from the file
// and then delete the file - otherwise it will be saved as a document
if (!saveAsDocument) {
return returnResult({
success: true,
content: processSingleFileResult.documents[0].pageContent,
saveAsDocument,
});
}
return processSingleFileResult;
}
// Otherwise, assume the content is a webpage and scrape the content from the webpage
const content = await getPageContent({ const content = await getPageContent({
link, link,
captureAs, captureAs,
@ -33,24 +105,29 @@ async function scrapeGenericUrl({
if (!content.length) { if (!content.length) {
console.error(`Resulting URL content was empty at ${link}.`); console.error(`Resulting URL content was empty at ${link}.`);
return { return returnResult({
success: false, success: false,
reason: `No URL content found at ${link}.`, reason: `No URL content found at ${link}.`,
documents: [], documents: [],
}; content: null,
saveAsDocument,
});
} }
if (!processAsDocument) { // If the captureAs is text, return the content as a string immediately
return { // so that we dont save the content as a document
if (!saveAsDocument) {
return returnResult({
success: true, success: true,
content, content,
}; saveAsDocument,
});
} }
// Save the content as a document from the URL
const url = new URL(link); const url = new URL(link);
const decodedPathname = decodeURIComponent(url.pathname); const decodedPathname = decodeURIComponent(url.pathname);
const filename = `${url.hostname}${decodedPathname.replace(/\//g, "_")}`; const filename = `${url.hostname}${decodedPathname.replace(/\//g, "_")}`;
const data = { const data = {
id: v4(), id: v4(),
url: "file://" + slugify(filename) + ".html", url: "file://" + slugify(filename) + ".html",

View File

@ -0,0 +1,72 @@
const { validURL } = require("../../utils/url");
/**
* Get the content type of a resource
* - Sends a HEAD request to the URL and returns the Content-Type header with a 5 second timeout
* @param {string} url - The URL to get the content type of
* @returns {Promise<{success: boolean, reason: string|null, contentType: string|null}>} - The content type of the resource
*/
async function getContentTypeFromURL(url) {
try {
if (!url || typeof url !== "string" || !validURL(url))
return { success: false, reason: "Not a valid URL.", contentType: null };
const abortController = new AbortController();
const timeout = setTimeout(() => {
abortController.abort();
console.error("Timeout fetching content type for URL:", url.toString());
}, 5_000);
const res = await fetch(url, {
method: "HEAD",
signal: abortController.signal,
}).finally(() => clearTimeout(timeout));
if (!res.ok)
return {
success: false,
reason: `HTTP ${res.status}: ${res.statusText}`,
contentType: null,
};
const contentType = res.headers.get("Content-Type")?.toLowerCase();
const contentTypeWithoutCharset = contentType?.split(";")[0].trim();
if (!contentTypeWithoutCharset)
return {
success: false,
reason: "No Content-Type found.",
contentType: null,
};
return {
success: true,
reason: null,
contentType: contentTypeWithoutCharset,
};
} catch (error) {
return {
success: false,
reason: `Error: ${error.message}`,
contentType: null,
};
}
}
function returnResult({
success,
reason,
documents,
content,
saveAsDocument = true,
}) {
if (!saveAsDocument) {
return {
success,
content,
};
} else return { success, reason, documents };
}
module.exports = {
returnResult,
getContentTypeFromURL,
};

View File

@ -14,9 +14,9 @@ async function processLink(link, scraperHeaders = {}, metadata = {}) {
return await scrapeGenericUrl({ return await scrapeGenericUrl({
link, link,
captureAs: "text", captureAs: "text",
processAsDocument: true,
scraperHeaders, scraperHeaders,
metadata, metadata,
saveAsDocument: true,
}); });
} }
@ -32,7 +32,7 @@ async function getLinkText(link, captureAs = "text") {
return await scrapeGenericUrl({ return await scrapeGenericUrl({
link, link,
captureAs, captureAs,
processAsDocument: false, saveAsDocument: false,
}); });
} }

View File

@ -16,6 +16,7 @@ const RESERVED_FILES = ["__HOTDIR__.md"];
* Process a single file and return the documents * Process a single file and return the documents
* @param {string} targetFilename - The filename to process * @param {string} targetFilename - The filename to process
* @param {Object} options - The options for the file processing * @param {Object} options - The options for the file processing
* @param {boolean} options.parseOnly - If true, the file will not be saved as a document even when `writeToServerDocuments` is called in the handler. Must be explicitly set to true to use.
* @param {Object} metadata - The metadata for the file processing * @param {Object} metadata - The metadata for the file processing
* @returns {Promise<{success: boolean, reason: string, documents: Object[]}>} - The documents from the file processing * @returns {Promise<{success: boolean, reason: string, documents: Object[]}>} - The documents from the file processing
*/ */

View File

@ -3,6 +3,10 @@ const WATCH_DIRECTORY = require("path").resolve(__dirname, "../hotdir");
const ACCEPTED_MIMES = { const ACCEPTED_MIMES = {
"text/plain": [".txt", ".md", ".org", ".adoc", ".rst"], "text/plain": [".txt", ".md", ".org", ".adoc", ".rst"],
"text/html": [".html"], "text/html": [".html"],
"text/csv": [".csv"],
"application/json": [".json"],
// TODO: Create asDoc.js that works for standard MS Word files.
// "application/msword": [".doc"],
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": [ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": [
".docx", ".docx",
@ -30,6 +34,7 @@ const ACCEPTED_MIMES = {
"image/png": [".png"], "image/png": [".png"],
"image/jpeg": [".jpg"], "image/jpeg": [".jpg"],
"image/jpg": [".jpg"], "image/jpg": [".jpg"],
"image/webp": [".webp"],
}; };
const SUPPORTED_FILETYPE_CONVERTERS = { const SUPPORTED_FILETYPE_CONVERTERS = {
@ -38,11 +43,16 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
".org": "./convert/asTxt.js", ".org": "./convert/asTxt.js",
".adoc": "./convert/asTxt.js", ".adoc": "./convert/asTxt.js",
".rst": "./convert/asTxt.js", ".rst": "./convert/asTxt.js",
".csv": "./convert/asTxt.js",
".json": "./convert/asTxt.js",
".html": "./convert/asTxt.js", ".html": "./convert/asTxt.js",
".pdf": "./convert/asPDF/index.js", ".pdf": "./convert/asPDF/index.js",
".docx": "./convert/asDocx.js", ".docx": "./convert/asDocx.js",
// TODO: Create asDoc.js that works for standard MS Word files.
// ".doc": "./convert/asDoc.js",
".pptx": "./convert/asOfficeMime.js", ".pptx": "./convert/asOfficeMime.js",
".odt": "./convert/asOfficeMime.js", ".odt": "./convert/asOfficeMime.js",
@ -62,6 +72,7 @@ const SUPPORTED_FILETYPE_CONVERTERS = {
".png": "./convert/asImage.js", ".png": "./convert/asImage.js",
".jpg": "./convert/asImage.js", ".jpg": "./convert/asImage.js",
".jpeg": "./convert/asImage.js", ".jpeg": "./convert/asImage.js",
".webp": "./convert/asImage.js",
}; };
module.exports = { module.exports = {

View File

@ -0,0 +1,48 @@
const { WATCH_DIRECTORY } = require("../constants");
const fs = require("fs");
const path = require("path");
const { pipeline } = require("stream/promises");
const { validURL } = require("../url");
/**
* Download a file to the hotdir
* @param {string} url - The URL of the file to download
* @param {number} maxTimeout - The maximum timeout in milliseconds
* @returns {Promise<{success: boolean, fileLocation: string|null, reason: string|null}>} - The path to the downloaded file
*/
async function downloadURIToFile(url, maxTimeout = 10_000) {
if (!url || typeof url !== "string" || !validURL(url))
return { success: false, reason: "Not a valid URL.", fileLocation: null };
try {
const abortController = new AbortController();
const timeout = setTimeout(() => {
abortController.abort();
console.error(
`Timeout ${maxTimeout}ms reached while downloading file for URL:`,
url.toString()
);
}, maxTimeout);
const res = await fetch(url, { signal: abortController.signal })
.then((res) => {
if (!res.ok) throw new Error(`HTTP ${res.status}: ${res.statusText}`);
return res;
})
.finally(() => clearTimeout(timeout));
const localFilePath = path.join(WATCH_DIRECTORY, path.basename(url));
const writeStream = fs.createWriteStream(localFilePath);
await pipeline(res.body, writeStream);
console.log(`[SUCCESS]: File ${localFilePath} downloaded to hotdir.`);
return { success: true, fileLocation: localFilePath, reason: null };
} catch (error) {
console.error(`Error writing to hotdir: ${error} for URL: ${url}`);
return { success: false, reason: error.message, fileLocation: null };
}
}
module.exports = {
downloadURIToFile,
};

View File

@ -22,6 +22,10 @@ class RuntimeSettings {
// Any settings here will be persisted across requests // Any settings here will be persisted across requests
// and must be explicitly defined here. // and must be explicitly defined here.
settingConfigs = { settingConfigs = {
seenAnyIpWarning: {
default: false,
validate: (value) => String(value) === "true",
},
allowAnyIp: { allowAnyIp: {
default: false, default: false,
// Value must be explicitly "true" or "false" as a string // Value must be explicitly "true" or "false" as a string

View File

@ -26,9 +26,12 @@ const runtimeSettings = new RuntimeSettings();
*/ */
function isInvalidIp({ hostname }) { function isInvalidIp({ hostname }) {
if (runtimeSettings.get("allowAnyIp")) { if (runtimeSettings.get("allowAnyIp")) {
console.log( if (!runtimeSettings.get("seenAnyIpWarning")) {
"\x1b[33mURL IP local address restrictions have been disabled by administrator!\x1b[0m" console.log(
); "\x1b[33mURL IP local address restrictions have been disabled by administrator!\x1b[0m"
);
runtimeSettings.set("seenAnyIpWarning", true);
}
return false; return false;
} }