merlyn/collector/utils/downloadURIToFile/index.js
Marcello Fitton f7b90571be
Fetch, Parse, and Create Documents for Statically Hosted Files (#4398)
* Add capability to web scraping feature for document creation to download and parse statically hosted files

* lint

* Remove unneeded comment

* Simplified process by using key of ACCEPTED_MIMES to validate the response content type, as a result unlocked all supported files

* Add TODO comments for future implementation of asDoc.js to handle standard MS Word files in constants.js

* Return captureAs argument to be exposed by scrapeGenericUrl and passed into getPageContent | Return explicit argument of captureAs into scrapeGenericUrl in processLink fn

* Return debug log for scrapeGenericUrl

* Change conditional to a guard clause.

* Add error handling, validation, and JSDOC to getContentType helper fn

* remove unneeded comments

* Simplify URL validation by reusing module

* Rename downloadFileToHotDir to downloadURIToFile and moved up to a global module | Add URL valuidation to downloadURIToFile

* refactor

* add support for webp
remove unused imports

---------

Co-authored-by: timothycarambat <rambat1010@gmail.com>
2025-10-01 15:49:05 -07:00

49 lines
1.7 KiB
JavaScript

const { WATCH_DIRECTORY } = require("../constants");
const fs = require("fs");
const path = require("path");
const { pipeline } = require("stream/promises");
const { validURL } = require("../url");
/**
* Download a file to the hotdir
* @param {string} url - The URL of the file to download
* @param {number} maxTimeout - The maximum timeout in milliseconds
* @returns {Promise<{success: boolean, fileLocation: string|null, reason: string|null}>} - The path to the downloaded file
*/
async function downloadURIToFile(url, maxTimeout = 10_000) {
if (!url || typeof url !== "string" || !validURL(url))
return { success: false, reason: "Not a valid URL.", fileLocation: null };
try {
const abortController = new AbortController();
const timeout = setTimeout(() => {
abortController.abort();
console.error(
`Timeout ${maxTimeout}ms reached while downloading file for URL:`,
url.toString()
);
}, maxTimeout);
const res = await fetch(url, { signal: abortController.signal })
.then((res) => {
if (!res.ok) throw new Error(`HTTP ${res.status}: ${res.statusText}`);
return res;
})
.finally(() => clearTimeout(timeout));
const localFilePath = path.join(WATCH_DIRECTORY, path.basename(url));
const writeStream = fs.createWriteStream(localFilePath);
await pipeline(res.body, writeStream);
console.log(`[SUCCESS]: File ${localFilePath} downloaded to hotdir.`);
return { success: true, fileLocation: localFilePath, reason: null };
} catch (error) {
console.error(`Error writing to hotdir: ${error} for URL: ${url}`);
return { success: false, reason: error.message, fileLocation: null };
}
}
module.exports = {
downloadURIToFile,
};