* Add capability to web scraping feature for document creation to download and parse statically hosted files * lint * Remove unneeded comment * Simplified process by using key of ACCEPTED_MIMES to validate the response content type, as a result unlocked all supported files * Add TODO comments for future implementation of asDoc.js to handle standard MS Word files in constants.js * Return captureAs argument to be exposed by scrapeGenericUrl and passed into getPageContent | Return explicit argument of captureAs into scrapeGenericUrl in processLink fn * Return debug log for scrapeGenericUrl * Change conditional to a guard clause. * Add error handling, validation, and JSDOC to getContentType helper fn * remove unneeded comments * Simplify URL validation by reusing module * Rename downloadFileToHotDir to downloadURIToFile and moved up to a global module | Add URL valuidation to downloadURIToFile * refactor * add support for webp remove unused imports --------- Co-authored-by: timothycarambat <rambat1010@gmail.com>
49 lines
1.7 KiB
JavaScript
49 lines
1.7 KiB
JavaScript
const { WATCH_DIRECTORY } = require("../constants");
|
|
const fs = require("fs");
|
|
const path = require("path");
|
|
const { pipeline } = require("stream/promises");
|
|
const { validURL } = require("../url");
|
|
|
|
/**
|
|
* Download a file to the hotdir
|
|
* @param {string} url - The URL of the file to download
|
|
* @param {number} maxTimeout - The maximum timeout in milliseconds
|
|
* @returns {Promise<{success: boolean, fileLocation: string|null, reason: string|null}>} - The path to the downloaded file
|
|
*/
|
|
async function downloadURIToFile(url, maxTimeout = 10_000) {
|
|
if (!url || typeof url !== "string" || !validURL(url))
|
|
return { success: false, reason: "Not a valid URL.", fileLocation: null };
|
|
|
|
try {
|
|
const abortController = new AbortController();
|
|
const timeout = setTimeout(() => {
|
|
abortController.abort();
|
|
console.error(
|
|
`Timeout ${maxTimeout}ms reached while downloading file for URL:`,
|
|
url.toString()
|
|
);
|
|
}, maxTimeout);
|
|
|
|
const res = await fetch(url, { signal: abortController.signal })
|
|
.then((res) => {
|
|
if (!res.ok) throw new Error(`HTTP ${res.status}: ${res.statusText}`);
|
|
return res;
|
|
})
|
|
.finally(() => clearTimeout(timeout));
|
|
|
|
const localFilePath = path.join(WATCH_DIRECTORY, path.basename(url));
|
|
const writeStream = fs.createWriteStream(localFilePath);
|
|
await pipeline(res.body, writeStream);
|
|
|
|
console.log(`[SUCCESS]: File ${localFilePath} downloaded to hotdir.`);
|
|
return { success: true, fileLocation: localFilePath, reason: null };
|
|
} catch (error) {
|
|
console.error(`Error writing to hotdir: ${error} for URL: ${url}`);
|
|
return { success: false, reason: error.message, fileLocation: null };
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
downloadURIToFile,
|
|
};
|