* Add capability to web scraping feature for document creation to download and parse statically hosted files * lint * Remove unneeded comment * Simplified process by using key of ACCEPTED_MIMES to validate the response content type, as a result unlocked all supported files * Add TODO comments for future implementation of asDoc.js to handle standard MS Word files in constants.js * Return captureAs argument to be exposed by scrapeGenericUrl and passed into getPageContent | Return explicit argument of captureAs into scrapeGenericUrl in processLink fn * Return debug log for scrapeGenericUrl * Change conditional to a guard clause. * Add error handling, validation, and JSDOC to getContentType helper fn * remove unneeded comments * Simplify URL validation by reusing module * Rename downloadFileToHotDir to downloadURIToFile and moved up to a global module | Add URL valuidation to downloadURIToFile * refactor * add support for webp remove unused imports --------- Co-authored-by: timothycarambat <rambat1010@gmail.com>
73 lines
1.9 KiB
JavaScript
73 lines
1.9 KiB
JavaScript
const { validURL } = require("../../utils/url");
|
|
|
|
/**
|
|
* Get the content type of a resource
|
|
* - Sends a HEAD request to the URL and returns the Content-Type header with a 5 second timeout
|
|
* @param {string} url - The URL to get the content type of
|
|
* @returns {Promise<{success: boolean, reason: string|null, contentType: string|null}>} - The content type of the resource
|
|
*/
|
|
async function getContentTypeFromURL(url) {
|
|
try {
|
|
if (!url || typeof url !== "string" || !validURL(url))
|
|
return { success: false, reason: "Not a valid URL.", contentType: null };
|
|
|
|
const abortController = new AbortController();
|
|
const timeout = setTimeout(() => {
|
|
abortController.abort();
|
|
console.error("Timeout fetching content type for URL:", url.toString());
|
|
}, 5_000);
|
|
|
|
const res = await fetch(url, {
|
|
method: "HEAD",
|
|
signal: abortController.signal,
|
|
}).finally(() => clearTimeout(timeout));
|
|
|
|
if (!res.ok)
|
|
return {
|
|
success: false,
|
|
reason: `HTTP ${res.status}: ${res.statusText}`,
|
|
contentType: null,
|
|
};
|
|
|
|
const contentType = res.headers.get("Content-Type")?.toLowerCase();
|
|
const contentTypeWithoutCharset = contentType?.split(";")[0].trim();
|
|
if (!contentTypeWithoutCharset)
|
|
return {
|
|
success: false,
|
|
reason: "No Content-Type found.",
|
|
contentType: null,
|
|
};
|
|
return {
|
|
success: true,
|
|
reason: null,
|
|
contentType: contentTypeWithoutCharset,
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
success: false,
|
|
reason: `Error: ${error.message}`,
|
|
contentType: null,
|
|
};
|
|
}
|
|
}
|
|
|
|
function returnResult({
|
|
success,
|
|
reason,
|
|
documents,
|
|
content,
|
|
saveAsDocument = true,
|
|
}) {
|
|
if (!saveAsDocument) {
|
|
return {
|
|
success,
|
|
content,
|
|
};
|
|
} else return { success, reason, documents };
|
|
}
|
|
|
|
module.exports = {
|
|
returnResult,
|
|
getContentTypeFromURL,
|
|
};
|