merlyn/collector/processLink/helpers/index.js
Marcello Fitton f7b90571be
Fetch, Parse, and Create Documents for Statically Hosted Files (#4398)
* Add capability to web scraping feature for document creation to download and parse statically hosted files

* lint

* Remove unneeded comment

* Simplified process by using key of ACCEPTED_MIMES to validate the response content type, as a result unlocked all supported files

* Add TODO comments for future implementation of asDoc.js to handle standard MS Word files in constants.js

* Return captureAs argument to be exposed by scrapeGenericUrl and passed into getPageContent | Return explicit argument of captureAs into scrapeGenericUrl in processLink fn

* Return debug log for scrapeGenericUrl

* Change conditional to a guard clause.

* Add error handling, validation, and JSDOC to getContentType helper fn

* remove unneeded comments

* Simplify URL validation by reusing module

* Rename downloadFileToHotDir to downloadURIToFile and moved up to a global module | Add URL valuidation to downloadURIToFile

* refactor

* add support for webp
remove unused imports

---------

Co-authored-by: timothycarambat <rambat1010@gmail.com>
2025-10-01 15:49:05 -07:00

73 lines
1.9 KiB
JavaScript

const { validURL } = require("../../utils/url");
/**
* Get the content type of a resource
* - Sends a HEAD request to the URL and returns the Content-Type header with a 5 second timeout
* @param {string} url - The URL to get the content type of
* @returns {Promise<{success: boolean, reason: string|null, contentType: string|null}>} - The content type of the resource
*/
async function getContentTypeFromURL(url) {
try {
if (!url || typeof url !== "string" || !validURL(url))
return { success: false, reason: "Not a valid URL.", contentType: null };
const abortController = new AbortController();
const timeout = setTimeout(() => {
abortController.abort();
console.error("Timeout fetching content type for URL:", url.toString());
}, 5_000);
const res = await fetch(url, {
method: "HEAD",
signal: abortController.signal,
}).finally(() => clearTimeout(timeout));
if (!res.ok)
return {
success: false,
reason: `HTTP ${res.status}: ${res.statusText}`,
contentType: null,
};
const contentType = res.headers.get("Content-Type")?.toLowerCase();
const contentTypeWithoutCharset = contentType?.split(";")[0].trim();
if (!contentTypeWithoutCharset)
return {
success: false,
reason: "No Content-Type found.",
contentType: null,
};
return {
success: true,
reason: null,
contentType: contentTypeWithoutCharset,
};
} catch (error) {
return {
success: false,
reason: `Error: ${error.message}`,
contentType: null,
};
}
}
function returnResult({
success,
reason,
documents,
content,
saveAsDocument = true,
}) {
if (!saveAsDocument) {
return {
success,
content,
};
} else return { success, reason, documents };
}
module.exports = {
returnResult,
getContentTypeFromURL,
};