* Add capability to web scraping feature for document creation to download and parse statically hosted files * lint * Remove unneeded comment * Simplified process by using key of ACCEPTED_MIMES to validate the response content type, as a result unlocked all supported files * Add TODO comments for future implementation of asDoc.js to handle standard MS Word files in constants.js * Return captureAs argument to be exposed by scrapeGenericUrl and passed into getPageContent | Return explicit argument of captureAs into scrapeGenericUrl in processLink fn * Return debug log for scrapeGenericUrl * Change conditional to a guard clause. * Add error handling, validation, and JSDOC to getContentType helper fn * remove unneeded comments * Simplify URL validation by reusing module * Rename downloadFileToHotDir to downloadURIToFile and moved up to a global module | Add URL valuidation to downloadURIToFile * refactor * add support for webp remove unused imports --------- Co-authored-by: timothycarambat <rambat1010@gmail.com>
43 lines
1.5 KiB
JavaScript
43 lines
1.5 KiB
JavaScript
const { validURL } = require("../utils/url");
|
|
const { scrapeGenericUrl } = require("./convert/generic");
|
|
|
|
/**
|
|
* Process a link and return the text content. This util will save the link as a document
|
|
* so it can be used for embedding later.
|
|
* @param {string} link - The link to process
|
|
* @param {{[key: string]: string}} scraperHeaders - Custom headers to apply when scraping the link
|
|
* @param {Object} metadata - Optional metadata to attach to the document
|
|
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
|
|
*/
|
|
async function processLink(link, scraperHeaders = {}, metadata = {}) {
|
|
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
|
|
return await scrapeGenericUrl({
|
|
link,
|
|
captureAs: "text",
|
|
scraperHeaders,
|
|
metadata,
|
|
saveAsDocument: true,
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Get the text content of a link - does not save the link as a document
|
|
* Mostly used in agentic flows/tools calls to get the text content of a link
|
|
* @param {string} link - The link to get the text content of
|
|
* @param {('html' | 'text' | 'json')} captureAs - The format to capture the page content as
|
|
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
|
|
*/
|
|
async function getLinkText(link, captureAs = "text") {
|
|
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
|
|
return await scrapeGenericUrl({
|
|
link,
|
|
captureAs,
|
|
saveAsDocument: false,
|
|
});
|
|
}
|
|
|
|
module.exports = {
|
|
processLink,
|
|
getLinkText,
|
|
};
|