merlyn/collector/processLink/index.js
Timothy Carambat cf3fbcbf0f
Improve URL handler for collector processes (#4504)
* Improve URL handler for collector processes

* dev build
2025-10-07 11:03:27 -07:00

46 lines
1.6 KiB
JavaScript

const { validURL } = require("../utils/url");
const { scrapeGenericUrl } = require("./convert/generic");
const { validateURL } = require("../utils/url");
/**
* Process a link and return the text content. This util will save the link as a document
* so it can be used for embedding later.
* @param {string} link - The link to process
* @param {{[key: string]: string}} scraperHeaders - Custom headers to apply when scraping the link
* @param {Object} metadata - Optional metadata to attach to the document
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
*/
async function processLink(link, scraperHeaders = {}, metadata = {}) {
link = validateURL(link);
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
return await scrapeGenericUrl({
link,
captureAs: "text",
scraperHeaders,
metadata,
saveAsDocument: true,
});
}
/**
* Get the text content of a link - does not save the link as a document
* Mostly used in agentic flows/tools calls to get the text content of a link
* @param {string} link - The link to get the text content of
* @param {('html' | 'text' | 'json')} captureAs - The format to capture the page content as
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
*/
async function getLinkText(link, captureAs = "text") {
link = validateURL(link);
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
return await scrapeGenericUrl({
link,
captureAs,
saveAsDocument: false,
});
}
module.exports = {
processLink,
getLinkText,
};