merlyn/collector/processLink/index.js
Sean Hatfield 610bdd4673
Allow custom headers in upload-link endpoint (#3695)
* allow custom headers in upload-link endpoint

* override loader.scrape to allow for passing of headers in langchain puppeteer

* lint

* Rename some variables
move positional args to named args
update documentation to reflect arg changes and funciton sigs
validate header object before attempting to end to forward to request

* update header validation for custom headers

---------

Co-authored-by: timothycarambat <rambat1010@gmail.com>
2025-04-22 12:47:12 -07:00

41 lines
1.4 KiB
JavaScript

const { validURL } = require("../utils/url");
const { scrapeGenericUrl } = require("./convert/generic");
/**
* Process a link and return the text content. This util will save the link as a document
* so it can be used for embedding later.
* @param {string} link - The link to process
* @param {{[key: string]: string}} scraperHeaders - Custom headers to apply when scraping the link
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
*/
async function processLink(link, scraperHeaders = {}) {
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
return await scrapeGenericUrl({
link,
captureAs: "text",
processAsDocument: true,
scraperHeaders,
});
}
/**
* Get the text content of a link - does not save the link as a document
* Mostly used in agentic flows/tools calls to get the text content of a link
* @param {string} link - The link to get the text content of
* @param {('html' | 'text' | 'json')} captureAs - The format to capture the page content as
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
*/
async function getLinkText(link, captureAs = "text") {
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
return await scrapeGenericUrl({
link,
captureAs,
processAsDocument: false,
});
}
module.exports = {
processLink,
getLinkText,
};