merlyn/collector/processLink/index.js
Jonas Stawski b8d4cc3454
Added metadata parameter to document/upload, document/upload/{folderName}, and document/upload-link (#4342)
* Added the ability to pass in metadata to the /document/upload/{folderName} endpoint

* Added the ability to pass in metadata to the /document/upload-link endpoint

* feat: added metadata to document/upload api endpoint

* simplify optional metadata in document dev api endpoints

* lint

* patch handling of metadata in dev api

* Linting, small comments

---------

Co-authored-by: jstawskigmi <jstawski@getmyinterns.org>
Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
2025-09-17 11:17:29 -07:00

43 lines
1.5 KiB
JavaScript

const { validURL } = require("../utils/url");
const { scrapeGenericUrl } = require("./convert/generic");
/**
* Process a link and return the text content. This util will save the link as a document
* so it can be used for embedding later.
* @param {string} link - The link to process
* @param {{[key: string]: string}} scraperHeaders - Custom headers to apply when scraping the link
* @param {Object} metadata - Optional metadata to attach to the document
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
*/
async function processLink(link, scraperHeaders = {}, metadata = {}) {
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
return await scrapeGenericUrl({
link,
captureAs: "text",
processAsDocument: true,
scraperHeaders,
metadata,
});
}
/**
* Get the text content of a link - does not save the link as a document
* Mostly used in agentic flows/tools calls to get the text content of a link
* @param {string} link - The link to get the text content of
* @param {('html' | 'text' | 'json')} captureAs - The format to capture the page content as
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
*/
async function getLinkText(link, captureAs = "text") {
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
return await scrapeGenericUrl({
link,
captureAs,
processAsDocument: false,
});
}
module.exports = {
processLink,
getLinkText,
};