* remove summarization from web-scraping flow block * add option to enable/disable summarization in web-scraping block * change layout summarize default to true (was the normal prior) --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
111 lines
3.5 KiB
JavaScript
111 lines
3.5 KiB
JavaScript
const { CollectorApi } = require("../../collectorApi");
|
|
const { TokenManager } = require("../../helpers/tiktoken");
|
|
const Provider = require("../../agents/aibitat/providers/ai-provider");
|
|
const { summarizeContent } = require("../../agents/aibitat/utils/summarize");
|
|
|
|
/**
|
|
* Execute a web scraping flow step
|
|
* @param {Object} config Flow step configuration
|
|
* @param {Object} context Execution context with introspect function
|
|
* @returns {Promise<string>} Scraped content
|
|
*/
|
|
async function executeWebScraping(config, context) {
|
|
const { url, captureAs = "text", enableSummarization = true } = config;
|
|
const { introspect, logger, aibitat } = context;
|
|
logger(
|
|
`\x1b[43m[AgentFlowToolExecutor]\x1b[0m - executing Web Scraping block`
|
|
);
|
|
|
|
if (!url) {
|
|
throw new Error("URL is required for web scraping");
|
|
}
|
|
|
|
const captureMode = captureAs === "querySelector" ? "html" : captureAs;
|
|
introspect(`Scraping the content of ${url} as ${captureAs}`);
|
|
const { success, content } = await new CollectorApi()
|
|
.getLinkContent(url, captureMode)
|
|
.then((res) => {
|
|
if (captureAs !== "querySelector") return res;
|
|
return parseHTMLwithSelector(res.content, config.querySelector, context);
|
|
});
|
|
|
|
if (!success) {
|
|
introspect(`Could not scrape ${url}. Cannot use this page's content.`);
|
|
throw new Error("URL could not be scraped and no content was found.");
|
|
}
|
|
|
|
introspect(`Successfully scraped content from ${url}`);
|
|
if (!content || content?.length === 0) {
|
|
introspect("There was no content to be collected or read.");
|
|
throw new Error("There was no content to be collected or read.");
|
|
}
|
|
|
|
if (!enableSummarization) {
|
|
logger(`Returning raw content as summarization is disabled`);
|
|
return content;
|
|
}
|
|
|
|
const tokenCount = new TokenManager(
|
|
aibitat.defaultProvider.model
|
|
).countFromString(content);
|
|
const contextLimit = Provider.contextLimit(
|
|
aibitat.defaultProvider.provider,
|
|
aibitat.defaultProvider.model
|
|
);
|
|
|
|
if (tokenCount < contextLimit) {
|
|
logger(
|
|
`Content within token limit (${tokenCount}/${contextLimit}). Returning raw content.`
|
|
);
|
|
return content;
|
|
}
|
|
|
|
introspect(
|
|
`This page's content is way too long (${tokenCount} tokens). I will summarize it right now.`
|
|
);
|
|
const summary = await summarizeContent({
|
|
provider: aibitat.defaultProvider.provider,
|
|
model: aibitat.defaultProvider.model,
|
|
content,
|
|
});
|
|
|
|
introspect(`Successfully summarized content`);
|
|
return summary;
|
|
}
|
|
|
|
/**
|
|
* Parse HTML with a CSS selector
|
|
* @param {string} html - The HTML to parse
|
|
* @param {string|null} selector - The CSS selector to use (as text string)
|
|
* @param {{introspect: Function}} context - The context object
|
|
* @returns {Object} The parsed content
|
|
*/
|
|
function parseHTMLwithSelector(html, selector = null, context) {
|
|
if (!selector || selector.length === 0) {
|
|
context.introspect("No selector provided. Returning the entire HTML.");
|
|
return { success: true, content: html };
|
|
}
|
|
|
|
const Cheerio = require("cheerio");
|
|
const $ = Cheerio.load(html);
|
|
const selectedElements = $(selector);
|
|
|
|
let content;
|
|
if (selectedElements.length === 0) {
|
|
return { success: false, content: null };
|
|
} else if (selectedElements.length === 1) {
|
|
content = selectedElements.html();
|
|
} else {
|
|
context.introspect(
|
|
`Found ${selectedElements.length} elements matching selector: ${selector}`
|
|
);
|
|
content = selectedElements
|
|
.map((_, element) => $(element).html())
|
|
.get()
|
|
.join("\n");
|
|
}
|
|
return { success: true, content };
|
|
}
|
|
|
|
module.exports = executeWebScraping;
|