const { v4 } = require("uuid"); const { PuppeteerWebBaseLoader, } = require("langchain/document_loaders/web/puppeteer"); const { default: slugify } = require("slugify"); const { parse } = require("node-html-parser"); const { writeToServerDocuments, documentsFolder } = require("../../files"); const { tokenizeString } = require("../../tokenizer"); const path = require("path"); const fs = require("fs"); const RuntimeSettings = require("../../runtimeSettings"); async function discoverLinks(startUrl, maxDepth = 1, maxLinks = 20) { const baseUrl = new URL(startUrl); const discoveredLinks = new Set([startUrl]); let queue = [[startUrl, 0]]; // [url, currentDepth] const scrapedUrls = new Set(); for (let currentDepth = 0; currentDepth < maxDepth; currentDepth++) { const levelSize = queue.length; const nextQueue = []; for (let i = 0; i < levelSize && discoveredLinks.size < maxLinks; i++) { const [currentUrl, urlDepth] = queue[i]; if (!scrapedUrls.has(currentUrl)) { scrapedUrls.add(currentUrl); const newLinks = await getPageLinks(currentUrl, baseUrl); for (const link of newLinks) { if (!discoveredLinks.has(link) && discoveredLinks.size < maxLinks) { discoveredLinks.add(link); if (urlDepth + 1 < maxDepth) { nextQueue.push([link, urlDepth + 1]); } } } } } queue = nextQueue; if (queue.length === 0 || discoveredLinks.size >= maxLinks) break; } return Array.from(discoveredLinks); } async function getPageLinks(url, baseUrl) { try { const runtimeSettings = new RuntimeSettings(); /** @type {import('puppeteer').PuppeteerLaunchOptions} */ let launchConfig = { headless: "new" }; /* On MacOS 15.1, the headless=new option causes the browser to crash immediately. * It is not clear why this is the case, but it is reproducible. Since AnythinglLM * in production runs in a container, we can disable headless mode to workaround the issue for development purposes. * * This may show a popup window when scraping a page in development mode. * This is expected behavior if seen in development mode on MacOS 15+ */ if ( process.platform === "darwin" && process.env.NODE_ENV === "development" ) { console.log( "Darwin Development Mode: Disabling headless mode to prevent Chromium from crashing." ); launchConfig.headless = "false"; } const loader = new PuppeteerWebBaseLoader(url, { launchOptions: { headless: launchConfig.headless, ignoreHTTPSErrors: true, args: runtimeSettings.get("browserLaunchArgs"), }, gotoOptions: { waitUntil: "networkidle2" }, }); const docs = await loader.load(); const html = docs[0].pageContent; const links = extractLinks(html, baseUrl); return links; } catch (error) { console.error(`Failed to get page links from ${url}.`, error); return []; } } function extractLinks(html, baseUrl) { const root = parse(html); const links = root.querySelectorAll("a"); const extractedLinks = new Set(); for (const link of links) { const href = link.getAttribute("href"); if (href) { const absoluteUrl = new URL(href, baseUrl.href).href; if ( absoluteUrl.startsWith( baseUrl.origin + baseUrl.pathname.split("/").slice(0, -1).join("/") ) ) { extractedLinks.add(absoluteUrl); } } } return Array.from(extractedLinks); } async function bulkScrapePages(links, outFolderPath) { const runtimeSettings = new RuntimeSettings(); /** @type {import('puppeteer').PuppeteerLaunchOptions} */ let launchConfig = { headless: "new" }; /* On MacOS 15.1, the headless=new option causes the browser to crash immediately. * It is not clear why this is the case, but it is reproducible. Since AnythinglLM * in production runs in a container, we can disable headless mode to workaround the issue for development purposes. * * This may show a popup window when scraping a page in development mode. * This is expected behavior if seen in development mode on MacOS 15+ */ if (process.platform === "darwin" && process.env.NODE_ENV === "development") { console.log( "Darwin Development Mode: Disabling headless mode to prevent Chromium from crashing." ); launchConfig.headless = "false"; } const scrapedData = []; for (let i = 0; i < links.length; i++) { const link = links[i]; console.log(`Scraping ${i + 1}/${links.length}: ${link}`); try { const loader = new PuppeteerWebBaseLoader(link, { launchOptions: { headless: launchConfig.headless, ignoreHTTPSErrors: true, args: runtimeSettings.get("browserLaunchArgs"), }, gotoOptions: { waitUntil: "networkidle2" }, async evaluate(page, browser) { const result = await page.evaluate(() => document.body.innerText); await browser.close(); return result; }, }); const docs = await loader.load(); const content = docs[0].pageContent; if (!content.length) { console.warn(`Empty content for ${link}. Skipping.`); continue; } const url = new URL(link); const decodedPathname = decodeURIComponent(url.pathname); const filename = `${url.hostname}${decodedPathname.replace(/\//g, "_")}`; const data = { id: v4(), url: "file://" + slugify(filename) + ".html", title: slugify(filename) + ".html", docAuthor: "no author found", description: "No description found.", docSource: "URL link uploaded by the user.", chunkSource: `link://${link}`, published: new Date().toLocaleString(), wordCount: content.split(" ").length, pageContent: content, token_count_estimate: tokenizeString(content), }; writeToServerDocuments({ data, filename: data.title, destinationOverride: outFolderPath, }); scrapedData.push(data); console.log(`Successfully scraped ${link}.`); } catch (error) { console.error(`Failed to scrape ${link}.`, error); } } return scrapedData; } async function websiteScraper(startUrl, depth = 1, maxLinks = 20) { const websiteName = new URL(startUrl).hostname; const outFolder = slugify( `${slugify(websiteName)}-${v4().slice(0, 4)}` ).toLowerCase(); const outFolderPath = path.resolve(documentsFolder, outFolder); console.log("Discovering links..."); const linksToScrape = await discoverLinks(startUrl, depth, maxLinks); console.log(`Found ${linksToScrape.length} links to scrape.`); if (!fs.existsSync(outFolderPath)) fs.mkdirSync(outFolderPath, { recursive: true }); console.log("Starting bulk scraping..."); const scrapedData = await bulkScrapePages(linksToScrape, outFolderPath); console.log(`Scraped ${scrapedData.length} pages.`); return scrapedData; } module.exports = websiteScraper;