Normalize scraper runtimeargs for bulk-scraper (#5083)

resolves #5078 closes #5079
2026-02-27 09:15:17 -08:00 · 2026-02-27 09:15:17 -08:00 · d58ff0ea3e
commit d58ff0ea3e
parent 70ee112522
1 changed files with 52 additions and 11 deletions
--- a/collector/utils/extensions/WebsiteDepth/index.js
+++ b/collector/utils/extensions/WebsiteDepth/index.js
@ -4,10 +4,11 @@ const {
 } = require("langchain/document_loaders/web/puppeteer");
 const { default: slugify } = require("slugify");
 const { parse } = require("node-html-parser");
-const { writeToServerDocuments } = require("../../files");
+const { writeToServerDocuments, documentsFolder } = require("../../files");
 const { tokenizeString } = require("../../tokenizer");
 const path = require("path");
 const fs = require("fs");
 const RuntimeSettings = require("../../runtimeSettings");
 async function discoverLinks(startUrl, maxDepth = 1, maxLinks = 20) {
  const baseUrl = new URL(startUrl);
@ -46,8 +47,33 @@ async function discoverLinks(startUrl, maxDepth = 1, maxLinks = 20) {
 async function getPageLinks(url, baseUrl) {
  try {
    const runtimeSettings = new RuntimeSettings();
    /** @type {import('puppeteer').PuppeteerLaunchOptions} */
    let launchConfig = { headless: "new" };
    /* On MacOS 15.1, the headless=new option causes the browser to crash immediately.
     * It is not clear why this is the case, but it is reproducible. Since AnythinglLM
     * in production runs in a container, we can disable headless mode to workaround the issue for development purposes.
     *
     * This may show a popup window when scraping a page in development mode.
     * This is expected behavior if seen in development mode on MacOS 15+
     */
    if (
      process.platform === "darwin" &&
      process.env.NODE_ENV === "development"
    ) {
      console.log(
        "Darwin Development Mode: Disabling headless mode to prevent Chromium from crashing."
      );
      launchConfig.headless = "false";
    }
    const loader = new PuppeteerWebBaseLoader(url, {
-      launchOptions: { headless: "new" },
+      launchOptions: {
        headless: launchConfig.headless,
        ignoreHTTPSErrors: true,
        args: runtimeSettings.get("browserLaunchArgs"),
      },
      gotoOptions: { waitUntil: "networkidle2" },
    });
    const docs = await loader.load();
@ -83,6 +109,24 @@ function extractLinks(html, baseUrl) {
 }
 async function bulkScrapePages(links, outFolderPath) {
  const runtimeSettings = new RuntimeSettings();
  /** @type {import('puppeteer').PuppeteerLaunchOptions} */
  let launchConfig = { headless: "new" };
  /* On MacOS 15.1, the headless=new option causes the browser to crash immediately.
   * It is not clear why this is the case, but it is reproducible. Since AnythinglLM
   * in production runs in a container, we can disable headless mode to workaround the issue for development purposes.
   *
   * This may show a popup window when scraping a page in development mode.
   * This is expected behavior if seen in development mode on MacOS 15+
   */
  if (process.platform === "darwin" && process.env.NODE_ENV === "development") {
    console.log(
      "Darwin Development Mode: Disabling headless mode to prevent Chromium from crashing."
    );
    launchConfig.headless = "false";
  }
  const scrapedData = [];
  for (let i = 0; i < links.length; i++) {
@ -91,7 +135,11 @@ async function bulkScrapePages(links, outFolderPath) {
    try {
      const loader = new PuppeteerWebBaseLoader(link, {
-        launchOptions: { headless: "new" },
+        launchOptions: {
          headless: launchConfig.headless,
          ignoreHTTPSErrors: true,
          args: runtimeSettings.get("browserLaunchArgs"),
        },
        gotoOptions: { waitUntil: "networkidle2" },
        async evaluate(page, browser) {
          const result = await page.evaluate(() => document.body.innerText);
@ -146,14 +194,7 @@ async function websiteScraper(startUrl, depth = 1, maxLinks = 20) {
  const outFolder = slugify(
    `${slugify(websiteName)}-${v4().slice(0, 4)}`
  ).toLowerCase();
-  const outFolderPath =
+  const outFolderPath = path.resolve(documentsFolder, outFolder);
    process.env.NODE_ENV === "development"
      ? path.resolve(
          __dirname,
          `../../../../server/storage/documents/${outFolder}`
        )
      : path.resolve(process.env.STORAGE_DIR, `documents/${outFolder}`);
  console.log("Discovering links...");
  const linksToScrape = await discoverLinks(startUrl, depth, maxLinks);
  console.log(`Found ${linksToScrape.length} links to scrape.`);