Improve URL handler for collector processes (#4504)

* Improve URL handler for collector processes * dev build
2025-10-07 11:03:27 -07:00 · 2025-10-07 11:03:27 -07:00 · cf3fbcbf0f
commit cf3fbcbf0f
parent 3259ede98a
7 changed files with 161 additions and 6 deletions
--- a/.github/workflows/dev-build.yaml
+++ b/.github/workflows/dev-build.yaml
@ -6,7 +6,7 @@ concurrency:

 on:
  push:
-    branches: ['4499-tooltips'] # put your current branch to create a build. Core team only.
+    branches: ['improve-url-handler-collector'] # put your current branch to create a build. Core team only.
    paths-ignore:
      - '**.md'
      - 'cloud-deployments/*'
--- a/collector/tests/utils/url/index.test.js
+++ b/collector/tests/utils/url/index.test.js
@ -0,0 +1,112 @@
+const { validURL, validateURL } = require("../../../utils/url");
+
+// Mock the RuntimeSettings module
+jest.mock("../../../utils/runtimeSettings", () => {
+  const mockInstance = {
+    get: jest.fn(),
+    set: jest.fn(),
+  };
+  return jest.fn().mockImplementation(() => mockInstance);
+});
+
+describe("validURL", () => {
+  let mockRuntimeSettings;
+
+  beforeEach(() => {
+    const RuntimeSettings = require("../../../utils/runtimeSettings");
+    mockRuntimeSettings = new RuntimeSettings();
+    jest.clearAllMocks();
+  });
+
+  it("should validate a valid URL", () => {
+    mockRuntimeSettings.get.mockImplementation((key) => {
+      if (key === "allowAnyIp") return false;
+      if (key === "seenAnyIpWarning") return true; // silence the warning for tests
+      return false;
+    });
+
+    expect(validURL("https://www.google.com")).toBe(true);
+    expect(validURL("http://www.google.com")).toBe(true);
+
+    // JS URL does not require extensions, so in theory
+    // these should be valid
+    expect(validURL("https://random")).toBe(true);
+    expect(validURL("http://123")).toBe(true);
+
+    // missing protocols
+    expect(validURL("www.google.com")).toBe(false);
+    expect(validURL("google.com")).toBe(false);
+
+    // invalid protocols
+    expect(validURL("ftp://www.google.com")).toBe(false);
+    expect(validURL("mailto://www.google.com")).toBe(false);
+    expect(validURL("tel://www.google.com")).toBe(false);
+    expect(validURL("data://www.google.com")).toBe(false);
+  });
+
+  it("should block private/local IPs when allowAnyIp is false (default behavior)", () => {
+    mockRuntimeSettings.get.mockImplementation((key) => {
+      if (key === "allowAnyIp") return false;
+      if (key === "seenAnyIpWarning") return true; // silence the warning for tests
+      return false;
+    });
+
+    expect(validURL("http://192.168.1.1")).toBe(false);
+    expect(validURL("http://10.0.0.1")).toBe(false);
+    expect(validURL("http://172.16.0.1")).toBe(false);
+
+    // But localhost should still be allowed
+    expect(validURL("http://127.0.0.1")).toBe(true);
+    expect(validURL("http://0.0.0.0")).toBe(true);
+  });
+
+  it("should allow any IP when allowAnyIp is true", () => {
+    mockRuntimeSettings.get.mockImplementation((key) => {
+      if (key === "allowAnyIp") return true;
+      if (key === "seenAnyIpWarning") return true; // silence the warning for tests
+      return false;
+    });
+
+    expect(validURL("http://192.168.1.1")).toBe(true);
+    expect(validURL("http://10.0.0.1")).toBe(true);
+    expect(validURL("http://172.16.0.1")).toBe(true);
+  });
+});
+
+describe("validateURL", () => {
+  it("should return the exact same URL if it's already valid", () => {
+    expect(validateURL("https://www.google.com")).toBe("https://www.google.com");
+    expect(validateURL("http://www.google.com")).toBe("http://www.google.com");
+    expect(validateURL("https://random")).toBe("https://random");
+
+    // With numbers as a url this will turn into an ip
+    expect(validateURL("123")).toBe("https://0.0.0.123");
+    expect(validateURL("123.123.123.123")).toBe("https://123.123.123.123");
+    expect(validateURL("http://127.0.123.45")).toBe("http://127.0.123.45");
+  });
+
+  it("should assume https:// if the URL doesn't have a protocol", () => {
+    expect(validateURL("www.google.com")).toBe("https://www.google.com");
+    expect(validateURL("google.com")).toBe("https://google.com");
+    expect(validateURL("ftp://www.google.com")).toBe("ftp://www.google.com");
+    expect(validateURL("mailto://www.google.com")).toBe("mailto://www.google.com");
+    expect(validateURL("tel://www.google.com")).toBe("tel://www.google.com");
+    expect(validateURL("data://www.google.com")).toBe("data://www.google.com");
+  });
+
+  it("should remove trailing slashes post-validation", () => {
+    expect(validateURL("https://www.google.com/")).toBe("https://www.google.com");
+    expect(validateURL("http://www.google.com/")).toBe("http://www.google.com");
+    expect(validateURL("https://random/")).toBe("https://random");
+  });
+
+  it("should handle edge cases and bad data inputs", () => {
+    expect(validateURL({})).toBe("");
+    expect(validateURL(null)).toBe("");
+    expect(validateURL(undefined)).toBe("");
+    expect(validateURL(124512)).toBe("");
+    expect(validateURL("")).toBe("");
+    expect(validateURL(" ")).toBe("");
+    expect(validateURL(" look here! ")).toBe("look here!");
+  });
+});
--- a/collector/extensions/index.js
+++ b/collector/extensions/index.js
@ -2,7 +2,7 @@ const { setDataSigner } = require("../middleware/setDataSigner");
 const { verifyPayloadIntegrity } = require("../middleware/verifyIntegrity");
 const { resolveRepoLoader, resolveRepoLoaderFunction } = require("../utils/extensions/RepoLoader");
 const { reqBody } = require("../utils/http");
-const { validURL } = require("../utils/url");
+const { validURL, validateURL } = require("../utils/url");
 const RESYNC_METHODS = require("./resync");
 const { loadObsidianVault } = require("../utils/extensions/ObsidianVault");

@ -119,6 +119,7 @@ function extensions(app) {
      try {
        const websiteDepth = require("../utils/extensions/WebsiteDepth");
        const { url, depth = 1, maxLinks = 20 } = reqBody(request);
+        url = validateURL(url);
        if (!validURL(url)) throw new Error("Not a valid URL.");
        const scrapedData = await websiteDepth(url, depth, maxLinks);
        response.status(200).json({ success: true, data: scrapedData });
--- a/collector/processLink/convert/generic.js
+++ b/collector/processLink/convert/generic.js
@ -111,7 +111,7 @@ async function scrapeGenericUrl({
    headers: scraperHeaders,
  });

-  if (!content.length) {
+  if (!content || !content.length) {
    console.error(`Resulting URL content was empty at ${link}.`);
    return returnResult({
      success: false,
--- a/collector/processLink/index.js
+++ b/collector/processLink/index.js
@ -1,5 +1,6 @@
 const { validURL } = require("../utils/url");
 const { scrapeGenericUrl } = require("./convert/generic");
+const { validateURL } = require("../utils/url");

 /**
 * Process a link and return the text content. This util will save the link as a document
@ -10,6 +11,7 @@ const { scrapeGenericUrl } = require("./convert/generic");
 * @returns {Promise<{success: boolean, content: string}>} - Response from collector
 */
 async function processLink(link, scraperHeaders = {}, metadata = {}) {
+  link = validateURL(link);
  if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
  return await scrapeGenericUrl({
    link,
@ -28,6 +30,7 @@ async function processLink(link, scraperHeaders = {}, metadata = {}) {
 * @returns {Promise<{success: boolean, content: string}>} - Response from collector
 */
 async function getLinkText(link, captureAs = "text") {
+  link = validateURL(link);
  if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
  return await scrapeGenericUrl({
    link,
--- a/collector/utils/url/index.js
+++ b/collector/utils/url/index.js
@ -54,7 +54,7 @@ function isInvalidIp({ hostname }) {
 }

 /**
- * Validates a URL
+ * Validates a URL strictly
 * - Checks the URL forms a valid URL
 * - Checks the URL is at least HTTP(S)
 * - Checks the URL is not an internal IP - can be bypassed via COLLECTOR_ALLOW_ANY_IP
@ -71,6 +71,33 @@ function validURL(url) {
  return false;
 }

+/**
+ * Modifies a URL to be valid:
+ * - Checks the URL is at least HTTP(S) so that protocol exists
+ * - Checks the URL forms a valid URL
+ * @param {string} url
+ * @returns {string}
+ */
+function validateURL(url) {
+  try {
+    let destination = url.trim().toLowerCase();
+    // If the URL has a protocol, just pass through
+    if (destination.includes("://")) {
+      destination = new URL(destination).toString();
+    } else {
+      // If the URL doesn't have a protocol, assume https://
+      destination = new URL(`https://${destination.trim()}`).toString();
+    }
+
+    // If the URL ends with a slash, remove it
+    return destination.endsWith("/") ? destination.slice(0, -1) : destination;
+  } catch {
+    if (typeof url !== "string") return "";
+    return url.trim();
+  }
+}
+
 module.exports = {
  validURL,
+  validateURL,
 };
--- a/server/utils/agents/aibitat/plugins/web-scraping.js
+++ b/server/utils/agents/aibitat/plugins/web-scraping.js
@ -45,6 +45,12 @@ const webScraping = {
              if (url) return await this.scrape(url);
              return "There is nothing we can do. This function call returns no information.";
            } catch (error) {
+              this.super.handlerProps.log(
+                `Web Scraping Error: ${error.message}`
+              );
+              this.super.introspect(
+                `${this.caller}: Web Scraping Error: ${error.message}`
+              );
              return `There was an error while calling the function. No data or response was found. Let the user know this was the error: ${error.message}`;
            }
          },
@ -78,15 +84,21 @@ const webScraping = {
            }

            const { TokenManager } = require("../../../helpers/tiktoken");
+            const tokenEstimate = new TokenManager(
+              this.super.model
+            ).countFromString(content);
            if (
-              new TokenManager(this.super.model).countFromString(content) <
+              tokenEstimate <
              Provider.contextLimit(this.super.provider, this.super.model)
            ) {
+              this.super.introspect(
+                `${this.caller}: Looking over the content of the page. ~${tokenEstimate} tokens.`
+              );
              return content;
            }

            this.super.introspect(
-              `${this.caller}: This page's content is way too long. I will summarize it right now.`
+              `${this.caller}: This page's content exceeds the model's context limit. Summarizing it right now.`
            );
            this.super.onAbort(() => {
              this.super.handlerProps.log(