Improve URL handler for collector processes (#4504)

* Improve URL handler for collector processes

* dev build
This commit is contained in:
Timothy Carambat 2025-10-07 11:03:27 -07:00 committed by GitHub
parent 3259ede98a
commit cf3fbcbf0f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 161 additions and 6 deletions

View File

@ -6,7 +6,7 @@ concurrency:
on:
push:
branches: ['4499-tooltips'] # put your current branch to create a build. Core team only.
branches: ['improve-url-handler-collector'] # put your current branch to create a build. Core team only.
paths-ignore:
- '**.md'
- 'cloud-deployments/*'

View File

@ -0,0 +1,112 @@
const { validURL, validateURL } = require("../../../utils/url");
// Mock the RuntimeSettings module
jest.mock("../../../utils/runtimeSettings", () => {
const mockInstance = {
get: jest.fn(),
set: jest.fn(),
};
return jest.fn().mockImplementation(() => mockInstance);
});
describe("validURL", () => {
let mockRuntimeSettings;
beforeEach(() => {
const RuntimeSettings = require("../../../utils/runtimeSettings");
mockRuntimeSettings = new RuntimeSettings();
jest.clearAllMocks();
});
it("should validate a valid URL", () => {
mockRuntimeSettings.get.mockImplementation((key) => {
if (key === "allowAnyIp") return false;
if (key === "seenAnyIpWarning") return true; // silence the warning for tests
return false;
});
expect(validURL("https://www.google.com")).toBe(true);
expect(validURL("http://www.google.com")).toBe(true);
// JS URL does not require extensions, so in theory
// these should be valid
expect(validURL("https://random")).toBe(true);
expect(validURL("http://123")).toBe(true);
// missing protocols
expect(validURL("www.google.com")).toBe(false);
expect(validURL("google.com")).toBe(false);
// invalid protocols
expect(validURL("ftp://www.google.com")).toBe(false);
expect(validURL("mailto://www.google.com")).toBe(false);
expect(validURL("tel://www.google.com")).toBe(false);
expect(validURL("data://www.google.com")).toBe(false);
});
it("should block private/local IPs when allowAnyIp is false (default behavior)", () => {
mockRuntimeSettings.get.mockImplementation((key) => {
if (key === "allowAnyIp") return false;
if (key === "seenAnyIpWarning") return true; // silence the warning for tests
return false;
});
expect(validURL("http://192.168.1.1")).toBe(false);
expect(validURL("http://10.0.0.1")).toBe(false);
expect(validURL("http://172.16.0.1")).toBe(false);
// But localhost should still be allowed
expect(validURL("http://127.0.0.1")).toBe(true);
expect(validURL("http://0.0.0.0")).toBe(true);
});
it("should allow any IP when allowAnyIp is true", () => {
mockRuntimeSettings.get.mockImplementation((key) => {
if (key === "allowAnyIp") return true;
if (key === "seenAnyIpWarning") return true; // silence the warning for tests
return false;
});
expect(validURL("http://192.168.1.1")).toBe(true);
expect(validURL("http://10.0.0.1")).toBe(true);
expect(validURL("http://172.16.0.1")).toBe(true);
});
});
describe("validateURL", () => {
it("should return the exact same URL if it's already valid", () => {
expect(validateURL("https://www.google.com")).toBe("https://www.google.com");
expect(validateURL("http://www.google.com")).toBe("http://www.google.com");
expect(validateURL("https://random")).toBe("https://random");
// With numbers as a url this will turn into an ip
expect(validateURL("123")).toBe("https://0.0.0.123");
expect(validateURL("123.123.123.123")).toBe("https://123.123.123.123");
expect(validateURL("http://127.0.123.45")).toBe("http://127.0.123.45");
});
it("should assume https:// if the URL doesn't have a protocol", () => {
expect(validateURL("www.google.com")).toBe("https://www.google.com");
expect(validateURL("google.com")).toBe("https://google.com");
expect(validateURL("ftp://www.google.com")).toBe("ftp://www.google.com");
expect(validateURL("mailto://www.google.com")).toBe("mailto://www.google.com");
expect(validateURL("tel://www.google.com")).toBe("tel://www.google.com");
expect(validateURL("data://www.google.com")).toBe("data://www.google.com");
});
it("should remove trailing slashes post-validation", () => {
expect(validateURL("https://www.google.com/")).toBe("https://www.google.com");
expect(validateURL("http://www.google.com/")).toBe("http://www.google.com");
expect(validateURL("https://random/")).toBe("https://random");
});
it("should handle edge cases and bad data inputs", () => {
expect(validateURL({})).toBe("");
expect(validateURL(null)).toBe("");
expect(validateURL(undefined)).toBe("");
expect(validateURL(124512)).toBe("");
expect(validateURL("")).toBe("");
expect(validateURL(" ")).toBe("");
expect(validateURL(" look here! ")).toBe("look here!");
});
});

View File

@ -2,7 +2,7 @@ const { setDataSigner } = require("../middleware/setDataSigner");
const { verifyPayloadIntegrity } = require("../middleware/verifyIntegrity");
const { resolveRepoLoader, resolveRepoLoaderFunction } = require("../utils/extensions/RepoLoader");
const { reqBody } = require("../utils/http");
const { validURL } = require("../utils/url");
const { validURL, validateURL } = require("../utils/url");
const RESYNC_METHODS = require("./resync");
const { loadObsidianVault } = require("../utils/extensions/ObsidianVault");
@ -119,6 +119,7 @@ function extensions(app) {
try {
const websiteDepth = require("../utils/extensions/WebsiteDepth");
const { url, depth = 1, maxLinks = 20 } = reqBody(request);
url = validateURL(url);
if (!validURL(url)) throw new Error("Not a valid URL.");
const scrapedData = await websiteDepth(url, depth, maxLinks);
response.status(200).json({ success: true, data: scrapedData });

View File

@ -111,7 +111,7 @@ async function scrapeGenericUrl({
headers: scraperHeaders,
});
if (!content.length) {
if (!content || !content.length) {
console.error(`Resulting URL content was empty at ${link}.`);
return returnResult({
success: false,

View File

@ -1,5 +1,6 @@
const { validURL } = require("../utils/url");
const { scrapeGenericUrl } = require("./convert/generic");
const { validateURL } = require("../utils/url");
/**
* Process a link and return the text content. This util will save the link as a document
@ -10,6 +11,7 @@ const { scrapeGenericUrl } = require("./convert/generic");
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
*/
async function processLink(link, scraperHeaders = {}, metadata = {}) {
link = validateURL(link);
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
return await scrapeGenericUrl({
link,
@ -28,6 +30,7 @@ async function processLink(link, scraperHeaders = {}, metadata = {}) {
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
*/
async function getLinkText(link, captureAs = "text") {
link = validateURL(link);
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
return await scrapeGenericUrl({
link,

View File

@ -54,7 +54,7 @@ function isInvalidIp({ hostname }) {
}
/**
* Validates a URL
* Validates a URL strictly
* - Checks the URL forms a valid URL
* - Checks the URL is at least HTTP(S)
* - Checks the URL is not an internal IP - can be bypassed via COLLECTOR_ALLOW_ANY_IP
@ -71,6 +71,33 @@ function validURL(url) {
return false;
}
/**
* Modifies a URL to be valid:
* - Checks the URL is at least HTTP(S) so that protocol exists
* - Checks the URL forms a valid URL
* @param {string} url
* @returns {string}
*/
function validateURL(url) {
try {
let destination = url.trim().toLowerCase();
// If the URL has a protocol, just pass through
if (destination.includes("://")) {
destination = new URL(destination).toString();
} else {
// If the URL doesn't have a protocol, assume https://
destination = new URL(`https://${destination.trim()}`).toString();
}
// If the URL ends with a slash, remove it
return destination.endsWith("/") ? destination.slice(0, -1) : destination;
} catch {
if (typeof url !== "string") return "";
return url.trim();
}
}
module.exports = {
validURL,
validateURL,
};

View File

@ -45,6 +45,12 @@ const webScraping = {
if (url) return await this.scrape(url);
return "There is nothing we can do. This function call returns no information.";
} catch (error) {
this.super.handlerProps.log(
`Web Scraping Error: ${error.message}`
);
this.super.introspect(
`${this.caller}: Web Scraping Error: ${error.message}`
);
return `There was an error while calling the function. No data or response was found. Let the user know this was the error: ${error.message}`;
}
},
@ -78,15 +84,21 @@ const webScraping = {
}
const { TokenManager } = require("../../../helpers/tiktoken");
const tokenEstimate = new TokenManager(
this.super.model
).countFromString(content);
if (
new TokenManager(this.super.model).countFromString(content) <
tokenEstimate <
Provider.contextLimit(this.super.provider, this.super.model)
) {
this.super.introspect(
`${this.caller}: Looking over the content of the page. ~${tokenEstimate} tokens.`
);
return content;
}
this.super.introspect(
`${this.caller}: This page's content is way too long. I will summarize it right now.`
`${this.caller}: This page's content exceeds the model's context limit. Summarizing it right now.`
);
this.super.onAbort(() => {
this.super.handlerProps.log(