Improve URL handler for collector processes (#4504)
* Improve URL handler for collector processes * dev build
This commit is contained in:
parent
3259ede98a
commit
cf3fbcbf0f
2
.github/workflows/dev-build.yaml
vendored
2
.github/workflows/dev-build.yaml
vendored
@ -6,7 +6,7 @@ concurrency:
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ['4499-tooltips'] # put your current branch to create a build. Core team only.
|
||||
branches: ['improve-url-handler-collector'] # put your current branch to create a build. Core team only.
|
||||
paths-ignore:
|
||||
- '**.md'
|
||||
- 'cloud-deployments/*'
|
||||
|
||||
112
collector/__tests__/utils/url/index.test.js
Normal file
112
collector/__tests__/utils/url/index.test.js
Normal file
@ -0,0 +1,112 @@
|
||||
const { validURL, validateURL } = require("../../../utils/url");
|
||||
|
||||
// Mock the RuntimeSettings module
|
||||
jest.mock("../../../utils/runtimeSettings", () => {
|
||||
const mockInstance = {
|
||||
get: jest.fn(),
|
||||
set: jest.fn(),
|
||||
};
|
||||
return jest.fn().mockImplementation(() => mockInstance);
|
||||
});
|
||||
|
||||
describe("validURL", () => {
|
||||
let mockRuntimeSettings;
|
||||
|
||||
beforeEach(() => {
|
||||
const RuntimeSettings = require("../../../utils/runtimeSettings");
|
||||
mockRuntimeSettings = new RuntimeSettings();
|
||||
jest.clearAllMocks();
|
||||
});
|
||||
|
||||
it("should validate a valid URL", () => {
|
||||
mockRuntimeSettings.get.mockImplementation((key) => {
|
||||
if (key === "allowAnyIp") return false;
|
||||
if (key === "seenAnyIpWarning") return true; // silence the warning for tests
|
||||
return false;
|
||||
});
|
||||
|
||||
expect(validURL("https://www.google.com")).toBe(true);
|
||||
expect(validURL("http://www.google.com")).toBe(true);
|
||||
|
||||
// JS URL does not require extensions, so in theory
|
||||
// these should be valid
|
||||
expect(validURL("https://random")).toBe(true);
|
||||
expect(validURL("http://123")).toBe(true);
|
||||
|
||||
// missing protocols
|
||||
expect(validURL("www.google.com")).toBe(false);
|
||||
expect(validURL("google.com")).toBe(false);
|
||||
|
||||
// invalid protocols
|
||||
expect(validURL("ftp://www.google.com")).toBe(false);
|
||||
expect(validURL("mailto://www.google.com")).toBe(false);
|
||||
expect(validURL("tel://www.google.com")).toBe(false);
|
||||
expect(validURL("data://www.google.com")).toBe(false);
|
||||
});
|
||||
|
||||
it("should block private/local IPs when allowAnyIp is false (default behavior)", () => {
|
||||
mockRuntimeSettings.get.mockImplementation((key) => {
|
||||
if (key === "allowAnyIp") return false;
|
||||
if (key === "seenAnyIpWarning") return true; // silence the warning for tests
|
||||
return false;
|
||||
});
|
||||
|
||||
expect(validURL("http://192.168.1.1")).toBe(false);
|
||||
expect(validURL("http://10.0.0.1")).toBe(false);
|
||||
expect(validURL("http://172.16.0.1")).toBe(false);
|
||||
|
||||
// But localhost should still be allowed
|
||||
expect(validURL("http://127.0.0.1")).toBe(true);
|
||||
expect(validURL("http://0.0.0.0")).toBe(true);
|
||||
});
|
||||
|
||||
it("should allow any IP when allowAnyIp is true", () => {
|
||||
mockRuntimeSettings.get.mockImplementation((key) => {
|
||||
if (key === "allowAnyIp") return true;
|
||||
if (key === "seenAnyIpWarning") return true; // silence the warning for tests
|
||||
return false;
|
||||
});
|
||||
|
||||
expect(validURL("http://192.168.1.1")).toBe(true);
|
||||
expect(validURL("http://10.0.0.1")).toBe(true);
|
||||
expect(validURL("http://172.16.0.1")).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("validateURL", () => {
|
||||
it("should return the exact same URL if it's already valid", () => {
|
||||
expect(validateURL("https://www.google.com")).toBe("https://www.google.com");
|
||||
expect(validateURL("http://www.google.com")).toBe("http://www.google.com");
|
||||
expect(validateURL("https://random")).toBe("https://random");
|
||||
|
||||
// With numbers as a url this will turn into an ip
|
||||
expect(validateURL("123")).toBe("https://0.0.0.123");
|
||||
expect(validateURL("123.123.123.123")).toBe("https://123.123.123.123");
|
||||
expect(validateURL("http://127.0.123.45")).toBe("http://127.0.123.45");
|
||||
});
|
||||
|
||||
it("should assume https:// if the URL doesn't have a protocol", () => {
|
||||
expect(validateURL("www.google.com")).toBe("https://www.google.com");
|
||||
expect(validateURL("google.com")).toBe("https://google.com");
|
||||
expect(validateURL("ftp://www.google.com")).toBe("ftp://www.google.com");
|
||||
expect(validateURL("mailto://www.google.com")).toBe("mailto://www.google.com");
|
||||
expect(validateURL("tel://www.google.com")).toBe("tel://www.google.com");
|
||||
expect(validateURL("data://www.google.com")).toBe("data://www.google.com");
|
||||
});
|
||||
|
||||
it("should remove trailing slashes post-validation", () => {
|
||||
expect(validateURL("https://www.google.com/")).toBe("https://www.google.com");
|
||||
expect(validateURL("http://www.google.com/")).toBe("http://www.google.com");
|
||||
expect(validateURL("https://random/")).toBe("https://random");
|
||||
});
|
||||
|
||||
it("should handle edge cases and bad data inputs", () => {
|
||||
expect(validateURL({})).toBe("");
|
||||
expect(validateURL(null)).toBe("");
|
||||
expect(validateURL(undefined)).toBe("");
|
||||
expect(validateURL(124512)).toBe("");
|
||||
expect(validateURL("")).toBe("");
|
||||
expect(validateURL(" ")).toBe("");
|
||||
expect(validateURL(" look here! ")).toBe("look here!");
|
||||
});
|
||||
});
|
||||
@ -2,7 +2,7 @@ const { setDataSigner } = require("../middleware/setDataSigner");
|
||||
const { verifyPayloadIntegrity } = require("../middleware/verifyIntegrity");
|
||||
const { resolveRepoLoader, resolveRepoLoaderFunction } = require("../utils/extensions/RepoLoader");
|
||||
const { reqBody } = require("../utils/http");
|
||||
const { validURL } = require("../utils/url");
|
||||
const { validURL, validateURL } = require("../utils/url");
|
||||
const RESYNC_METHODS = require("./resync");
|
||||
const { loadObsidianVault } = require("../utils/extensions/ObsidianVault");
|
||||
|
||||
@ -119,6 +119,7 @@ function extensions(app) {
|
||||
try {
|
||||
const websiteDepth = require("../utils/extensions/WebsiteDepth");
|
||||
const { url, depth = 1, maxLinks = 20 } = reqBody(request);
|
||||
url = validateURL(url);
|
||||
if (!validURL(url)) throw new Error("Not a valid URL.");
|
||||
const scrapedData = await websiteDepth(url, depth, maxLinks);
|
||||
response.status(200).json({ success: true, data: scrapedData });
|
||||
|
||||
@ -111,7 +111,7 @@ async function scrapeGenericUrl({
|
||||
headers: scraperHeaders,
|
||||
});
|
||||
|
||||
if (!content.length) {
|
||||
if (!content || !content.length) {
|
||||
console.error(`Resulting URL content was empty at ${link}.`);
|
||||
return returnResult({
|
||||
success: false,
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
const { validURL } = require("../utils/url");
|
||||
const { scrapeGenericUrl } = require("./convert/generic");
|
||||
const { validateURL } = require("../utils/url");
|
||||
|
||||
/**
|
||||
* Process a link and return the text content. This util will save the link as a document
|
||||
@ -10,6 +11,7 @@ const { scrapeGenericUrl } = require("./convert/generic");
|
||||
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
|
||||
*/
|
||||
async function processLink(link, scraperHeaders = {}, metadata = {}) {
|
||||
link = validateURL(link);
|
||||
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
|
||||
return await scrapeGenericUrl({
|
||||
link,
|
||||
@ -28,6 +30,7 @@ async function processLink(link, scraperHeaders = {}, metadata = {}) {
|
||||
* @returns {Promise<{success: boolean, content: string}>} - Response from collector
|
||||
*/
|
||||
async function getLinkText(link, captureAs = "text") {
|
||||
link = validateURL(link);
|
||||
if (!validURL(link)) return { success: false, reason: "Not a valid URL." };
|
||||
return await scrapeGenericUrl({
|
||||
link,
|
||||
|
||||
@ -54,7 +54,7 @@ function isInvalidIp({ hostname }) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates a URL
|
||||
* Validates a URL strictly
|
||||
* - Checks the URL forms a valid URL
|
||||
* - Checks the URL is at least HTTP(S)
|
||||
* - Checks the URL is not an internal IP - can be bypassed via COLLECTOR_ALLOW_ANY_IP
|
||||
@ -71,6 +71,33 @@ function validURL(url) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Modifies a URL to be valid:
|
||||
* - Checks the URL is at least HTTP(S) so that protocol exists
|
||||
* - Checks the URL forms a valid URL
|
||||
* @param {string} url
|
||||
* @returns {string}
|
||||
*/
|
||||
function validateURL(url) {
|
||||
try {
|
||||
let destination = url.trim().toLowerCase();
|
||||
// If the URL has a protocol, just pass through
|
||||
if (destination.includes("://")) {
|
||||
destination = new URL(destination).toString();
|
||||
} else {
|
||||
// If the URL doesn't have a protocol, assume https://
|
||||
destination = new URL(`https://${destination.trim()}`).toString();
|
||||
}
|
||||
|
||||
// If the URL ends with a slash, remove it
|
||||
return destination.endsWith("/") ? destination.slice(0, -1) : destination;
|
||||
} catch {
|
||||
if (typeof url !== "string") return "";
|
||||
return url.trim();
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
validURL,
|
||||
validateURL,
|
||||
};
|
||||
|
||||
@ -45,6 +45,12 @@ const webScraping = {
|
||||
if (url) return await this.scrape(url);
|
||||
return "There is nothing we can do. This function call returns no information.";
|
||||
} catch (error) {
|
||||
this.super.handlerProps.log(
|
||||
`Web Scraping Error: ${error.message}`
|
||||
);
|
||||
this.super.introspect(
|
||||
`${this.caller}: Web Scraping Error: ${error.message}`
|
||||
);
|
||||
return `There was an error while calling the function. No data or response was found. Let the user know this was the error: ${error.message}`;
|
||||
}
|
||||
},
|
||||
@ -78,15 +84,21 @@ const webScraping = {
|
||||
}
|
||||
|
||||
const { TokenManager } = require("../../../helpers/tiktoken");
|
||||
const tokenEstimate = new TokenManager(
|
||||
this.super.model
|
||||
).countFromString(content);
|
||||
if (
|
||||
new TokenManager(this.super.model).countFromString(content) <
|
||||
tokenEstimate <
|
||||
Provider.contextLimit(this.super.provider, this.super.model)
|
||||
) {
|
||||
this.super.introspect(
|
||||
`${this.caller}: Looking over the content of the page. ~${tokenEstimate} tokens.`
|
||||
);
|
||||
return content;
|
||||
}
|
||||
|
||||
this.super.introspect(
|
||||
`${this.caller}: This page's content is way too long. I will summarize it right now.`
|
||||
`${this.caller}: This page's content exceeds the model's context limit. Summarizing it right now.`
|
||||
);
|
||||
this.super.onAbort(() => {
|
||||
this.super.handlerProps.log(
|
||||
|
||||
Loading…
Reference in New Issue
Block a user