Enable bypass of ip limitations via ENV in collector processing (#3652)

* Enable bypass of ip limitations via ENV in collector startup
resolves #3625
connect #3626

* dev build

* bump dockerx build action

* enable runtime setting config of collector requests

* comments and linting for option passing

* unset

* unset

* update docs link

* linting and docs
This commit is contained in:
Timothy Carambat 2025-04-21 11:10:41 -07:00 committed by GitHub
parent fd4929b4d2
commit 1601eb986c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 173 additions and 6 deletions

View File

@ -6,7 +6,7 @@ concurrency:
on:
push:
branches: ['na'] # put your current branch to create a build. Core team only.
branches: ['3625-bypass-ip-check'] # put your current branch to create a build. Core team only.
paths-ignore:
- '**.md'
- 'cloud-deployments/*'

View File

@ -1,9 +1,12 @@
const { CommunicationKey } = require("../utils/comKey");
const RuntimeSettings = require("../utils/runtimeSettings");
const runtimeSettings = new RuntimeSettings();
function verifyPayloadIntegrity(request, response, next) {
const comKey = new CommunicationKey();
if (process.env.NODE_ENV === "development") {
comKey.log('verifyPayloadIntegrity is skipped in development.')
comKey.log('verifyPayloadIntegrity is skipped in development.');
runtimeSettings.parseOptionsFromRequest(request);
next();
return;
}
@ -12,7 +15,9 @@ function verifyPayloadIntegrity(request, response, next) {
if (!signature) return response.status(400).json({ msg: 'Failed integrity signature check.' })
const validSignedPayload = comKey.verify(signature, request.body);
if (!validSignedPayload) return response.status(400).json({ msg: 'Failed integrity signature check.' })
if (!validSignedPayload) return response.status(400).json({ msg: 'Failed integrity signature check.' });
runtimeSettings.parseOptionsFromRequest(request);
next();
}

View File

@ -0,0 +1,83 @@
const { reqBody } = require("../http");
/**
* Runtime settings are used to configure the collector per-request.
* These settings are persisted across requests, but can be overridden per-request.
*
* The settings are passed in the request body via `options.runtimeSettings`
* which is set in the backend #attachOptions function in CollectorApi.
*
* We do this so that the collector and backend can share the same ENV variables
* but only pass the relevant settings to the collector per-request and be able to
* access them across the collector via a single instance of RuntimeSettings.
*
* TODO: We may want to set all options passed from backend to collector here,
* but for now - we are only setting the runtime settings specifically for backwards
* compatibility with existing CollectorApi usage.
*/
class RuntimeSettings {
static _instance = null;
settings = {};
// Any settings here will be persisted across requests
// and must be explicitly defined here.
settingConfigs = {
allowAnyIp: {
default: false,
// Value must be explicitly "true" or "false" as a string
validate: (value) => String(value) === "true",
},
};
constructor() {
if (RuntimeSettings._instance) return RuntimeSettings._instance;
RuntimeSettings._instance = this;
return this;
}
/**
* Parse the runtime settings from the request body options body
* see #attachOptions https://github.com/Mintplex-Labs/anything-llm/blob/ebf112007e0d579af3d2b43569db95bdfc59074b/server/utils/collectorApi/index.js#L18
* @param {import('express').Request} request
* @returns {void}
*/
parseOptionsFromRequest(request = {}) {
const options = reqBody(request)?.options?.runtimeSettings || {};
for (const [key, value] of Object.entries(options)) {
if (!this.settingConfigs.hasOwnProperty(key)) continue;
this.set(key, value);
}
return;
}
/**
* Get a runtime setting
* - Will throw an error if the setting requested is not a supported runtime setting key
* - Will return the default value if the setting requested is not set at all
* @param {string} key
* @returns {any}
*/
get(key) {
if (!this.settingConfigs[key])
throw new Error(`Invalid runtime setting: ${key}`);
return this.settings.hasOwnProperty(key)
? this.settings[key]
: this.settingConfigs[key].default;
}
/**
* Set a runtime setting
* - Will throw an error if the setting requested is not a supported runtime setting key
* - Will validate the value against the setting's validate function
* @param {string} key
* @param {any} value
* @returns {void}
*/
set(key, value = null) {
if (!this.settingConfigs[key])
throw new Error(`Invalid runtime setting: ${key}`);
this.settings[key] = this.settingConfigs[key].validate(value);
}
}
module.exports = RuntimeSettings;

View File

@ -1,3 +1,4 @@
const RuntimeSettings = require("../runtimeSettings");
/** ATTN: SECURITY RESEARCHERS
* To Security researchers about to submit an SSRF report CVE - please don't.
* We are aware that the code below is does not defend against any of the thousands of ways
@ -13,15 +14,24 @@
const VALID_PROTOCOLS = ["https:", "http:"];
const INVALID_OCTETS = [192, 172, 10, 127];
const runtimeSettings = new RuntimeSettings();
/**
* If an ip address is passed in the user is attempting to collector some internal service running on internal/private IP.
* This is not a security feature and simply just prevents the user from accidentally entering invalid IP addresses.
* Can be bypassed via COLLECTOR_ALLOW_ANY_IP environment variable.
* @param {URL} param0
* @param {URL['hostname']} param0.hostname
* @returns {boolean}
*/
function isInvalidIp({ hostname }) {
if (runtimeSettings.get("allowAnyIp")) {
console.log(
"\x1b[33mURL IP local address restrictions have been disabled by administrator!\x1b[0m"
);
return false;
}
const IPRegex = new RegExp(
/^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$/gi
);
@ -40,6 +50,14 @@ function isInvalidIp({ hostname }) {
return INVALID_OCTETS.includes(Number(octetOne));
}
/**
* Validates a URL
* - Checks the URL forms a valid URL
* - Checks the URL is at least HTTP(S)
* - Checks the URL is not an internal IP - can be bypassed via COLLECTOR_ALLOW_ANY_IP
* @param {string} url
* @returns {boolean}
*/
function validURL(url) {
try {
const destination = new URL(url);

View File

@ -322,6 +322,10 @@ GID='1000'
# See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
# SIMPLE_SSO_ENABLED=1
# Allow scraping of any IP address in collector - must be string "true" to be enabled
# See https://docs.anythingllm.com/configuration#local-ip-address-scraping for more information.
# COLLECTOR_ALLOW_ANY_IP="true"
# Specify the target languages for when using OCR to parse images and PDFs.
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.

View File

@ -311,6 +311,10 @@ TTS_PROVIDER="native"
# See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
# SIMPLE_SSO_ENABLED=1
# Allow scraping of any IP address in collector - must be string "true" to be enabled
# See https://docs.anythingllm.com/configuration#local-ip-address-scraping for more information.
# COLLECTOR_ALLOW_ANY_IP="true"
# Specify the target languages for when using OCR to parse images and PDFs.
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.

View File

@ -1,5 +1,14 @@
const { EncryptionManager } = require("../EncryptionManager");
/**
* @typedef {Object} CollectorOptions
* @property {string} whisperProvider - The provider to use for whisper, defaults to "local"
* @property {string} WhisperModelPref - The model to use for whisper if set.
* @property {string} openAiKey - The API key to use for OpenAI interfacing, mostly passed to OAI Whisper provider.
* @property {Object} ocr - The OCR options
* @property {{allowAnyIp: "true"|null|undefined}} runtimeSettings - The runtime settings that are passed to the collector. Persisted across requests.
*/
// When running locally will occupy the 0.0.0.0 hostname space but when deployed inside
// of docker this endpoint is not exposed so it is only on the Docker instances internal network
// so no additional security is needed on the endpoint directly. Auth is done however by the express
@ -15,6 +24,10 @@ class CollectorApi {
console.log(`\x1b[36m[CollectorApi]\x1b[0m ${text}`, ...args);
}
/**
* Attach options to the request passed to the collector API
* @returns {CollectorOptions}
*/
#attachOptions() {
return {
whisperProvider: process.env.WHISPER_PROVIDER || "local",
@ -23,6 +36,9 @@ class CollectorApi {
ocr: {
langList: process.env.TARGET_OCR_LANG || "eng",
},
runtimeSettings: {
allowAnyIp: process.env.COLLECTOR_ALLOW_ANY_IP ?? "false",
},
};
}
@ -45,6 +61,12 @@ class CollectorApi {
});
}
/**
* Process a document
* - Will append the options to the request body
* @param {string} filename - The filename of the document to process
* @returns {Promise<Object>} - The response from the collector API
*/
async processDocument(filename = "") {
if (!filename) return false;
@ -75,10 +97,16 @@ class CollectorApi {
});
}
/**
* Process a link
* - Will append the options to the request body
* @param {string} link - The link to process
* @returns {Promise<Object>} - The response from the collector API
*/
async processLink(link = "") {
if (!link) return false;
const data = JSON.stringify({ link });
const data = JSON.stringify({ link, options: this.#attachOptions() });
return await fetch(`${this.endpoint}/process-link`, {
method: "POST",
headers: {
@ -101,8 +129,19 @@ class CollectorApi {
});
}
/**
* Process raw text as a document for the collector
* - Will append the options to the request body
* @param {string} textContent - The text to process
* @param {Object} metadata - The metadata to process
* @returns {Promise<Object>} - The response from the collector API
*/
async processRawText(textContent = "", metadata = {}) {
const data = JSON.stringify({ textContent, metadata });
const data = JSON.stringify({
textContent,
metadata,
options: this.#attachOptions(),
});
return await fetch(`${this.endpoint}/process-raw-text`, {
method: "POST",
headers: {
@ -151,10 +190,21 @@ class CollectorApi {
});
}
/**
* Get the content of a link only in a specific format
* - Will append the options to the request body
* @param {string} link - The link to get the content of
* @param {"text"|"html"} captureAs - The format to capture the content as
* @returns {Promise<Object>} - The response from the collector API
*/
async getLinkContent(link = "", captureAs = "text") {
if (!link) return false;
const data = JSON.stringify({ link, captureAs });
const data = JSON.stringify({
link,
captureAs,
options: this.#attachOptions(),
});
return await fetch(`${this.endpoint}/util/get-link`, {
method: "POST",
headers: {

View File

@ -958,6 +958,9 @@ function dumpENV() {
// OCR Language Support
"TARGET_OCR_LANG",
// Collector API common ENV - allows bypassing URL validation checks
"COLLECTOR_ALLOW_ANY_IP",
];
// Simple sanitization of each value to prevent ENV injection via newline or quote escaping.