Enable bypass of ip limitations via ENV in collector processing (#3652)
* Enable bypass of ip limitations via ENV in collector startup resolves #3625 connect #3626 * dev build * bump dockerx build action * enable runtime setting config of collector requests * comments and linting for option passing * unset * unset * update docs link * linting and docs
This commit is contained in:
parent
fd4929b4d2
commit
1601eb986c
2
.github/workflows/dev-build.yaml
vendored
2
.github/workflows/dev-build.yaml
vendored
@ -6,7 +6,7 @@ concurrency:
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ['na'] # put your current branch to create a build. Core team only.
|
||||
branches: ['3625-bypass-ip-check'] # put your current branch to create a build. Core team only.
|
||||
paths-ignore:
|
||||
- '**.md'
|
||||
- 'cloud-deployments/*'
|
||||
|
||||
@ -1,9 +1,12 @@
|
||||
const { CommunicationKey } = require("../utils/comKey");
|
||||
const RuntimeSettings = require("../utils/runtimeSettings");
|
||||
const runtimeSettings = new RuntimeSettings();
|
||||
|
||||
function verifyPayloadIntegrity(request, response, next) {
|
||||
const comKey = new CommunicationKey();
|
||||
if (process.env.NODE_ENV === "development") {
|
||||
comKey.log('verifyPayloadIntegrity is skipped in development.')
|
||||
comKey.log('verifyPayloadIntegrity is skipped in development.');
|
||||
runtimeSettings.parseOptionsFromRequest(request);
|
||||
next();
|
||||
return;
|
||||
}
|
||||
@ -12,7 +15,9 @@ function verifyPayloadIntegrity(request, response, next) {
|
||||
if (!signature) return response.status(400).json({ msg: 'Failed integrity signature check.' })
|
||||
|
||||
const validSignedPayload = comKey.verify(signature, request.body);
|
||||
if (!validSignedPayload) return response.status(400).json({ msg: 'Failed integrity signature check.' })
|
||||
if (!validSignedPayload) return response.status(400).json({ msg: 'Failed integrity signature check.' });
|
||||
|
||||
runtimeSettings.parseOptionsFromRequest(request);
|
||||
next();
|
||||
}
|
||||
|
||||
|
||||
83
collector/utils/runtimeSettings/index.js
Normal file
83
collector/utils/runtimeSettings/index.js
Normal file
@ -0,0 +1,83 @@
|
||||
const { reqBody } = require("../http");
|
||||
|
||||
/**
|
||||
* Runtime settings are used to configure the collector per-request.
|
||||
* These settings are persisted across requests, but can be overridden per-request.
|
||||
*
|
||||
* The settings are passed in the request body via `options.runtimeSettings`
|
||||
* which is set in the backend #attachOptions function in CollectorApi.
|
||||
*
|
||||
* We do this so that the collector and backend can share the same ENV variables
|
||||
* but only pass the relevant settings to the collector per-request and be able to
|
||||
* access them across the collector via a single instance of RuntimeSettings.
|
||||
*
|
||||
* TODO: We may want to set all options passed from backend to collector here,
|
||||
* but for now - we are only setting the runtime settings specifically for backwards
|
||||
* compatibility with existing CollectorApi usage.
|
||||
*/
|
||||
class RuntimeSettings {
|
||||
static _instance = null;
|
||||
settings = {};
|
||||
|
||||
// Any settings here will be persisted across requests
|
||||
// and must be explicitly defined here.
|
||||
settingConfigs = {
|
||||
allowAnyIp: {
|
||||
default: false,
|
||||
// Value must be explicitly "true" or "false" as a string
|
||||
validate: (value) => String(value) === "true",
|
||||
},
|
||||
};
|
||||
|
||||
constructor() {
|
||||
if (RuntimeSettings._instance) return RuntimeSettings._instance;
|
||||
RuntimeSettings._instance = this;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the runtime settings from the request body options body
|
||||
* see #attachOptions https://github.com/Mintplex-Labs/anything-llm/blob/ebf112007e0d579af3d2b43569db95bdfc59074b/server/utils/collectorApi/index.js#L18
|
||||
* @param {import('express').Request} request
|
||||
* @returns {void}
|
||||
*/
|
||||
parseOptionsFromRequest(request = {}) {
|
||||
const options = reqBody(request)?.options?.runtimeSettings || {};
|
||||
for (const [key, value] of Object.entries(options)) {
|
||||
if (!this.settingConfigs.hasOwnProperty(key)) continue;
|
||||
this.set(key, value);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a runtime setting
|
||||
* - Will throw an error if the setting requested is not a supported runtime setting key
|
||||
* - Will return the default value if the setting requested is not set at all
|
||||
* @param {string} key
|
||||
* @returns {any}
|
||||
*/
|
||||
get(key) {
|
||||
if (!this.settingConfigs[key])
|
||||
throw new Error(`Invalid runtime setting: ${key}`);
|
||||
return this.settings.hasOwnProperty(key)
|
||||
? this.settings[key]
|
||||
: this.settingConfigs[key].default;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set a runtime setting
|
||||
* - Will throw an error if the setting requested is not a supported runtime setting key
|
||||
* - Will validate the value against the setting's validate function
|
||||
* @param {string} key
|
||||
* @param {any} value
|
||||
* @returns {void}
|
||||
*/
|
||||
set(key, value = null) {
|
||||
if (!this.settingConfigs[key])
|
||||
throw new Error(`Invalid runtime setting: ${key}`);
|
||||
this.settings[key] = this.settingConfigs[key].validate(value);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = RuntimeSettings;
|
||||
@ -1,3 +1,4 @@
|
||||
const RuntimeSettings = require("../runtimeSettings");
|
||||
/** ATTN: SECURITY RESEARCHERS
|
||||
* To Security researchers about to submit an SSRF report CVE - please don't.
|
||||
* We are aware that the code below is does not defend against any of the thousands of ways
|
||||
@ -13,15 +14,24 @@
|
||||
|
||||
const VALID_PROTOCOLS = ["https:", "http:"];
|
||||
const INVALID_OCTETS = [192, 172, 10, 127];
|
||||
const runtimeSettings = new RuntimeSettings();
|
||||
|
||||
/**
|
||||
* If an ip address is passed in the user is attempting to collector some internal service running on internal/private IP.
|
||||
* This is not a security feature and simply just prevents the user from accidentally entering invalid IP addresses.
|
||||
* Can be bypassed via COLLECTOR_ALLOW_ANY_IP environment variable.
|
||||
* @param {URL} param0
|
||||
* @param {URL['hostname']} param0.hostname
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function isInvalidIp({ hostname }) {
|
||||
if (runtimeSettings.get("allowAnyIp")) {
|
||||
console.log(
|
||||
"\x1b[33mURL IP local address restrictions have been disabled by administrator!\x1b[0m"
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
const IPRegex = new RegExp(
|
||||
/^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$/gi
|
||||
);
|
||||
@ -40,6 +50,14 @@ function isInvalidIp({ hostname }) {
|
||||
return INVALID_OCTETS.includes(Number(octetOne));
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates a URL
|
||||
* - Checks the URL forms a valid URL
|
||||
* - Checks the URL is at least HTTP(S)
|
||||
* - Checks the URL is not an internal IP - can be bypassed via COLLECTOR_ALLOW_ANY_IP
|
||||
* @param {string} url
|
||||
* @returns {boolean}
|
||||
*/
|
||||
function validURL(url) {
|
||||
try {
|
||||
const destination = new URL(url);
|
||||
|
||||
@ -322,6 +322,10 @@ GID='1000'
|
||||
# See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
|
||||
# SIMPLE_SSO_ENABLED=1
|
||||
|
||||
# Allow scraping of any IP address in collector - must be string "true" to be enabled
|
||||
# See https://docs.anythingllm.com/configuration#local-ip-address-scraping for more information.
|
||||
# COLLECTOR_ALLOW_ANY_IP="true"
|
||||
|
||||
# Specify the target languages for when using OCR to parse images and PDFs.
|
||||
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
|
||||
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.
|
||||
|
||||
@ -311,6 +311,10 @@ TTS_PROVIDER="native"
|
||||
# See https://docs.anythingllm.com/configuration#simple-sso-passthrough for more information.
|
||||
# SIMPLE_SSO_ENABLED=1
|
||||
|
||||
# Allow scraping of any IP address in collector - must be string "true" to be enabled
|
||||
# See https://docs.anythingllm.com/configuration#local-ip-address-scraping for more information.
|
||||
# COLLECTOR_ALLOW_ANY_IP="true"
|
||||
|
||||
# Specify the target languages for when using OCR to parse images and PDFs.
|
||||
# This is a comma separated list of language codes as a string. Unsupported languages will be ignored.
|
||||
# Default is English. See https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html for a list of valid language codes.
|
||||
|
||||
@ -1,5 +1,14 @@
|
||||
const { EncryptionManager } = require("../EncryptionManager");
|
||||
|
||||
/**
|
||||
* @typedef {Object} CollectorOptions
|
||||
* @property {string} whisperProvider - The provider to use for whisper, defaults to "local"
|
||||
* @property {string} WhisperModelPref - The model to use for whisper if set.
|
||||
* @property {string} openAiKey - The API key to use for OpenAI interfacing, mostly passed to OAI Whisper provider.
|
||||
* @property {Object} ocr - The OCR options
|
||||
* @property {{allowAnyIp: "true"|null|undefined}} runtimeSettings - The runtime settings that are passed to the collector. Persisted across requests.
|
||||
*/
|
||||
|
||||
// When running locally will occupy the 0.0.0.0 hostname space but when deployed inside
|
||||
// of docker this endpoint is not exposed so it is only on the Docker instances internal network
|
||||
// so no additional security is needed on the endpoint directly. Auth is done however by the express
|
||||
@ -15,6 +24,10 @@ class CollectorApi {
|
||||
console.log(`\x1b[36m[CollectorApi]\x1b[0m ${text}`, ...args);
|
||||
}
|
||||
|
||||
/**
|
||||
* Attach options to the request passed to the collector API
|
||||
* @returns {CollectorOptions}
|
||||
*/
|
||||
#attachOptions() {
|
||||
return {
|
||||
whisperProvider: process.env.WHISPER_PROVIDER || "local",
|
||||
@ -23,6 +36,9 @@ class CollectorApi {
|
||||
ocr: {
|
||||
langList: process.env.TARGET_OCR_LANG || "eng",
|
||||
},
|
||||
runtimeSettings: {
|
||||
allowAnyIp: process.env.COLLECTOR_ALLOW_ANY_IP ?? "false",
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
@ -45,6 +61,12 @@ class CollectorApi {
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a document
|
||||
* - Will append the options to the request body
|
||||
* @param {string} filename - The filename of the document to process
|
||||
* @returns {Promise<Object>} - The response from the collector API
|
||||
*/
|
||||
async processDocument(filename = "") {
|
||||
if (!filename) return false;
|
||||
|
||||
@ -75,10 +97,16 @@ class CollectorApi {
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Process a link
|
||||
* - Will append the options to the request body
|
||||
* @param {string} link - The link to process
|
||||
* @returns {Promise<Object>} - The response from the collector API
|
||||
*/
|
||||
async processLink(link = "") {
|
||||
if (!link) return false;
|
||||
|
||||
const data = JSON.stringify({ link });
|
||||
const data = JSON.stringify({ link, options: this.#attachOptions() });
|
||||
return await fetch(`${this.endpoint}/process-link`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
@ -101,8 +129,19 @@ class CollectorApi {
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Process raw text as a document for the collector
|
||||
* - Will append the options to the request body
|
||||
* @param {string} textContent - The text to process
|
||||
* @param {Object} metadata - The metadata to process
|
||||
* @returns {Promise<Object>} - The response from the collector API
|
||||
*/
|
||||
async processRawText(textContent = "", metadata = {}) {
|
||||
const data = JSON.stringify({ textContent, metadata });
|
||||
const data = JSON.stringify({
|
||||
textContent,
|
||||
metadata,
|
||||
options: this.#attachOptions(),
|
||||
});
|
||||
return await fetch(`${this.endpoint}/process-raw-text`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
@ -151,10 +190,21 @@ class CollectorApi {
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the content of a link only in a specific format
|
||||
* - Will append the options to the request body
|
||||
* @param {string} link - The link to get the content of
|
||||
* @param {"text"|"html"} captureAs - The format to capture the content as
|
||||
* @returns {Promise<Object>} - The response from the collector API
|
||||
*/
|
||||
async getLinkContent(link = "", captureAs = "text") {
|
||||
if (!link) return false;
|
||||
|
||||
const data = JSON.stringify({ link, captureAs });
|
||||
const data = JSON.stringify({
|
||||
link,
|
||||
captureAs,
|
||||
options: this.#attachOptions(),
|
||||
});
|
||||
return await fetch(`${this.endpoint}/util/get-link`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
|
||||
@ -958,6 +958,9 @@ function dumpENV() {
|
||||
|
||||
// OCR Language Support
|
||||
"TARGET_OCR_LANG",
|
||||
|
||||
// Collector API common ENV - allows bypassing URL validation checks
|
||||
"COLLECTOR_ALLOW_ANY_IP",
|
||||
];
|
||||
|
||||
// Simple sanitization of each value to prevent ENV injection via newline or quote escaping.
|
||||
|
||||
Loading…
Reference in New Issue
Block a user