merlyn/collector/utils/extensions/Confluence/ConfluenceLoader/index.js
Neha Prasad 3ecf218eea
feat: Add SSL certificate bypass support for self-hosted Confluence instances (#4219)
* Added bypassSSL parameter to constructor and implemented SSL bypass logic in fetchConfluenceData method

* Updated generateChunkSource function to include bypassSSL in the encrypted payload

* Updated the request body to include bypassSSL in the JSON payload sent to the backend

* Updated form submission to include bypassSSL parameter from the checkbox

* Added bypass_ssl: "Bypass SSL Certificate Validation" translation

* passed these parameters to fetchconfluencepage function for proper resync functionality

* allow ignore of SSL cert for Confluence

* add translations

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
2025-11-25 14:32:10 -08:00

158 lines
4.7 KiB
JavaScript

/*
* This is a custom implementation of the Confluence langchain loader. There was an issue where
* code blocks were not being extracted. This is a temporary fix until this issue is resolved.*/
const { htmlToText } = require("html-to-text");
class ConfluencePagesLoader {
constructor({
baseUrl,
spaceKey,
username,
accessToken,
limit = 25,
expand = "body.storage,version",
personalAccessToken,
cloud = true,
bypassSSL = false,
}) {
this.baseUrl = baseUrl;
this.spaceKey = spaceKey;
this.username = username;
this.accessToken = accessToken;
this.limit = limit;
this.expand = expand;
this.personalAccessToken = personalAccessToken;
this.cloud = cloud;
this.bypassSSL = bypassSSL;
this.log("Initialized Confluence Loader");
if (this.bypassSSL)
this.log("!!SSL bypass is enabled!! Use at your own risk!!");
}
log(message, ...args) {
console.log(`\x1b[36m[Confluence Loader]\x1b[0m ${message}`, ...args);
}
get authorizationHeader() {
if (this.personalAccessToken) {
return `Bearer ${this.personalAccessToken}`;
} else if (this.username && this.accessToken) {
const authToken = Buffer.from(
`${this.username}:${this.accessToken}`
).toString("base64");
return `Basic ${authToken}`;
}
return undefined;
}
async load(options) {
try {
const pages = await this.fetchAllPagesInSpace(
options?.start,
options?.limit
);
return pages.map((page) => this.createDocumentFromPage(page));
} catch (error) {
this.log("Error:", error);
return [];
}
}
async fetchConfluenceData(url) {
try {
const initialHeaders = {
"Content-Type": "application/json",
Accept: "application/json",
};
const authHeader = this.authorizationHeader;
if (authHeader) initialHeaders.Authorization = authHeader;
// Configure fetch options with SSL bypass if enabled
const fetchOptions = {
headers: initialHeaders,
};
// If SSL bypass is enabled, set the NODE_TLS_REJECT_UNAUTHORIZED environment variable
if (this.bypassSSL) process.env.NODE_TLS_REJECT_UNAUTHORIZED = "0";
const response = await fetch(url, fetchOptions);
if (!response.ok) {
throw new Error(
`Failed to fetch ${url} from Confluence: ${response.status}`
);
}
return await response.json();
} catch (error) {
this.log("Error:", error);
throw new Error(error.message);
} finally {
if (this.bypassSSL) process.env.NODE_TLS_REJECT_UNAUTHORIZED = "1";
}
}
// https://developer.atlassian.com/cloud/confluence/rest/v2/intro/#auth
async fetchAllPagesInSpace(start = 0, limit = this.limit) {
const url = `${this.baseUrl}${
this.cloud ? "/wiki" : ""
}/rest/api/content?spaceKey=${
this.spaceKey
}&limit=${limit}&start=${start}&expand=${this.expand}`;
const data = await this.fetchConfluenceData(url);
if (data.size === 0) {
return [];
}
const nextPageStart = start + data.size;
const nextPageResults = await this.fetchAllPagesInSpace(
nextPageStart,
limit
);
return data.results.concat(nextPageResults);
}
createDocumentFromPage(page) {
// Function to extract code blocks
const extractCodeBlocks = (content) => {
const codeBlockRegex =
/<ac:structured-macro ac:name="code"[^>]*>[\s\S]*?<ac:plain-text-body><!\[CDATA\[([\s\S]*?)\]\]><\/ac:plain-text-body>[\s\S]*?<\/ac:structured-macro>/g;
const languageRegex =
/<ac:parameter ac:name="language">(.*?)<\/ac:parameter>/;
return content.replace(codeBlockRegex, (match) => {
const language = match.match(languageRegex)?.[1] || "";
const code =
match.match(
/<ac:plain-text-body><!\[CDATA\[([\s\S]*?)\]\]><\/ac:plain-text-body>/
)?.[1] || "";
return `\n\`\`\`${language}\n${code.trim()}\n\`\`\`\n`;
});
};
const contentWithCodeBlocks = extractCodeBlocks(page.body.storage.value);
const plainTextContent = htmlToText(contentWithCodeBlocks, {
wordwrap: false,
preserveNewlines: true,
});
const textWithPreservedStructure = plainTextContent.replace(
/\n{3,}/g,
"\n\n"
);
const pageUrl = `${this.baseUrl}/spaces/${this.spaceKey}/pages/${page.id}`;
return {
pageContent: textWithPreservedStructure,
metadata: {
id: page.id,
status: page.status,
title: page.title,
type: page.type,
url: pageUrl,
version: page.version?.number,
updated_by: page.version?.by?.displayName,
updated_at: page.version?.when,
},
};
}
}
module.exports = { ConfluencePagesLoader };