diff --git a/collector/__tests__/utils/extensions/Confluence/ConfluenceLoader.test.js b/collector/__tests__/utils/extensions/Confluence/ConfluenceLoader.test.js new file mode 100644 index 00000000..6633f0ce --- /dev/null +++ b/collector/__tests__/utils/extensions/Confluence/ConfluenceLoader.test.js @@ -0,0 +1,125 @@ +/* eslint-env jest, node */ +process.env.STORAGE_DIR = "test-storage"; + +const { resolveConfluenceBaseUrl } = require("../../../../utils/extensions/Confluence"); +const { + ConfluencePagesLoader, +} = require("../../../../utils/extensions/Confluence/ConfluenceLoader"); + +describe("resolveConfluenceBaseUrl", () => { + test("cloud: strips path and returns origin only", () => { + expect( + resolveConfluenceBaseUrl("https://example.atlassian.net/wiki/spaces/SP", true) + ).toBe("https://example.atlassian.net"); + }); + + test("self-hosted: preserves context path, strips trailing slash", () => { + expect( + resolveConfluenceBaseUrl("https://my.domain.com/confluence/", false) + ).toBe("https://my.domain.com/confluence"); + }); + + test("self-hosted: returns origin when no context path", () => { + expect( + resolveConfluenceBaseUrl("https://my.domain.com/", false) + ).toBe("https://my.domain.com"); + }); +}); + +describe("ConfluencePagesLoader", () => { + afterEach(() => { + jest.restoreAllMocks(); + }); + + describe("cloud mode", () => { + test("API requests include /wiki prefix", async () => { + const fetchMock = jest.spyOn(global, "fetch").mockResolvedValue({ + ok: true, + json: jest.fn().mockResolvedValue({ size: 0, results: [] }), + }); + const loader = new ConfluencePagesLoader({ + baseUrl: resolveConfluenceBaseUrl("https://example.atlassian.net/wiki/spaces/SP", true), + spaceKey: "SP", + username: "user", + accessToken: "token", + cloud: true, + }); + + await loader.fetchAllPagesInSpace(); + + expect(fetchMock).toHaveBeenCalledWith( + "https://example.atlassian.net/wiki/rest/api/content?spaceKey=SP&limit=25&start=0&expand=body.storage,version", + expect.any(Object) + ); + }); + + test("page URLs include /wiki prefix", () => { + const loader = new ConfluencePagesLoader({ + baseUrl: resolveConfluenceBaseUrl("https://example.atlassian.net/wiki", true), + spaceKey: "SP", + username: "user", + accessToken: "token", + cloud: true, + }); + + const document = loader.createDocumentFromPage({ + id: "123", + status: "current", + title: "Cloud page", + type: "page", + body: { storage: { value: "

Hello

" } }, + version: { number: 1, by: { displayName: "User" }, when: "2026-01-01T00:00:00.000Z" }, + }); + + expect(document.metadata.url).toBe( + "https://example.atlassian.net/wiki/spaces/SP/pages/123" + ); + }); + }); + + describe("self-hosted mode", () => { + test("API requests use context path without /wiki", async () => { + const fetchMock = jest.spyOn(global, "fetch").mockResolvedValue({ + ok: true, + json: jest.fn().mockResolvedValue({ size: 0, results: [] }), + }); + const loader = new ConfluencePagesLoader({ + baseUrl: resolveConfluenceBaseUrl("https://my.domain.com/confluence/", false), + spaceKey: "SP", + username: "user", + accessToken: "token", + cloud: false, + }); + + await loader.fetchAllPagesInSpace(); + + expect(fetchMock).toHaveBeenCalledWith( + "https://my.domain.com/confluence/rest/api/content?spaceKey=SP&limit=25&start=0&expand=body.storage,version", + expect.any(Object) + ); + }); + + test("page URLs use context path without /wiki", () => { + const loader = new ConfluencePagesLoader({ + baseUrl: resolveConfluenceBaseUrl("https://my.domain.com/confluence/", false), + spaceKey: "SP", + username: "user", + accessToken: "token", + cloud: false, + }); + + const document = loader.createDocumentFromPage({ + id: "123", + status: "current", + title: "Self-hosted page", + type: "page", + body: { storage: { value: "

Hello

" } }, + version: { number: 1, by: { displayName: "User" }, when: "2026-01-01T00:00:00.000Z" }, + }); + + expect(document.metadata.url).toBe( + "https://my.domain.com/confluence/spaces/SP/pages/123" + ); + }); + }); +}); diff --git a/collector/utils/extensions/Confluence/index.js b/collector/utils/extensions/Confluence/index.js index 7db84d7c..b4a0b244 100644 --- a/collector/utils/extensions/Confluence/index.js +++ b/collector/utils/extensions/Confluence/index.js @@ -46,10 +46,11 @@ async function loadConfluence( }; } - const { origin, hostname } = new URL(baseUrl); - console.log(`-- Working Confluence ${origin} --`); + const normalizedBaseUrl = resolveConfluenceBaseUrl(baseUrl, cloud); + const { hostname } = new URL(normalizedBaseUrl); + console.log(`-- Working Confluence ${normalizedBaseUrl} --`); const loader = new ConfluencePagesLoader({ - baseUrl: origin, // Use the origin to avoid issues with subdomains, ports, protocols, etc. + baseUrl: normalizedBaseUrl, spaceKey, username, accessToken, @@ -98,13 +99,13 @@ async function loadConfluence( id: v4(), url: doc.metadata.url + ".page", title: doc.metadata.title || doc.metadata.source, - docAuthor: origin, + docAuthor: normalizedBaseUrl, description: doc.metadata.title, - docSource: `${origin} Confluence`, + docSource: `${normalizedBaseUrl} Confluence`, chunkSource: generateChunkSource( { doc, - baseUrl: origin, + baseUrl: normalizedBaseUrl, spaceKey, accessToken, username, @@ -182,8 +183,9 @@ async function fetchConfluencePage({ } console.log(`-- Working Confluence Page ${pageUrl} --`); + const normalizedBaseUrl = resolveConfluenceBaseUrl(baseUrl, cloud); const loader = new ConfluencePagesLoader({ - baseUrl, // Should be the origin of the baseUrl + baseUrl: normalizedBaseUrl, spaceKey, username, accessToken, @@ -243,6 +245,21 @@ function validBaseUrl(baseUrl) { } } +/** + * Resolves the Confluence base URL, preserving context paths for self-hosted deployments. + * @param {string} baseUrl + * @param {boolean} cloud + * @returns {string} + */ +function resolveConfluenceBaseUrl(baseUrl, cloud = true) { + const url = new URL(baseUrl); + // Cloud URLs use just the origin; self-hosted may have a context path like /confluence + if (cloud) return url.origin; + + const contextPath = url.pathname.replace(/\/+$/, ""); + return `${url.origin}${contextPath}`; +} + /** * Generate the full chunkSource for a specific Confluence page so that we can resync it later. * This data is encrypted into a single `payload` query param so we can replay credentials later @@ -271,4 +288,5 @@ function generateChunkSource( module.exports = { loadConfluence, fetchConfluencePage, + resolveConfluenceBaseUrl, };