fix: GitLab connector infinite loop and rate limit crash for large repos (#5021)

* Fix infinite loop and rate limit crashes

* simplify logic | add max-retries to fetchNextPage and fetchSingleFileContents

---------

Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
Marcello Fitton 2026-02-19 12:42:21 -08:00 committed by GitHub
parent d325b07182
commit c927eda18f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,4 +1,5 @@
const ignore = require("ignore");
const MAX_RETRIES = 3;
/**
* @typedef {Object} RepoLoaderArgs
@ -46,6 +47,10 @@ class GitLabRepoLoader {
this.branches = [];
}
#wait(ms) {
return new Promise((resolve) => setTimeout(resolve, ms));
}
#validGitlabUrl() {
const validPatterns = [
/https:\/\/gitlab\.com\/(?<author>[^\/]+)\/(?<project>.*)/,
@ -322,26 +327,37 @@ ${body}`
* @param {string} sourceFilePath - The path to the file in the repository.
* @returns {Promise<string|null>} The content of the file, or null if fetching fails.
*/
async fetchSingleFileContents(sourceFilePath) {
async fetchSingleFileContents(sourceFilePath, retries = 0) {
try {
const data = await fetch(
`${this.apiBase}/api/v4/projects/${
this.projectId
}/repository/files/${encodeURIComponent(sourceFilePath)}/raw?ref=${
this.branch
}`,
{
method: "GET",
headers: this.accessToken
? { "PRIVATE-TOKEN": this.accessToken }
: {},
}
).then((res) => {
if (res.ok) return res.text();
throw new Error(`Failed to fetch single file ${sourceFilePath}`);
const url = `${this.apiBase}/api/v4/projects/${
this.projectId
}/repository/files/${encodeURIComponent(sourceFilePath)}/raw?ref=${
this.branch
}`;
const response = await fetch(url, {
method: "GET",
headers: this.accessToken ? { "PRIVATE-TOKEN": this.accessToken } : {},
});
return data;
if (response.status === 429) {
if (retries >= MAX_RETRIES) {
console.warn(
`[Gitlab Loader]: Rate limit persists for ${sourceFilePath} after ${retries} retries. Skipping.`
);
return null;
}
const retryAfter = Number(response.headers.get("retry-after")) || 60;
console.warn(
`[Gitlab Loader]: Rate limit hit fetching ${sourceFilePath}. Waiting ${retryAfter}s...`
);
await this.#wait(retryAfter * 1000);
return this.fetchSingleFileContents(sourceFilePath, retries + 1);
}
if (!response.ok)
throw new Error(`Failed to fetch single file ${sourceFilePath}`);
return await response.text();
} catch (e) {
console.error(`RepoLoader.fetchSingleFileContents`, e);
return null;
@ -353,7 +369,7 @@ ${body}`
* @param {Object} requestData - The request data.
* @returns {Promise<Array<Object>|null>} The next page of data, or null if no more pages.
*/
async fetchNextPage(requestData) {
async fetchNextPage(requestData, retries = 0) {
try {
if (requestData.page === -1) return null;
if (!requestData.page) requestData.page = 1;
@ -371,28 +387,52 @@ ${body}`
headers: this.accessToken ? { "PRIVATE-TOKEN": this.accessToken } : {},
});
// Rate limits get hit very often if no PAT is provided
if (response.status === 429) {
if (retries >= MAX_RETRIES) {
console.warn(
`[Gitlab Loader]: Rate limit persists for ${endpoint} after ${retries} retries. Skipping.`
);
return null;
}
const retryAfter = Number(response.headers.get("retry-after")) || 60;
console.warn(
`[Gitlab Loader]: Rate limit hit for ${endpoint}. Waiting ${retryAfter}s before retrying...`
);
await this.#wait(retryAfter * 1000);
return this.fetchNextPage(requestData, retries + 1);
}
if (response.status === 401) {
console.warn(`Rate limit hit for ${endpoint}. Skipping.`);
console.warn(
`[Gitlab Loader]: Unauthorized request for ${endpoint}. Skipping.`
);
return null;
}
if (!response.ok) {
console.warn(
`[Gitlab Loader]: Unexpected status ${response.status} for ${endpoint}. Skipping.`
);
return null;
}
const totalPages = Number(response.headers.get("x-total-pages"));
const data = await response.json();
if (!Array.isArray(data)) {
console.warn(`Unexpected response format for ${endpoint}:`, data);
return [];
}
// GitLab omits x-total-pages for large repos, so use x-next-page
// as the sole pagination signal — it's empty on the last page.
const nextPage = response.headers.get("x-next-page");
const totalPages = response.headers.get("x-total-pages");
console.log(
`Gitlab RepoLoader: fetched ${endpoint} page ${requestData.page}/${totalPages} with ${data.length} records.`
`Gitlab RepoLoader: fetched ${endpoint} page ${requestData.page}${
totalPages ? `/${totalPages}` : ""
} with ${data.length} records.`
);
if (totalPages === requestData.page) {
requestData.page = -1;
} else {
requestData.page = Number(response.headers.get("x-next-page"));
}
requestData.page = nextPage?.trim() ? Number(nextPage) : -1;
return data;
} catch (e) {