fix: GitLab connector infinite loop and rate limit crash for large repos (#5021)
* Fix infinite loop and rate limit crashes * simplify logic | add max-retries to fetchNextPage and fetchSingleFileContents --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com> Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
parent
d325b07182
commit
c927eda18f
@ -1,4 +1,5 @@
|
|||||||
const ignore = require("ignore");
|
const ignore = require("ignore");
|
||||||
|
const MAX_RETRIES = 3;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @typedef {Object} RepoLoaderArgs
|
* @typedef {Object} RepoLoaderArgs
|
||||||
@ -46,6 +47,10 @@ class GitLabRepoLoader {
|
|||||||
this.branches = [];
|
this.branches = [];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#wait(ms) {
|
||||||
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
||||||
|
}
|
||||||
|
|
||||||
#validGitlabUrl() {
|
#validGitlabUrl() {
|
||||||
const validPatterns = [
|
const validPatterns = [
|
||||||
/https:\/\/gitlab\.com\/(?<author>[^\/]+)\/(?<project>.*)/,
|
/https:\/\/gitlab\.com\/(?<author>[^\/]+)\/(?<project>.*)/,
|
||||||
@ -322,26 +327,37 @@ ${body}`
|
|||||||
* @param {string} sourceFilePath - The path to the file in the repository.
|
* @param {string} sourceFilePath - The path to the file in the repository.
|
||||||
* @returns {Promise<string|null>} The content of the file, or null if fetching fails.
|
* @returns {Promise<string|null>} The content of the file, or null if fetching fails.
|
||||||
*/
|
*/
|
||||||
async fetchSingleFileContents(sourceFilePath) {
|
async fetchSingleFileContents(sourceFilePath, retries = 0) {
|
||||||
try {
|
try {
|
||||||
const data = await fetch(
|
const url = `${this.apiBase}/api/v4/projects/${
|
||||||
`${this.apiBase}/api/v4/projects/${
|
this.projectId
|
||||||
this.projectId
|
}/repository/files/${encodeURIComponent(sourceFilePath)}/raw?ref=${
|
||||||
}/repository/files/${encodeURIComponent(sourceFilePath)}/raw?ref=${
|
this.branch
|
||||||
this.branch
|
}`;
|
||||||
}`,
|
const response = await fetch(url, {
|
||||||
{
|
method: "GET",
|
||||||
method: "GET",
|
headers: this.accessToken ? { "PRIVATE-TOKEN": this.accessToken } : {},
|
||||||
headers: this.accessToken
|
|
||||||
? { "PRIVATE-TOKEN": this.accessToken }
|
|
||||||
: {},
|
|
||||||
}
|
|
||||||
).then((res) => {
|
|
||||||
if (res.ok) return res.text();
|
|
||||||
throw new Error(`Failed to fetch single file ${sourceFilePath}`);
|
|
||||||
});
|
});
|
||||||
|
|
||||||
return data;
|
if (response.status === 429) {
|
||||||
|
if (retries >= MAX_RETRIES) {
|
||||||
|
console.warn(
|
||||||
|
`[Gitlab Loader]: Rate limit persists for ${sourceFilePath} after ${retries} retries. Skipping.`
|
||||||
|
);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
const retryAfter = Number(response.headers.get("retry-after")) || 60;
|
||||||
|
console.warn(
|
||||||
|
`[Gitlab Loader]: Rate limit hit fetching ${sourceFilePath}. Waiting ${retryAfter}s...`
|
||||||
|
);
|
||||||
|
await this.#wait(retryAfter * 1000);
|
||||||
|
return this.fetchSingleFileContents(sourceFilePath, retries + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!response.ok)
|
||||||
|
throw new Error(`Failed to fetch single file ${sourceFilePath}`);
|
||||||
|
|
||||||
|
return await response.text();
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error(`RepoLoader.fetchSingleFileContents`, e);
|
console.error(`RepoLoader.fetchSingleFileContents`, e);
|
||||||
return null;
|
return null;
|
||||||
@ -353,7 +369,7 @@ ${body}`
|
|||||||
* @param {Object} requestData - The request data.
|
* @param {Object} requestData - The request data.
|
||||||
* @returns {Promise<Array<Object>|null>} The next page of data, or null if no more pages.
|
* @returns {Promise<Array<Object>|null>} The next page of data, or null if no more pages.
|
||||||
*/
|
*/
|
||||||
async fetchNextPage(requestData) {
|
async fetchNextPage(requestData, retries = 0) {
|
||||||
try {
|
try {
|
||||||
if (requestData.page === -1) return null;
|
if (requestData.page === -1) return null;
|
||||||
if (!requestData.page) requestData.page = 1;
|
if (!requestData.page) requestData.page = 1;
|
||||||
@ -371,28 +387,52 @@ ${body}`
|
|||||||
headers: this.accessToken ? { "PRIVATE-TOKEN": this.accessToken } : {},
|
headers: this.accessToken ? { "PRIVATE-TOKEN": this.accessToken } : {},
|
||||||
});
|
});
|
||||||
|
|
||||||
// Rate limits get hit very often if no PAT is provided
|
if (response.status === 429) {
|
||||||
|
if (retries >= MAX_RETRIES) {
|
||||||
|
console.warn(
|
||||||
|
`[Gitlab Loader]: Rate limit persists for ${endpoint} after ${retries} retries. Skipping.`
|
||||||
|
);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
const retryAfter = Number(response.headers.get("retry-after")) || 60;
|
||||||
|
console.warn(
|
||||||
|
`[Gitlab Loader]: Rate limit hit for ${endpoint}. Waiting ${retryAfter}s before retrying...`
|
||||||
|
);
|
||||||
|
await this.#wait(retryAfter * 1000);
|
||||||
|
return this.fetchNextPage(requestData, retries + 1);
|
||||||
|
}
|
||||||
|
|
||||||
if (response.status === 401) {
|
if (response.status === 401) {
|
||||||
console.warn(`Rate limit hit for ${endpoint}. Skipping.`);
|
console.warn(
|
||||||
|
`[Gitlab Loader]: Unauthorized request for ${endpoint}. Skipping.`
|
||||||
|
);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
console.warn(
|
||||||
|
`[Gitlab Loader]: Unexpected status ${response.status} for ${endpoint}. Skipping.`
|
||||||
|
);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const totalPages = Number(response.headers.get("x-total-pages"));
|
|
||||||
const data = await response.json();
|
const data = await response.json();
|
||||||
if (!Array.isArray(data)) {
|
if (!Array.isArray(data)) {
|
||||||
console.warn(`Unexpected response format for ${endpoint}:`, data);
|
console.warn(`Unexpected response format for ${endpoint}:`, data);
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GitLab omits x-total-pages for large repos, so use x-next-page
|
||||||
|
// as the sole pagination signal — it's empty on the last page.
|
||||||
|
const nextPage = response.headers.get("x-next-page");
|
||||||
|
const totalPages = response.headers.get("x-total-pages");
|
||||||
console.log(
|
console.log(
|
||||||
`Gitlab RepoLoader: fetched ${endpoint} page ${requestData.page}/${totalPages} with ${data.length} records.`
|
`Gitlab RepoLoader: fetched ${endpoint} page ${requestData.page}${
|
||||||
|
totalPages ? `/${totalPages}` : ""
|
||||||
|
} with ${data.length} records.`
|
||||||
);
|
);
|
||||||
|
|
||||||
if (totalPages === requestData.page) {
|
requestData.page = nextPage?.trim() ? Number(nextPage) : -1;
|
||||||
requestData.page = -1;
|
|
||||||
} else {
|
|
||||||
requestData.page = Number(response.headers.get("x-next-page"));
|
|
||||||
}
|
|
||||||
|
|
||||||
return data;
|
return data;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user