3416 file picker load times (#3833)

* wip * implment conditional streaming * no bench
2025-05-15 13:54:14 -07:00 · 2025-05-15 13:54:14 -07:00 · 3bbceb82aa
commit 3bbceb82aa
parent 9371bcc71e
1 changed files with 109 additions and 16 deletions
--- a/server/utils/files/index.js
+++ b/server/utils/files/index.js
@ -26,6 +26,7 @@ async function fileData(filePath = null) {
 async function viewLocalFiles() {
  if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath);
  const filePromises = [];
  const liveSyncAvailable = await DocumentSyncQueue.enabled();
  const directory = {
    name: "documents",
@ -43,28 +44,26 @@ async function viewLocalFiles() {
        type: "folder",
        items: [],
      };
      const subfiles = fs.readdirSync(folderPath);
      const filenames = {};
-      for (const subfile of subfiles) {
+      for (let i = 0; i < subfiles.length; i++) {
-        if (path.extname(subfile) !== ".json") continue;
+        const subfile = subfiles[i];
        const filePath = path.join(folderPath, subfile);
        const rawData = fs.readFileSync(filePath, "utf8");
        const cachefilename = `${file}/${subfile}`;
-        const { pageContent, ...metadata } = JSON.parse(rawData);
+        if (path.extname(subfile) !== ".json") continue;
-        subdocs.items.push({
+        filePromises.push(
-          name: subfile,
+          fileToPickerData({
-          type: "file",
+            pathToFile: path.join(folderPath, subfile),
-          ...metadata,
+            liveSyncAvailable,
-          cached: await cachedVectorInformation(cachefilename, true),
+          })
-          canWatch: liveSyncAvailable
+        );
            ? DocumentSyncQueue.canWatch(metadata)
            : false,
          // pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document
          // watched: false, // boolean to indicate if this document is watched in ANY workspace
        });
        filenames[cachefilename] = subfile;
      }
      const results = await Promise.all(filePromises).then((results) =>
        results.filter((i) => !!i)
      ); // Filter out any null results
      subdocs.items.push(...results);
      // Grab the pinned workspaces and watched documents for this folder's documents
      // at the time of the query so we don't have to re-query the database for each file
@ -335,6 +334,100 @@ function purgeEntireVectorCache() {
  return;
 }
 /**
 * File size threshold for files that are too large to be read into memory (MB)
 *
 * If the file is larger than this, we will stream it and parse it in chunks
 * This is to prevent us from using too much memory when parsing large files
 * or loading the files in the file picker.
 * @TODO - When lazy loading for folders is implemented, we should increase this threshold (512MB)
 * since it will always be faster to readSync than to stream the file and parse it in chunks.
 */
 const FILE_READ_SIZE_THRESHOLD = 150 * (1024 * 1024);
 /**
 * Converts a file to picker data
 * @param {string} pathToFile - The path to the file to convert
 * @param {boolean} liveSyncAvailable - Whether live sync is available
 * @returns {Promise<{name: string, type: string, [string]: any, cached: boolean, canWatch: boolean}>} - The picker data
 */
 async function fileToPickerData({ pathToFile, liveSyncAvailable = false }) {
  let metadata = {};
  const filename = path.basename(pathToFile);
  const fileStats = fs.statSync(pathToFile);
  const cachedStatus = await cachedVectorInformation(pathToFile, true);
  const canWatchStatus = liveSyncAvailable
    ? DocumentSyncQueue.canWatch(metadata)
    : false;
  if (fileStats.size < FILE_READ_SIZE_THRESHOLD) {
    const rawData = fs.readFileSync(pathToFile, "utf8");
    try {
      metadata = JSON.parse(rawData);
      // Remove the pageContent field from the metadata - it is large and not needed for the picker
      delete metadata.pageContent;
    } catch (err) {
      console.error("Error parsing file", err);
      return null;
    }
    return {
      name: filename,
      type: "file",
      ...metadata,
      cached: cachedStatus,
      canWatch: canWatchStatus,
      // pinnedWorkspaces: [], // This is the list of workspaceIds that have pinned this document
      // watched: false, // boolean to indicate if this document is watched in ANY workspace
    };
  }
  console.log(
    `Stream-parsing ${path.basename(pathToFile)} because it exceeds the ${FILE_READ_SIZE_THRESHOLD} byte limit.`
  );
  const stream = fs.createReadStream(pathToFile, { encoding: "utf8" });
  try {
    let fileContent = "";
    metadata = await new Promise((resolve, reject) => {
      stream
        .on("data", (chunk) => {
          fileContent += chunk;
        })
        .on("end", () => {
          metadata = JSON.parse(fileContent);
          // Remove the pageContent field from the metadata - it is large and not needed for the picker
          delete metadata.pageContent;
          resolve(metadata);
        })
        .on("error", (err) => {
          console.error("Error parsing file", err);
          reject(null);
        });
    }).catch((err) => {
      console.error("Error parsing file", err);
    });
  } catch (err) {
    console.error("Error parsing file", err);
    metadata = null;
  } finally {
    stream.destroy();
  }
  // If the metadata is empty or something went wrong, return null
  if (!metadata || !Object.keys(metadata)?.length) {
    console.log(`Stream-parsing failed for ${path.basename(pathToFile)}`);
    return null;
  }
  return {
    name: filename,
    type: "file",
    ...metadata,
    cached: cachedStatus,
    canWatch: canWatchStatus,
  };
 }
 module.exports = {
  findDocumentInDocuments,
  cachedVectorInformation,