merlyn/collector/utils/files/index.js
Timothy Carambat 0fb33736da
Workspace Chat with documents overhaul (#4261)
* Create parse endpoint in collector (#4212)

* create parse endpoint in collector

* revert cleanup temp util call

* lint

* remove unused cleanupTempDocuments function

* revert slug change
minor change for destinations

---------

Co-authored-by: timothycarambat <rambat1010@gmail.com>

* Add parsed files table and parse server endpoints (#4222)

* add workspace_parsed_files table + parse endpoints/models

* remove dev api parse endpoint

* remove unneeded imports

* iterate over all files + remove unneeded update function + update telemetry debounce

* Upload UI/UX context window check + frontend alert (#4230)

* prompt user to embed if exceeds prompt window + handle embed + handle cancel

* add tokenCountEstimate to workspace_parsed_files + optimizations

* use util for path locations + use safeJsonParse

* add modal for user decision on overflow of context window

* lint

* dynamic fetching of provider/model combo + inject parsed documents

* remove unneeded comments

* popup ui for attaching/removing files + warning to embed + wip fetching states on update

* remove prop drilling, fetch files/limits directly in attach files popup

* rework ux of FE + BE optimizations

* fix ux of FE + BE optimizations

* Implement bidirectional sync for parsed file states
linting
small changes and comments

* move parse support to another endpoint file
simplify calls and loading of records

* button borders

* enable default users to upload parsed files but NOT embed

* delete cascade on user/workspace/thread deletion to remove parsedFileRecord

* enable bgworker with "always" jobs and optional document sync jobs
orphan document job: Will find any broken reference files to prevent overpollution of the storage folder. This will run 10s after boot and every 12hr after

* change run timeout for orphan job to 1m to allow settling before spawning a worker

* linting and cleanup pr

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>

* dev build

* fix tooltip hiding during embedding overflow files

* prevent crash log from ERRNO on parse files

* unused import

* update docs link

* Migrate parsed-files to GET endpoint
patch logic for grabbing models names from utils
better handling for undetermined context windows (null instead of Pos_INIFI)
UI placeholder for null context windows

* patch URL

---------

Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com>
2025-08-11 09:26:19 -07:00

227 lines
7.5 KiB
JavaScript

const fs = require("fs");
const path = require("path");
const { MimeDetector } = require("./mime");
/**
* The folder where documents are stored to be stored when
* processed by the collector.
*/
const documentsFolder =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../../server/storage/documents`)
: path.resolve(process.env.STORAGE_DIR, `documents`);
/**
* The folder where direct uploads are stored to be stored when
* processed by the collector. These are files that were DnD'd into UI
* and are not to be embedded or selectable from the file picker.
*/
const directUploadsFolder =
process.env.NODE_ENV === "development"
? path.resolve(__dirname, `../../../server/storage/direct-uploads`)
: path.resolve(process.env.STORAGE_DIR, `direct-uploads`);
/**
* Checks if a file is text by checking the mime type and then falling back to buffer inspection.
* This way we can capture all the cases where the mime type is not known but still parseable as text
* without having to constantly add new mime type overrides.
* @param {string} filepath - The path to the file.
* @returns {boolean} - Returns true if the file is text, false otherwise.
*/
function isTextType(filepath) {
if (!fs.existsSync(filepath)) return false;
const result = isKnownTextMime(filepath);
if (result.valid) return true; // Known text type - return true.
if (result.reason !== "generic") return false; // If any other reason than generic - return false.
return parseableAsText(filepath); // Fallback to parsing as text via buffer inspection.
}
/**
* Checks if a file is known to be text by checking the mime type.
* @param {string} filepath - The path to the file.
* @returns {boolean} - Returns true if the file is known to be text, false otherwise.
*/
function isKnownTextMime(filepath) {
try {
const mimeLib = new MimeDetector();
const mime = mimeLib.getType(filepath);
if (mimeLib.badMimes.includes(mime))
return { valid: false, reason: "bad_mime" };
const type = mime.split("/")[0];
if (mimeLib.nonTextTypes.includes(type))
return { valid: false, reason: "non_text_mime" };
return { valid: true, reason: "valid_mime" };
} catch (e) {
return { valid: false, reason: "generic" };
}
}
/**
* Checks if a file is parseable as text by forcing it to be read as text in utf8 encoding.
* If the file looks too much like a binary file, it will return false.
* @param {string} filepath - The path to the file.
* @returns {boolean} - Returns true if the file is parseable as text, false otherwise.
*/
function parseableAsText(filepath) {
try {
const fd = fs.openSync(filepath, "r");
const buffer = Buffer.alloc(1024); // Read first 1KB of the file synchronously
const bytesRead = fs.readSync(fd, buffer, 0, 1024, 0);
fs.closeSync(fd);
const content = buffer.subarray(0, bytesRead).toString("utf8");
const nullCount = (content.match(/\0/g) || []).length;
const controlCount = (content.match(/[\x00-\x08\x0B\x0C\x0E-\x1F]/g) || [])
.length;
const threshold = bytesRead * 0.1;
return nullCount + controlCount < threshold;
} catch {
return false;
}
}
function trashFile(filepath) {
if (!fs.existsSync(filepath)) return;
try {
const isDir = fs.lstatSync(filepath).isDirectory();
if (isDir) return;
} catch {
return;
}
fs.rmSync(filepath);
return;
}
function createdDate(filepath) {
try {
const { birthtimeMs, birthtime } = fs.statSync(filepath);
if (birthtimeMs === 0) throw new Error("Invalid stat for file!");
return birthtime.toLocaleString();
} catch {
return "unknown";
}
}
/**
* Writes a document to the server documents folder.
* @param {Object} params - The parameters for the function.
* @param {Object} params.data - The data to write to the file. Must look like a document object.
* @param {string} params.filename - The name of the file to write to.
* @param {string|null} params.destinationOverride - A forced destination to write to - will be honored if provided.
* @param {Object} params.options - The options for the function.
* @param {boolean} params.options.parseOnly - If true, the file will be written to the direct uploads folder instead of the documents folder. Will be ignored if destinationOverride is provided.
* @returns {Object} - The data with the location added.
*/
function writeToServerDocuments({
data = {},
filename,
destinationOverride = null,
options = {},
}) {
if (!filename) throw new Error("Filename is required!");
let destination = null;
if (destinationOverride) destination = path.resolve(destinationOverride);
else if (options.parseOnly) destination = path.resolve(directUploadsFolder);
else destination = path.resolve(documentsFolder, "custom-documents");
if (!fs.existsSync(destination))
fs.mkdirSync(destination, { recursive: true });
const destinationFilePath = path.resolve(destination, filename) + ".json";
fs.writeFileSync(destinationFilePath, JSON.stringify(data, null, 4), {
encoding: "utf-8",
});
return {
...data,
// relative location string that can be passed into the /update-embeddings api
// that will work since we know the location exists and since we only allow
// 1-level deep folders this will always work. This still works for integrations like GitHub and YouTube.
location: destinationFilePath.split("/").slice(-2).join("/"),
isDirectUpload: options.parseOnly || false,
};
}
// When required we can wipe the entire collector hotdir and tmp storage in case
// there were some large file failures that we unable to be removed a reboot will
// force remove them.
async function wipeCollectorStorage() {
const cleanHotDir = new Promise((resolve) => {
const directory = path.resolve(__dirname, "../../hotdir");
fs.readdir(directory, (err, files) => {
if (err) resolve();
for (const file of files) {
if (file === "__HOTDIR__.md") continue;
try {
fs.rmSync(path.join(directory, file));
} catch {}
}
resolve();
});
});
const cleanTmpDir = new Promise((resolve) => {
const directory = path.resolve(__dirname, "../../storage/tmp");
fs.readdir(directory, (err, files) => {
if (err) resolve();
for (const file of files) {
if (file === ".placeholder") continue;
try {
fs.rmSync(path.join(directory, file));
} catch {}
}
resolve();
});
});
await Promise.all([cleanHotDir, cleanTmpDir]);
console.log(`Collector hot directory and tmp storage wiped!`);
return;
}
/**
* Checks if a given path is within another path.
* @param {string} outer - The outer path (should be resolved).
* @param {string} inner - The inner path (should be resolved).
* @returns {boolean} - Returns true if the inner path is within the outer path, false otherwise.
*/
function isWithin(outer, inner) {
if (outer === inner) return false;
const rel = path.relative(outer, inner);
return !rel.startsWith("../") && rel !== "..";
}
function normalizePath(filepath = "") {
const result = path
.normalize(filepath.trim())
.replace(/^(\.\.(\/|\\|$))+/, "")
.trim();
if (["..", ".", "/"].includes(result)) throw new Error("Invalid path.");
return result;
}
function sanitizeFileName(fileName) {
if (!fileName) return fileName;
return fileName.replace(/[<>:"\/\\|?*]/g, "");
}
module.exports = {
trashFile,
isTextType,
createdDate,
writeToServerDocuments,
wipeCollectorStorage,
normalizePath,
isWithin,
sanitizeFileName,
documentsFolder,
directUploadsFolder,
};