* Create parse endpoint in collector (#4212) * create parse endpoint in collector * revert cleanup temp util call * lint * remove unused cleanupTempDocuments function * revert slug change minor change for destinations --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> * Add parsed files table and parse server endpoints (#4222) * add workspace_parsed_files table + parse endpoints/models * remove dev api parse endpoint * remove unneeded imports * iterate over all files + remove unneeded update function + update telemetry debounce * Upload UI/UX context window check + frontend alert (#4230) * prompt user to embed if exceeds prompt window + handle embed + handle cancel * add tokenCountEstimate to workspace_parsed_files + optimizations * use util for path locations + use safeJsonParse * add modal for user decision on overflow of context window * lint * dynamic fetching of provider/model combo + inject parsed documents * remove unneeded comments * popup ui for attaching/removing files + warning to embed + wip fetching states on update * remove prop drilling, fetch files/limits directly in attach files popup * rework ux of FE + BE optimizations * fix ux of FE + BE optimizations * Implement bidirectional sync for parsed file states linting small changes and comments * move parse support to another endpoint file simplify calls and loading of records * button borders * enable default users to upload parsed files but NOT embed * delete cascade on user/workspace/thread deletion to remove parsedFileRecord * enable bgworker with "always" jobs and optional document sync jobs orphan document job: Will find any broken reference files to prevent overpollution of the storage folder. This will run 10s after boot and every 12hr after * change run timeout for orphan job to 1m to allow settling before spawning a worker * linting and cleanup pr --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com> * dev build * fix tooltip hiding during embedding overflow files * prevent crash log from ERRNO on parse files * unused import * update docs link * Migrate parsed-files to GET endpoint patch logic for grabbing models names from utils better handling for undetermined context windows (null instead of Pos_INIFI) UI placeholder for null context windows * patch URL --------- Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com>
228 lines
6.3 KiB
JavaScript
228 lines
6.3 KiB
JavaScript
const prisma = require("../utils/prisma");
|
|
const { EventLogs } = require("./eventLogs");
|
|
const { Document } = require("./documents");
|
|
const { documentsPath, directUploadsPath } = require("../utils/files");
|
|
const { safeJsonParse } = require("../utils/http");
|
|
const fs = require("fs");
|
|
const path = require("path");
|
|
|
|
const WorkspaceParsedFiles = {
|
|
create: async function ({
|
|
filename,
|
|
workspaceId,
|
|
userId = null,
|
|
threadId = null,
|
|
metadata = null,
|
|
tokenCountEstimate = 0,
|
|
}) {
|
|
try {
|
|
const file = await prisma.workspace_parsed_files.create({
|
|
data: {
|
|
filename,
|
|
workspaceId: parseInt(workspaceId),
|
|
userId: userId ? parseInt(userId) : null,
|
|
threadId: threadId ? parseInt(threadId) : null,
|
|
metadata,
|
|
tokenCountEstimate,
|
|
},
|
|
});
|
|
|
|
await EventLogs.logEvent(
|
|
"workspace_file_uploaded",
|
|
{
|
|
filename,
|
|
workspaceId,
|
|
},
|
|
userId
|
|
);
|
|
|
|
return { file, error: null };
|
|
} catch (error) {
|
|
console.error("FAILED TO CREATE PARSED FILE RECORD.", error.message);
|
|
return { file: null, error: error.message };
|
|
}
|
|
},
|
|
|
|
get: async function (clause = {}) {
|
|
try {
|
|
const file = await prisma.workspace_parsed_files.findFirst({
|
|
where: clause,
|
|
});
|
|
return file;
|
|
} catch (error) {
|
|
console.error(error.message);
|
|
return null;
|
|
}
|
|
},
|
|
|
|
where: async function (
|
|
clause = {},
|
|
limit = null,
|
|
orderBy = null,
|
|
select = null
|
|
) {
|
|
try {
|
|
const files = await prisma.workspace_parsed_files.findMany({
|
|
where: clause,
|
|
...(limit !== null ? { take: limit } : {}),
|
|
...(orderBy !== null ? { orderBy } : {}),
|
|
...(select !== null ? { select } : {}),
|
|
});
|
|
return files;
|
|
} catch (error) {
|
|
console.error(error.message);
|
|
return [];
|
|
}
|
|
},
|
|
|
|
delete: async function (clause = {}) {
|
|
try {
|
|
await prisma.workspace_parsed_files.deleteMany({
|
|
where: clause,
|
|
});
|
|
return true;
|
|
} catch (error) {
|
|
console.error(error.message);
|
|
return false;
|
|
}
|
|
},
|
|
|
|
totalTokenCount: async function (clause = {}) {
|
|
const { _sum } = await prisma.workspace_parsed_files.aggregate({
|
|
where: clause,
|
|
_sum: { tokenCountEstimate: true },
|
|
});
|
|
return _sum.tokenCountEstimate || 0;
|
|
},
|
|
|
|
moveToDocumentsAndEmbed: async function (fileId, workspace) {
|
|
try {
|
|
const parsedFile = await this.get({ id: parseInt(fileId) });
|
|
if (!parsedFile) throw new Error("File not found");
|
|
|
|
// Get file location from metadata
|
|
const metadata = safeJsonParse(parsedFile.metadata, {});
|
|
const location = metadata.location;
|
|
if (!location) throw new Error("No file location in metadata");
|
|
|
|
// Get file from metadata location
|
|
const sourceFile = path.join(directUploadsPath, location.split("/")[1]);
|
|
if (!fs.existsSync(sourceFile)) throw new Error("Source file not found");
|
|
|
|
// Move to custom-documents
|
|
const customDocsPath = path.join(documentsPath, "custom-documents");
|
|
if (!fs.existsSync(customDocsPath))
|
|
fs.mkdirSync(customDocsPath, { recursive: true });
|
|
|
|
// Copy the file to custom-documents
|
|
const targetPath = path.join(customDocsPath, location.split("/")[1]);
|
|
fs.copyFileSync(sourceFile, targetPath);
|
|
fs.unlinkSync(sourceFile);
|
|
|
|
const {
|
|
failedToEmbed = [],
|
|
errors = [],
|
|
embedded = [],
|
|
} = await Document.addDocuments(
|
|
workspace,
|
|
[`custom-documents/${location.split("/")[1]}`],
|
|
parsedFile.userId
|
|
);
|
|
|
|
if (failedToEmbed.length > 0)
|
|
throw new Error(errors[0] || "Failed to embed document");
|
|
|
|
const document = await Document.get({
|
|
workspaceId: workspace.id,
|
|
docpath: embedded[0],
|
|
});
|
|
return { success: true, error: null, document };
|
|
} catch (error) {
|
|
console.error("Failed to move and embed file:", error);
|
|
return { success: false, error: error.message, document: null };
|
|
} finally {
|
|
// Always delete the file after processing
|
|
await this.delete({ id: parseInt(fileId) });
|
|
}
|
|
},
|
|
|
|
getContextMetadataAndLimits: async function (
|
|
workspace,
|
|
thread = null,
|
|
user = null
|
|
) {
|
|
try {
|
|
if (!workspace) throw new Error("Workspace is required");
|
|
const files = await this.where({
|
|
workspaceId: workspace.id,
|
|
threadId: thread?.id || null,
|
|
...(user ? { userId: user.id } : {}),
|
|
});
|
|
|
|
const results = [];
|
|
let totalTokens = 0;
|
|
|
|
for (const file of files) {
|
|
const metadata = safeJsonParse(file.metadata, {});
|
|
totalTokens += file.tokenCountEstimate || 0;
|
|
results.push({
|
|
id: file.id,
|
|
title: metadata.title || metadata.location,
|
|
location: metadata.location,
|
|
token_count_estimate: file.tokenCountEstimate,
|
|
});
|
|
}
|
|
|
|
return {
|
|
files: results,
|
|
contextWindow: workspace.contextWindow,
|
|
currentContextTokenCount: totalTokens,
|
|
};
|
|
} catch (error) {
|
|
console.error("Failed to get context metadata:", error);
|
|
return {
|
|
files: [],
|
|
contextWindow: Infinity,
|
|
currentContextTokenCount: 0,
|
|
};
|
|
}
|
|
},
|
|
|
|
getContextFiles: async function (workspace, thread = null, user = null) {
|
|
try {
|
|
const files = await this.where({
|
|
workspaceId: workspace.id,
|
|
threadId: thread?.id || null,
|
|
...(user ? { userId: user.id } : {}),
|
|
});
|
|
|
|
const results = [];
|
|
for (const file of files) {
|
|
const metadata = safeJsonParse(file.metadata, {});
|
|
const location = metadata.location;
|
|
if (!location) continue;
|
|
|
|
const sourceFile = path.join(directUploadsPath, location.split("/")[1]);
|
|
if (!fs.existsSync(sourceFile)) continue;
|
|
|
|
const content = fs.readFileSync(sourceFile, "utf-8");
|
|
const data = safeJsonParse(content, null);
|
|
if (!data?.pageContent) continue;
|
|
|
|
results.push({
|
|
pageContent: data.pageContent,
|
|
token_count_estimate: file.tokenCountEstimate,
|
|
...metadata,
|
|
});
|
|
}
|
|
|
|
return results;
|
|
} catch (error) {
|
|
console.error("Failed to get context files:", error);
|
|
return [];
|
|
}
|
|
},
|
|
};
|
|
|
|
module.exports = { WorkspaceParsedFiles };
|