merlyn/collector/index.js
Timothy Carambat 0fb33736da
Workspace Chat with documents overhaul (#4261)
* Create parse endpoint in collector (#4212)

* create parse endpoint in collector

* revert cleanup temp util call

* lint

* remove unused cleanupTempDocuments function

* revert slug change
minor change for destinations

---------

Co-authored-by: timothycarambat <rambat1010@gmail.com>

* Add parsed files table and parse server endpoints (#4222)

* add workspace_parsed_files table + parse endpoints/models

* remove dev api parse endpoint

* remove unneeded imports

* iterate over all files + remove unneeded update function + update telemetry debounce

* Upload UI/UX context window check + frontend alert (#4230)

* prompt user to embed if exceeds prompt window + handle embed + handle cancel

* add tokenCountEstimate to workspace_parsed_files + optimizations

* use util for path locations + use safeJsonParse

* add modal for user decision on overflow of context window

* lint

* dynamic fetching of provider/model combo + inject parsed documents

* remove unneeded comments

* popup ui for attaching/removing files + warning to embed + wip fetching states on update

* remove prop drilling, fetch files/limits directly in attach files popup

* rework ux of FE + BE optimizations

* fix ux of FE + BE optimizations

* Implement bidirectional sync for parsed file states
linting
small changes and comments

* move parse support to another endpoint file
simplify calls and loading of records

* button borders

* enable default users to upload parsed files but NOT embed

* delete cascade on user/workspace/thread deletion to remove parsedFileRecord

* enable bgworker with "always" jobs and optional document sync jobs
orphan document job: Will find any broken reference files to prevent overpollution of the storage folder. This will run 10s after boot and every 12hr after

* change run timeout for orphan job to 1m to allow settling before spawning a worker

* linting and cleanup pr

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>

* dev build

* fix tooltip hiding during embedding overflow files

* prevent crash log from ERRNO on parse files

* unused import

* update docs link

* Migrate parsed-files to GET endpoint
patch logic for grabbing models names from utils
better handling for undetermined context windows (null instead of Pos_INIFI)
UI placeholder for null context windows

* patch URL

---------

Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com>
2025-08-11 09:26:19 -07:00

189 lines
4.7 KiB
JavaScript

process.env.NODE_ENV === "development"
? require("dotenv").config({ path: `.env.${process.env.NODE_ENV}` })
: require("dotenv").config();
require("./utils/logger")();
const express = require("express");
const bodyParser = require("body-parser");
const cors = require("cors");
const path = require("path");
const { ACCEPTED_MIMES } = require("./utils/constants");
const { reqBody } = require("./utils/http");
const { processSingleFile } = require("./processSingleFile");
const { processLink, getLinkText } = require("./processLink");
const { wipeCollectorStorage } = require("./utils/files");
const extensions = require("./extensions");
const { processRawText } = require("./processRawText");
const { verifyPayloadIntegrity } = require("./middleware/verifyIntegrity");
const app = express();
const FILE_LIMIT = "3GB";
app.use(cors({ origin: true }));
app.use(
bodyParser.text({ limit: FILE_LIMIT }),
bodyParser.json({ limit: FILE_LIMIT }),
bodyParser.urlencoded({
limit: FILE_LIMIT,
extended: true,
})
);
app.post(
"/process",
[verifyPayloadIntegrity],
async function (request, response) {
const { filename, options = {} } = reqBody(request);
try {
const targetFilename = path
.normalize(filename)
.replace(/^(\.\.(\/|\\|$))+/, "");
const {
success,
reason,
documents = [],
} = await processSingleFile(targetFilename, options);
response
.status(200)
.json({ filename: targetFilename, success, reason, documents });
} catch (e) {
console.error(e);
response.status(200).json({
filename: filename,
success: false,
reason: "A processing error occurred.",
documents: [],
});
}
return;
}
);
app.post(
"/parse",
[verifyPayloadIntegrity],
async function (request, response) {
const { filename, options = {} } = reqBody(request);
try {
const targetFilename = path
.normalize(filename)
.replace(/^(\.\.(\/|\\|$))+/, "");
const {
success,
reason,
documents = [],
} = await processSingleFile(targetFilename, {
...options,
parseOnly: true,
});
response
.status(200)
.json({ filename: targetFilename, success, reason, documents });
} catch (e) {
console.error(e);
response.status(200).json({
filename: filename,
success: false,
reason: "A processing error occurred.",
documents: [],
});
}
return;
}
);
app.post(
"/process-link",
[verifyPayloadIntegrity],
async function (request, response) {
const { link, scraperHeaders = {} } = reqBody(request);
try {
const {
success,
reason,
documents = [],
} = await processLink(link, scraperHeaders);
response.status(200).json({ url: link, success, reason, documents });
} catch (e) {
console.error(e);
response.status(200).json({
url: link,
success: false,
reason: "A processing error occurred.",
documents: [],
});
}
return;
}
);
app.post(
"/util/get-link",
[verifyPayloadIntegrity],
async function (request, response) {
const { link, captureAs = "text" } = reqBody(request);
try {
const { success, content = null } = await getLinkText(link, captureAs);
response.status(200).json({ url: link, success, content });
} catch (e) {
console.error(e);
response.status(200).json({
url: link,
success: false,
content: null,
});
}
return;
}
);
app.post(
"/process-raw-text",
[verifyPayloadIntegrity],
async function (request, response) {
const { textContent, metadata } = reqBody(request);
try {
const {
success,
reason,
documents = [],
} = await processRawText(textContent, metadata);
response
.status(200)
.json({ filename: metadata.title, success, reason, documents });
} catch (e) {
console.error(e);
response.status(200).json({
filename: metadata?.title || "Unknown-doc.txt",
success: false,
reason: "A processing error occurred.",
documents: [],
});
}
return;
}
);
extensions(app);
app.get("/accepts", function (_, response) {
response.status(200).json(ACCEPTED_MIMES);
});
app.all("*", function (_, response) {
response.sendStatus(200);
});
app
.listen(8888, async () => {
await wipeCollectorStorage();
console.log(`Document processor app listening on port 8888`);
})
.on("error", function (_) {
process.once("SIGUSR2", function () {
process.kill(process.pid, "SIGUSR2");
});
process.on("SIGINT", function () {
process.kill(process.pid, "SIGINT");
});
});