* Create parse endpoint in collector (#4212) * create parse endpoint in collector * revert cleanup temp util call * lint * remove unused cleanupTempDocuments function * revert slug change minor change for destinations --------- Co-authored-by: timothycarambat <rambat1010@gmail.com> * Add parsed files table and parse server endpoints (#4222) * add workspace_parsed_files table + parse endpoints/models * remove dev api parse endpoint * remove unneeded imports * iterate over all files + remove unneeded update function + update telemetry debounce * Upload UI/UX context window check + frontend alert (#4230) * prompt user to embed if exceeds prompt window + handle embed + handle cancel * add tokenCountEstimate to workspace_parsed_files + optimizations * use util for path locations + use safeJsonParse * add modal for user decision on overflow of context window * lint * dynamic fetching of provider/model combo + inject parsed documents * remove unneeded comments * popup ui for attaching/removing files + warning to embed + wip fetching states on update * remove prop drilling, fetch files/limits directly in attach files popup * rework ux of FE + BE optimizations * fix ux of FE + BE optimizations * Implement bidirectional sync for parsed file states linting small changes and comments * move parse support to another endpoint file simplify calls and loading of records * button borders * enable default users to upload parsed files but NOT embed * delete cascade on user/workspace/thread deletion to remove parsedFileRecord * enable bgworker with "always" jobs and optional document sync jobs orphan document job: Will find any broken reference files to prevent overpollution of the storage folder. This will run 10s after boot and every 12hr after * change run timeout for orphan job to 1m to allow settling before spawning a worker * linting and cleanup pr --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com> * dev build * fix tooltip hiding during embedding overflow files * prevent crash log from ERRNO on parse files * unused import * update docs link * Migrate parsed-files to GET endpoint patch logic for grabbing models names from utils better handling for undetermined context windows (null instead of Pos_INIFI) UI placeholder for null context windows * patch URL --------- Co-authored-by: Sean Hatfield <seanhatfield5@gmail.com>
189 lines
4.7 KiB
JavaScript
189 lines
4.7 KiB
JavaScript
process.env.NODE_ENV === "development"
|
|
? require("dotenv").config({ path: `.env.${process.env.NODE_ENV}` })
|
|
: require("dotenv").config();
|
|
|
|
require("./utils/logger")();
|
|
const express = require("express");
|
|
const bodyParser = require("body-parser");
|
|
const cors = require("cors");
|
|
const path = require("path");
|
|
const { ACCEPTED_MIMES } = require("./utils/constants");
|
|
const { reqBody } = require("./utils/http");
|
|
const { processSingleFile } = require("./processSingleFile");
|
|
const { processLink, getLinkText } = require("./processLink");
|
|
const { wipeCollectorStorage } = require("./utils/files");
|
|
const extensions = require("./extensions");
|
|
const { processRawText } = require("./processRawText");
|
|
const { verifyPayloadIntegrity } = require("./middleware/verifyIntegrity");
|
|
const app = express();
|
|
const FILE_LIMIT = "3GB";
|
|
|
|
app.use(cors({ origin: true }));
|
|
app.use(
|
|
bodyParser.text({ limit: FILE_LIMIT }),
|
|
bodyParser.json({ limit: FILE_LIMIT }),
|
|
bodyParser.urlencoded({
|
|
limit: FILE_LIMIT,
|
|
extended: true,
|
|
})
|
|
);
|
|
|
|
app.post(
|
|
"/process",
|
|
[verifyPayloadIntegrity],
|
|
async function (request, response) {
|
|
const { filename, options = {} } = reqBody(request);
|
|
try {
|
|
const targetFilename = path
|
|
.normalize(filename)
|
|
.replace(/^(\.\.(\/|\\|$))+/, "");
|
|
const {
|
|
success,
|
|
reason,
|
|
documents = [],
|
|
} = await processSingleFile(targetFilename, options);
|
|
response
|
|
.status(200)
|
|
.json({ filename: targetFilename, success, reason, documents });
|
|
} catch (e) {
|
|
console.error(e);
|
|
response.status(200).json({
|
|
filename: filename,
|
|
success: false,
|
|
reason: "A processing error occurred.",
|
|
documents: [],
|
|
});
|
|
}
|
|
return;
|
|
}
|
|
);
|
|
|
|
app.post(
|
|
"/parse",
|
|
[verifyPayloadIntegrity],
|
|
async function (request, response) {
|
|
const { filename, options = {} } = reqBody(request);
|
|
try {
|
|
const targetFilename = path
|
|
.normalize(filename)
|
|
.replace(/^(\.\.(\/|\\|$))+/, "");
|
|
const {
|
|
success,
|
|
reason,
|
|
documents = [],
|
|
} = await processSingleFile(targetFilename, {
|
|
...options,
|
|
parseOnly: true,
|
|
});
|
|
response
|
|
.status(200)
|
|
.json({ filename: targetFilename, success, reason, documents });
|
|
} catch (e) {
|
|
console.error(e);
|
|
response.status(200).json({
|
|
filename: filename,
|
|
success: false,
|
|
reason: "A processing error occurred.",
|
|
documents: [],
|
|
});
|
|
}
|
|
return;
|
|
}
|
|
);
|
|
|
|
app.post(
|
|
"/process-link",
|
|
[verifyPayloadIntegrity],
|
|
async function (request, response) {
|
|
const { link, scraperHeaders = {} } = reqBody(request);
|
|
try {
|
|
const {
|
|
success,
|
|
reason,
|
|
documents = [],
|
|
} = await processLink(link, scraperHeaders);
|
|
response.status(200).json({ url: link, success, reason, documents });
|
|
} catch (e) {
|
|
console.error(e);
|
|
response.status(200).json({
|
|
url: link,
|
|
success: false,
|
|
reason: "A processing error occurred.",
|
|
documents: [],
|
|
});
|
|
}
|
|
return;
|
|
}
|
|
);
|
|
|
|
app.post(
|
|
"/util/get-link",
|
|
[verifyPayloadIntegrity],
|
|
async function (request, response) {
|
|
const { link, captureAs = "text" } = reqBody(request);
|
|
try {
|
|
const { success, content = null } = await getLinkText(link, captureAs);
|
|
response.status(200).json({ url: link, success, content });
|
|
} catch (e) {
|
|
console.error(e);
|
|
response.status(200).json({
|
|
url: link,
|
|
success: false,
|
|
content: null,
|
|
});
|
|
}
|
|
return;
|
|
}
|
|
);
|
|
|
|
app.post(
|
|
"/process-raw-text",
|
|
[verifyPayloadIntegrity],
|
|
async function (request, response) {
|
|
const { textContent, metadata } = reqBody(request);
|
|
try {
|
|
const {
|
|
success,
|
|
reason,
|
|
documents = [],
|
|
} = await processRawText(textContent, metadata);
|
|
response
|
|
.status(200)
|
|
.json({ filename: metadata.title, success, reason, documents });
|
|
} catch (e) {
|
|
console.error(e);
|
|
response.status(200).json({
|
|
filename: metadata?.title || "Unknown-doc.txt",
|
|
success: false,
|
|
reason: "A processing error occurred.",
|
|
documents: [],
|
|
});
|
|
}
|
|
return;
|
|
}
|
|
);
|
|
|
|
extensions(app);
|
|
|
|
app.get("/accepts", function (_, response) {
|
|
response.status(200).json(ACCEPTED_MIMES);
|
|
});
|
|
|
|
app.all("*", function (_, response) {
|
|
response.sendStatus(200);
|
|
});
|
|
|
|
app
|
|
.listen(8888, async () => {
|
|
await wipeCollectorStorage();
|
|
console.log(`Document processor app listening on port 8888`);
|
|
})
|
|
.on("error", function (_) {
|
|
process.once("SIGUSR2", function () {
|
|
process.kill(process.pid, "SIGUSR2");
|
|
});
|
|
process.on("SIGINT", function () {
|
|
process.kill(process.pid, "SIGINT");
|
|
});
|
|
});
|