merlyn/collector/index.js
Timothy Carambat 719521c307
Document Processor v2 (#442)
* wip: init refactor of document processor to JS

* add NodeJs PDF support

* wip: partity with python processor
feat: add pptx support

* fix: forgot files

* Remove python scripts totally

* wip:update docker to boot new collector

* add package.json support

* update dockerfile for new build

* update gitignore and linting

* add more protections on file lookup

* update package.json

* test build

* update docker commands to use cap-add=SYS_ADMIN so web scraper can run
update all scripts to reflect this
remove docker build for branch
2023-12-14 15:14:56 -08:00

79 lines
2.1 KiB
JavaScript

process.env.NODE_ENV === "development"
? require("dotenv").config({ path: `.env.${process.env.NODE_ENV}` })
: require("dotenv").config();
const express = require("express");
const bodyParser = require("body-parser");
const cors = require("cors");
const path = require("path");
const { ACCEPTED_MIMES } = require("./utils/constants");
const { reqBody } = require("./utils/http");
const { processSingleFile } = require("./processSingleFile");
const { processLink } = require("./processLink");
const app = express();
app.use(cors({ origin: true }));
app.use(
bodyParser.text(),
bodyParser.json(),
bodyParser.urlencoded({
extended: true,
})
);
app.post("/process", async function (request, response) {
const { filename } = reqBody(request);
try {
const targetFilename = path
.normalize(filename)
.replace(/^(\.\.(\/|\\|$))+/, "");
const { success, reason } = await processSingleFile(targetFilename);
response.status(200).json({ filename: targetFilename, success, reason });
} catch (e) {
console.error(e);
response.status(200).json({
filename: filename,
success: false,
reason: "A processing error occurred.",
});
}
return;
});
app.post("/process-link", async function (request, response) {
const { link } = reqBody(request);
try {
const { success, reason } = await processLink(link);
response.status(200).json({ url: link, success, reason });
} catch (e) {
console.error(e);
response.status(200).json({
url: link,
success: false,
reason: "A processing error occurred.",
});
}
return;
});
app.get("/accepts", function (_, response) {
response.status(200).json(ACCEPTED_MIMES);
});
app.all("*", function (_, response) {
response.sendStatus(200);
});
app
.listen(8888, async () => {
console.log(`Document processor app listening on port 8888`);
})
.on("error", function (_) {
process.once("SIGUSR2", function () {
process.kill(process.pid, "SIGUSR2");
});
process.on("SIGINT", function () {
process.kill(process.pid, "SIGINT");
});
});