merlyn/collector/utils/extensions/PaperlessNgx/index.js
Sean Hatfield 05df4ac72b
Paperless ngx data connector (#4121)
* paperless ngx data connector

* wip resync paperless ngx

* fix generateChunkSource for resyncing paperless ngx

* lint

* Refactor Paperless-NGX connector
Fix issue with date rendering in tooltip + extended width
Move tooltip details to be column for more space

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
2025-11-20 11:27:38 -08:00

129 lines
3.5 KiB
JavaScript

const fs = require("fs");
const path = require("path");
const { default: slugify } = require("slugify");
const { v4 } = require("uuid");
const {
writeToServerDocuments,
sanitizeFileName,
documentsFolder,
} = require("../../files");
const { tokenizeString } = require("../../tokenizer");
const { validBaseUrl } = require("../../http");
const PaperlessNgxLoader = require("./PaperlessNgxLoader");
/**
* Load documents from a Paperless-ngx instance
* @param {object} args - forwarded request body params
* @param {import("../../../middleware/setDataSigner").ResponseWithSigner} response - Express response object with encryptionWorker
* @returns
*/
async function loadPaperlessNgx({ baseUrl = null, apiToken = null }, response) {
if (!baseUrl || !validBaseUrl(baseUrl)) {
return {
success: false,
reason: "Provided base URL is not a valid URL.",
};
}
if (!apiToken) {
return {
success: false,
reason:
"You need to provide an API token to use the Paperless-ngx connector.",
};
}
const { origin, hostname } = new URL(baseUrl);
console.log(`-- Working Paperless-ngx ${origin} --`);
const loader = new PaperlessNgxLoader({
baseUrl: origin,
apiToken,
});
const { docs, error } = await loader
.load()
.then((docs) => ({ docs, error: null }))
.catch((e) => ({
docs: [],
error: e.message?.split("Error:")?.[1] || e.message,
}));
if (!docs.length || !!error) {
return {
success: false,
reason:
error ?? "No parseable documents found in that Paperless-ngx instance.",
data: null,
};
}
const outFolder = slugify(
`paperless-${hostname}-${v4().slice(0, 4)}`
).toLowerCase();
const outFolderPath = path.resolve(documentsFolder, outFolder);
if (!fs.existsSync(outFolderPath))
fs.mkdirSync(outFolderPath, { recursive: true });
docs.forEach((doc) => {
if (!doc.pageContent) return;
const data = {
id: v4(),
url: doc.metadata.url,
title: doc.metadata.title,
docAuthor: doc.metadata.correspondent || "Unknown",
description: `A document from the Paperless-ngx instance at ${origin}`,
docSource: `paperless-ngx`,
chunkSource: generateChunkSource(
{ doc, baseUrl: origin, apiToken },
response.locals.encryptionWorker
),
published: doc.metadata.created,
wordCount: doc.pageContent.split(" ").length,
pageContent: doc.pageContent,
token_count_estimate: tokenizeString(doc.pageContent),
};
console.log(
`[Paperless-ngx Loader]: Saving ${doc.metadata.title} to ${outFolder}`
);
const fileName = sanitizeFileName(
`${slugify(doc.metadata.title)}-${data.id}`
);
writeToServerDocuments({
data,
filename: fileName,
destinationOverride: outFolderPath,
});
});
return {
success: true,
reason: null,
data: {
files: docs.length,
destination: outFolder,
},
};
}
/**
* Generate the full chunkSource for a specific Paperless-ngx document so that we can resync it later.
* @param {object} chunkSourceInformation
* @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker
* @returns {string}
*/
function generateChunkSource({ doc, baseUrl, apiToken }, encryptionWorker) {
const payload = {
baseUrl,
token: apiToken,
};
return `paperless-ngx://${doc.metadata.id}?payload=${encryptionWorker.encrypt(
JSON.stringify(payload)
)}`;
}
module.exports = {
loadPaperlessNgx,
};