const { htmlToText } = require("html-to-text"); const pdf = require("pdf-parse"); class PaperlessNgxLoader { constructor({ baseUrl, apiToken }) { this.baseUrl = new URL(baseUrl).origin; this.apiToken = apiToken; this.baseHeaders = { Authorization: `Token ${this.apiToken}`, }; } async load() { try { const documents = await this.fetchAllDocuments(); return documents.map((doc) => this.createDocumentFromPage(doc)); } catch (error) { console.error("Error:", error); throw error; } } /** * Fetches all documents from Paperless-ngx * @returns {Promise<{{[key: string]: any, content: string}[]}>} The documents with their content */ async fetchAllDocuments() { try { const documents = []; let nextUrl = `${this.baseUrl}/api/documents/`; let page = 1; while (nextUrl) { console.log(`Fetching documents page ${page} from Paperless-ngx`); try { const data = await fetch(nextUrl, { headers: { "Content-Type": "application/json", ...this.baseHeaders, }, }).then((res) => { if (!res.ok) throw new Error( `Failed to fetch documents from Paperless-ngx: ${res.status}` ); return res.json(); }); const validResults = data.results.filter((doc) => doc?.id); if (!validResults.length) break; documents.push(...validResults); if (data.next === nextUrl) break; nextUrl = data.next || null; page++; } catch (error) { console.error( `Error fetching page ${page} from Paperless-ngx:`, error ); break; } } console.log( `Fetched ${documents.length} documents from Paperless-ngx (Pages: ${ page - 1 })` ); const documentsWithContent = await Promise.all( documents.map(async (doc) => { const content = await this.fetchDocumentContent(doc.id); return { ...doc, content }; }) ); return documentsWithContent.filter((doc) => !!doc.content); } catch (error) { throw new Error( `Failed to fetch documents from Paperless-ngx: ${error.message}` ); } } /** * Fetches the content of a document from Paperless-ngx * @param {string} documentId - The ID of the document to fetch * @returns {Promise} The content of the document */ async fetchDocumentContent(documentId) { try { const response = await fetch( `${this.baseUrl}/api/documents/${documentId}/download/`, { headers: this.baseHeaders, } ); if (!response.ok) throw new Error(`Failed to fetch document content: ${response.status}`); const contentType = response.headers.get("content-type"); switch (contentType) { case "text/plain": return await response.text(); case "application/pdf": const buffer = await response.arrayBuffer(); return await this.parsePdfContent(buffer); default: return await response.text(); } } catch (error) { console.error( `Failed to fetch content for document ${documentId}:`, error ); return ""; } } async parsePdfContent(buffer) { try { const data = await pdf(Buffer.from(buffer)); return data.text; } catch (error) { console.error("Failed to parse PDF content:", error); return ""; } } createDocumentFromPage(doc) { const content = doc.content || ""; const plainTextContent = htmlToText(content, { wordwrap: false, preserveNewlines: true, }); return { pageContent: plainTextContent, metadata: { id: doc.id, title: doc.original_file_name, created: doc.created, modified: doc.modified, added: doc.added, tags: doc.tags, correspondent: doc.correspondent, documentType: doc.document_type, url: `${this.baseUrl}/documents/${doc.id}`, }, }; } } module.exports = PaperlessNgxLoader;