const { v4 } = require("uuid"); const officeParser = require("officeparser"); const { createdDate, trashFile, writeToServerDocuments, } = require("../../utils/files"); const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); async function asOfficeMime({ fullFilePath = "", filename = "", options = {}, metadata = {}, }) { console.log(`-- Working ${filename} --`); let content = ""; try { content = await officeParser.parseOfficeAsync(fullFilePath); } catch (error) { console.error(`Could not parse office or office-like file`, error); } if (!content.length) { console.error(`Resulting text content was empty for ${filename}.`); trashFile(fullFilePath); return { success: false, reason: `No text content found in ${filename}.`, documents: [], }; } const data = { id: v4(), url: "file://" + fullFilePath, title: metadata.title || filename, docAuthor: metadata.docAuthor || "no author found", description: metadata.description || "No description found.", docSource: metadata.docSource || "Office file uploaded by the user.", chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: content.split(" ").length, pageContent: content, token_count_estimate: tokenizeString(content), }; const document = writeToServerDocuments({ data, filename: `${slugify(filename)}-${data.id}`, options: { parseOnly: options.parseOnly }, }); trashFile(fullFilePath); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.\n`); return { success: true, reason: null, documents: [document] }; } module.exports = asOfficeMime;