const { v4 } = require("uuid"); const xlsx = require("node-xlsx").default; const path = require("path"); const fs = require("fs"); const { createdDate, trashFile, writeToServerDocuments, documentsFolder, } = require("../../utils/files"); const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); function convertToCSV(data) { return data .map((row) => row .map((cell) => { if (cell === null || cell === undefined) return ""; if (typeof cell === "string" && cell.includes(",")) return `"${cell}"`; return cell; }) .join(",") ) .join("\n"); } async function asXlsx({ fullFilePath = "", filename = "", options = {}, metadata = {}, }) { const documents = []; try { const workSheetsFromFile = xlsx.parse(fullFilePath); if (options.parseOnly) { const allSheetContents = []; let totalWordCount = 0; const sheetNames = []; for (const sheet of workSheetsFromFile) { const processed = processSheet(sheet); if (!processed) continue; const { name, content, wordCount } = processed; sheetNames.push(name); allSheetContents.push(`\nSheet: ${name}\n${content}`); totalWordCount += wordCount; } if (allSheetContents.length === 0) { console.log(`No valid sheets found in ${filename}.`); return { success: false, reason: `No valid sheets found in ${filename}.`, documents: [], }; } const combinedContent = allSheetContents.join("\n"); const sheetListText = sheetNames.length > 1 ? ` (Sheets: ${sheetNames.join(", ")})` : ` (Sheet: ${sheetNames[0]})`; const combinedData = { id: v4(), url: `file://${fullFilePath}`, title: metadata.title || `${filename}${sheetListText}`, docAuthor: metadata.docAuthor || "Unknown", description: metadata.description || `Spreadsheet data from ${filename} containing ${sheetNames.length} ${ sheetNames.length === 1 ? "sheet" : "sheets" }`, docSource: metadata.docSource || "an xlsx file uploaded by the user.", chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: totalWordCount, pageContent: combinedContent, token_count_estimate: tokenizeString(combinedContent), }; const document = writeToServerDocuments({ data: combinedData, filename: `${slugify(path.basename(filename))}-${combinedData.id}`, destinationOverride: null, options: { parseOnly: true }, }); documents.push(document); console.log(`[SUCCESS]: ${filename} converted & ready for embedding.`); } else { const folderName = slugify( `${path.basename(filename)}-${v4().slice(0, 4)}`, { lower: true, trim: true, } ); const outFolderPath = path.resolve(documentsFolder, folderName); if (!fs.existsSync(outFolderPath)) fs.mkdirSync(outFolderPath, { recursive: true }); for (const sheet of workSheetsFromFile) { const processed = processSheet(sheet); if (!processed) continue; const { name, content, wordCount } = processed; const sheetData = { id: v4(), url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`, title: metadata.title || `${filename} - Sheet:${name}`, docAuthor: metadata.docAuthor || "Unknown", description: metadata.description || `Spreadsheet data from sheet: ${name}`, docSource: metadata.docSource || "an xlsx file uploaded by the user.", chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), wordCount: wordCount, pageContent: content, token_count_estimate: tokenizeString(content), }; const document = writeToServerDocuments({ data: sheetData, filename: `sheet-${slugify(name)}`, destinationOverride: outFolderPath, options: { parseOnly: options.parseOnly }, }); documents.push(document); console.log( `[SUCCESS]: Sheet "${name}" converted & ready for embedding.` ); } } } catch (err) { console.error("Could not process xlsx file!", err); return { success: false, reason: `Error processing ${filename}: ${err.message}`, documents: [], }; } finally { if (!options.absolutePath) trashFile(fullFilePath); } if (documents.length === 0) { console.error(`No valid sheets found in ${filename}.`); return { success: false, reason: `No valid sheets found in ${filename}.`, documents: [], }; } console.log( `[SUCCESS]: ${filename} fully processed. Created ${documents.length} document(s).\n` ); return { success: true, reason: null, documents }; } /** * Processes a single sheet and returns its content and metadata * @param {{name: string, data: Array>}} sheet - Parsed sheet with name and 2D array of cell values * @returns {{name: string, content: string, wordCount: number}|null} - Object with name, CSV content, and word count, or null if sheet is empty */ function processSheet(sheet) { try { const { name, data } = sheet; const content = convertToCSV(data); if (!content?.length) { console.log(`Sheet "${name}" is empty. Skipping.`); return null; } console.log(`-- Processing sheet: ${name} --`); return { name, content, wordCount: content.split(/\s+/).length, }; } catch (err) { console.error(`Error processing sheet "${sheet.name}":`, err); return null; } } module.exports = asXlsx;