diff --git a/collector/processSingleFile/convert/asXlsx.js b/collector/processSingleFile/convert/asXlsx.js index c01bc866..4601ceff 100644 --- a/collector/processSingleFile/convert/asXlsx.js +++ b/collector/processSingleFile/convert/asXlsx.js @@ -7,7 +7,6 @@ const { trashFile, writeToServerDocuments, documentsFolder, - directUploadsFolder, } = require("../../utils/files"); const { tokenizeString } = require("../../utils/tokenizer"); const { default: slugify } = require("slugify"); @@ -34,30 +33,83 @@ async function asXlsx({ metadata = {}, }) { const documents = []; - const folderName = slugify(`${path.basename(filename)}-${v4().slice(0, 4)}`, { - lower: true, - trim: true, - }); - const outFolderPath = options.parseOnly - ? path.resolve(directUploadsFolder, folderName) - : path.resolve(documentsFolder, folderName); try { const workSheetsFromFile = xlsx.parse(fullFilePath); - if (!fs.existsSync(outFolderPath)) - fs.mkdirSync(outFolderPath, { recursive: true }); - for (const sheet of workSheetsFromFile) { - try { - const { name, data } = sheet; - const content = convertToCSV(data); + if (options.parseOnly) { + const allSheetContents = []; + let totalWordCount = 0; + const sheetNames = []; - if (!content?.length) { - console.warn(`Sheet "${name}" is empty. Skipping.`); - continue; + for (const sheet of workSheetsFromFile) { + const processed = processSheet(sheet); + if (!processed) continue; + + const { name, content, wordCount } = processed; + sheetNames.push(name); + allSheetContents.push(`\nSheet: ${name}\n${content}`); + totalWordCount += wordCount; + } + + if (allSheetContents.length === 0) { + console.log(`No valid sheets found in ${filename}.`); + return { + success: false, + reason: `No valid sheets found in ${filename}.`, + documents: [], + }; + } + + const combinedContent = allSheetContents.join("\n"); + const sheetListText = + sheetNames.length > 1 + ? ` (Sheets: ${sheetNames.join(", ")})` + : ` (Sheet: ${sheetNames[0]})`; + + const combinedData = { + id: v4(), + url: `file://${fullFilePath}`, + title: metadata.title || `${filename}${sheetListText}`, + docAuthor: metadata.docAuthor || "Unknown", + description: + metadata.description || + `Spreadsheet data from ${filename} containing ${sheetNames.length} ${ + sheetNames.length === 1 ? "sheet" : "sheets" + }`, + docSource: metadata.docSource || "an xlsx file uploaded by the user.", + chunkSource: metadata.chunkSource || "", + published: createdDate(fullFilePath), + wordCount: totalWordCount, + pageContent: combinedContent, + token_count_estimate: tokenizeString(combinedContent), + }; + + const document = writeToServerDocuments({ + data: combinedData, + filename: `${slugify(path.basename(filename))}-${combinedData.id}`, + destinationOverride: null, + options: { parseOnly: true }, + }); + documents.push(document); + console.log(`[SUCCESS]: ${filename} converted & ready for embedding.`); + } else { + const folderName = slugify( + `${path.basename(filename)}-${v4().slice(0, 4)}`, + { + lower: true, + trim: true, } + ); + const outFolderPath = path.resolve(documentsFolder, folderName); + if (!fs.existsSync(outFolderPath)) + fs.mkdirSync(outFolderPath, { recursive: true }); - console.log(`-- Processing sheet: ${name} --`); + for (const sheet of workSheetsFromFile) { + const processed = processSheet(sheet); + if (!processed) continue; + + const { name, content, wordCount } = processed; const sheetData = { id: v4(), url: `file://${path.join(outFolderPath, `${slugify(name)}.csv`)}`, @@ -68,7 +120,7 @@ async function asXlsx({ docSource: metadata.docSource || "an xlsx file uploaded by the user.", chunkSource: metadata.chunkSource || "", published: createdDate(fullFilePath), - wordCount: content.split(/\s+/).length, + wordCount: wordCount, pageContent: content, token_count_estimate: tokenizeString(content), }; @@ -83,9 +135,6 @@ async function asXlsx({ console.log( `[SUCCESS]: Sheet "${name}" converted & ready for embedding.` ); - } catch (err) { - console.error(`Error processing sheet "${name}":`, err); - continue; } } } catch (err) { @@ -114,4 +163,31 @@ async function asXlsx({ return { success: true, reason: null, documents }; } +/** + * Processes a single sheet and returns its content and metadata + * @param {{name: string, data: Array>}} sheet - Parsed sheet with name and 2D array of cell values + * @returns {{name: string, content: string, wordCount: number}|null} - Object with name, CSV content, and word count, or null if sheet is empty + */ +function processSheet(sheet) { + try { + const { name, data } = sheet; + const content = convertToCSV(data); + + if (!content?.length) { + console.log(`Sheet "${name}" is empty. Skipping.`); + return null; + } + + console.log(`-- Processing sheet: ${name} --`); + return { + name, + content, + wordCount: content.split(/\s+/).length, + }; + } catch (err) { + console.error(`Error processing sheet "${sheet.name}":`, err); + return null; + } +} + module.exports = asXlsx;