merlyn/collector/processRawText/index.js
Marcello Fitton 8f33203ade
chore: add ESLint to /collector (#5128)
* add eslint config to /collector

* prettier formatting

* fix unused

* fix undefined

* disable lines

* lockfile

---------

Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
2026-03-05 16:25:23 -08:00

84 lines
2.7 KiB
JavaScript

const { v4 } = require("uuid");
const { writeToServerDocuments } = require("../utils/files");
const { tokenizeString } = require("../utils/tokenizer");
const { default: slugify } = require("slugify");
// Will remove the last .extension from the input
// and stringify the input + move to lowercase.
function stripAndSlug(input) {
if (!input.includes(".")) return slugify(input, { lower: true });
return slugify(input.split(".").slice(0, -1).join("-"), { lower: true });
}
const METADATA_KEYS = {
possible: {
url: ({ url, title }) => {
let validUrl;
try {
const u = new URL(url);
validUrl = ["https:", "http:"].includes(u.protocol);
} catch {}
if (validUrl) return `web://${url.toLowerCase()}.website`;
return `file://${stripAndSlug(title)}.txt`;
},
title: ({ title }) => `${stripAndSlug(title)}.txt`,
docAuthor: ({ docAuthor }) => {
return typeof docAuthor === "string" ? docAuthor : "no author specified";
},
description: ({ description }) => {
return typeof description === "string"
? description
: "no description found";
},
docSource: ({ docSource }) => {
return typeof docSource === "string" ? docSource : "no source set";
},
chunkSource: ({ chunkSource, title }) => {
return typeof chunkSource === "string"
? chunkSource
: `${stripAndSlug(title)}.txt`;
},
published: ({ published }) => {
if (isNaN(Number(published))) return new Date().toLocaleString();
return new Date(Number(published)).toLocaleString();
},
},
};
async function processRawText(textContent, metadata) {
console.log(`-- Working Raw Text doc ${metadata.title} --`);
if (!textContent || textContent.length === 0) {
return {
success: false,
reason: "textContent was empty - nothing to process.",
documents: [],
};
}
const data = {
id: v4(),
url: METADATA_KEYS.possible.url(metadata),
title: METADATA_KEYS.possible.title(metadata),
docAuthor: METADATA_KEYS.possible.docAuthor(metadata),
description: METADATA_KEYS.possible.description(metadata),
docSource: METADATA_KEYS.possible.docSource(metadata),
chunkSource: METADATA_KEYS.possible.chunkSource(metadata),
published: METADATA_KEYS.possible.published(metadata),
wordCount: textContent.split(" ").length,
pageContent: textContent,
token_count_estimate: tokenizeString(textContent),
};
const document = writeToServerDocuments({
data,
filename: `raw-${stripAndSlug(metadata.title)}-${data.id}`,
});
console.log(
`[SUCCESS]: Raw text and metadata saved & ready for embedding.\n`
);
return { success: true, reason: null, documents: [document] };
}
module.exports = { processRawText };