* add eslint config to /collector * prettier formatting * fix unused * fix undefined * disable lines * lockfile --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
84 lines
2.7 KiB
JavaScript
84 lines
2.7 KiB
JavaScript
const { v4 } = require("uuid");
|
|
const { writeToServerDocuments } = require("../utils/files");
|
|
const { tokenizeString } = require("../utils/tokenizer");
|
|
const { default: slugify } = require("slugify");
|
|
|
|
// Will remove the last .extension from the input
|
|
// and stringify the input + move to lowercase.
|
|
function stripAndSlug(input) {
|
|
if (!input.includes(".")) return slugify(input, { lower: true });
|
|
return slugify(input.split(".").slice(0, -1).join("-"), { lower: true });
|
|
}
|
|
|
|
const METADATA_KEYS = {
|
|
possible: {
|
|
url: ({ url, title }) => {
|
|
let validUrl;
|
|
try {
|
|
const u = new URL(url);
|
|
validUrl = ["https:", "http:"].includes(u.protocol);
|
|
} catch {}
|
|
|
|
if (validUrl) return `web://${url.toLowerCase()}.website`;
|
|
return `file://${stripAndSlug(title)}.txt`;
|
|
},
|
|
title: ({ title }) => `${stripAndSlug(title)}.txt`,
|
|
docAuthor: ({ docAuthor }) => {
|
|
return typeof docAuthor === "string" ? docAuthor : "no author specified";
|
|
},
|
|
description: ({ description }) => {
|
|
return typeof description === "string"
|
|
? description
|
|
: "no description found";
|
|
},
|
|
docSource: ({ docSource }) => {
|
|
return typeof docSource === "string" ? docSource : "no source set";
|
|
},
|
|
chunkSource: ({ chunkSource, title }) => {
|
|
return typeof chunkSource === "string"
|
|
? chunkSource
|
|
: `${stripAndSlug(title)}.txt`;
|
|
},
|
|
published: ({ published }) => {
|
|
if (isNaN(Number(published))) return new Date().toLocaleString();
|
|
return new Date(Number(published)).toLocaleString();
|
|
},
|
|
},
|
|
};
|
|
|
|
async function processRawText(textContent, metadata) {
|
|
console.log(`-- Working Raw Text doc ${metadata.title} --`);
|
|
if (!textContent || textContent.length === 0) {
|
|
return {
|
|
success: false,
|
|
reason: "textContent was empty - nothing to process.",
|
|
documents: [],
|
|
};
|
|
}
|
|
|
|
const data = {
|
|
id: v4(),
|
|
url: METADATA_KEYS.possible.url(metadata),
|
|
title: METADATA_KEYS.possible.title(metadata),
|
|
docAuthor: METADATA_KEYS.possible.docAuthor(metadata),
|
|
description: METADATA_KEYS.possible.description(metadata),
|
|
docSource: METADATA_KEYS.possible.docSource(metadata),
|
|
chunkSource: METADATA_KEYS.possible.chunkSource(metadata),
|
|
published: METADATA_KEYS.possible.published(metadata),
|
|
wordCount: textContent.split(" ").length,
|
|
pageContent: textContent,
|
|
token_count_estimate: tokenizeString(textContent),
|
|
};
|
|
|
|
const document = writeToServerDocuments({
|
|
data,
|
|
filename: `raw-${stripAndSlug(metadata.title)}-${data.id}`,
|
|
});
|
|
console.log(
|
|
`[SUCCESS]: Raw text and metadata saved & ready for embedding.\n`
|
|
);
|
|
return { success: true, reason: null, documents: [document] };
|
|
}
|
|
|
|
module.exports = { processRawText };
|