const fs = require("fs"); const path = require("path"); const { default: slugify } = require("slugify"); const { v4 } = require("uuid"); const { writeToServerDocuments, sanitizeFileName, documentsFolder, isWithin, } = require("../../files"); const { tokenizeString } = require("../../tokenizer"); const { YoutubeLoader } = require("./YoutubeLoader"); const { validYoutubeVideoUrl } = require("../../url"); /** * Fetch the transcript content for a YouTube video * @param {string} url - The URL of the YouTube video * @returns {Promise<{success: boolean, reason: string|null, content: string|null, metadata: TranscriptMetadata}>} - The transcript content for the YouTube video */ async function fetchVideoTranscriptContent({ url }) { if (!validYoutubeVideoUrl(url)) { return { success: false, reason: "Invalid URL. Should be youtu.be or youtube.com/watch.", content: null, metadata: {}, }; } console.log(`-- Working YouTube ${url} --`); const loader = YoutubeLoader.createFromUrl(url, { addVideoInfo: true }); const { docs, error } = await loader .load() .then((docs) => ({ docs, error: null })) .catch((e) => ({ docs: [], error: e.message?.split("Error:")?.[1] || e.message, })); if (!docs.length || !!error) { return { success: false, reason: error ?? "No transcript found for that YouTube video.", content: null, metadata: {}, }; } const metadata = docs[0].metadata; const content = docs[0].pageContent; if (!content.length) { return { success: false, reason: "No transcript could be parsed for that YouTube video.", content: null, metadata: {}, }; } return { success: true, reason: null, content, metadata, }; } /** * @typedef {Object} TranscriptMetadata * @property {string} title - The title of the video * @property {string} author - The author of the video * @property {string} description - The description of the video * @property {string} view_count - The view count of the video * @property {string} source - The source of the video (videoId) */ /** * @typedef {Object} TranscriptAsDocument * @property {boolean} success - Whether the transcript was successful * @property {string|null} reason - The reason for the transcript * @property {TranscriptMetadata} metadata - The metadata from the transcript */ /** * @typedef {Object} TranscriptAsContent * @property {boolean} success - Whether the transcript was successful * @property {string|null} reason - The reason for the transcript * @property {string|null} content - The content of the transcript * @property {Object[]} documents - The documents from the transcript * @property {boolean} saveAsDocument - Whether to save the transcript as a document */ /** * Load the transcript content for a YouTube video as well as save it to the server documents * @param {Object} params - The parameters for the YouTube transcript * @param {string} params.url - The URL of the YouTube video * @param {Object} options - The options for the YouTube transcript * @param {boolean} options.parseOnly - Whether to parse the transcript content only or save it to the server documents * @returns {Promise} - The transcript content for the YouTube video */ async function loadYouTubeTranscript({ url }, options = { parseOnly: false }) { const transcriptResults = await fetchVideoTranscriptContent({ url }); if (!transcriptResults.success) { return { success: false, reason: transcriptResults.reason || "An unknown error occurred during transcription retrieval", documents: [], content: null, saveAsDocument: options.parseOnly, data: {}, }; } const { content, metadata } = transcriptResults; if (options.parseOnly) { return { success: true, reason: null, content: buildTranscriptContentWithMetadata(content, metadata), documents: [], saveAsDocument: options.parseOnly, data: {}, }; } const outFolder = sanitizeFileName( slugify(`${metadata.author} YouTube transcripts`).toLowerCase() ); const outFolderPath = path.resolve(documentsFolder, outFolder); const uuid = v4(); const fileName = sanitizeFileName(`${slugify(metadata.title)}-${uuid}`); if (!isWithin(documentsFolder, path.resolve(outFolderPath, fileName))) { console.error( `[YouTube Loader]: Invalid file path ${path.resolve( outFolderPath, fileName )} is not within the documents folder ${documentsFolder}` ); return { success: false, reason: `[YouTube Loader]: Invalid file path ${path.resolve( outFolderPath, fileName )} is not within the documents folder ${documentsFolder}`, documents: [], data: {}, }; } if (!fs.existsSync(outFolderPath)) fs.mkdirSync(outFolderPath, { recursive: true }); const data = { id: uuid, url: url + ".youtube", title: metadata.title || url, docAuthor: metadata.author, description: metadata.description, docSource: url, chunkSource: `youtube://${url}`, published: new Date().toLocaleString(), wordCount: content.split(" ").length, pageContent: content, token_count_estimate: tokenizeString(content), }; console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`); const document = writeToServerDocuments({ data, filename: fileName, destinationOverride: outFolderPath, }); return { success: true, reason: null, documents: [document], data: { title: metadata.title, author: metadata.author, destination: outFolder, }, }; } /** * Generate the transcript content and metadata into a single string * * Why? For ephemeral documents where we just want the content, we want to include the metadata as keys in the content * so that the LLM has context about the video, this gives it a better understanding of the video * and allows it to use the metadata in the conversation if relevant. * Examples: * - How many views does have? * - Checkout and tell me the key points and if it is performing well * - Summarize this video ? -> description could have links and references * @param {string} content - The content of the transcript * @param {TranscriptMetadata} metadata - The metadata from the transcript * @returns {string} - The concatenated transcript content and metadata */ function buildTranscriptContentWithMetadata(content = "", metadata = {}) { const VALID_METADATA_KEYS = ["title", "author", "description", "view_count"]; if (!content || !metadata || Object.keys(metadata).length === 0) return content; let contentWithMetadata = ""; VALID_METADATA_KEYS.forEach((key) => { if (!metadata[key]) return; contentWithMetadata += `<${key}>${metadata[key]}`; }); return `${contentWithMetadata}\nTranscript:\n${content}`; } module.exports = { loadYouTubeTranscript, fetchVideoTranscriptContent, };