const { validYoutubeVideoUrl } = require("../../../url"); /* * This is just a custom implementation of the Langchain JS YouTubeLoader class * as the dependency for YoutubeTranscript is quite fickle and its a rat race to keep it up * and instead of waiting for patches we can just bring this simple script in-house and at least * be able to patch it since its so flaky. When we have more connectors we can kill this because * it will be a pain to maintain over time. */ class YoutubeLoader { #videoId; #language; #addVideoInfo; constructor({ videoId = null, language = null, addVideoInfo = false } = {}) { if (!videoId) throw new Error("Invalid video id!"); this.#videoId = videoId; this.#language = language; this.#addVideoInfo = addVideoInfo; } /** * Extracts the videoId from a YouTube video URL. * @param url The URL of the YouTube video. * @returns The videoId of the YouTube video. */ static getVideoID(url) { const videoId = validYoutubeVideoUrl(url, true); if (videoId) return videoId; throw new Error("Failed to get youtube video id from the url"); } /** * Creates a new instance of the YoutubeLoader class from a YouTube video * URL. * @param url The URL of the YouTube video. * @param config Optional configuration options for the YoutubeLoader instance, excluding the videoId. * @returns A new instance of the YoutubeLoader class. */ static createFromUrl(url, config = {}) { const videoId = YoutubeLoader.getVideoID(url); return new YoutubeLoader({ ...config, videoId }); } /** * Loads the transcript and video metadata from the specified YouTube * video. It uses the youtube-transcript library to fetch the transcript * and the youtubei.js library to fetch the video metadata. * @returns Langchain like doc that is 1 element with PageContent and */ async load() { let transcript; const metadata = { source: this.#videoId, }; try { const fetchTranscript = await import("youtube-transcript-plus").then( (module) => module.fetchTranscript ); const transcriptSegments = await fetchTranscript(this.#videoId, { lang: this.#language, }); if (!transcriptSegments || transcriptSegments.length === 0) throw new Error("Transcription not found"); transcript = this.#convertTranscriptSegmentsToText(transcriptSegments); if (this.#addVideoInfo) { const { Innertube } = require("youtubei.js"); const youtube = await Innertube.create(); const info = (await youtube.getBasicInfo(this.#videoId)).basic_info; metadata.description = info.short_description; metadata.title = info.title; metadata.view_count = info.view_count; metadata.author = info.author; } } catch (e) { throw new Error( `Failed to get YouTube video transcription: ${e?.message}` ); } return [ { pageContent: transcript, metadata, }, ]; } #convertTranscriptSegmentsToText(transcriptSegments) { return transcriptSegments .map((segment) => typeof segment === "string" ? segment : segment.text || "" ) .join(" ") .replace(/\s+/g, " ") .trim(); } } module.exports.YoutubeLoader = YoutubeLoader;