merlyn/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/index.js
2026-01-02 15:41:22 -08:00

100 lines
3.3 KiB
JavaScript

const { validYoutubeVideoUrl } = require("../../../url");
/*
* This is just a custom implementation of the Langchain JS YouTubeLoader class
* as the dependency for YoutubeTranscript is quite fickle and its a rat race to keep it up
* and instead of waiting for patches we can just bring this simple script in-house and at least
* be able to patch it since its so flaky. When we have more connectors we can kill this because
* it will be a pain to maintain over time.
*/
class YoutubeLoader {
#videoId;
#language;
#addVideoInfo;
constructor({ videoId = null, language = null, addVideoInfo = false } = {}) {
if (!videoId) throw new Error("Invalid video id!");
this.#videoId = videoId;
this.#language = language;
this.#addVideoInfo = addVideoInfo;
}
/**
* Extracts the videoId from a YouTube video URL.
* @param url The URL of the YouTube video.
* @returns The videoId of the YouTube video.
*/
static getVideoID(url) {
const videoId = validYoutubeVideoUrl(url, true);
if (videoId) return videoId;
throw new Error("Failed to get youtube video id from the url");
}
/**
* Creates a new instance of the YoutubeLoader class from a YouTube video
* URL.
* @param url The URL of the YouTube video.
* @param config Optional configuration options for the YoutubeLoader instance, excluding the videoId.
* @returns A new instance of the YoutubeLoader class.
*/
static createFromUrl(url, config = {}) {
const videoId = YoutubeLoader.getVideoID(url);
return new YoutubeLoader({ ...config, videoId });
}
/**
* Loads the transcript and video metadata from the specified YouTube
* video. It uses the youtube-transcript library to fetch the transcript
* and the youtubei.js library to fetch the video metadata.
* @returns Langchain like doc that is 1 element with PageContent and
*/
async load() {
let transcript;
const metadata = {
source: this.#videoId,
};
try {
const fetchTranscript = await import("youtube-transcript-plus").then(
(module) => module.fetchTranscript
);
const transcriptSegments = await fetchTranscript(this.#videoId, {
lang: this.#language,
});
if (!transcriptSegments || transcriptSegments.length === 0)
throw new Error("Transcription not found");
transcript = this.#convertTranscriptSegmentsToText(transcriptSegments);
if (this.#addVideoInfo) {
const { Innertube } = require("youtubei.js");
const youtube = await Innertube.create();
const info = (await youtube.getBasicInfo(this.#videoId)).basic_info;
metadata.description = info.short_description;
metadata.title = info.title;
metadata.view_count = info.view_count;
metadata.author = info.author;
}
} catch (e) {
throw new Error(
`Failed to get YouTube video transcription: ${e?.message}`
);
}
return [
{
pageContent: transcript,
metadata,
},
];
}
#convertTranscriptSegmentsToText(transcriptSegments) {
return transcriptSegments
.map((segment) =>
typeof segment === "string" ? segment : segment.text || ""
)
.join(" ")
.replace(/\s+/g, " ")
.trim();
}
}
module.exports.YoutubeLoader = YoutubeLoader;