220 lines
6.9 KiB
JavaScript
220 lines
6.9 KiB
JavaScript
const fs = require("fs");
|
|
const path = require("path");
|
|
const { default: slugify } = require("slugify");
|
|
const { v4 } = require("uuid");
|
|
const {
|
|
writeToServerDocuments,
|
|
sanitizeFileName,
|
|
documentsFolder,
|
|
isWithin,
|
|
} = require("../../files");
|
|
const { tokenizeString } = require("../../tokenizer");
|
|
const { YoutubeLoader } = require("./YoutubeLoader");
|
|
const { validYoutubeVideoUrl } = require("../../url");
|
|
|
|
/**
|
|
* Fetch the transcript content for a YouTube video
|
|
* @param {string} url - The URL of the YouTube video
|
|
* @returns {Promise<{success: boolean, reason: string|null, content: string|null, metadata: TranscriptMetadata}>} - The transcript content for the YouTube video
|
|
*/
|
|
async function fetchVideoTranscriptContent({ url }) {
|
|
if (!validYoutubeVideoUrl(url)) {
|
|
return {
|
|
success: false,
|
|
reason: "Invalid URL. Should be youtu.be or youtube.com/watch.",
|
|
content: null,
|
|
metadata: {},
|
|
};
|
|
}
|
|
|
|
console.log(`-- Working YouTube ${url} --`);
|
|
const loader = YoutubeLoader.createFromUrl(url, { addVideoInfo: true });
|
|
const { docs, error } = await loader
|
|
.load()
|
|
.then((docs) => ({ docs, error: null }))
|
|
.catch((e) => ({
|
|
docs: [],
|
|
error: e.message?.split("Error:")?.[1] || e.message,
|
|
}));
|
|
|
|
if (!docs.length || !!error) {
|
|
return {
|
|
success: false,
|
|
reason: error ?? "No transcript found for that YouTube video.",
|
|
content: null,
|
|
metadata: {},
|
|
};
|
|
}
|
|
|
|
const metadata = docs[0].metadata;
|
|
const content = docs[0].pageContent;
|
|
if (!content.length) {
|
|
return {
|
|
success: false,
|
|
reason: "No transcript could be parsed for that YouTube video.",
|
|
content: null,
|
|
metadata: {},
|
|
};
|
|
}
|
|
|
|
return {
|
|
success: true,
|
|
reason: null,
|
|
content,
|
|
metadata,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* @typedef {Object} TranscriptMetadata
|
|
* @property {string} title - The title of the video
|
|
* @property {string} author - The author of the video
|
|
* @property {string} description - The description of the video
|
|
* @property {string} view_count - The view count of the video
|
|
* @property {string} source - The source of the video (videoId)
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} TranscriptAsDocument
|
|
* @property {boolean} success - Whether the transcript was successful
|
|
* @property {string|null} reason - The reason for the transcript
|
|
* @property {TranscriptMetadata} metadata - The metadata from the transcript
|
|
*/
|
|
|
|
/**
|
|
* @typedef {Object} TranscriptAsContent
|
|
* @property {boolean} success - Whether the transcript was successful
|
|
* @property {string|null} reason - The reason for the transcript
|
|
* @property {string|null} content - The content of the transcript
|
|
* @property {Object[]} documents - The documents from the transcript
|
|
* @property {boolean} saveAsDocument - Whether to save the transcript as a document
|
|
*/
|
|
|
|
/**
|
|
* Load the transcript content for a YouTube video as well as save it to the server documents
|
|
* @param {Object} params - The parameters for the YouTube transcript
|
|
* @param {string} params.url - The URL of the YouTube video
|
|
* @param {Object} options - The options for the YouTube transcript
|
|
* @param {boolean} options.parseOnly - Whether to parse the transcript content only or save it to the server documents
|
|
* @returns {Promise<TranscriptAsDocument | TranscriptAsContent>} - The transcript content for the YouTube video
|
|
*/
|
|
async function loadYouTubeTranscript({ url }, options = { parseOnly: false }) {
|
|
const transcriptResults = await fetchVideoTranscriptContent({ url });
|
|
if (!transcriptResults.success) {
|
|
return {
|
|
success: false,
|
|
reason:
|
|
transcriptResults.reason ||
|
|
"An unknown error occurred during transcription retrieval",
|
|
documents: [],
|
|
content: null,
|
|
saveAsDocument: options.parseOnly,
|
|
data: {},
|
|
};
|
|
}
|
|
|
|
const { content, metadata } = transcriptResults;
|
|
|
|
if (options.parseOnly) {
|
|
return {
|
|
success: true,
|
|
reason: null,
|
|
content: buildTranscriptContentWithMetadata(content, metadata),
|
|
documents: [],
|
|
saveAsDocument: options.parseOnly,
|
|
data: {},
|
|
};
|
|
}
|
|
|
|
const outFolder = sanitizeFileName(
|
|
slugify(`${metadata.author} YouTube transcripts`).toLowerCase()
|
|
);
|
|
const outFolderPath = path.resolve(documentsFolder, outFolder);
|
|
const uuid = v4();
|
|
const fileName = sanitizeFileName(`${slugify(metadata.title)}-${uuid}`);
|
|
|
|
if (!isWithin(documentsFolder, path.resolve(outFolderPath, fileName))) {
|
|
console.error(
|
|
`[YouTube Loader]: Invalid file path ${path.resolve(
|
|
outFolderPath,
|
|
fileName
|
|
)} is not within the documents folder ${documentsFolder}`
|
|
);
|
|
return {
|
|
success: false,
|
|
reason: `[YouTube Loader]: Invalid file path ${path.resolve(
|
|
outFolderPath,
|
|
fileName
|
|
)} is not within the documents folder ${documentsFolder}`,
|
|
documents: [],
|
|
data: {},
|
|
};
|
|
}
|
|
|
|
if (!fs.existsSync(outFolderPath))
|
|
fs.mkdirSync(outFolderPath, { recursive: true });
|
|
const data = {
|
|
id: uuid,
|
|
url: url + ".youtube",
|
|
title: metadata.title || url,
|
|
docAuthor: metadata.author,
|
|
description: metadata.description,
|
|
docSource: url,
|
|
chunkSource: `youtube://${url}`,
|
|
published: new Date().toLocaleString(),
|
|
wordCount: content.split(" ").length,
|
|
pageContent: content,
|
|
token_count_estimate: tokenizeString(content),
|
|
};
|
|
|
|
console.log(`[YouTube Loader]: Saving ${metadata.title} to ${outFolder}`);
|
|
const document = writeToServerDocuments({
|
|
data,
|
|
filename: fileName,
|
|
destinationOverride: outFolderPath,
|
|
});
|
|
|
|
return {
|
|
success: true,
|
|
reason: null,
|
|
documents: [document],
|
|
data: {
|
|
title: metadata.title,
|
|
author: metadata.author,
|
|
destination: outFolder,
|
|
},
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Generate the transcript content and metadata into a single string
|
|
*
|
|
* Why? For ephemeral documents where we just want the content, we want to include the metadata as keys in the content
|
|
* so that the LLM has context about the video, this gives it a better understanding of the video
|
|
* and allows it to use the metadata in the conversation if relevant.
|
|
* Examples:
|
|
* - How many views does <LINK> have?
|
|
* - Checkout <LINK> and tell me the key points and if it is performing well
|
|
* - Summarize this video <LINK>? -> description could have links and references
|
|
* @param {string} content - The content of the transcript
|
|
* @param {TranscriptMetadata} metadata - The metadata from the transcript
|
|
* @returns {string} - The concatenated transcript content and metadata
|
|
*/
|
|
function buildTranscriptContentWithMetadata(content = "", metadata = {}) {
|
|
const VALID_METADATA_KEYS = ["title", "author", "description", "view_count"];
|
|
if (!content || !metadata || Object.keys(metadata).length === 0)
|
|
return content;
|
|
|
|
let contentWithMetadata = "";
|
|
VALID_METADATA_KEYS.forEach((key) => {
|
|
if (!metadata[key]) return;
|
|
contentWithMetadata += `<${key}>${metadata[key]}</${key}>`;
|
|
});
|
|
return `${contentWithMetadata}\nTranscript:\n${content}`;
|
|
}
|
|
|
|
module.exports = {
|
|
loadYouTubeTranscript,
|
|
fetchVideoTranscriptContent,
|
|
};
|