Add ability to auto-handle YT video URLs in uploader & chat (#4547)
* Add ability to auto-handle YT video URLs in uploader & chat * move YT validator to URL utils * update comment
This commit is contained in:
parent
be82f91fc3
commit
5edc1bea42
@ -1,3 +1,4 @@
|
|||||||
|
process.env.STORAGE_DIR = "test-storage"; // needed for tests to run
|
||||||
const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");
|
const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");
|
||||||
|
|
||||||
describe("YoutubeTranscript", () => {
|
describe("YoutubeTranscript", () => {
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
const { validURL, validateURL } = require("../../../utils/url");
|
process.env.STORAGE_DIR = "test-storage"; // needed for tests to run
|
||||||
|
const { validURL, validateURL, validYoutubeVideoUrl } = require("../../../utils/url");
|
||||||
|
|
||||||
// Mock the RuntimeSettings module
|
// Mock the RuntimeSettings module
|
||||||
jest.mock("../../../utils/runtimeSettings", () => {
|
jest.mock("../../../utils/runtimeSettings", () => {
|
||||||
@ -127,3 +128,70 @@ describe("validateURL", () => {
|
|||||||
.toBe("https://example.com/PATH/To/Resource?q2=Value&q1=UPPER");
|
.toBe("https://example.com/PATH/To/Resource?q2=Value&q1=UPPER");
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
||||||
|
describe("validYoutubeVideoUrl", () => {
|
||||||
|
const ID = "dQw4w9WgXcQ"; // 11-char valid video id
|
||||||
|
|
||||||
|
it("returns true for youtube watch URLs with v param", () => {
|
||||||
|
expect(validYoutubeVideoUrl(`https://www.youtube.com/watch?v=${ID}`)).toBe(
|
||||||
|
true
|
||||||
|
);
|
||||||
|
expect(validYoutubeVideoUrl(`https://youtube.com/watch?v=${ID}&t=10s`)).toBe(
|
||||||
|
true
|
||||||
|
);
|
||||||
|
expect(validYoutubeVideoUrl(`https://m.youtube.com/watch?v=${ID}`)).toBe(true);
|
||||||
|
expect(validYoutubeVideoUrl(`youtube.com/watch?v=${ID}`)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns true for youtu.be short URLs", () => {
|
||||||
|
expect(validYoutubeVideoUrl(`https://youtu.be/${ID}`)).toBe(true);
|
||||||
|
expect(validYoutubeVideoUrl(`https://youtu.be/${ID}?si=abc`)).toBe(true);
|
||||||
|
// extra path segments after id should still validate the id component
|
||||||
|
expect(validYoutubeVideoUrl(`https://youtu.be/${ID}/extra`)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns true for embed and v path formats", () => {
|
||||||
|
expect(validYoutubeVideoUrl(`https://www.youtube.com/embed/${ID}`)).toBe(true);
|
||||||
|
expect(validYoutubeVideoUrl(`https://youtube.com/v/${ID}`)).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false for non-YouTube hosts", () => {
|
||||||
|
expect(validYoutubeVideoUrl("https://example.com/watch?v=dQw4w9WgXcQ")).toBe(
|
||||||
|
false
|
||||||
|
);
|
||||||
|
expect(validYoutubeVideoUrl("https://vimeo.com/123456")).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false for unrelated YouTube paths without a video id", () => {
|
||||||
|
expect(validYoutubeVideoUrl("https://www.youtube.com/user/somechannel")).toBe(
|
||||||
|
false
|
||||||
|
);
|
||||||
|
expect(validYoutubeVideoUrl("https://www.youtube.com/")).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns false for empty or bad inputs", () => {
|
||||||
|
expect(validYoutubeVideoUrl("")).toBe(false);
|
||||||
|
expect(validYoutubeVideoUrl(null)).toBe(false);
|
||||||
|
expect(validYoutubeVideoUrl(undefined)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
it("returns the video ID for valid YouTube video URLs", () => {
|
||||||
|
expect(validYoutubeVideoUrl(`https://www.youtube.com/watch?v=${ID}`, true)).toBe(ID);
|
||||||
|
expect(validYoutubeVideoUrl(`https://youtube.com/watch?v=${ID}&t=10s`, true)).toBe(ID);
|
||||||
|
expect(validYoutubeVideoUrl(`https://m.youtube.com/watch?v=${ID}`, true)).toBe(ID);
|
||||||
|
expect(validYoutubeVideoUrl(`youtube.com/watch?v=${ID}`, true)).toBe(ID);
|
||||||
|
expect(validYoutubeVideoUrl(`https://youtu.be/${ID}`, true)).toBe(ID);
|
||||||
|
expect(validYoutubeVideoUrl(`https://youtu.be/${ID}?si=abc`, true)).toBe(ID);
|
||||||
|
expect(validYoutubeVideoUrl(`https://youtu.be/${ID}/extra`, true)).toBe(ID);
|
||||||
|
expect(validYoutubeVideoUrl(`https://www.youtube.com/embed/${ID}`, true)).toBe(ID);
|
||||||
|
expect(validYoutubeVideoUrl(`https://youtube.com/v/${ID}`, true)).toBe(ID);
|
||||||
|
// invalid video IDs
|
||||||
|
expect(validYoutubeVideoUrl(`https://www.youtube.com/watch?v=invalid`, true)).toBe(null);
|
||||||
|
expect(validYoutubeVideoUrl(`https://youtube.com/watch?v=invalid`, true)).toBe(null);
|
||||||
|
expect(validYoutubeVideoUrl(`https://m.youtube.com/watch?v=invalid`, true)).toBe(null);
|
||||||
|
expect(validYoutubeVideoUrl(`youtube.com/watch`, true)).toBe(null);
|
||||||
|
expect(validYoutubeVideoUrl(`https://youtu.be/invalid`, true)).toBe(null);
|
||||||
|
expect(validYoutubeVideoUrl(`https://youtu.be/invalid?si=abc`, true)).toBe(null);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|||||||
@ -1,15 +1,18 @@
|
|||||||
const { v4 } = require("uuid");
|
const { v4 } = require("uuid");
|
||||||
const path = require("path");
|
|
||||||
const {
|
const {
|
||||||
PuppeteerWebBaseLoader,
|
PuppeteerWebBaseLoader,
|
||||||
} = require("langchain/document_loaders/web/puppeteer");
|
} = require("langchain/document_loaders/web/puppeteer");
|
||||||
const { writeToServerDocuments } = require("../../utils/files");
|
const { writeToServerDocuments } = require("../../utils/files");
|
||||||
const { tokenizeString } = require("../../utils/tokenizer");
|
const { tokenizeString } = require("../../utils/tokenizer");
|
||||||
const { default: slugify } = require("slugify");
|
const { default: slugify } = require("slugify");
|
||||||
const { getContentTypeFromURL, returnResult } = require("../helpers");
|
const {
|
||||||
const { processSingleFile } = require("../../processSingleFile");
|
returnResult,
|
||||||
const { downloadURIToFile } = require("../../utils/downloadURIToFile");
|
determineContentType,
|
||||||
const { ACCEPTED_MIMES } = require("../../utils/constants");
|
processAsFile,
|
||||||
|
} = require("../helpers");
|
||||||
|
const {
|
||||||
|
loadYouTubeTranscript,
|
||||||
|
} = require("../../utils/extensions/YoutubeTranscript");
|
||||||
const RuntimeSettings = require("../../utils/runtimeSettings");
|
const RuntimeSettings = require("../../utils/runtimeSettings");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -29,80 +32,23 @@ async function scrapeGenericUrl({
|
|||||||
metadata = {},
|
metadata = {},
|
||||||
saveAsDocument = true,
|
saveAsDocument = true,
|
||||||
}) {
|
}) {
|
||||||
/** @type {'web' | 'file'} */
|
/** @type {'web' | 'file' | 'youtube'} */
|
||||||
let processVia = "web";
|
|
||||||
console.log(`-- Working URL ${link} => (captureAs: ${captureAs}) --`);
|
console.log(`-- Working URL ${link} => (captureAs: ${captureAs}) --`);
|
||||||
|
let { contentType, processVia } = await determineContentType(link);
|
||||||
const contentType = await getContentTypeFromURL(link)
|
|
||||||
.then((result) => {
|
|
||||||
// If there is a reason, log it, but continue with the process
|
|
||||||
if (!!result.reason) console.error(result.reason);
|
|
||||||
return result.contentType;
|
|
||||||
})
|
|
||||||
.catch((error) => {
|
|
||||||
console.error("Error getting content type from URL", error);
|
|
||||||
return null;
|
|
||||||
});
|
|
||||||
|
|
||||||
// If the content is unlikely to be a webpage, assume it is a file and process it as a file
|
|
||||||
if (
|
|
||||||
!["text/html", "text/plain"].includes(contentType) &&
|
|
||||||
contentType in ACCEPTED_MIMES
|
|
||||||
)
|
|
||||||
processVia = "file";
|
|
||||||
|
|
||||||
console.log(`-- URL determined to be ${contentType} (${processVia}) --`);
|
console.log(`-- URL determined to be ${contentType} (${processVia}) --`);
|
||||||
// If the content type is a file, download the file to the hotdir and process it
|
|
||||||
// Then return the content of the file as a document or whatever the captureAs dictates.
|
|
||||||
if (processVia === "file") {
|
|
||||||
const fileContentResult = await downloadURIToFile(link);
|
|
||||||
if (!fileContentResult.success)
|
|
||||||
return returnResult({
|
|
||||||
success: false,
|
|
||||||
reason: fileContentResult.reason,
|
|
||||||
documents: [],
|
|
||||||
content: null,
|
|
||||||
saveAsDocument,
|
|
||||||
});
|
|
||||||
|
|
||||||
const fileFilePath = fileContentResult.fileLocation;
|
/**
|
||||||
const targetFilename = path.basename(fileFilePath);
|
* When the content is a file or a YouTube video, we can use the existing processing functions
|
||||||
|
* These are self-contained and will return the correct response based on the saveAsDocument flag already
|
||||||
/**
|
* so we can return the content immediately.
|
||||||
* If the saveAsDocument is false, we are only interested in the text content
|
*/
|
||||||
* and can ignore the file as a document by using `parseOnly` in the options.
|
if (processVia === "file")
|
||||||
* This will send the file to the Direct Uploads folder instead of the Documents folder.
|
return await processAsFile({ uri: link, saveAsDocument });
|
||||||
* that will be deleted by the cleanup-orphan-documents job that runs frequently. The trade off
|
else if (processVia === "youtube")
|
||||||
* is that since it still is in FS we can debug its output or even potentially reuse it for other purposes.
|
return await loadYouTubeTranscript(
|
||||||
*
|
{ url: link },
|
||||||
* TODO: Improve this process via a new option that will instantly delete the file after processing
|
{ parseOnly: saveAsDocument === false }
|
||||||
* if we find we dont need this file ever after processing.
|
);
|
||||||
*/
|
|
||||||
const processSingleFileResult = await processSingleFile(targetFilename, {
|
|
||||||
parseOnly: saveAsDocument === false,
|
|
||||||
});
|
|
||||||
if (!processSingleFileResult.success) {
|
|
||||||
return returnResult({
|
|
||||||
success: false,
|
|
||||||
reason: processSingleFileResult.reason,
|
|
||||||
documents: [],
|
|
||||||
content: null,
|
|
||||||
saveAsDocument,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we intend to return only the text content, return the content from the file
|
|
||||||
// and then delete the file - otherwise it will be saved as a document
|
|
||||||
if (!saveAsDocument) {
|
|
||||||
return returnResult({
|
|
||||||
success: true,
|
|
||||||
content: processSingleFileResult.documents[0].pageContent,
|
|
||||||
saveAsDocument,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
return processSingleFileResult;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Otherwise, assume the content is a webpage and scrape the content from the webpage
|
// Otherwise, assume the content is a webpage and scrape the content from the webpage
|
||||||
const content = await getPageContent({
|
const content = await getPageContent({
|
||||||
@ -110,7 +56,6 @@ async function scrapeGenericUrl({
|
|||||||
captureAs,
|
captureAs,
|
||||||
headers: scraperHeaders,
|
headers: scraperHeaders,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!content || !content.length) {
|
if (!content || !content.length) {
|
||||||
console.error(`Resulting URL content was empty at ${link}.`);
|
console.error(`Resulting URL content was empty at ${link}.`);
|
||||||
return returnResult({
|
return returnResult({
|
||||||
@ -124,13 +69,12 @@ async function scrapeGenericUrl({
|
|||||||
|
|
||||||
// If the captureAs is text, return the content as a string immediately
|
// If the captureAs is text, return the content as a string immediately
|
||||||
// so that we dont save the content as a document
|
// so that we dont save the content as a document
|
||||||
if (!saveAsDocument) {
|
if (!saveAsDocument)
|
||||||
return returnResult({
|
return returnResult({
|
||||||
success: true,
|
success: true,
|
||||||
content,
|
content,
|
||||||
saveAsDocument,
|
saveAsDocument,
|
||||||
});
|
});
|
||||||
}
|
|
||||||
|
|
||||||
// Save the content as a document from the URL
|
// Save the content as a document from the URL
|
||||||
const url = new URL(link);
|
const url = new URL(link);
|
||||||
|
|||||||
@ -1,4 +1,9 @@
|
|||||||
|
const path = require("path");
|
||||||
const { validURL } = require("../../utils/url");
|
const { validURL } = require("../../utils/url");
|
||||||
|
const { processSingleFile } = require("../../processSingleFile");
|
||||||
|
const { downloadURIToFile } = require("../../utils/downloadURIToFile");
|
||||||
|
const { ACCEPTED_MIMES } = require("../../utils/constants");
|
||||||
|
const { validYoutubeVideoUrl } = require("../../utils/url");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the content type of a resource
|
* Get the content type of a resource
|
||||||
@ -51,13 +56,23 @@ async function getContentTypeFromURL(url) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize the result object based on the saveAsDocument flag
|
||||||
|
* @param {Object} result - The result object to normalize
|
||||||
|
* @param {boolean} result.success - Whether the result is successful
|
||||||
|
* @param {string|null} result.reason - The reason for the result
|
||||||
|
* @param {Object[]} result.documents - The documents from the result
|
||||||
|
* @param {string|null} result.content - The content of the result
|
||||||
|
* @param {boolean} result.saveAsDocument - Whether to save the content as a document. Default is true
|
||||||
|
* @returns {{success: boolean, reason: string|null, documents: Object[], content: string|null}} - The normalized result object
|
||||||
|
*/
|
||||||
function returnResult({
|
function returnResult({
|
||||||
success,
|
success,
|
||||||
reason,
|
reason,
|
||||||
documents,
|
documents,
|
||||||
content,
|
content,
|
||||||
saveAsDocument = true,
|
saveAsDocument = true,
|
||||||
}) {
|
} = {}) {
|
||||||
if (!saveAsDocument) {
|
if (!saveAsDocument) {
|
||||||
return {
|
return {
|
||||||
success,
|
success,
|
||||||
@ -66,7 +81,98 @@ function returnResult({
|
|||||||
} else return { success, reason, documents };
|
} else return { success, reason, documents };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Determine the content type of a link - should be a URL
|
||||||
|
* @param {string} uri - The link to determine the content type of
|
||||||
|
* @returns {Promise<{contentType: string|null, processVia: 'web' | 'file' | 'youtube'}>} - The content type of the link
|
||||||
|
*/
|
||||||
|
async function determineContentType(uri) {
|
||||||
|
let processVia = "web";
|
||||||
|
|
||||||
|
// Dont check for content type if it is a YouTube video URL
|
||||||
|
if (validYoutubeVideoUrl(uri))
|
||||||
|
return { contentType: "text/html", processVia: "youtube" };
|
||||||
|
|
||||||
|
return await getContentTypeFromURL(uri)
|
||||||
|
.then((result) => {
|
||||||
|
if (!!result.reason) console.error(result.reason);
|
||||||
|
|
||||||
|
// If the content type is not text/html or text/plain, and it is in the ACCEPTED_MIMES,
|
||||||
|
// then we can process it as a file
|
||||||
|
if (
|
||||||
|
!!result.contentType &&
|
||||||
|
!["text/html", "text/plain"].includes(result.contentType) &&
|
||||||
|
result.contentType in ACCEPTED_MIMES
|
||||||
|
)
|
||||||
|
processVia = "file";
|
||||||
|
|
||||||
|
return { contentType: result.contentType, processVia };
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
console.error("Error getting content type from URL", error);
|
||||||
|
return { contentType: null, processVia };
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process a link as a file
|
||||||
|
* @param {string} uri - The link to process as a file
|
||||||
|
* @param {boolean} saveAsDocument - Whether to save the content as a document. Default is true
|
||||||
|
* @returns {Promise<{success: boolean, reason: string|null, documents: Object[], content: string|null, saveAsDocument: boolean}>} - The content of the file
|
||||||
|
*/
|
||||||
|
async function processAsFile({ uri, saveAsDocument = true }) {
|
||||||
|
const fileContentResult = await downloadURIToFile(uri);
|
||||||
|
if (!fileContentResult.success)
|
||||||
|
return returnResult({
|
||||||
|
success: false,
|
||||||
|
reason: fileContentResult.reason,
|
||||||
|
documents: [],
|
||||||
|
content: null,
|
||||||
|
saveAsDocument,
|
||||||
|
});
|
||||||
|
|
||||||
|
const fileFilePath = fileContentResult.fileLocation;
|
||||||
|
const targetFilename = path.basename(fileFilePath);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If the saveAsDocument is false, we are only interested in the text content
|
||||||
|
* and can ignore the file as a document by using `parseOnly` in the options.
|
||||||
|
* This will send the file to the Direct Uploads folder instead of the Documents folder.
|
||||||
|
* that will be deleted by the cleanup-orphan-documents job that runs frequently. The trade off
|
||||||
|
* is that since it still is in FS we can debug its output or even potentially reuse it for other purposes.
|
||||||
|
*
|
||||||
|
* TODO: Improve this process via a new option that will instantly delete the file after processing
|
||||||
|
* if we find we dont need this file ever after processing.
|
||||||
|
*/
|
||||||
|
const processSingleFileResult = await processSingleFile(targetFilename, {
|
||||||
|
parseOnly: saveAsDocument === false,
|
||||||
|
});
|
||||||
|
if (!processSingleFileResult.success) {
|
||||||
|
return returnResult({
|
||||||
|
success: false,
|
||||||
|
reason: processSingleFileResult.reason,
|
||||||
|
documents: [],
|
||||||
|
content: null,
|
||||||
|
saveAsDocument,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we intend to return only the text content, return the content from the file
|
||||||
|
// and then delete the file - otherwise it will be saved as a document
|
||||||
|
if (!saveAsDocument) {
|
||||||
|
return returnResult({
|
||||||
|
success: true,
|
||||||
|
content: processSingleFileResult.documents[0].pageContent,
|
||||||
|
saveAsDocument,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return processSingleFileResult;
|
||||||
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
returnResult,
|
returnResult,
|
||||||
getContentTypeFromURL,
|
getContentTypeFromURL,
|
||||||
|
determineContentType,
|
||||||
|
processAsFile,
|
||||||
};
|
};
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
const { validYoutubeVideoUrl } = require("../../../url");
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is just a custom implementation of the Langchain JS YouTubeLoader class
|
* This is just a custom implementation of the Langchain JS YouTubeLoader class
|
||||||
* as the dependency for YoutubeTranscript is quite fickle and its a rat race to keep it up
|
* as the dependency for YoutubeTranscript is quite fickle and its a rat race to keep it up
|
||||||
@ -23,14 +25,9 @@ class YoutubeLoader {
|
|||||||
* @returns The videoId of the YouTube video.
|
* @returns The videoId of the YouTube video.
|
||||||
*/
|
*/
|
||||||
static getVideoID(url) {
|
static getVideoID(url) {
|
||||||
const match = url.match(
|
const videoId = validYoutubeVideoUrl(url, true);
|
||||||
/.*(?:youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=)([^#&?]*).*/
|
if (videoId) return videoId;
|
||||||
);
|
throw new Error("Failed to get youtube video id from the url");
|
||||||
if (match !== null && match[1].length === 11) {
|
|
||||||
return match[1];
|
|
||||||
} else {
|
|
||||||
throw new Error("Failed to get youtube video id from the url");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
const { validYoutubeVideoUrl } = require("../../../url");
|
||||||
|
|
||||||
class YoutubeTranscriptError extends Error {
|
class YoutubeTranscriptError extends Error {
|
||||||
constructor(message) {
|
constructor(message) {
|
||||||
super(`[YoutubeTranscript] ${message}`);
|
super(`[YoutubeTranscript] ${message}`);
|
||||||
@ -229,13 +231,9 @@ class YoutubeTranscript {
|
|||||||
* @returns {string} YouTube video ID
|
* @returns {string} YouTube video ID
|
||||||
*/
|
*/
|
||||||
static retrieveVideoId(videoId) {
|
static retrieveVideoId(videoId) {
|
||||||
if (videoId.length === 11) return videoId;
|
if (videoId.length === 11) return videoId; // already a valid ID most likely
|
||||||
|
const matchedId = validYoutubeVideoUrl(videoId, true);
|
||||||
const RE_YOUTUBE =
|
if (matchedId) return matchedId;
|
||||||
/(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;
|
|
||||||
const matchId = videoId.match(RE_YOUTUBE);
|
|
||||||
|
|
||||||
if (matchId?.[1]) return matchId[1];
|
|
||||||
throw new YoutubeTranscriptError(
|
throw new YoutubeTranscriptError(
|
||||||
"Impossible to retrieve Youtube video ID."
|
"Impossible to retrieve Youtube video ID."
|
||||||
);
|
);
|
||||||
|
|||||||
@ -9,27 +9,13 @@ const {
|
|||||||
} = require("../../files");
|
} = require("../../files");
|
||||||
const { tokenizeString } = require("../../tokenizer");
|
const { tokenizeString } = require("../../tokenizer");
|
||||||
const { YoutubeLoader } = require("./YoutubeLoader");
|
const { YoutubeLoader } = require("./YoutubeLoader");
|
||||||
|
const { validYoutubeVideoUrl } = require("../../url");
|
||||||
|
|
||||||
function validYoutubeVideoUrl(link) {
|
/**
|
||||||
const UrlPattern = require("url-pattern");
|
* Fetch the transcript content for a YouTube video
|
||||||
const opts = new URL(link);
|
* @param {string} url - The URL of the YouTube video
|
||||||
const url = `${opts.protocol}//${opts.host}${opts.pathname}${
|
* @returns {Promise<{success: boolean, reason: string|null, content: string|null, metadata: Object}>} - The transcript content for the YouTube video
|
||||||
opts.searchParams.has("v") ? `?v=${opts.searchParams.get("v")}` : ""
|
*/
|
||||||
}`;
|
|
||||||
|
|
||||||
const shortPatternMatch = new UrlPattern(
|
|
||||||
"https\\://(www.)youtu.be/(:videoId)"
|
|
||||||
).match(url);
|
|
||||||
const fullPatternMatch = new UrlPattern(
|
|
||||||
"https\\://(www.)youtube.com/watch?v=(:videoId)"
|
|
||||||
).match(url);
|
|
||||||
const videoId =
|
|
||||||
shortPatternMatch?.videoId || fullPatternMatch?.videoId || null;
|
|
||||||
if (!!videoId) return true;
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function fetchVideoTranscriptContent({ url }) {
|
async function fetchVideoTranscriptContent({ url }) {
|
||||||
if (!validYoutubeVideoUrl(url)) {
|
if (!validYoutubeVideoUrl(url)) {
|
||||||
return {
|
return {
|
||||||
@ -44,15 +30,11 @@ async function fetchVideoTranscriptContent({ url }) {
|
|||||||
const loader = YoutubeLoader.createFromUrl(url, { addVideoInfo: true });
|
const loader = YoutubeLoader.createFromUrl(url, { addVideoInfo: true });
|
||||||
const { docs, error } = await loader
|
const { docs, error } = await loader
|
||||||
.load()
|
.load()
|
||||||
.then((docs) => {
|
.then((docs) => ({ docs, error: null }))
|
||||||
return { docs, error: null };
|
.catch((e) => ({
|
||||||
})
|
docs: [],
|
||||||
.catch((e) => {
|
error: e.message?.split("Error:")?.[1] || e.message,
|
||||||
return {
|
}));
|
||||||
docs: [],
|
|
||||||
error: e.message?.split("Error:")?.[1] || e.message,
|
|
||||||
};
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!docs.length || !!error) {
|
if (!docs.length || !!error) {
|
||||||
return {
|
return {
|
||||||
@ -82,7 +64,31 @@ async function fetchVideoTranscriptContent({ url }) {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
async function loadYouTubeTranscript({ url }) {
|
/**
|
||||||
|
* @typedef {Object} TranscriptAsDocument
|
||||||
|
* @property {boolean} success - Whether the transcript was successful
|
||||||
|
* @property {string|null} reason - The reason for the transcript
|
||||||
|
* @property {{title: string, author: string, destination: string}} data - The data from the transcript
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @typedef {Object} TranscriptAsContent
|
||||||
|
* @property {boolean} success - Whether the transcript was successful
|
||||||
|
* @property {string|null} reason - The reason for the transcript
|
||||||
|
* @property {string|null} content - The content of the transcript
|
||||||
|
* @property {Object[]} documents - The documents from the transcript
|
||||||
|
* @property {boolean} saveAsDocument - Whether to save the transcript as a document
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load the transcript content for a YouTube video as well as save it to the server documents
|
||||||
|
* @param {Object} params - The parameters for the YouTube transcript
|
||||||
|
* @param {string} params.url - The URL of the YouTube video
|
||||||
|
* @param {Object} options - The options for the YouTube transcript
|
||||||
|
* @param {boolean} options.parseOnly - Whether to parse the transcript content only or save it to the server documents
|
||||||
|
* @returns {Promise<TranscriptAsDocument | TranscriptAsContent>} - The transcript content for the YouTube video
|
||||||
|
*/
|
||||||
|
async function loadYouTubeTranscript({ url }, options = { parseOnly: false }) {
|
||||||
const transcriptResults = await fetchVideoTranscriptContent({ url });
|
const transcriptResults = await fetchVideoTranscriptContent({ url });
|
||||||
if (!transcriptResults.success) {
|
if (!transcriptResults.success) {
|
||||||
return {
|
return {
|
||||||
@ -90,9 +96,25 @@ async function loadYouTubeTranscript({ url }) {
|
|||||||
reason:
|
reason:
|
||||||
transcriptResults.reason ||
|
transcriptResults.reason ||
|
||||||
"An unknown error occurred during transcription retrieval",
|
"An unknown error occurred during transcription retrieval",
|
||||||
|
documents: [],
|
||||||
|
content: null,
|
||||||
|
saveAsDocument: options.parseOnly,
|
||||||
|
data: {},
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const { content, metadata } = transcriptResults;
|
const { content, metadata } = transcriptResults;
|
||||||
|
if (options.parseOnly) {
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
reason: null,
|
||||||
|
content,
|
||||||
|
documents: [],
|
||||||
|
saveAsDocument: options.parseOnly,
|
||||||
|
data: {},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
const outFolder = sanitizeFileName(
|
const outFolder = sanitizeFileName(
|
||||||
slugify(`${metadata.author} YouTube transcripts`).toLowerCase()
|
slugify(`${metadata.author} YouTube transcripts`).toLowerCase()
|
||||||
);
|
);
|
||||||
@ -100,7 +122,6 @@ async function loadYouTubeTranscript({ url }) {
|
|||||||
|
|
||||||
if (!fs.existsSync(outFolderPath))
|
if (!fs.existsSync(outFolderPath))
|
||||||
fs.mkdirSync(outFolderPath, { recursive: true });
|
fs.mkdirSync(outFolderPath, { recursive: true });
|
||||||
|
|
||||||
const data = {
|
const data = {
|
||||||
id: v4(),
|
id: v4(),
|
||||||
url: url + ".youtube",
|
url: url + ".youtube",
|
||||||
@ -124,7 +145,7 @@ async function loadYouTubeTranscript({ url }) {
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
success: true,
|
success: true,
|
||||||
reason: "test",
|
reason: null,
|
||||||
data: {
|
data: {
|
||||||
title: metadata.title,
|
title: metadata.title,
|
||||||
author: metadata.author,
|
author: metadata.author,
|
||||||
|
|||||||
@ -95,7 +95,43 @@ function validateURL(url) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate if a link is a valid YouTube video URL
|
||||||
|
* - Checks youtu.be, youtube.com, m.youtube.com, music.youtube.com
|
||||||
|
* - Embed video URLs
|
||||||
|
* - Short URLs
|
||||||
|
* - Live URLs
|
||||||
|
* - Regular watch URLs
|
||||||
|
* - Optional query parameters (including ?v parameter)
|
||||||
|
*
|
||||||
|
* Can be used to extract the video ID from a YouTube video URL via the returnVideoId parameter.
|
||||||
|
* @param {string} link - The link to validate
|
||||||
|
* @param {boolean} returnVideoId - Whether to return the video ID if the link is a valid YouTube video URL
|
||||||
|
* @returns {boolean|string} - Whether the link is a valid YouTube video URL or the video ID if returnVideoId is true
|
||||||
|
*/
|
||||||
|
function validYoutubeVideoUrl(link, returnVideoId = false) {
|
||||||
|
try {
|
||||||
|
if (!link || typeof link !== "string") return false;
|
||||||
|
let urlToValidate = link;
|
||||||
|
|
||||||
|
if (!link.startsWith("http://") && !link.startsWith("https://")) {
|
||||||
|
urlToValidate = "https://" + link;
|
||||||
|
urlToValidate = new URL(urlToValidate).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
const regex =
|
||||||
|
/^(?:https?:\/\/)?(?:www\.|m\.|music\.)?(?:youtu\.be\/|youtube\.com\/(?:embed\/|v\/|watch\?(?:.*&)?v=|(?:live\/)?|shorts\/))([\w-]{11})(?:\S+)?$/;
|
||||||
|
const match = urlToValidate.match(regex);
|
||||||
|
if (returnVideoId) return match?.[1] ?? null;
|
||||||
|
return !!match?.[1];
|
||||||
|
} catch (error) {
|
||||||
|
console.error("Error validating YouTube video URL", error);
|
||||||
|
return returnVideoId ? null : false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
validURL,
|
validURL,
|
||||||
validateURL,
|
validateURL,
|
||||||
|
validYoutubeVideoUrl,
|
||||||
};
|
};
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user