Add ability to auto-handle YT video URLs in uploader & chat (#4547)

* Add ability to auto-handle YT video URLs in uploader & chat * move YT validator to URL utils * update comment
2025-10-15 12:18:57 -07:00 · 2025-10-15 12:18:57 -07:00 · 5edc1bea42
commit 5edc1bea42
parent be82f91fc3
8 changed files with 299 additions and 128 deletions
--- a/collector/tests/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js
+++ b/collector/tests/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js
@ -1,3 +1,4 @@
 process.env.STORAGE_DIR = "test-storage"; // needed for tests to run
 const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");
 describe("YoutubeTranscript", () => {
--- a/collector/tests/utils/url/index.test.js
+++ b/collector/tests/utils/url/index.test.js
@ -1,4 +1,5 @@
-const { validURL, validateURL } = require("../../../utils/url");
+process.env.STORAGE_DIR = "test-storage"; // needed for tests to run
 const { validURL, validateURL, validYoutubeVideoUrl } = require("../../../utils/url");
 // Mock the RuntimeSettings module
 jest.mock("../../../utils/runtimeSettings", () => {
@ -127,3 +128,70 @@ describe("validateURL", () => {
      .toBe("https://example.com/PATH/To/Resource?q2=Value&q1=UPPER");
  });
 });
 describe("validYoutubeVideoUrl", () => {
  const ID = "dQw4w9WgXcQ"; // 11-char valid video id
  it("returns true for youtube watch URLs with v param", () => {
    expect(validYoutubeVideoUrl(`https://www.youtube.com/watch?v=${ID}`)).toBe(
      true
    );
    expect(validYoutubeVideoUrl(`https://youtube.com/watch?v=${ID}&t=10s`)).toBe(
      true
    );
    expect(validYoutubeVideoUrl(`https://m.youtube.com/watch?v=${ID}`)).toBe(true);
    expect(validYoutubeVideoUrl(`youtube.com/watch?v=${ID}`)).toBe(true);
  });
  it("returns true for youtu.be short URLs", () => {
    expect(validYoutubeVideoUrl(`https://youtu.be/${ID}`)).toBe(true);
    expect(validYoutubeVideoUrl(`https://youtu.be/${ID}?si=abc`)).toBe(true);
    // extra path segments after id should still validate the id component
    expect(validYoutubeVideoUrl(`https://youtu.be/${ID}/extra`)).toBe(true);
  });
  it("returns true for embed and v path formats", () => {
    expect(validYoutubeVideoUrl(`https://www.youtube.com/embed/${ID}`)).toBe(true);
    expect(validYoutubeVideoUrl(`https://youtube.com/v/${ID}`)).toBe(true);
  });
  it("returns false for non-YouTube hosts", () => {
    expect(validYoutubeVideoUrl("https://example.com/watch?v=dQw4w9WgXcQ")).toBe(
      false
    );
    expect(validYoutubeVideoUrl("https://vimeo.com/123456")).toBe(false);
  });
  it("returns false for unrelated YouTube paths without a video id", () => {
    expect(validYoutubeVideoUrl("https://www.youtube.com/user/somechannel")).toBe(
      false
    );
    expect(validYoutubeVideoUrl("https://www.youtube.com/")).toBe(false);
  });
  it("returns false for empty or bad inputs", () => {
    expect(validYoutubeVideoUrl("")).toBe(false);
    expect(validYoutubeVideoUrl(null)).toBe(false);
    expect(validYoutubeVideoUrl(undefined)).toBe(false);
  });
  it("returns the video ID for valid YouTube video URLs", () => {
    expect(validYoutubeVideoUrl(`https://www.youtube.com/watch?v=${ID}`, true)).toBe(ID);
    expect(validYoutubeVideoUrl(`https://youtube.com/watch?v=${ID}&t=10s`, true)).toBe(ID);
    expect(validYoutubeVideoUrl(`https://m.youtube.com/watch?v=${ID}`, true)).toBe(ID);
    expect(validYoutubeVideoUrl(`youtube.com/watch?v=${ID}`, true)).toBe(ID);
    expect(validYoutubeVideoUrl(`https://youtu.be/${ID}`, true)).toBe(ID);
    expect(validYoutubeVideoUrl(`https://youtu.be/${ID}?si=abc`, true)).toBe(ID);
    expect(validYoutubeVideoUrl(`https://youtu.be/${ID}/extra`, true)).toBe(ID);
    expect(validYoutubeVideoUrl(`https://www.youtube.com/embed/${ID}`, true)).toBe(ID);
    expect(validYoutubeVideoUrl(`https://youtube.com/v/${ID}`, true)).toBe(ID);
    // invalid video IDs
    expect(validYoutubeVideoUrl(`https://www.youtube.com/watch?v=invalid`, true)).toBe(null);
    expect(validYoutubeVideoUrl(`https://youtube.com/watch?v=invalid`, true)).toBe(null);
    expect(validYoutubeVideoUrl(`https://m.youtube.com/watch?v=invalid`, true)).toBe(null);
    expect(validYoutubeVideoUrl(`youtube.com/watch`, true)).toBe(null);
    expect(validYoutubeVideoUrl(`https://youtu.be/invalid`, true)).toBe(null);
    expect(validYoutubeVideoUrl(`https://youtu.be/invalid?si=abc`, true)).toBe(null);
  });
 });
--- a/collector/processLink/convert/generic.js
+++ b/collector/processLink/convert/generic.js
@ -1,15 +1,18 @@
 const { v4 } = require("uuid");
 const path = require("path");
 const {
  PuppeteerWebBaseLoader,
 } = require("langchain/document_loaders/web/puppeteer");
 const { writeToServerDocuments } = require("../../utils/files");
 const { tokenizeString } = require("../../utils/tokenizer");
 const { default: slugify } = require("slugify");
-const { getContentTypeFromURL, returnResult } = require("../helpers");
+const {
-const { processSingleFile } = require("../../processSingleFile");
+  returnResult,
-const { downloadURIToFile } = require("../../utils/downloadURIToFile");
+  determineContentType,
-const { ACCEPTED_MIMES } = require("../../utils/constants");
+  processAsFile,
 } = require("../helpers");
 const {
  loadYouTubeTranscript,
 } = require("../../utils/extensions/YoutubeTranscript");
 const RuntimeSettings = require("../../utils/runtimeSettings");
 /**
@ -29,80 +32,23 @@ async function scrapeGenericUrl({
  metadata = {},
  saveAsDocument = true,
 }) {
-  /** @type {'web' | 'file'} */
+  /** @type {'web' | 'file' | 'youtube'} */
  let processVia = "web";
  console.log(`-- Working URL ${link} => (captureAs: ${captureAs}) --`);
-
+  let { contentType, processVia } = await determineContentType(link);
  const contentType = await getContentTypeFromURL(link)
    .then((result) => {
      // If there is a reason, log it, but continue with the process
      if (!!result.reason) console.error(result.reason);
      return result.contentType;
    })
    .catch((error) => {
      console.error("Error getting content type from URL", error);
      return null;
    });
  // If the content is unlikely to be a webpage, assume it is a file and process it as a file
  if (
    !["text/html", "text/plain"].includes(contentType) &&
    contentType in ACCEPTED_MIMES
  )
    processVia = "file";
  console.log(`-- URL determined to be ${contentType} (${processVia}) --`);
  // If the content type is a file, download the file to the hotdir and process it
  // Then return the content of the file as a document or whatever the captureAs dictates.
  if (processVia === "file") {
    const fileContentResult = await downloadURIToFile(link);
    if (!fileContentResult.success)
      return returnResult({
        success: false,
        reason: fileContentResult.reason,
        documents: [],
        content: null,
        saveAsDocument,
      });
-    const fileFilePath = fileContentResult.fileLocation;
+  /**
-    const targetFilename = path.basename(fileFilePath);
+   * When the content is a file or a YouTube video, we can use the existing processing functions
-
+   * These are self-contained and will return the correct response based on the saveAsDocument flag already
-    /**
+   * so we can return the content immediately.
-     * If the saveAsDocument is false, we are only interested in the text content
+   */
-     * and can ignore the file as a document by using `parseOnly` in the options.
+  if (processVia === "file")
-     * This will send the file to the Direct Uploads folder instead of the Documents folder.
+    return await processAsFile({ uri: link, saveAsDocument });
-     * that will be deleted by the cleanup-orphan-documents job that runs frequently. The trade off
+  else if (processVia === "youtube")
-     * is that since it still is in FS we can debug its output or even potentially reuse it for other purposes.
+    return await loadYouTubeTranscript(
-     *
+      { url: link },
-     * TODO: Improve this process via a new option that will instantly delete the file after processing
+      { parseOnly: saveAsDocument === false }
-     * if we find we dont need this file ever after processing.
+    );
     */
    const processSingleFileResult = await processSingleFile(targetFilename, {
      parseOnly: saveAsDocument === false,
    });
    if (!processSingleFileResult.success) {
      return returnResult({
        success: false,
        reason: processSingleFileResult.reason,
        documents: [],
        content: null,
        saveAsDocument,
      });
    }
    // If we intend to return only the text content, return the content from the file
    // and then delete the file - otherwise it will be saved as a document
    if (!saveAsDocument) {
      return returnResult({
        success: true,
        content: processSingleFileResult.documents[0].pageContent,
        saveAsDocument,
      });
    }
    return processSingleFileResult;
  }
  // Otherwise, assume the content is a webpage and scrape the content from the webpage
  const content = await getPageContent({
@ -110,7 +56,6 @@ async function scrapeGenericUrl({
    captureAs,
    headers: scraperHeaders,
  });
  if (!content || !content.length) {
    console.error(`Resulting URL content was empty at ${link}.`);
    return returnResult({
@ -124,13 +69,12 @@ async function scrapeGenericUrl({
  // If the captureAs is text, return the content as a string immediately
  // so that we dont save the content as a document
-  if (!saveAsDocument) {
+  if (!saveAsDocument)
    return returnResult({
      success: true,
      content,
      saveAsDocument,
    });
  }
  // Save the content as a document from the URL
  const url = new URL(link);
--- a/collector/processLink/helpers/index.js
+++ b/collector/processLink/helpers/index.js
@ -1,4 +1,9 @@
 const path = require("path");
 const { validURL } = require("../../utils/url");
 const { processSingleFile } = require("../../processSingleFile");
 const { downloadURIToFile } = require("../../utils/downloadURIToFile");
 const { ACCEPTED_MIMES } = require("../../utils/constants");
 const { validYoutubeVideoUrl } = require("../../utils/url");
 /**
 * Get the content type of a resource
@ -51,13 +56,23 @@ async function getContentTypeFromURL(url) {
  }
 }
 /**
 * Normalize the result object based on the saveAsDocument flag
 * @param {Object} result - The result object to normalize
 * @param {boolean} result.success - Whether the result is successful
 * @param {string|null} result.reason - The reason for the result
 * @param {Object[]} result.documents - The documents from the result
 * @param {string|null} result.content - The content of the result
 * @param {boolean} result.saveAsDocument - Whether to save the content as a document. Default is true
 * @returns {{success: boolean, reason: string|null, documents: Object[], content: string|null}} - The normalized result object
 */
 function returnResult({
  success,
  reason,
  documents,
  content,
  saveAsDocument = true,
-}) {
+} = {}) {
  if (!saveAsDocument) {
    return {
      success,
@ -66,7 +81,98 @@ function returnResult({
  } else return { success, reason, documents };
 }
 /**
 * Determine the content type of a link - should be a URL
 * @param {string} uri - The link to determine the content type of
 * @returns {Promise<{contentType: string|null, processVia: 'web' | 'file' | 'youtube'}>} - The content type of the link
 */
 async function determineContentType(uri) {
  let processVia = "web";
  // Dont check for content type if it is a YouTube video URL
  if (validYoutubeVideoUrl(uri))
    return { contentType: "text/html", processVia: "youtube" };
  return await getContentTypeFromURL(uri)
    .then((result) => {
      if (!!result.reason) console.error(result.reason);
      // If the content type is not text/html or text/plain, and it is in the ACCEPTED_MIMES,
      // then we can process it as a file
      if (
        !!result.contentType &&
        !["text/html", "text/plain"].includes(result.contentType) &&
        result.contentType in ACCEPTED_MIMES
      )
        processVia = "file";
      return { contentType: result.contentType, processVia };
    })
    .catch((error) => {
      console.error("Error getting content type from URL", error);
      return { contentType: null, processVia };
    });
 }
 /**
 * Process a link as a file
 * @param {string} uri - The link to process as a file
 * @param {boolean} saveAsDocument - Whether to save the content as a document. Default is true
 * @returns {Promise<{success: boolean, reason: string|null, documents: Object[], content: string|null, saveAsDocument: boolean}>} - The content of the file
 */
 async function processAsFile({ uri, saveAsDocument = true }) {
  const fileContentResult = await downloadURIToFile(uri);
  if (!fileContentResult.success)
    return returnResult({
      success: false,
      reason: fileContentResult.reason,
      documents: [],
      content: null,
      saveAsDocument,
    });
  const fileFilePath = fileContentResult.fileLocation;
  const targetFilename = path.basename(fileFilePath);
  /**
   * If the saveAsDocument is false, we are only interested in the text content
   * and can ignore the file as a document by using `parseOnly` in the options.
   * This will send the file to the Direct Uploads folder instead of the Documents folder.
   * that will be deleted by the cleanup-orphan-documents job that runs frequently. The trade off
   * is that since it still is in FS we can debug its output or even potentially reuse it for other purposes.
   *
   * TODO: Improve this process via a new option that will instantly delete the file after processing
   * if we find we dont need this file ever after processing.
   */
  const processSingleFileResult = await processSingleFile(targetFilename, {
    parseOnly: saveAsDocument === false,
  });
  if (!processSingleFileResult.success) {
    return returnResult({
      success: false,
      reason: processSingleFileResult.reason,
      documents: [],
      content: null,
      saveAsDocument,
    });
  }
  // If we intend to return only the text content, return the content from the file
  // and then delete the file - otherwise it will be saved as a document
  if (!saveAsDocument) {
    return returnResult({
      success: true,
      content: processSingleFileResult.documents[0].pageContent,
      saveAsDocument,
    });
  }
  return processSingleFileResult;
 }
 module.exports = {
  returnResult,
  getContentTypeFromURL,
  determineContentType,
  processAsFile,
 };
--- a/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/index.js
+++ b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/index.js
@ -1,3 +1,5 @@
 const { validYoutubeVideoUrl } = require("../../../url");
 /*
 * This is just a custom implementation of the Langchain JS YouTubeLoader class
 * as the dependency for YoutubeTranscript is quite fickle and its a rat race to keep it up
@ -23,14 +25,9 @@ class YoutubeLoader {
   * @returns The videoId of the YouTube video.
   */
  static getVideoID(url) {
-    const match = url.match(
+    const videoId = validYoutubeVideoUrl(url, true);
-      /.*(?:youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=)([^#&?]*).*/
+    if (videoId) return videoId;
-    );
+    throw new Error("Failed to get youtube video id from the url");
    if (match !== null && match[1].length === 11) {
      return match[1];
    } else {
      throw new Error("Failed to get youtube video id from the url");
    }
  }
  /**
--- a/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js
+++ b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js
@ -1,3 +1,5 @@
 const { validYoutubeVideoUrl } = require("../../../url");
 class YoutubeTranscriptError extends Error {
  constructor(message) {
    super(`[YoutubeTranscript] ${message}`);
@ -229,13 +231,9 @@ class YoutubeTranscript {
   * @returns {string} YouTube video ID
   */
  static retrieveVideoId(videoId) {
-    if (videoId.length === 11) return videoId;
+    if (videoId.length === 11) return videoId; // already a valid ID most likely
-
+    const matchedId = validYoutubeVideoUrl(videoId, true);
-    const RE_YOUTUBE =
+    if (matchedId) return matchedId;
      /(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;
    const matchId = videoId.match(RE_YOUTUBE);
    if (matchId?.[1]) return matchId[1];
    throw new YoutubeTranscriptError(
      "Impossible to retrieve Youtube video ID."
    );
--- a/collector/utils/extensions/YoutubeTranscript/index.js
+++ b/collector/utils/extensions/YoutubeTranscript/index.js
@ -9,27 +9,13 @@ const {
 } = require("../../files");
 const { tokenizeString } = require("../../tokenizer");
 const { YoutubeLoader } = require("./YoutubeLoader");
 const { validYoutubeVideoUrl } = require("../../url");
-function validYoutubeVideoUrl(link) {
+/**
-  const UrlPattern = require("url-pattern");
+ * Fetch the transcript content for a YouTube video
-  const opts = new URL(link);
+ * @param {string} url - The URL of the YouTube video
-  const url = `${opts.protocol}//${opts.host}${opts.pathname}${
+ * @returns {Promise<{success: boolean, reason: string|null, content: string|null, metadata: Object}>} - The transcript content for the YouTube video
-    opts.searchParams.has("v") ? `?v=${opts.searchParams.get("v")}` : ""
+ */
  }`;
  const shortPatternMatch = new UrlPattern(
    "https\\://(www.)youtu.be/(:videoId)"
  ).match(url);
  const fullPatternMatch = new UrlPattern(
    "https\\://(www.)youtube.com/watch?v=(:videoId)"
  ).match(url);
  const videoId =
    shortPatternMatch?.videoId || fullPatternMatch?.videoId || null;
  if (!!videoId) return true;
  return false;
 }
 async function fetchVideoTranscriptContent({ url }) {
  if (!validYoutubeVideoUrl(url)) {
    return {
@ -44,15 +30,11 @@ async function fetchVideoTranscriptContent({ url }) {
  const loader = YoutubeLoader.createFromUrl(url, { addVideoInfo: true });
  const { docs, error } = await loader
    .load()
-    .then((docs) => {
+    .then((docs) => ({ docs, error: null }))
-      return { docs, error: null };
+    .catch((e) => ({
-    })
+      docs: [],
-    .catch((e) => {
+      error: e.message?.split("Error:")?.[1] || e.message,
-      return {
+    }));
        docs: [],
        error: e.message?.split("Error:")?.[1] || e.message,
      };
    });
  if (!docs.length || !!error) {
    return {
@ -82,7 +64,31 @@ async function fetchVideoTranscriptContent({ url }) {
  };
 }
-async function loadYouTubeTranscript({ url }) {
+/**
 * @typedef {Object} TranscriptAsDocument
 * @property {boolean} success - Whether the transcript was successful
 * @property {string|null} reason - The reason for the transcript
 * @property {{title: string, author: string, destination: string}} data - The data from the transcript
 */
 /**
 * @typedef {Object} TranscriptAsContent
 * @property {boolean} success - Whether the transcript was successful
 * @property {string|null} reason - The reason for the transcript
 * @property {string|null} content - The content of the transcript
 * @property {Object[]} documents - The documents from the transcript
 * @property {boolean} saveAsDocument - Whether to save the transcript as a document
 */
 /**
 * Load the transcript content for a YouTube video as well as save it to the server documents
 * @param {Object} params - The parameters for the YouTube transcript
 * @param {string} params.url - The URL of the YouTube video
 * @param {Object} options - The options for the YouTube transcript
 * @param {boolean} options.parseOnly - Whether to parse the transcript content only or save it to the server documents
 * @returns {Promise<TranscriptAsDocument | TranscriptAsContent>} - The transcript content for the YouTube video
 */
 async function loadYouTubeTranscript({ url }, options = { parseOnly: false }) {
  const transcriptResults = await fetchVideoTranscriptContent({ url });
  if (!transcriptResults.success) {
    return {
@ -90,9 +96,25 @@ async function loadYouTubeTranscript({ url }) {
      reason:
        transcriptResults.reason ||
        "An unknown error occurred during transcription retrieval",
      documents: [],
      content: null,
      saveAsDocument: options.parseOnly,
      data: {},
    };
  }
  const { content, metadata } = transcriptResults;
  if (options.parseOnly) {
    return {
      success: true,
      reason: null,
      content,
      documents: [],
      saveAsDocument: options.parseOnly,
      data: {},
    };
  }
  const outFolder = sanitizeFileName(
    slugify(`${metadata.author} YouTube transcripts`).toLowerCase()
  );
@ -100,7 +122,6 @@ async function loadYouTubeTranscript({ url }) {
  if (!fs.existsSync(outFolderPath))
    fs.mkdirSync(outFolderPath, { recursive: true });
  const data = {
    id: v4(),
    url: url + ".youtube",
@ -124,7 +145,7 @@ async function loadYouTubeTranscript({ url }) {
  return {
    success: true,
-    reason: "test",
+    reason: null,
    data: {
      title: metadata.title,
      author: metadata.author,
--- a/collector/utils/url/index.js
+++ b/collector/utils/url/index.js
@ -95,7 +95,43 @@ function validateURL(url) {
  }
 }
 /**
 * Validate if a link is a valid YouTube video URL
 * - Checks youtu.be, youtube.com, m.youtube.com, music.youtube.com
 * - Embed video URLs
 * - Short URLs
 * - Live URLs
 * - Regular watch URLs
 * - Optional query parameters (including ?v parameter)
 *
 * Can be used to extract the video ID from a YouTube video URL via the returnVideoId parameter.
 * @param {string} link - The link to validate
 * @param {boolean} returnVideoId - Whether to return the video ID if the link is a valid YouTube video URL
 * @returns {boolean|string} - Whether the link is a valid YouTube video URL or the video ID if returnVideoId is true
 */
 function validYoutubeVideoUrl(link, returnVideoId = false) {
  try {
    if (!link || typeof link !== "string") return false;
    let urlToValidate = link;
    if (!link.startsWith("http://") && !link.startsWith("https://")) {
      urlToValidate = "https://" + link;
      urlToValidate = new URL(urlToValidate).toString();
    }
    const regex =
      /^(?:https?:\/\/)?(?:www\.|m\.|music\.)?(?:youtu\.be\/|youtube\.com\/(?:embed\/|v\/|watch\?(?:.*&)?v=|(?:live\/)?|shorts\/))([\w-]{11})(?:\S+)?$/;
    const match = urlToValidate.match(regex);
    if (returnVideoId) return match?.[1] ?? null;
    return !!match?.[1];
  } catch (error) {
    console.error("Error validating YouTube video URL", error);
    return returnVideoId ? null : false;
  }
 }
 module.exports = {
  validURL,
  validateURL,
  validYoutubeVideoUrl,
 };