Fix broken YT scraping with YT API (#4005)

* Fix broken YT scraping with YT API * refactor youtube transcript class/add jsdoc comments * fix test --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com> Co-authored-by: timothycarambat <rambat1010@gmail.com>
2025-07-07 16:06:18 -04:00 · 2025-07-07 16:06:18 -04:00 · d0978fa363
commit d0978fa363
parent 0d7a7551b8
2 changed files with 141 additions and 77 deletions
--- a/collector/tests/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js
+++ b/collector/tests/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js
@ -0,0 +1,18 @@
 const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");
 describe("YoutubeTranscript", () => {
  it("should fetch transcript from YouTube video", async () => {
    const videoId = "BJjsfNO5JTo";
    const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
      lang: "en",
    });
    expect(transcript).toBeDefined();
    expect(typeof transcript).toBe("string");
    expect(transcript.length).toBeGreaterThan(0);
    // Log the results for debugging purposes
    console.log("Success! Transcript length:", transcript.length);
    console.log("First 200 characters:", transcript.substring(0, 200) + "...");
  }, 30000); // 30 second timeout for network request
 });
--- a/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js
+++ b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js
@ -1,9 +1,3 @@
 const { parse } = require("node-html-parser");
 const RE_YOUTUBE =
  /(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;
 const USER_AGENT =
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)";
 class YoutubeTranscriptError extends Error {
  constructor(message) {
    super(`[YoutubeTranscript] ${message}`);
@ -11,100 +5,152 @@ class YoutubeTranscriptError extends Error {
 }
 /**
- * Class to retrieve transcript if exist
+ * Handles fetching and parsing YouTube video transcripts
 */
 class YoutubeTranscript {
  /**
-   * Fetch transcript from YTB Video
+   * Encodes a string as a protobuf field
-   * @param videoId Video url or video identifier
+   * @param {number} fieldNumber - The protobuf field number
-   * @param config Object with lang param (eg: en, es, hk, uk) format.
+   * @param {string} str - The string to encode
-   * Will just the grab first caption if it can find one, so no special lang caption support.
+   * @returns {Buffer} Encoded protobuf field
   */
  static #encodeProtobufString(fieldNumber, str) {
    const utf8Bytes = Buffer.from(str, "utf8");
    const tag = (fieldNumber << 3) | 2; // wire type 2 for string
    const lengthBytes = this.#encodeVarint(utf8Bytes.length);
    return Buffer.concat([
      Buffer.from([tag]),
      Buffer.from(lengthBytes),
      utf8Bytes,
    ]);
  }
  /**
   * Encodes a number as a protobuf varint
   * @param {number} value - The number to encode
   * @returns {number[]} Encoded varint bytes
   */
  static #encodeVarint(value) {
    const bytes = [];
    while (value >= 0x80) {
      bytes.push((value & 0x7f) | 0x80);
      value >>>= 7;
    }
    bytes.push(value);
    return bytes;
  }
  /**
   * Creates a base64 encoded protobuf message
   * @param {Object} param - The parameters to encode
   * @param {string} param.param1 - First parameter
   * @param {string} param.param2 - Second parameter
   * @returns {string} Base64 encoded protobuf
   */
  static #getBase64Protobuf({ param1, param2 }) {
    const field1 = this.#encodeProtobufString(1, param1);
    const field2 = this.#encodeProtobufString(2, param2);
    return Buffer.concat([field1, field2]).toString("base64");
  }
  /**
   * Extracts transcript text from YouTube API response
   * @param {Object} responseData - The YouTube API response
   * @returns {string} Combined transcript text
   */
  static #extractTranscriptFromResponse(responseData) {
    const transcriptRenderer =
      responseData.actions?.[0]?.updateEngagementPanelAction?.content
        ?.transcriptRenderer;
    if (!transcriptRenderer) {
      throw new Error("No transcript data found in response");
    }
    const segments =
      transcriptRenderer.content?.transcriptSearchPanelRenderer?.body
        ?.transcriptSegmentListRenderer?.initialSegments;
    if (!segments) {
      throw new Error("Transcript segments not found in response");
    }
    return segments
      .map((segment) => {
        const runs = segment.transcriptSegmentRenderer?.snippet?.runs;
        return runs ? runs.map((run) => run.text).join("") : "";
      })
      .filter((text) => text)
      .join(" ")
      .trim()
      .replace(/\s+/g, " ");
  }
  /**
   * Fetch transcript from YouTube video
   * @param {string} videoId - Video URL or video identifier
   * @param {Object} config - Configuration options
   * @param {string} [config.lang='en'] - Language code (e.g., 'en', 'es', 'fr')
   * @returns {Promise<string>} Video transcript text
   */
  static async fetchTranscript(videoId, config = {}) {
    const identifier = this.retrieveVideoId(videoId);
    const lang = config?.lang ?? "en";
    try {
-      const transcriptUrl = await fetch(
+      const innerProto = this.#getBase64Protobuf({
-        `https://www.youtube.com/watch?v=${identifier}`,
+        param1: "asr",
        param2: lang,
      });
      const params = this.#getBase64Protobuf({
        param1: identifier,
        param2: innerProto,
      });
      const response = await fetch(
        "https://www.youtube.com/youtubei/v1/get_transcript",
        {
          method: "POST",
          headers: {
-            "User-Agent": USER_AGENT,
+            "Content-Type": "application/json",
            "User-Agent":
              "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)",
          },
          body: JSON.stringify({
            context: {
              client: {
                clientName: "WEB",
                clientVersion: "2.20240826.01.00",
              },
            },
            params,
          }),
        }
      )
        .then((res) => res.text())
        .then((html) => parse(html))
        .then((html) => this.#parseTranscriptEndpoint(html, lang));
      if (!transcriptUrl)
        throw new Error("Failed to locate a transcript for this video!");
      // Result is hopefully some XML.
      const transcriptXML = await fetch(transcriptUrl)
        .then((res) => res.text())
        .then((xml) => parse(xml));
      let transcript = "";
      const chunks = transcriptXML.getElementsByTagName("text");
      for (const chunk of chunks) {
        // Add space after each text chunk
        transcript += chunk.textContent + " ";
      }
      // Trim extra whitespace
      return transcript.trim().replace(/\s+/g, " ");
    } catch (e) {
      throw new YoutubeTranscriptError(e);
    }
  }
  static #parseTranscriptEndpoint(document, langCode = null) {
    try {
      // Get all script tags on document page
      const scripts = document.getElementsByTagName("script");
      // find the player data script.
      const playerScript = scripts.find((script) =>
        script.textContent.includes("var ytInitialPlayerResponse = {")
      );
-      const dataString =
+      if (!response.ok) {
-        playerScript.textContent
+        throw new Error(`HTTP error! status: ${response.status}`);
-          ?.split("var ytInitialPlayerResponse = ")?.[1] //get the start of the object {....
+      }
          ?.split("};")?.[0] + // chunk off any code after object closure.
        "}"; // add back that curly brace we just cut.
-      const data = JSON.parse(dataString.trim()); // Attempt a JSON parse
+      const responseData = await response.json();
-      const availableCaptions =
+      return this.#extractTranscriptFromResponse(responseData);
        data?.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];
      // If languageCode was specified then search for it's code, otherwise get the first.
      let captionTrack = availableCaptions?.[0];
      if (langCode)
        captionTrack =
          availableCaptions.find((track) =>
            track.languageCode.includes(langCode)
          ) ?? availableCaptions?.[0];
      return captionTrack?.baseUrl;
    } catch (e) {
-      console.error(`YoutubeTranscript.#parseTranscriptEndpoint ${e.message}`);
+      throw new YoutubeTranscriptError(e.message || e);
      return null;
    }
  }
  /**
-   * Retrieve video id from url or string
+   * Extract video ID from a YouTube URL or verify an existing ID
-   * @param videoId video url or video id
+   * @param {string} videoId - Video URL or ID
   * @returns {string} YouTube video ID
   */
  static retrieveVideoId(videoId) {
-    if (videoId.length === 11) {
+    if (videoId.length === 11) return videoId;
-      return videoId;
+
-    }
+    const RE_YOUTUBE =
      /(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;
    const matchId = videoId.match(RE_YOUTUBE);
-    if (matchId && matchId.length) {
+
-      return matchId[1];
+    if (matchId?.[1]) return matchId[1];
    }
    throw new YoutubeTranscriptError(
      "Impossible to retrieve Youtube video ID."
    );