Fix broken YT scraping with YT API (#4005)

* Fix broken YT scraping with YT API * refactor youtube transcript class/add jsdoc comments * fix test --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com> Co-authored-by: timothycarambat <rambat1010@gmail.com>
2025-07-07 16:06:18 -04:00 · 2025-07-07 16:06:18 -04:00 · d0978fa363
commit d0978fa363
parent 0d7a7551b8
2 changed files with 141 additions and 77 deletions
--- a/collector/tests/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js
+++ b/collector/tests/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js
@ -0,0 +1,18 @@
+const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");
+
+describe("YoutubeTranscript", () => {
+  it("should fetch transcript from YouTube video", async () => {
+    const videoId = "BJjsfNO5JTo";
+    const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
+      lang: "en",
+    });
+
+    expect(transcript).toBeDefined();
+    expect(typeof transcript).toBe("string");
+    expect(transcript.length).toBeGreaterThan(0);
+
+    // Log the results for debugging purposes
+    console.log("Success! Transcript length:", transcript.length);
+    console.log("First 200 characters:", transcript.substring(0, 200) + "...");
+  }, 30000); // 30 second timeout for network request
+});
--- a/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js
+++ b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js
@ -1,9 +1,3 @@
-const { parse } = require("node-html-parser");
-const RE_YOUTUBE =
-  /(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;
-const USER_AGENT =
-  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)";
-
 class YoutubeTranscriptError extends Error {
  constructor(message) {
    super(`[YoutubeTranscript] ${message}`);
@ -11,100 +5,152 @@ class YoutubeTranscriptError extends Error {
 }

 /**
- * Class to retrieve transcript if exist
+ * Handles fetching and parsing YouTube video transcripts
 */
 class YoutubeTranscript {
  /**
-   * Fetch transcript from YTB Video
-   * @param videoId Video url or video identifier
-   * @param config Object with lang param (eg: en, es, hk, uk) format.
-   * Will just the grab first caption if it can find one, so no special lang caption support.
+   * Encodes a string as a protobuf field
+   * @param {number} fieldNumber - The protobuf field number
+   * @param {string} str - The string to encode
+   * @returns {Buffer} Encoded protobuf field
+   */
+  static #encodeProtobufString(fieldNumber, str) {
+    const utf8Bytes = Buffer.from(str, "utf8");
+    const tag = (fieldNumber << 3) | 2; // wire type 2 for string
+    const lengthBytes = this.#encodeVarint(utf8Bytes.length);
+
+    return Buffer.concat([
+      Buffer.from([tag]),
+      Buffer.from(lengthBytes),
+      utf8Bytes,
+    ]);
+  }
+
+  /**
+   * Encodes a number as a protobuf varint
+   * @param {number} value - The number to encode
+   * @returns {number[]} Encoded varint bytes
+   */
+  static #encodeVarint(value) {
+    const bytes = [];
+    while (value >= 0x80) {
+      bytes.push((value & 0x7f) | 0x80);
+      value >>>= 7;
+    }
+    bytes.push(value);
+    return bytes;
+  }
+
+  /**
+   * Creates a base64 encoded protobuf message
+   * @param {Object} param - The parameters to encode
+   * @param {string} param.param1 - First parameter
+   * @param {string} param.param2 - Second parameter
+   * @returns {string} Base64 encoded protobuf
+   */
+  static #getBase64Protobuf({ param1, param2 }) {
+    const field1 = this.#encodeProtobufString(1, param1);
+    const field2 = this.#encodeProtobufString(2, param2);
+    return Buffer.concat([field1, field2]).toString("base64");
+  }
+
+  /**
+   * Extracts transcript text from YouTube API response
+   * @param {Object} responseData - The YouTube API response
+   * @returns {string} Combined transcript text
+   */
+  static #extractTranscriptFromResponse(responseData) {
+    const transcriptRenderer =
+      responseData.actions?.[0]?.updateEngagementPanelAction?.content
+        ?.transcriptRenderer;
+    if (!transcriptRenderer) {
+      throw new Error("No transcript data found in response");
+    }
+
+    const segments =
+      transcriptRenderer.content?.transcriptSearchPanelRenderer?.body
+        ?.transcriptSegmentListRenderer?.initialSegments;
+    if (!segments) {
+      throw new Error("Transcript segments not found in response");
+    }
+
+    return segments
+      .map((segment) => {
+        const runs = segment.transcriptSegmentRenderer?.snippet?.runs;
+        return runs ? runs.map((run) => run.text).join("") : "";
+      })
+      .filter((text) => text)
+      .join(" ")
+      .trim()
+      .replace(/\s+/g, " ");
+  }
+
+  /**
+   * Fetch transcript from YouTube video
+   * @param {string} videoId - Video URL or video identifier
+   * @param {Object} config - Configuration options
+   * @param {string} [config.lang='en'] - Language code (e.g., 'en', 'es', 'fr')
+   * @returns {Promise<string>} Video transcript text
   */
  static async fetchTranscript(videoId, config = {}) {
    const identifier = this.retrieveVideoId(videoId);
    const lang = config?.lang ?? "en";
+
    try {
-      const transcriptUrl = await fetch(
-        `https://www.youtube.com/watch?v=${identifier}`,
+      const innerProto = this.#getBase64Protobuf({
+        param1: "asr",
+        param2: lang,
+      });
+      const params = this.#getBase64Protobuf({
+        param1: identifier,
+        param2: innerProto,
+      });
+
+      const response = await fetch(
+        "https://www.youtube.com/youtubei/v1/get_transcript",
        {
+          method: "POST",
          headers: {
-            "User-Agent": USER_AGENT,
+            "Content-Type": "application/json",
+            "User-Agent":
+              "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)",
          },
+          body: JSON.stringify({
+            context: {
+              client: {
+                clientName: "WEB",
+                clientVersion: "2.20240826.01.00",
+              },
+            },
+            params,
+          }),
        }
-      )
-        .then((res) => res.text())
-        .then((html) => parse(html))
-        .then((html) => this.#parseTranscriptEndpoint(html, lang));
-
-      if (!transcriptUrl)
-        throw new Error("Failed to locate a transcript for this video!");
-
-      // Result is hopefully some XML.
-      const transcriptXML = await fetch(transcriptUrl)
-        .then((res) => res.text())
-        .then((xml) => parse(xml));
-
-      let transcript = "";
-      const chunks = transcriptXML.getElementsByTagName("text");
-      for (const chunk of chunks) {
-        // Add space after each text chunk
-        transcript += chunk.textContent + " ";
-      }
-
-      // Trim extra whitespace
-      return transcript.trim().replace(/\s+/g, " ");
-    } catch (e) {
-      throw new YoutubeTranscriptError(e);
-    }
-  }
-
-  static #parseTranscriptEndpoint(document, langCode = null) {
-    try {
-      // Get all script tags on document page
-      const scripts = document.getElementsByTagName("script");
-
-      // find the player data script.
-      const playerScript = scripts.find((script) =>
-        script.textContent.includes("var ytInitialPlayerResponse = {")
      );

-      const dataString =
-        playerScript.textContent
-          ?.split("var ytInitialPlayerResponse = ")?.[1] //get the start of the object {....
-          ?.split("};")?.[0] + // chunk off any code after object closure.
-        "}"; // add back that curly brace we just cut.
+      if (!response.ok) {
+        throw new Error(`HTTP error! status: ${response.status}`);
+      }

-      const data = JSON.parse(dataString.trim()); // Attempt a JSON parse
-      const availableCaptions =
-        data?.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];
-
-      // If languageCode was specified then search for it's code, otherwise get the first.
-      let captionTrack = availableCaptions?.[0];
-      if (langCode)
-        captionTrack =
-          availableCaptions.find((track) =>
-            track.languageCode.includes(langCode)
-          ) ?? availableCaptions?.[0];
-
-      return captionTrack?.baseUrl;
+      const responseData = await response.json();
+      return this.#extractTranscriptFromResponse(responseData);
    } catch (e) {
-      console.error(`YoutubeTranscript.#parseTranscriptEndpoint ${e.message}`);
-      return null;
+      throw new YoutubeTranscriptError(e.message || e);
    }
  }

  /**
-   * Retrieve video id from url or string
-   * @param videoId video url or video id
+   * Extract video ID from a YouTube URL or verify an existing ID
+   * @param {string} videoId - Video URL or ID
+   * @returns {string} YouTube video ID
   */
  static retrieveVideoId(videoId) {
-    if (videoId.length === 11) {
-      return videoId;
-    }
+    if (videoId.length === 11) return videoId;
+
+    const RE_YOUTUBE =
+      /(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;
    const matchId = videoId.match(RE_YOUTUBE);
-    if (matchId && matchId.length) {
-      return matchId[1];
-    }
+
+    if (matchId?.[1]) return matchId[1];
    throw new YoutubeTranscriptError(
      "Impossible to retrieve Youtube video ID."
    );