fix: youtube transcript collector not work well with non en or non asr caption (#4442)

* fix: youtube transcript collector not work well with non en or non asr caption * stub YT test in Github actions --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
2025-09-30 04:22:50 +08:00 · 2025-09-30 04:22:50 +08:00 · 8fc1f24d1b
commit 8fc1f24d1b
parent c8f13d5f27
2 changed files with 115 additions and 14 deletions
--- a/collector/tests/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js
+++ b/collector/tests/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js
@ -1,16 +1,32 @@
 const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");

 describe("YoutubeTranscript", () => {
-  it("should fetch transcript from YouTube video", async () => {
-    const videoId = "BJjsfNO5JTo";
-    const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
-      lang: "en",
-    });
+  if (process.env.GITHUB_ACTIONS) {
+    console.log("Skipping YoutubeTranscript test in GitHub Actions as the URLs will not resolve.");
+    it('is stubbed in GitHub Actions', () => expect(true).toBe(true));
+  } else {
+    it("should fetch transcript from YouTube video", async () => {
+      const videoId = "BJjsfNO5JTo";
+      const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
+        lang: "en",
+      });

-    expect(transcript).toBeDefined();
-    expect(typeof transcript).toBe("string");
-    expect(transcript.length).toBeGreaterThan(0);
-    // console.log("Success! Transcript length:", transcript.length);
-    // console.log("First 200 characters:", transcript.substring(0, 200) + "...");
-  }, 30000);
+      expect(transcript).toBeDefined();
+      expect(typeof transcript).toBe("string");
+      expect(transcript.length).toBeGreaterThan(0);
+      console.log("First 200 characters:", transcript.substring(0, 200) + "...");
+    }, 30000);
+
+    it("should fetch non asr transcript from YouTube video", async () => {
+      const videoId = "D111ao6wWH0";
+      const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
+        lang: "zh-HK",
+      });
+
+      expect(transcript).toBeDefined();
+      expect(typeof transcript).toBe("string");
+      expect(transcript.length).toBeGreaterThan(0);
+      console.log("First 200 characters:", transcript.substring(0, 200) + "...");
+    }, 30000);
+  }
 });
--- a/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js
+++ b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js
@ -85,6 +85,85 @@ class YoutubeTranscript {
      .replace(/\s+/g, " ");
  }

+  /**
+   * Calculates a preference score for a caption track to determine the best match
+   * @param {Object} track - The caption track object from YouTube
+   * @param {string} track.languageCode - ISO language code (e.g., 'zh-HK', 'en', 'es')
+   * @param {string} track.kind - Track type ('asr' for auto-generated, "" for human-transcribed)
+   * @param {string[]} preferredLanguages - Array of language codes in preference order (e.g., ['zh-HK', 'en'])
+   * @returns {number} Preference score (lower is better)
+   */
+  static #calculatePreferenceScore(track, preferredLanguages) {
+    // Language preference: index in preferredLanguages array (0 = most preferred)
+    const languagePreference = preferredLanguages.indexOf(track.languageCode);
+    const languageScore = languagePreference === -1 ? 9999 : languagePreference;
+
+    // Kind bonus: prefer human-transcribed (undefined) over auto-generated ('asr')
+    const kindBonus = track.kind === "asr" ? 0.5 : 0;
+
+    return languageScore + kindBonus;
+  }
+
+  /**
+   * Finds the most suitable caption track based on preferred languages
+   * @param {string} videoBody - The raw HTML response from YouTube
+   * @param {string[]} preferredLanguages - Array of language codes in preference order
+   * @returns {Object|null} The selected caption track or null if none found
+   */
+  static #findPreferredCaptionTrack(videoBody, preferredLanguages) {
+    const captionsConfigJson = videoBody.match(
+      /"captions":(.*?),"videoDetails":/s
+    );
+
+    const captionsConfig = captionsConfigJson?.[1]
+      ? JSON.parse(captionsConfigJson[1])
+      : null;
+
+    const captionTracks = captionsConfig
+      ? captionsConfig.playerCaptionsTracklistRenderer.captionTracks
+      : null;
+
+    if (!captionTracks || captionTracks.length === 0) {
+      return null;
+    }
+
+    const sortedTracks = [...captionTracks].sort((a, b) => {
+      const scoreA = this.#calculatePreferenceScore(a, preferredLanguages);
+      const scoreB = this.#calculatePreferenceScore(b, preferredLanguages);
+      return scoreA - scoreB;
+    });
+
+    return sortedTracks[0];
+  }
+
+  /**
+   * Fetches video page content and finds the preferred caption track
+   * @param {string} videoId - YouTube video ID
+   * @param {string[]} preferredLanguages - Array of preferred language codes
+   * @returns {Promise<Object>} The preferred caption track
+   * @throws {YoutubeTranscriptError} If no suitable caption track is found
+   */
+  static async #getPreferredCaptionTrack(videoId, preferredLanguages) {
+    const videoResponse = await fetch(
+      `https://www.youtube.com/watch?v=${videoId}`,
+      { credentials: "omit" }
+    );
+    const videoBody = await videoResponse.text();
+
+    const preferredCaptionTrack = this.#findPreferredCaptionTrack(
+      videoBody,
+      preferredLanguages
+    );
+
+    if (!preferredCaptionTrack) {
+      throw new YoutubeTranscriptError(
+        "No suitable caption track found for the video"
+      );
+    }
+
+    return preferredCaptionTrack;
+  }
+
  /**
   * Fetch transcript from YouTube video
   * @param {string} videoId - Video URL or video identifier
@ -93,14 +172,20 @@ class YoutubeTranscript {
   * @returns {Promise<string>} Video transcript text
   */
  static async fetchTranscript(videoId, config = {}) {
+    const preferredLanguages = config?.lang ? [config?.lang, "en"] : ["en"];
    const identifier = this.retrieveVideoId(videoId);
-    const lang = config?.lang ?? "en";

    try {
+      const preferredCaptionTrack = await this.#getPreferredCaptionTrack(
+        identifier,
+        preferredLanguages
+      );
+
      const innerProto = this.#getBase64Protobuf({
-        param1: "asr",
-        param2: lang,
+        param1: preferredCaptionTrack.kind || "",
+        param2: preferredCaptionTrack.languageCode,
      });
+
      const params = this.#getBase64Protobuf({
        param1: identifier,
        param2: innerProto,