fix: youtube transcript collector not work well with non en or non asr caption (#4442)
* fix: youtube transcript collector not work well with non en or non asr caption * stub YT test in Github actions --------- Co-authored-by: Timothy Carambat <rambat1010@gmail.com>
This commit is contained in:
parent
c8f13d5f27
commit
8fc1f24d1b
@ -1,16 +1,32 @@
|
||||
const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");
|
||||
|
||||
describe("YoutubeTranscript", () => {
|
||||
it("should fetch transcript from YouTube video", async () => {
|
||||
const videoId = "BJjsfNO5JTo";
|
||||
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
|
||||
lang: "en",
|
||||
});
|
||||
if (process.env.GITHUB_ACTIONS) {
|
||||
console.log("Skipping YoutubeTranscript test in GitHub Actions as the URLs will not resolve.");
|
||||
it('is stubbed in GitHub Actions', () => expect(true).toBe(true));
|
||||
} else {
|
||||
it("should fetch transcript from YouTube video", async () => {
|
||||
const videoId = "BJjsfNO5JTo";
|
||||
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
|
||||
lang: "en",
|
||||
});
|
||||
|
||||
expect(transcript).toBeDefined();
|
||||
expect(typeof transcript).toBe("string");
|
||||
expect(transcript.length).toBeGreaterThan(0);
|
||||
// console.log("Success! Transcript length:", transcript.length);
|
||||
// console.log("First 200 characters:", transcript.substring(0, 200) + "...");
|
||||
}, 30000);
|
||||
expect(transcript).toBeDefined();
|
||||
expect(typeof transcript).toBe("string");
|
||||
expect(transcript.length).toBeGreaterThan(0);
|
||||
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
|
||||
}, 30000);
|
||||
|
||||
it("should fetch non asr transcript from YouTube video", async () => {
|
||||
const videoId = "D111ao6wWH0";
|
||||
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
|
||||
lang: "zh-HK",
|
||||
});
|
||||
|
||||
expect(transcript).toBeDefined();
|
||||
expect(typeof transcript).toBe("string");
|
||||
expect(transcript.length).toBeGreaterThan(0);
|
||||
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
|
||||
}, 30000);
|
||||
}
|
||||
});
|
||||
|
||||
@ -85,6 +85,85 @@ class YoutubeTranscript {
|
||||
.replace(/\s+/g, " ");
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates a preference score for a caption track to determine the best match
|
||||
* @param {Object} track - The caption track object from YouTube
|
||||
* @param {string} track.languageCode - ISO language code (e.g., 'zh-HK', 'en', 'es')
|
||||
* @param {string} track.kind - Track type ('asr' for auto-generated, "" for human-transcribed)
|
||||
* @param {string[]} preferredLanguages - Array of language codes in preference order (e.g., ['zh-HK', 'en'])
|
||||
* @returns {number} Preference score (lower is better)
|
||||
*/
|
||||
static #calculatePreferenceScore(track, preferredLanguages) {
|
||||
// Language preference: index in preferredLanguages array (0 = most preferred)
|
||||
const languagePreference = preferredLanguages.indexOf(track.languageCode);
|
||||
const languageScore = languagePreference === -1 ? 9999 : languagePreference;
|
||||
|
||||
// Kind bonus: prefer human-transcribed (undefined) over auto-generated ('asr')
|
||||
const kindBonus = track.kind === "asr" ? 0.5 : 0;
|
||||
|
||||
return languageScore + kindBonus;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the most suitable caption track based on preferred languages
|
||||
* @param {string} videoBody - The raw HTML response from YouTube
|
||||
* @param {string[]} preferredLanguages - Array of language codes in preference order
|
||||
* @returns {Object|null} The selected caption track or null if none found
|
||||
*/
|
||||
static #findPreferredCaptionTrack(videoBody, preferredLanguages) {
|
||||
const captionsConfigJson = videoBody.match(
|
||||
/"captions":(.*?),"videoDetails":/s
|
||||
);
|
||||
|
||||
const captionsConfig = captionsConfigJson?.[1]
|
||||
? JSON.parse(captionsConfigJson[1])
|
||||
: null;
|
||||
|
||||
const captionTracks = captionsConfig
|
||||
? captionsConfig.playerCaptionsTracklistRenderer.captionTracks
|
||||
: null;
|
||||
|
||||
if (!captionTracks || captionTracks.length === 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const sortedTracks = [...captionTracks].sort((a, b) => {
|
||||
const scoreA = this.#calculatePreferenceScore(a, preferredLanguages);
|
||||
const scoreB = this.#calculatePreferenceScore(b, preferredLanguages);
|
||||
return scoreA - scoreB;
|
||||
});
|
||||
|
||||
return sortedTracks[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetches video page content and finds the preferred caption track
|
||||
* @param {string} videoId - YouTube video ID
|
||||
* @param {string[]} preferredLanguages - Array of preferred language codes
|
||||
* @returns {Promise<Object>} The preferred caption track
|
||||
* @throws {YoutubeTranscriptError} If no suitable caption track is found
|
||||
*/
|
||||
static async #getPreferredCaptionTrack(videoId, preferredLanguages) {
|
||||
const videoResponse = await fetch(
|
||||
`https://www.youtube.com/watch?v=${videoId}`,
|
||||
{ credentials: "omit" }
|
||||
);
|
||||
const videoBody = await videoResponse.text();
|
||||
|
||||
const preferredCaptionTrack = this.#findPreferredCaptionTrack(
|
||||
videoBody,
|
||||
preferredLanguages
|
||||
);
|
||||
|
||||
if (!preferredCaptionTrack) {
|
||||
throw new YoutubeTranscriptError(
|
||||
"No suitable caption track found for the video"
|
||||
);
|
||||
}
|
||||
|
||||
return preferredCaptionTrack;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch transcript from YouTube video
|
||||
* @param {string} videoId - Video URL or video identifier
|
||||
@ -93,14 +172,20 @@ class YoutubeTranscript {
|
||||
* @returns {Promise<string>} Video transcript text
|
||||
*/
|
||||
static async fetchTranscript(videoId, config = {}) {
|
||||
const preferredLanguages = config?.lang ? [config?.lang, "en"] : ["en"];
|
||||
const identifier = this.retrieveVideoId(videoId);
|
||||
const lang = config?.lang ?? "en";
|
||||
|
||||
try {
|
||||
const preferredCaptionTrack = await this.#getPreferredCaptionTrack(
|
||||
identifier,
|
||||
preferredLanguages
|
||||
);
|
||||
|
||||
const innerProto = this.#getBase64Protobuf({
|
||||
param1: "asr",
|
||||
param2: lang,
|
||||
param1: preferredCaptionTrack.kind || "",
|
||||
param2: preferredCaptionTrack.languageCode,
|
||||
});
|
||||
|
||||
const params = this.#getBase64Protobuf({
|
||||
param1: identifier,
|
||||
param2: innerProto,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user