Fix broken YT scraping with YT API (#4005)
* Fix broken YT scraping with YT API * refactor youtube transcript class/add jsdoc comments * fix test --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com> Co-authored-by: timothycarambat <rambat1010@gmail.com>
This commit is contained in:
parent
0d7a7551b8
commit
d0978fa363
@ -0,0 +1,18 @@
|
|||||||
|
const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");
|
||||||
|
|
||||||
|
describe("YoutubeTranscript", () => {
|
||||||
|
it("should fetch transcript from YouTube video", async () => {
|
||||||
|
const videoId = "BJjsfNO5JTo";
|
||||||
|
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
|
||||||
|
lang: "en",
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(transcript).toBeDefined();
|
||||||
|
expect(typeof transcript).toBe("string");
|
||||||
|
expect(transcript.length).toBeGreaterThan(0);
|
||||||
|
|
||||||
|
// Log the results for debugging purposes
|
||||||
|
console.log("Success! Transcript length:", transcript.length);
|
||||||
|
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
|
||||||
|
}, 30000); // 30 second timeout for network request
|
||||||
|
});
|
||||||
@ -1,9 +1,3 @@
|
|||||||
const { parse } = require("node-html-parser");
|
|
||||||
const RE_YOUTUBE =
|
|
||||||
/(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;
|
|
||||||
const USER_AGENT =
|
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)";
|
|
||||||
|
|
||||||
class YoutubeTranscriptError extends Error {
|
class YoutubeTranscriptError extends Error {
|
||||||
constructor(message) {
|
constructor(message) {
|
||||||
super(`[YoutubeTranscript] ${message}`);
|
super(`[YoutubeTranscript] ${message}`);
|
||||||
@ -11,100 +5,152 @@ class YoutubeTranscriptError extends Error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class to retrieve transcript if exist
|
* Handles fetching and parsing YouTube video transcripts
|
||||||
*/
|
*/
|
||||||
class YoutubeTranscript {
|
class YoutubeTranscript {
|
||||||
/**
|
/**
|
||||||
* Fetch transcript from YTB Video
|
* Encodes a string as a protobuf field
|
||||||
* @param videoId Video url or video identifier
|
* @param {number} fieldNumber - The protobuf field number
|
||||||
* @param config Object with lang param (eg: en, es, hk, uk) format.
|
* @param {string} str - The string to encode
|
||||||
* Will just the grab first caption if it can find one, so no special lang caption support.
|
* @returns {Buffer} Encoded protobuf field
|
||||||
|
*/
|
||||||
|
static #encodeProtobufString(fieldNumber, str) {
|
||||||
|
const utf8Bytes = Buffer.from(str, "utf8");
|
||||||
|
const tag = (fieldNumber << 3) | 2; // wire type 2 for string
|
||||||
|
const lengthBytes = this.#encodeVarint(utf8Bytes.length);
|
||||||
|
|
||||||
|
return Buffer.concat([
|
||||||
|
Buffer.from([tag]),
|
||||||
|
Buffer.from(lengthBytes),
|
||||||
|
utf8Bytes,
|
||||||
|
]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Encodes a number as a protobuf varint
|
||||||
|
* @param {number} value - The number to encode
|
||||||
|
* @returns {number[]} Encoded varint bytes
|
||||||
|
*/
|
||||||
|
static #encodeVarint(value) {
|
||||||
|
const bytes = [];
|
||||||
|
while (value >= 0x80) {
|
||||||
|
bytes.push((value & 0x7f) | 0x80);
|
||||||
|
value >>>= 7;
|
||||||
|
}
|
||||||
|
bytes.push(value);
|
||||||
|
return bytes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a base64 encoded protobuf message
|
||||||
|
* @param {Object} param - The parameters to encode
|
||||||
|
* @param {string} param.param1 - First parameter
|
||||||
|
* @param {string} param.param2 - Second parameter
|
||||||
|
* @returns {string} Base64 encoded protobuf
|
||||||
|
*/
|
||||||
|
static #getBase64Protobuf({ param1, param2 }) {
|
||||||
|
const field1 = this.#encodeProtobufString(1, param1);
|
||||||
|
const field2 = this.#encodeProtobufString(2, param2);
|
||||||
|
return Buffer.concat([field1, field2]).toString("base64");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extracts transcript text from YouTube API response
|
||||||
|
* @param {Object} responseData - The YouTube API response
|
||||||
|
* @returns {string} Combined transcript text
|
||||||
|
*/
|
||||||
|
static #extractTranscriptFromResponse(responseData) {
|
||||||
|
const transcriptRenderer =
|
||||||
|
responseData.actions?.[0]?.updateEngagementPanelAction?.content
|
||||||
|
?.transcriptRenderer;
|
||||||
|
if (!transcriptRenderer) {
|
||||||
|
throw new Error("No transcript data found in response");
|
||||||
|
}
|
||||||
|
|
||||||
|
const segments =
|
||||||
|
transcriptRenderer.content?.transcriptSearchPanelRenderer?.body
|
||||||
|
?.transcriptSegmentListRenderer?.initialSegments;
|
||||||
|
if (!segments) {
|
||||||
|
throw new Error("Transcript segments not found in response");
|
||||||
|
}
|
||||||
|
|
||||||
|
return segments
|
||||||
|
.map((segment) => {
|
||||||
|
const runs = segment.transcriptSegmentRenderer?.snippet?.runs;
|
||||||
|
return runs ? runs.map((run) => run.text).join("") : "";
|
||||||
|
})
|
||||||
|
.filter((text) => text)
|
||||||
|
.join(" ")
|
||||||
|
.trim()
|
||||||
|
.replace(/\s+/g, " ");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetch transcript from YouTube video
|
||||||
|
* @param {string} videoId - Video URL or video identifier
|
||||||
|
* @param {Object} config - Configuration options
|
||||||
|
* @param {string} [config.lang='en'] - Language code (e.g., 'en', 'es', 'fr')
|
||||||
|
* @returns {Promise<string>} Video transcript text
|
||||||
*/
|
*/
|
||||||
static async fetchTranscript(videoId, config = {}) {
|
static async fetchTranscript(videoId, config = {}) {
|
||||||
const identifier = this.retrieveVideoId(videoId);
|
const identifier = this.retrieveVideoId(videoId);
|
||||||
const lang = config?.lang ?? "en";
|
const lang = config?.lang ?? "en";
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const transcriptUrl = await fetch(
|
const innerProto = this.#getBase64Protobuf({
|
||||||
`https://www.youtube.com/watch?v=${identifier}`,
|
param1: "asr",
|
||||||
|
param2: lang,
|
||||||
|
});
|
||||||
|
const params = this.#getBase64Protobuf({
|
||||||
|
param1: identifier,
|
||||||
|
param2: innerProto,
|
||||||
|
});
|
||||||
|
|
||||||
|
const response = await fetch(
|
||||||
|
"https://www.youtube.com/youtubei/v1/get_transcript",
|
||||||
{
|
{
|
||||||
|
method: "POST",
|
||||||
headers: {
|
headers: {
|
||||||
"User-Agent": USER_AGENT,
|
"Content-Type": "application/json",
|
||||||
|
"User-Agent":
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36,gzip(gfe)",
|
||||||
},
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
context: {
|
||||||
|
client: {
|
||||||
|
clientName: "WEB",
|
||||||
|
clientVersion: "2.20240826.01.00",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
params,
|
||||||
|
}),
|
||||||
}
|
}
|
||||||
)
|
|
||||||
.then((res) => res.text())
|
|
||||||
.then((html) => parse(html))
|
|
||||||
.then((html) => this.#parseTranscriptEndpoint(html, lang));
|
|
||||||
|
|
||||||
if (!transcriptUrl)
|
|
||||||
throw new Error("Failed to locate a transcript for this video!");
|
|
||||||
|
|
||||||
// Result is hopefully some XML.
|
|
||||||
const transcriptXML = await fetch(transcriptUrl)
|
|
||||||
.then((res) => res.text())
|
|
||||||
.then((xml) => parse(xml));
|
|
||||||
|
|
||||||
let transcript = "";
|
|
||||||
const chunks = transcriptXML.getElementsByTagName("text");
|
|
||||||
for (const chunk of chunks) {
|
|
||||||
// Add space after each text chunk
|
|
||||||
transcript += chunk.textContent + " ";
|
|
||||||
}
|
|
||||||
|
|
||||||
// Trim extra whitespace
|
|
||||||
return transcript.trim().replace(/\s+/g, " ");
|
|
||||||
} catch (e) {
|
|
||||||
throw new YoutubeTranscriptError(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static #parseTranscriptEndpoint(document, langCode = null) {
|
|
||||||
try {
|
|
||||||
// Get all script tags on document page
|
|
||||||
const scripts = document.getElementsByTagName("script");
|
|
||||||
|
|
||||||
// find the player data script.
|
|
||||||
const playerScript = scripts.find((script) =>
|
|
||||||
script.textContent.includes("var ytInitialPlayerResponse = {")
|
|
||||||
);
|
);
|
||||||
|
|
||||||
const dataString =
|
if (!response.ok) {
|
||||||
playerScript.textContent
|
throw new Error(`HTTP error! status: ${response.status}`);
|
||||||
?.split("var ytInitialPlayerResponse = ")?.[1] //get the start of the object {....
|
}
|
||||||
?.split("};")?.[0] + // chunk off any code after object closure.
|
|
||||||
"}"; // add back that curly brace we just cut.
|
|
||||||
|
|
||||||
const data = JSON.parse(dataString.trim()); // Attempt a JSON parse
|
const responseData = await response.json();
|
||||||
const availableCaptions =
|
return this.#extractTranscriptFromResponse(responseData);
|
||||||
data?.captions?.playerCaptionsTracklistRenderer?.captionTracks || [];
|
|
||||||
|
|
||||||
// If languageCode was specified then search for it's code, otherwise get the first.
|
|
||||||
let captionTrack = availableCaptions?.[0];
|
|
||||||
if (langCode)
|
|
||||||
captionTrack =
|
|
||||||
availableCaptions.find((track) =>
|
|
||||||
track.languageCode.includes(langCode)
|
|
||||||
) ?? availableCaptions?.[0];
|
|
||||||
|
|
||||||
return captionTrack?.baseUrl;
|
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.error(`YoutubeTranscript.#parseTranscriptEndpoint ${e.message}`);
|
throw new YoutubeTranscriptError(e.message || e);
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Retrieve video id from url or string
|
* Extract video ID from a YouTube URL or verify an existing ID
|
||||||
* @param videoId video url or video id
|
* @param {string} videoId - Video URL or ID
|
||||||
|
* @returns {string} YouTube video ID
|
||||||
*/
|
*/
|
||||||
static retrieveVideoId(videoId) {
|
static retrieveVideoId(videoId) {
|
||||||
if (videoId.length === 11) {
|
if (videoId.length === 11) return videoId;
|
||||||
return videoId;
|
|
||||||
}
|
const RE_YOUTUBE =
|
||||||
|
/(?:youtube\.com\/(?:[^\/]+\/.+\/|(?:v|e(?:mbed)?)\/|.*[?&]v=)|youtu\.be\/)([^"&?\/\s]{11})/i;
|
||||||
const matchId = videoId.match(RE_YOUTUBE);
|
const matchId = videoId.match(RE_YOUTUBE);
|
||||||
if (matchId && matchId.length) {
|
|
||||||
return matchId[1];
|
if (matchId?.[1]) return matchId[1];
|
||||||
}
|
|
||||||
throw new YoutubeTranscriptError(
|
throw new YoutubeTranscriptError(
|
||||||
"Impossible to retrieve Youtube video ID."
|
"Impossible to retrieve Youtube video ID."
|
||||||
);
|
);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user