Upgrade YT Scraper (#4820)

This commit is contained in:
Timothy Carambat 2026-01-02 15:41:22 -08:00 committed by GitHub
parent b2f49b6036
commit 092b1b45f8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 23 additions and 38 deletions

View File

@ -6,7 +6,7 @@ concurrency:
on: on:
push: push:
branches: ['fix-scraper-esm-bug'] # put your current branch to create a build. Core team only. branches: ['upgrade-yt-scraper'] # put your current branch to create a build. Core team only.
paths-ignore: paths-ignore:
- '**.md' - '**.md'
- 'cloud-deployments/*' - 'cloud-deployments/*'

View File

@ -1,33 +0,0 @@
process.env.STORAGE_DIR = "test-storage"; // needed for tests to run
const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");
describe("YoutubeTranscript", () => {
if (process.env.GITHUB_ACTIONS) {
console.log("Skipping YoutubeTranscript test in GitHub Actions as the URLs will not resolve.");
it('is stubbed in GitHub Actions', () => expect(true).toBe(true));
} else {
it("should fetch transcript from YouTube video", async () => {
const videoId = "BJjsfNO5JTo";
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
lang: "en",
});
expect(transcript).toBeDefined();
expect(typeof transcript).toBe("string");
expect(transcript.length).toBeGreaterThan(0);
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
}, 30000);
it("should fetch non asr transcript from YouTube video", async () => {
const videoId = "D111ao6wWH0";
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
lang: "zh-HK",
});
expect(transcript).toBeDefined();
expect(typeof transcript).toBe("string");
expect(transcript.length).toBeGreaterThan(0);
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
}, 30000);
}
});

View File

@ -44,6 +44,7 @@
"uuid": "^9.0.0", "uuid": "^9.0.0",
"wavefile": "^11.0.0", "wavefile": "^11.0.0",
"winston": "^3.13.0", "winston": "^3.13.0",
"youtube-transcript-plus": "^1.1.2",
"youtubei.js": "^9.1.0" "youtubei.js": "^9.1.0"
}, },
"devDependencies": { "devDependencies": {

View File

@ -54,13 +54,15 @@ class YoutubeLoader {
source: this.#videoId, source: this.#videoId,
}; };
try { try {
const { YoutubeTranscript } = require("./youtube-transcript"); const fetchTranscript = await import("youtube-transcript-plus").then(
transcript = await YoutubeTranscript.fetchTranscript(this.#videoId, { (module) => module.fetchTranscript
);
const transcriptSegments = await fetchTranscript(this.#videoId, {
lang: this.#language, lang: this.#language,
}); });
if (!transcript) { if (!transcriptSegments || transcriptSegments.length === 0)
throw new Error("Transcription not found"); throw new Error("Transcription not found");
} transcript = this.#convertTranscriptSegmentsToText(transcriptSegments);
if (this.#addVideoInfo) { if (this.#addVideoInfo) {
const { Innertube } = require("youtubei.js"); const { Innertube } = require("youtubei.js");
const youtube = await Innertube.create(); const youtube = await Innertube.create();
@ -82,6 +84,16 @@ class YoutubeLoader {
}, },
]; ];
} }
#convertTranscriptSegmentsToText(transcriptSegments) {
return transcriptSegments
.map((segment) =>
typeof segment === "string" ? segment : segment.text || ""
)
.join(" ")
.replace(/\s+/g, " ")
.trim();
}
} }
module.exports.YoutubeLoader = YoutubeLoader; module.exports.YoutubeLoader = YoutubeLoader;

View File

@ -3908,6 +3908,11 @@ yauzl@^2.10.0, yauzl@^2.4.2:
buffer-crc32 "~0.2.3" buffer-crc32 "~0.2.3"
fd-slicer "~1.1.0" fd-slicer "~1.1.0"
youtube-transcript-plus@^1.1.2:
version "1.1.2"
resolved "https://registry.yarnpkg.com/youtube-transcript-plus/-/youtube-transcript-plus-1.1.2.tgz#f86851852a056088c11f4f6523ab0f8dba7d9711"
integrity sha512-bLlqkA6gVVUorZpcc+THuECXyAwOpnHqW2lOav9g6gGovxAP3FCD8s9GBFVjmSl3cWWwwPPXtG/zY1nD+GvQ7A==
youtubei.js@^9.1.0: youtubei.js@^9.1.0:
version "9.4.0" version "9.4.0"
resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-9.4.0.tgz#ccccaf4a295b96e3e17134a66730bbc82461594b" resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-9.4.0.tgz#ccccaf4a295b96e3e17134a66730bbc82461594b"