Upgrade YT Scraper (#4820)

This commit is contained in:
Timothy Carambat 2026-01-02 15:41:22 -08:00 committed by GitHub
parent b2f49b6036
commit 092b1b45f8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 23 additions and 38 deletions

View File

@ -6,7 +6,7 @@ concurrency:
on:
push:
branches: ['fix-scraper-esm-bug'] # put your current branch to create a build. Core team only.
branches: ['upgrade-yt-scraper'] # put your current branch to create a build. Core team only.
paths-ignore:
- '**.md'
- 'cloud-deployments/*'

View File

@ -1,33 +0,0 @@
process.env.STORAGE_DIR = "test-storage"; // needed for tests to run
const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");
describe("YoutubeTranscript", () => {
if (process.env.GITHUB_ACTIONS) {
console.log("Skipping YoutubeTranscript test in GitHub Actions as the URLs will not resolve.");
it('is stubbed in GitHub Actions', () => expect(true).toBe(true));
} else {
it("should fetch transcript from YouTube video", async () => {
const videoId = "BJjsfNO5JTo";
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
lang: "en",
});
expect(transcript).toBeDefined();
expect(typeof transcript).toBe("string");
expect(transcript.length).toBeGreaterThan(0);
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
}, 30000);
it("should fetch non asr transcript from YouTube video", async () => {
const videoId = "D111ao6wWH0";
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
lang: "zh-HK",
});
expect(transcript).toBeDefined();
expect(typeof transcript).toBe("string");
expect(transcript.length).toBeGreaterThan(0);
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
}, 30000);
}
});

View File

@ -44,6 +44,7 @@
"uuid": "^9.0.0",
"wavefile": "^11.0.0",
"winston": "^3.13.0",
"youtube-transcript-plus": "^1.1.2",
"youtubei.js": "^9.1.0"
},
"devDependencies": {

View File

@ -54,13 +54,15 @@ class YoutubeLoader {
source: this.#videoId,
};
try {
const { YoutubeTranscript } = require("./youtube-transcript");
transcript = await YoutubeTranscript.fetchTranscript(this.#videoId, {
const fetchTranscript = await import("youtube-transcript-plus").then(
(module) => module.fetchTranscript
);
const transcriptSegments = await fetchTranscript(this.#videoId, {
lang: this.#language,
});
if (!transcript) {
if (!transcriptSegments || transcriptSegments.length === 0)
throw new Error("Transcription not found");
}
transcript = this.#convertTranscriptSegmentsToText(transcriptSegments);
if (this.#addVideoInfo) {
const { Innertube } = require("youtubei.js");
const youtube = await Innertube.create();
@ -82,6 +84,16 @@ class YoutubeLoader {
},
];
}
#convertTranscriptSegmentsToText(transcriptSegments) {
return transcriptSegments
.map((segment) =>
typeof segment === "string" ? segment : segment.text || ""
)
.join(" ")
.replace(/\s+/g, " ")
.trim();
}
}
module.exports.YoutubeLoader = YoutubeLoader;

View File

@ -3908,6 +3908,11 @@ yauzl@^2.10.0, yauzl@^2.4.2:
buffer-crc32 "~0.2.3"
fd-slicer "~1.1.0"
youtube-transcript-plus@^1.1.2:
version "1.1.2"
resolved "https://registry.yarnpkg.com/youtube-transcript-plus/-/youtube-transcript-plus-1.1.2.tgz#f86851852a056088c11f4f6523ab0f8dba7d9711"
integrity sha512-bLlqkA6gVVUorZpcc+THuECXyAwOpnHqW2lOav9g6gGovxAP3FCD8s9GBFVjmSl3cWWwwPPXtG/zY1nD+GvQ7A==
youtubei.js@^9.1.0:
version "9.4.0"
resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-9.4.0.tgz#ccccaf4a295b96e3e17134a66730bbc82461594b"