diff --git a/.github/workflows/dev-build.yaml b/.github/workflows/dev-build.yaml index 29c49b08..8c077a9c 100644 --- a/.github/workflows/dev-build.yaml +++ b/.github/workflows/dev-build.yaml @@ -6,7 +6,7 @@ concurrency: on: push: - branches: ['fix-scraper-esm-bug'] # put your current branch to create a build. Core team only. + branches: ['upgrade-yt-scraper'] # put your current branch to create a build. Core team only. paths-ignore: - '**.md' - 'cloud-deployments/*' diff --git a/collector/__tests__/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js b/collector/__tests__/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js deleted file mode 100644 index ed2e5f20..00000000 --- a/collector/__tests__/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js +++ /dev/null @@ -1,33 +0,0 @@ -process.env.STORAGE_DIR = "test-storage"; // needed for tests to run -const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js"); - -describe("YoutubeTranscript", () => { - if (process.env.GITHUB_ACTIONS) { - console.log("Skipping YoutubeTranscript test in GitHub Actions as the URLs will not resolve."); - it('is stubbed in GitHub Actions', () => expect(true).toBe(true)); - } else { - it("should fetch transcript from YouTube video", async () => { - const videoId = "BJjsfNO5JTo"; - const transcript = await YoutubeTranscript.fetchTranscript(videoId, { - lang: "en", - }); - - expect(transcript).toBeDefined(); - expect(typeof transcript).toBe("string"); - expect(transcript.length).toBeGreaterThan(0); - console.log("First 200 characters:", transcript.substring(0, 200) + "..."); - }, 30000); - - it("should fetch non asr transcript from YouTube video", async () => { - const videoId = "D111ao6wWH0"; - const transcript = await YoutubeTranscript.fetchTranscript(videoId, { - lang: "zh-HK", - }); - - expect(transcript).toBeDefined(); - expect(typeof transcript).toBe("string"); - expect(transcript.length).toBeGreaterThan(0); - console.log("First 200 characters:", transcript.substring(0, 200) + "..."); - }, 30000); - } -}); diff --git a/collector/package.json b/collector/package.json index 6ce51151..745d7685 100644 --- a/collector/package.json +++ b/collector/package.json @@ -44,6 +44,7 @@ "uuid": "^9.0.0", "wavefile": "^11.0.0", "winston": "^3.13.0", + "youtube-transcript-plus": "^1.1.2", "youtubei.js": "^9.1.0" }, "devDependencies": { diff --git a/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/index.js b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/index.js index 45376449..14a46396 100644 --- a/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/index.js +++ b/collector/utils/extensions/YoutubeTranscript/YoutubeLoader/index.js @@ -54,13 +54,15 @@ class YoutubeLoader { source: this.#videoId, }; try { - const { YoutubeTranscript } = require("./youtube-transcript"); - transcript = await YoutubeTranscript.fetchTranscript(this.#videoId, { + const fetchTranscript = await import("youtube-transcript-plus").then( + (module) => module.fetchTranscript + ); + const transcriptSegments = await fetchTranscript(this.#videoId, { lang: this.#language, }); - if (!transcript) { + if (!transcriptSegments || transcriptSegments.length === 0) throw new Error("Transcription not found"); - } + transcript = this.#convertTranscriptSegmentsToText(transcriptSegments); if (this.#addVideoInfo) { const { Innertube } = require("youtubei.js"); const youtube = await Innertube.create(); @@ -82,6 +84,16 @@ class YoutubeLoader { }, ]; } + + #convertTranscriptSegmentsToText(transcriptSegments) { + return transcriptSegments + .map((segment) => + typeof segment === "string" ? segment : segment.text || "" + ) + .join(" ") + .replace(/\s+/g, " ") + .trim(); + } } module.exports.YoutubeLoader = YoutubeLoader; diff --git a/collector/yarn.lock b/collector/yarn.lock index e8b7cb61..82ec934e 100644 --- a/collector/yarn.lock +++ b/collector/yarn.lock @@ -3908,6 +3908,11 @@ yauzl@^2.10.0, yauzl@^2.4.2: buffer-crc32 "~0.2.3" fd-slicer "~1.1.0" +youtube-transcript-plus@^1.1.2: + version "1.1.2" + resolved "https://registry.yarnpkg.com/youtube-transcript-plus/-/youtube-transcript-plus-1.1.2.tgz#f86851852a056088c11f4f6523ab0f8dba7d9711" + integrity sha512-bLlqkA6gVVUorZpcc+THuECXyAwOpnHqW2lOav9g6gGovxAP3FCD8s9GBFVjmSl3cWWwwPPXtG/zY1nD+GvQ7A== + youtubei.js@^9.1.0: version "9.4.0" resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-9.4.0.tgz#ccccaf4a295b96e3e17134a66730bbc82461594b"