Upgrade YT Scraper (#4820)
This commit is contained in:
parent
b2f49b6036
commit
092b1b45f8
2
.github/workflows/dev-build.yaml
vendored
2
.github/workflows/dev-build.yaml
vendored
@ -6,7 +6,7 @@ concurrency:
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ['fix-scraper-esm-bug'] # put your current branch to create a build. Core team only.
|
||||
branches: ['upgrade-yt-scraper'] # put your current branch to create a build. Core team only.
|
||||
paths-ignore:
|
||||
- '**.md'
|
||||
- 'cloud-deployments/*'
|
||||
|
||||
@ -1,33 +0,0 @@
|
||||
process.env.STORAGE_DIR = "test-storage"; // needed for tests to run
|
||||
const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");
|
||||
|
||||
describe("YoutubeTranscript", () => {
|
||||
if (process.env.GITHUB_ACTIONS) {
|
||||
console.log("Skipping YoutubeTranscript test in GitHub Actions as the URLs will not resolve.");
|
||||
it('is stubbed in GitHub Actions', () => expect(true).toBe(true));
|
||||
} else {
|
||||
it("should fetch transcript from YouTube video", async () => {
|
||||
const videoId = "BJjsfNO5JTo";
|
||||
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
|
||||
lang: "en",
|
||||
});
|
||||
|
||||
expect(transcript).toBeDefined();
|
||||
expect(typeof transcript).toBe("string");
|
||||
expect(transcript.length).toBeGreaterThan(0);
|
||||
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
|
||||
}, 30000);
|
||||
|
||||
it("should fetch non asr transcript from YouTube video", async () => {
|
||||
const videoId = "D111ao6wWH0";
|
||||
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
|
||||
lang: "zh-HK",
|
||||
});
|
||||
|
||||
expect(transcript).toBeDefined();
|
||||
expect(typeof transcript).toBe("string");
|
||||
expect(transcript.length).toBeGreaterThan(0);
|
||||
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
|
||||
}, 30000);
|
||||
}
|
||||
});
|
||||
@ -44,6 +44,7 @@
|
||||
"uuid": "^9.0.0",
|
||||
"wavefile": "^11.0.0",
|
||||
"winston": "^3.13.0",
|
||||
"youtube-transcript-plus": "^1.1.2",
|
||||
"youtubei.js": "^9.1.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
|
||||
@ -54,13 +54,15 @@ class YoutubeLoader {
|
||||
source: this.#videoId,
|
||||
};
|
||||
try {
|
||||
const { YoutubeTranscript } = require("./youtube-transcript");
|
||||
transcript = await YoutubeTranscript.fetchTranscript(this.#videoId, {
|
||||
const fetchTranscript = await import("youtube-transcript-plus").then(
|
||||
(module) => module.fetchTranscript
|
||||
);
|
||||
const transcriptSegments = await fetchTranscript(this.#videoId, {
|
||||
lang: this.#language,
|
||||
});
|
||||
if (!transcript) {
|
||||
if (!transcriptSegments || transcriptSegments.length === 0)
|
||||
throw new Error("Transcription not found");
|
||||
}
|
||||
transcript = this.#convertTranscriptSegmentsToText(transcriptSegments);
|
||||
if (this.#addVideoInfo) {
|
||||
const { Innertube } = require("youtubei.js");
|
||||
const youtube = await Innertube.create();
|
||||
@ -82,6 +84,16 @@ class YoutubeLoader {
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
#convertTranscriptSegmentsToText(transcriptSegments) {
|
||||
return transcriptSegments
|
||||
.map((segment) =>
|
||||
typeof segment === "string" ? segment : segment.text || ""
|
||||
)
|
||||
.join(" ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
}
|
||||
}
|
||||
|
||||
module.exports.YoutubeLoader = YoutubeLoader;
|
||||
|
||||
@ -3908,6 +3908,11 @@ yauzl@^2.10.0, yauzl@^2.4.2:
|
||||
buffer-crc32 "~0.2.3"
|
||||
fd-slicer "~1.1.0"
|
||||
|
||||
youtube-transcript-plus@^1.1.2:
|
||||
version "1.1.2"
|
||||
resolved "https://registry.yarnpkg.com/youtube-transcript-plus/-/youtube-transcript-plus-1.1.2.tgz#f86851852a056088c11f4f6523ab0f8dba7d9711"
|
||||
integrity sha512-bLlqkA6gVVUorZpcc+THuECXyAwOpnHqW2lOav9g6gGovxAP3FCD8s9GBFVjmSl3cWWwwPPXtG/zY1nD+GvQ7A==
|
||||
|
||||
youtubei.js@^9.1.0:
|
||||
version "9.4.0"
|
||||
resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-9.4.0.tgz#ccccaf4a295b96e3e17134a66730bbc82461594b"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user