Upgrade YT Scraper (#4820)
This commit is contained in:
parent
b2f49b6036
commit
092b1b45f8
2
.github/workflows/dev-build.yaml
vendored
2
.github/workflows/dev-build.yaml
vendored
@ -6,7 +6,7 @@ concurrency:
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches: ['fix-scraper-esm-bug'] # put your current branch to create a build. Core team only.
|
branches: ['upgrade-yt-scraper'] # put your current branch to create a build. Core team only.
|
||||||
paths-ignore:
|
paths-ignore:
|
||||||
- '**.md'
|
- '**.md'
|
||||||
- 'cloud-deployments/*'
|
- 'cloud-deployments/*'
|
||||||
|
|||||||
@ -1,33 +0,0 @@
|
|||||||
process.env.STORAGE_DIR = "test-storage"; // needed for tests to run
|
|
||||||
const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");
|
|
||||||
|
|
||||||
describe("YoutubeTranscript", () => {
|
|
||||||
if (process.env.GITHUB_ACTIONS) {
|
|
||||||
console.log("Skipping YoutubeTranscript test in GitHub Actions as the URLs will not resolve.");
|
|
||||||
it('is stubbed in GitHub Actions', () => expect(true).toBe(true));
|
|
||||||
} else {
|
|
||||||
it("should fetch transcript from YouTube video", async () => {
|
|
||||||
const videoId = "BJjsfNO5JTo";
|
|
||||||
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
|
|
||||||
lang: "en",
|
|
||||||
});
|
|
||||||
|
|
||||||
expect(transcript).toBeDefined();
|
|
||||||
expect(typeof transcript).toBe("string");
|
|
||||||
expect(transcript.length).toBeGreaterThan(0);
|
|
||||||
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
|
|
||||||
}, 30000);
|
|
||||||
|
|
||||||
it("should fetch non asr transcript from YouTube video", async () => {
|
|
||||||
const videoId = "D111ao6wWH0";
|
|
||||||
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
|
|
||||||
lang: "zh-HK",
|
|
||||||
});
|
|
||||||
|
|
||||||
expect(transcript).toBeDefined();
|
|
||||||
expect(typeof transcript).toBe("string");
|
|
||||||
expect(transcript.length).toBeGreaterThan(0);
|
|
||||||
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
|
|
||||||
}, 30000);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
@ -44,6 +44,7 @@
|
|||||||
"uuid": "^9.0.0",
|
"uuid": "^9.0.0",
|
||||||
"wavefile": "^11.0.0",
|
"wavefile": "^11.0.0",
|
||||||
"winston": "^3.13.0",
|
"winston": "^3.13.0",
|
||||||
|
"youtube-transcript-plus": "^1.1.2",
|
||||||
"youtubei.js": "^9.1.0"
|
"youtubei.js": "^9.1.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
|
|||||||
@ -54,13 +54,15 @@ class YoutubeLoader {
|
|||||||
source: this.#videoId,
|
source: this.#videoId,
|
||||||
};
|
};
|
||||||
try {
|
try {
|
||||||
const { YoutubeTranscript } = require("./youtube-transcript");
|
const fetchTranscript = await import("youtube-transcript-plus").then(
|
||||||
transcript = await YoutubeTranscript.fetchTranscript(this.#videoId, {
|
(module) => module.fetchTranscript
|
||||||
|
);
|
||||||
|
const transcriptSegments = await fetchTranscript(this.#videoId, {
|
||||||
lang: this.#language,
|
lang: this.#language,
|
||||||
});
|
});
|
||||||
if (!transcript) {
|
if (!transcriptSegments || transcriptSegments.length === 0)
|
||||||
throw new Error("Transcription not found");
|
throw new Error("Transcription not found");
|
||||||
}
|
transcript = this.#convertTranscriptSegmentsToText(transcriptSegments);
|
||||||
if (this.#addVideoInfo) {
|
if (this.#addVideoInfo) {
|
||||||
const { Innertube } = require("youtubei.js");
|
const { Innertube } = require("youtubei.js");
|
||||||
const youtube = await Innertube.create();
|
const youtube = await Innertube.create();
|
||||||
@ -82,6 +84,16 @@ class YoutubeLoader {
|
|||||||
},
|
},
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#convertTranscriptSegmentsToText(transcriptSegments) {
|
||||||
|
return transcriptSegments
|
||||||
|
.map((segment) =>
|
||||||
|
typeof segment === "string" ? segment : segment.text || ""
|
||||||
|
)
|
||||||
|
.join(" ")
|
||||||
|
.replace(/\s+/g, " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports.YoutubeLoader = YoutubeLoader;
|
module.exports.YoutubeLoader = YoutubeLoader;
|
||||||
|
|||||||
@ -3908,6 +3908,11 @@ yauzl@^2.10.0, yauzl@^2.4.2:
|
|||||||
buffer-crc32 "~0.2.3"
|
buffer-crc32 "~0.2.3"
|
||||||
fd-slicer "~1.1.0"
|
fd-slicer "~1.1.0"
|
||||||
|
|
||||||
|
youtube-transcript-plus@^1.1.2:
|
||||||
|
version "1.1.2"
|
||||||
|
resolved "https://registry.yarnpkg.com/youtube-transcript-plus/-/youtube-transcript-plus-1.1.2.tgz#f86851852a056088c11f4f6523ab0f8dba7d9711"
|
||||||
|
integrity sha512-bLlqkA6gVVUorZpcc+THuECXyAwOpnHqW2lOav9g6gGovxAP3FCD8s9GBFVjmSl3cWWwwPPXtG/zY1nD+GvQ7A==
|
||||||
|
|
||||||
youtubei.js@^9.1.0:
|
youtubei.js@^9.1.0:
|
||||||
version "9.4.0"
|
version "9.4.0"
|
||||||
resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-9.4.0.tgz#ccccaf4a295b96e3e17134a66730bbc82461594b"
|
resolved "https://registry.yarnpkg.com/youtubei.js/-/youtubei.js-9.4.0.tgz#ccccaf4a295b96e3e17134a66730bbc82461594b"
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user