Fix scraping failed bug in link/bulk link scrapers (#2807)
* fix scraping failed bug in link/bulk link scrapers * reset submodule * swap to networkidle2 as a safe mix for SPA and API-loaded pages, but also not hang on request heavy pages * lint --------- Co-authored-by: timothycarambat <rambat1010@gmail.com>
This commit is contained in:
parent
6bc21860e4
commit
9bc01afa7d
@ -61,7 +61,7 @@ async function getPageContent(link) {
|
||||
ignoreHTTPSErrors: true,
|
||||
},
|
||||
gotoOptions: {
|
||||
waitUntil: "domcontentloaded",
|
||||
waitUntil: "networkidle2",
|
||||
},
|
||||
async evaluate(page, browser) {
|
||||
const result = await page.evaluate(() => document.body.innerText);
|
||||
|
||||
@ -48,7 +48,7 @@ async function getPageLinks(url, baseUrl) {
|
||||
try {
|
||||
const loader = new PuppeteerWebBaseLoader(url, {
|
||||
launchOptions: { headless: "new" },
|
||||
gotoOptions: { waitUntil: "domcontentloaded" },
|
||||
gotoOptions: { waitUntil: "networkidle2" },
|
||||
});
|
||||
const docs = await loader.load();
|
||||
const html = docs[0].pageContent;
|
||||
@ -92,7 +92,7 @@ async function bulkScrapePages(links, outFolderPath) {
|
||||
try {
|
||||
const loader = new PuppeteerWebBaseLoader(link, {
|
||||
launchOptions: { headless: "new" },
|
||||
gotoOptions: { waitUntil: "domcontentloaded" },
|
||||
gotoOptions: { waitUntil: "networkidle2" },
|
||||
async evaluate(page, browser) {
|
||||
const result = await page.evaluate(() => document.body.innerText);
|
||||
await browser.close();
|
||||
|
||||
Loading…
Reference in New Issue
Block a user