feat: handle pagination on server side
This commit is contained in:
@@ -291,9 +291,15 @@ export default class Interpreter extends EventEmitter {
|
|||||||
await this.options.serializableCallback(scrapeResult);
|
await this.options.serializableCallback(scrapeResult);
|
||||||
},
|
},
|
||||||
|
|
||||||
|
// scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }) => {
|
||||||
|
// await this.ensureScriptsLoaded(page);
|
||||||
|
// const scrapeResults: Record<string, any>[] = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
||||||
|
// await this.options.serializableCallback(scrapeResults);
|
||||||
|
// },
|
||||||
|
|
||||||
scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }) => {
|
scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }) => {
|
||||||
await this.ensureScriptsLoaded(page);
|
await this.ensureScriptsLoaded(page);
|
||||||
const scrapeResults: Record<string, any>[] = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
const scrapeResults: Record<string, any>[] = await this.handlePagination(page, config);
|
||||||
await this.options.serializableCallback(scrapeResults);
|
await this.options.serializableCallback(scrapeResults);
|
||||||
},
|
},
|
||||||
|
|
||||||
@@ -357,6 +363,63 @@ export default class Interpreter extends EventEmitter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private async handlePagination(page: Page, config: { listSelector: string, fields: any, limit?: number, pagination: any }) {
|
||||||
|
let allResults: Record<string, any>[] = [];
|
||||||
|
let currentPage = 1;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
// Scrape current page
|
||||||
|
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
||||||
|
allResults = allResults.concat(pageResults);
|
||||||
|
|
||||||
|
if (config.limit && allResults.length >= config.limit) {
|
||||||
|
allResults = allResults.slice(0, config.limit);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (config.pagination.type) {
|
||||||
|
case 'scrollDown':
|
||||||
|
await page.evaluate(() => window.scrollDown(config.listSelector, config.limit));
|
||||||
|
break;
|
||||||
|
case 'scrollUp':
|
||||||
|
await page.evaluate(() => window.scrollUp(config.listSelector, config.limit));
|
||||||
|
break;
|
||||||
|
case 'clickNext':
|
||||||
|
const nextButton = await page.$(config.pagination.selector);
|
||||||
|
if (!nextButton) {
|
||||||
|
return allResults; // No more pages
|
||||||
|
}
|
||||||
|
await nextButton.click();
|
||||||
|
break;
|
||||||
|
case 'clickLoadMore':
|
||||||
|
const loadMoreButton = await page.$(config.pagination.selector);
|
||||||
|
if (!loadMoreButton) {
|
||||||
|
return allResults; // No more items to load
|
||||||
|
}
|
||||||
|
await loadMoreButton.click();
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
return allResults; // No pagination or unknown type
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if new items were loaded
|
||||||
|
const newItemsLoaded = await page.evaluate((prevCount, listSelector) => {
|
||||||
|
const currentCount = document.querySelectorAll(listSelector).length;
|
||||||
|
return currentCount > prevCount;
|
||||||
|
}, allResults.length, config.listSelector);
|
||||||
|
|
||||||
|
if (!newItemsLoaded) {
|
||||||
|
return allResults; // No new items, end pagination
|
||||||
|
}
|
||||||
|
|
||||||
|
currentPage++;
|
||||||
|
await page.waitForTimeout(1000); // Wait for page to load
|
||||||
|
}
|
||||||
|
|
||||||
|
return allResults;
|
||||||
|
}
|
||||||
|
|
||||||
private async runLoop(p: Page, workflow: Workflow) {
|
private async runLoop(p: Page, workflow: Workflow) {
|
||||||
const usedActions: string[] = [];
|
const usedActions: string[] = [];
|
||||||
let lastAction = null;
|
let lastAction = null;
|
||||||
@@ -429,7 +492,7 @@ export default class Interpreter extends EventEmitter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private async ensureScriptsLoaded(page: Page) {
|
private async ensureScriptsLoaded(page: Page) {
|
||||||
const isScriptLoaded = await page.evaluate(() => typeof window.scrape === 'function' && typeof window.scrapeSchema === 'function' && typeof window.scrapeList === 'function' && typeof window.scrapeListAuto === 'function');
|
const isScriptLoaded = await page.evaluate(() => typeof window.scrape === 'function' && typeof window.scrapeSchema === 'function' && typeof window.scrapeList === 'function' && typeof window.scrapeListAuto === 'function' && typeof window.scrollDown === 'function' && typeof window.scrollUp === 'function');
|
||||||
if (!isScriptLoaded) {
|
if (!isScriptLoaded) {
|
||||||
await page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
|
await page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user