feat: add logic to scrape multiple nested shadow dom elements

This commit is contained in:
RohitR311
2024-12-31 01:52:38 +05:30
parent b757d9c4f8
commit 4b4074b70d

View File

@@ -246,7 +246,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
return currentElements; return currentElements;
} }
// Helper function to extract value from element based on attribute
function getElementValue(element, attribute) { function getElementValue(element, attribute) {
if (!element) return null; if (!element) return null;
@@ -294,12 +293,12 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
}); });
} }
// Main scraping logic // First try the MBE approach
const seedName = getSeedKey(lists); const seedName = getSeedKey(lists);
const seedElements = findAllElements(lists[seedName]); const seedElements = findAllElements(lists[seedName]);
const MBEs = getMBEs(seedElements); const MBEs = getMBEs(seedElements);
return MBEs.map((mbe) => omap( const mbeResults = MBEs.map((mbe) => omap(
lists, lists,
(config) => { (config) => {
const elem = findAllElements(config) const elem = findAllElements(config)
@@ -309,6 +308,33 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
}, },
(key) => key (key) => key
)) || []; )) || [];
// If MBE approach didn't find all elements, try independent scraping
if (mbeResults.some(result => Object.values(result).some(v => v === undefined))) {
// Fall back to independent scraping
const results = [];
const foundElements = new Map();
// Find all elements for each selector
Object.entries(lists).forEach(([key, config]) => {
const elements = findAllElements(config);
foundElements.set(key, elements);
});
// Create result objects for each found element
foundElements.forEach((elements, key) => {
elements.forEach((element, index) => {
if (!results[index]) {
results[index] = {};
}
results[index][key] = getElementValue(element, lists[key].attribute);
});
});
return results.filter(result => Object.keys(result).length > 0);
}
return mbeResults;
}; };
/** /**