feat: pass object instead of str for key-val pair
This commit is contained in:
@@ -126,7 +126,12 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
// wrap inside an IIFE to avoid polluting the global scope: https://github.com/microsoft/playwright/issues/31864
|
/**
|
||||||
|
* Returns a "scrape" result from the current page.
|
||||||
|
* @returns {Array<Object>} *Curated* array of scraped information (with sparse rows removed)
|
||||||
|
*/
|
||||||
|
// Wrap the entire function in an IIFE (Immediately Invoked Function Expression)
|
||||||
|
// and attach it to the window object
|
||||||
(function(window) {
|
(function(window) {
|
||||||
/**
|
/**
|
||||||
* Returns a "scrape" result from the current page.
|
* Returns a "scrape" result from the current page.
|
||||||
@@ -180,7 +185,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
/**
|
/**
|
||||||
* Given an object with named lists of elements,
|
* Given an object with named lists of elements,
|
||||||
* groups the elements by their distance in the DOM tree.
|
* groups the elements by their distance in the DOM tree.
|
||||||
* @param {Object.<string, object[]>} lists The named lists of HTML elements.
|
* @param {Object.<string, {selector: string, tag: string}>} lists The named lists of HTML elements.
|
||||||
* @returns {Array.<Object.<string, string>>}
|
* @returns {Array.<Object.<string, string>>}
|
||||||
*/
|
*/
|
||||||
window.scrapeSchema = function (lists) {
|
window.scrapeSchema = function (lists) {
|
||||||
@@ -199,8 +204,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
}
|
}
|
||||||
|
|
||||||
function getSeedKey(listObj) {
|
function getSeedKey(listObj) {
|
||||||
const maxLength = Math.max(...Object.values(omap(listObj, (x) => x.length)));
|
const maxLength = Math.max(...Object.values(omap(listObj, (x) => document.querySelectorAll(x.selector).length)));
|
||||||
return Object.keys(ofilter(listObj, (_, v) => v.length === maxLength))[0];
|
return Object.keys(ofilter(listObj, (_, v) => document.querySelectorAll(v.selector).length === maxLength))[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
function getMBEs(elements) {
|
function getMBEs(elements) {
|
||||||
@@ -219,11 +224,16 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
}
|
}
|
||||||
|
|
||||||
const seedName = getSeedKey(lists);
|
const seedName = getSeedKey(lists);
|
||||||
const MBEs = getMBEs(lists[seedName]);
|
const seedElements = Array.from(document.querySelectorAll(lists[seedName].selector));
|
||||||
|
const MBEs = getMBEs(seedElements);
|
||||||
|
|
||||||
return MBEs.map((mbe) => omap(
|
return MBEs.map((mbe) => omap(
|
||||||
lists,
|
lists,
|
||||||
(listOfElements) => listOfElements.find((elem) => mbe.contains(elem))?.innerText,
|
({ selector }) => {
|
||||||
|
const elem = Array.from(document.querySelectorAll(selector)).find((elem) => mbe.contains(elem));
|
||||||
|
return elem ? elem.innerText : undefined;
|
||||||
|
},
|
||||||
|
(key) => lists[key].selector // Use the selector as the key in the output
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user