Merge pull request #675 from getmaxun/smart-list

feat: better, faster, smarter capture list
This commit is contained in:
Rohit
2025-07-07 01:23:53 +05:30
committed by GitHub
9 changed files with 3079 additions and 2305 deletions

View File

@@ -423,44 +423,149 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list * @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
*/ */
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
// Enhanced query function to handle iframe, frame and shadow DOM // XPath evaluation functions
const queryElement = (rootElement, selector) => { const evaluateXPath = (rootElement, xpath) => {
if (!selector.includes('>>') && !selector.includes(':>>')) { try {
return rootElement.querySelector(selector); const ownerDoc =
rootElement.nodeType === Node.DOCUMENT_NODE
? rootElement
: rootElement.ownerDocument;
if (!ownerDoc) return null;
const result = ownerDoc.evaluate(
xpath,
rootElement,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
);
return result.singleNodeValue;
} catch (error) {
console.warn("XPath evaluation failed:", xpath, error);
return null;
}
};
const evaluateXPathAll = (rootElement, xpath) => {
try {
const ownerDoc =
rootElement.nodeType === Node.DOCUMENT_NODE
? rootElement
: rootElement.ownerDocument;
if (!ownerDoc) return [];
const result = ownerDoc.evaluate(
xpath,
rootElement,
null,
XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,
null
);
const elements = [];
for (let i = 0; i < result.snapshotLength; i++) {
const node = result.snapshotItem(i);
if (node && node.nodeType === Node.ELEMENT_NODE) {
elements.push(node);
}
} }
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim()); return elements;
} catch (error) {
console.warn("XPath evaluation failed:", xpath, error);
return [];
}
};
// Helper function to detect selector type
const isXPathSelector = (selector) => {
return (
selector.startsWith("//") ||
selector.startsWith("/") ||
selector.startsWith("./")
);
};
// Enhanced query function to handle iframe, frame, shadow DOM, CSS selectors, and XPath
const queryElement = (rootElement, selector) => {
if (!selector.includes(">>") && !selector.includes(":>>")) {
// Check if it's an XPath selector
if (isXPathSelector(selector)) {
return evaluateXPath(rootElement, selector);
} else {
return rootElement.querySelector(selector);
}
}
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
let currentElement = rootElement; let currentElement = rootElement;
for (let i = 0; i < parts.length; i++) { for (let i = 0; i < parts.length; i++) {
if (!currentElement) return null; if (!currentElement) return null;
// Handle iframe and frame traversal // Handle iframe and frame traversal
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') { if (
currentElement.tagName === "IFRAME" ||
currentElement.tagName === "FRAME"
) {
try { try {
const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document; const frameDoc =
currentElement.contentDocument ||
currentElement.contentWindow.document;
if (!frameDoc) return null;
if (isXPathSelector(parts[i])) {
currentElement = evaluateXPath(frameDoc, parts[i]);
} else {
currentElement = frameDoc.querySelector(parts[i]); currentElement = frameDoc.querySelector(parts[i]);
}
continue; continue;
} catch (e) { } catch (e) {
console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e); console.warn(
`Cannot access ${currentElement.tagName.toLowerCase()} content:`,
e
);
return null; return null;
} }
} }
let nextElement = null;
// Try regular DOM first // Try regular DOM first
let nextElement = currentElement.querySelector(parts[i]); if ("querySelector" in currentElement) {
if (isXPathSelector(parts[i])) {
nextElement = evaluateXPath(currentElement, parts[i]);
} else {
nextElement = currentElement.querySelector(parts[i]);
}
}
// Try shadow DOM if not found // Try shadow DOM if not found
if (!nextElement && currentElement.shadowRoot) { if (
!nextElement &&
"shadowRoot" in currentElement &&
currentElement.shadowRoot
) {
if (isXPathSelector(parts[i])) {
nextElement = evaluateXPath(currentElement.shadowRoot, parts[i]);
} else {
nextElement = currentElement.shadowRoot.querySelector(parts[i]); nextElement = currentElement.shadowRoot.querySelector(parts[i]);
} }
}
// Check children's shadow roots if still not found // Check children's shadow roots if still not found
if (!nextElement) { if (!nextElement && "children" in currentElement) {
const children = Array.from(currentElement.children || []); const children = Array.from(currentElement.children || []);
for (const child of children) { for (const child of children) {
if (child.shadowRoot) { if (child.shadowRoot) {
if (isXPathSelector(parts[i])) {
nextElement = evaluateXPath(child.shadowRoot, parts[i]);
} else {
nextElement = child.shadowRoot.querySelector(parts[i]); nextElement = child.shadowRoot.querySelector(parts[i]);
}
if (nextElement) break; if (nextElement) break;
} }
} }
@@ -474,11 +579,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
// Enhanced query all function for both contexts // Enhanced query all function for both contexts
const queryElementAll = (rootElement, selector) => { const queryElementAll = (rootElement, selector) => {
if (!selector.includes('>>') && !selector.includes(':>>')) { if (!selector.includes(">>") && !selector.includes(":>>")) {
return rootElement.querySelectorAll(selector); if (isXPathSelector(selector)) {
return evaluateXPathAll(rootElement, selector);
} else {
return Array.from(rootElement.querySelectorAll(selector));
}
} }
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim()); const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
let currentElements = [rootElement]; let currentElements = [rootElement];
for (const part of parts) { for (const part of parts) {
@@ -486,30 +595,64 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
for (const element of currentElements) { for (const element of currentElements) {
// Handle iframe and frame traversal // Handle iframe and frame traversal
if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') { if (element.tagName === "IFRAME" || element.tagName === "FRAME") {
try { try {
const frameDoc = element.contentDocument || element.contentWindow.document; const frameDoc =
nextElements.push(...frameDoc.querySelectorAll(part)); element.contentDocument || element.contentWindow.document;
if (frameDoc) {
if (isXPathSelector(part)) {
nextElements.push(...evaluateXPathAll(frameDoc, part));
} else {
nextElements.push(
...Array.from(frameDoc.querySelectorAll(part))
);
}
}
} catch (e) { } catch (e) {
console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e); console.warn(
`Cannot access ${element.tagName.toLowerCase()} content:`,
e
);
continue; continue;
} }
} else { } else {
// Regular DOM elements // Regular DOM elements
if (element.querySelectorAll) { if (element.querySelectorAll) {
nextElements.push(...element.querySelectorAll(part)); if (isXPathSelector(part)) {
nextElements.push(...evaluateXPathAll(element, part));
} else {
nextElements.push(
...Array.from(element.querySelectorAll(part))
);
}
} }
// Shadow DOM elements // Shadow DOM elements
if (element.shadowRoot) { if (element.shadowRoot) {
nextElements.push(...element.shadowRoot.querySelectorAll(part)); if (isXPathSelector(part)) {
nextElements.push(
...evaluateXPathAll(element.shadowRoot, part)
);
} else {
nextElements.push(
...Array.from(element.shadowRoot.querySelectorAll(part))
);
}
} }
// Check children's shadow roots // Check children's shadow roots
const children = Array.from(element.children || []); const children = Array.from(element.children || []);
for (const child of children) { for (const child of children) {
if (child.shadowRoot) { if (child.shadowRoot) {
nextElements.push(...child.shadowRoot.querySelectorAll(part)); if (isXPathSelector(part)) {
nextElements.push(
...evaluateXPathAll(child.shadowRoot, part)
);
} else {
nextElements.push(
...Array.from(child.shadowRoot.querySelectorAll(part))
);
}
} }
} }
} }
@@ -522,11 +665,12 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
}; };
// Enhanced value extraction with context awareness // Enhanced value extraction with context awareness
function extractValue(element, attribute) { const extractValue = (element, attribute) => {
if (!element) return null; if (!element) return null;
// Get context-aware base URL // Get context-aware base URL
const baseURL = element.ownerDocument?.location?.href || window.location.origin; const baseURL =
element.ownerDocument?.location?.href || window.location.origin;
// Check shadow root first // Check shadow root first
if (element.shadowRoot) { if (element.shadowRoot) {
@@ -536,15 +680,37 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
} }
} }
if (attribute === 'innerText') { if (attribute === "innerText") {
return element.innerText.trim(); // First try standard innerText/textContent
} else if (attribute === 'innerHTML') { let textContent =
return element.innerHTML.trim(); element.innerText?.trim() || element.textContent?.trim();
} else if (attribute === 'src' || attribute === 'href') {
if (attribute === 'href' && element.tagName !== 'A') { // If empty, check for common data attributes that might contain the text
if (!textContent) {
const dataAttributes = [
"data-600",
"data-text",
"data-label",
"data-value",
"data-content",
];
for (const attr of dataAttributes) {
const dataValue = element.getAttribute(attr);
if (dataValue && dataValue.trim()) {
textContent = dataValue.trim();
break;
}
}
}
return textContent || null;
} else if (attribute === "innerHTML") {
return element.innerHTML?.trim() || null;
} else if (attribute === "src" || attribute === "href") {
if (attribute === "href" && element.tagName !== "A") {
const parentElement = element.parentElement; const parentElement = element.parentElement;
if (parentElement && parentElement.tagName === 'A') { if (parentElement && parentElement.tagName === "A") {
const parentHref = parentElement.getAttribute('href'); const parentHref = parentElement.getAttribute("href");
if (parentHref) { if (parentHref) {
try { try {
return new URL(parentHref, baseURL).href; return new URL(parentHref, baseURL).href;
@@ -556,13 +722,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
} }
const attrValue = element.getAttribute(attribute); const attrValue = element.getAttribute(attribute);
const dataAttr = attrValue || element.getAttribute('data-' + attribute); const dataAttr = attrValue || element.getAttribute("data-" + attribute);
if (!dataAttr || dataAttr.trim() === '') { if (!dataAttr || dataAttr.trim() === "") {
if (attribute === 'src') { if (attribute === "src") {
const style = window.getComputedStyle(element); const style = window.getComputedStyle(element);
const bgImage = style.backgroundImage; const bgImage = style.backgroundImage;
if (bgImage && bgImage !== 'none') { if (bgImage && bgImage !== "none") {
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/); const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
return matches ? new URL(matches[1], baseURL).href : null; return matches ? new URL(matches[1], baseURL).href : null;
} }
@@ -573,15 +739,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
try { try {
return new URL(dataAttr, baseURL).href; return new URL(dataAttr, baseURL).href;
} catch (e) { } catch (e) {
console.warn('Error creating URL from', dataAttr, e); console.warn("Error creating URL from", dataAttr, e);
return dataAttr; // Return the original value if URL construction fails return dataAttr;
} }
} }
return element.getAttribute(attribute); return element.getAttribute(attribute);
} };
// Enhanced table ancestor finding with context support // Enhanced table ancestor finding with context support
function findTableAncestor(element) { const findTableAncestor = (element) => {
let currentElement = element; let currentElement = element;
const MAX_DEPTH = 5; const MAX_DEPTH = 5;
let depth = 0; let depth = 0;
@@ -593,14 +759,17 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
continue; continue;
} }
if (currentElement.tagName === 'TD') { if (currentElement.tagName === "TD") {
return { type: 'TD', element: currentElement }; return { type: "TD", element: currentElement };
} else if (currentElement.tagName === 'TR') { } else if (currentElement.tagName === "TR") {
return { type: 'TR', element: currentElement }; return { type: "TR", element: currentElement };
} }
// Handle iframe and frame crossing // Handle iframe and frame crossing
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') { if (
currentElement.tagName === "IFRAME" ||
currentElement.tagName === "FRAME"
) {
try { try {
currentElement = currentElement.contentDocument.body; currentElement = currentElement.contentDocument.body;
} catch (e) { } catch (e) {
@@ -612,26 +781,26 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
depth++; depth++;
} }
return null; return null;
} };
// Helper function to get cell index // Helper function to get cell index
function getCellIndex(td) { const getCellIndex = (td) => {
if (td.getRootNode() instanceof ShadowRoot) { if (td.getRootNode() instanceof ShadowRoot) {
const shadowRoot = td.getRootNode(); const shadowRoot = td.getRootNode();
const allCells = Array.from(shadowRoot.querySelectorAll('td')); const allCells = Array.from(shadowRoot.querySelectorAll("td"));
return allCells.indexOf(td); return allCells.indexOf(td);
} }
let index = 0; let index = 0;
let sibling = td; let sibling = td;
while (sibling = sibling.previousElementSibling) { while ((sibling = sibling.previousElementSibling)) {
index++; index++;
} }
return index; return index;
} };
// Helper function to check for TH elements // Helper function to check for TH elements
function hasThElement(row, tableFields) { const hasThElement = (row, tableFields) => {
for (const [_, { selector }] of Object.entries(tableFields)) { for (const [_, { selector }] of Object.entries(tableFields)) {
const element = queryElement(row, selector); const element = queryElement(row, selector);
if (element) { if (element) {
@@ -642,9 +811,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
continue; continue;
} }
if (current.tagName === 'TH') return true; if (current.tagName === "TH") return true;
if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') { if (current.tagName === "IFRAME" || current.tagName === "FRAME") {
try { try {
current = current.contentDocument.body; current = current.contentDocument.body;
} catch (e) { } catch (e) {
@@ -657,35 +826,35 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
} }
} }
return false; return false;
} };
// Helper function to filter rows // Helper function to filter rows
function filterRowsBasedOnTag(rows, tableFields) { const filterRowsBasedOnTag = (rows, tableFields) => {
for (const row of rows) { for (const row of rows) {
if (hasThElement(row, tableFields)) { if (hasThElement(row, tableFields)) {
return rows; return rows;
} }
} }
// Include shadow DOM in TH search return rows.filter((row) => {
return rows.filter(row => { const directTH = row.getElementsByTagName("TH").length === 0;
const directTH = row.getElementsByTagName('TH').length === 0; const shadowTH = row.shadowRoot
const shadowTH = row.shadowRoot ? ? row.shadowRoot.querySelector("th") === null
row.shadowRoot.querySelector('th') === null : true; : true;
return directTH && shadowTH; return directTH && shadowTH;
}); });
} };
// Class similarity comparison functions // Class similarity comparison functions
function calculateClassSimilarity(classList1, classList2) { const calculateClassSimilarity = (classList1, classList2) => {
const set1 = new Set(classList1); const set1 = new Set(classList1);
const set2 = new Set(classList2); const set2 = new Set(classList2);
const intersection = new Set([...set1].filter(x => set2.has(x))); const intersection = new Set([...set1].filter((x) => set2.has(x)));
const union = new Set([...set1, ...set2]); const union = new Set([...set1, ...set2]);
return intersection.size / union.size; return intersection.size / union.size;
} };
// Enhanced similar elements finding with context support // Enhanced similar elements finding with context support
function findSimilarElements(baseElement, similarityThreshold = 0.7) { const findSimilarElements = (baseElement, similarityThreshold = 0.7) => {
const baseClasses = Array.from(baseElement.classList); const baseClasses = Array.from(baseElement.classList);
if (baseClasses.length === 0) return []; if (baseClasses.length === 0) return [];
@@ -697,25 +866,33 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
// Get elements from shadow DOM // Get elements from shadow DOM
if (baseElement.getRootNode() instanceof ShadowRoot) { if (baseElement.getRootNode() instanceof ShadowRoot) {
const shadowHost = baseElement.getRootNode().host; const shadowHost = baseElement.getRootNode().host;
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName)); allElements.push(
...shadowHost.getElementsByTagName(baseElement.tagName)
);
} }
// Get elements from iframes and frames // Get elements from iframes and frames
const frames = [ const frames = [
...Array.from(document.getElementsByTagName('iframe')), ...Array.from(document.getElementsByTagName("iframe")),
...Array.from(document.getElementsByTagName('frame')) ...Array.from(document.getElementsByTagName("frame")),
]; ];
for (const frame of frames) { for (const frame of frames) {
try { try {
const frameDoc = frame.contentDocument || frame.contentWindow.document; const frameDoc =
allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName)); frame.contentDocument || frame.contentWindow.document;
allElements.push(
...frameDoc.getElementsByTagName(baseElement.tagName)
);
} catch (e) { } catch (e) {
console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e); console.warn(
`Cannot access ${frame.tagName.toLowerCase()} content:`,
e
);
} }
} }
return allElements.filter(element => { return allElements.filter((element) => {
if (element === baseElement) return false; if (element === baseElement) return false;
const similarity = calculateClassSimilarity( const similarity = calculateClassSimilarity(
baseClasses, baseClasses,
@@ -723,45 +900,92 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
); );
return similarity >= similarityThreshold; return similarity >= similarityThreshold;
}); });
} };
function tryFallbackSelector(rootElement, originalSelector) { const tryFallbackSelector = (rootElement, originalSelector) => {
let element = queryElement(rootElement, originalSelector); let element = queryElement(rootElement, originalSelector);
if (!element && originalSelector.includes('nth-child')) { if (!element && originalSelector.includes("nth-child")) {
const match = originalSelector.match(/nth-child\((\d+)\)/); const match = originalSelector.match(/nth-child\((\d+)\)/);
if (match) { if (match) {
const position = parseInt(match[1], 10); const position = parseInt(match[1], 10);
for (let i = position - 1; i >= 1; i--) { for (let i = position - 1; i >= 1; i--) {
const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`); const fallbackSelector = originalSelector.replace(
/nth-child\(\d+\)/,
`nth-child(${i})`
);
element = queryElement(rootElement, fallbackSelector); element = queryElement(rootElement, fallbackSelector);
if (element) break; if (element) break;
} }
if (!element) { if (!element) {
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, ''); const baseSelector = originalSelector.replace(
/\:nth-child\(\d+\)/,
""
);
element = queryElement(rootElement, baseSelector); element = queryElement(rootElement, baseSelector);
} }
} }
} }
return element; return element;
} };
// Create indexed XPath for specific container instance
const createIndexedXPath = (
childSelector,
listSelector,
containerIndex
) => {
// Check if the child selector contains the list selector pattern
if (childSelector.includes(listSelector.replace("//", ""))) {
// Replace the list selector part with indexed version
const listPattern = listSelector.replace("//", "");
const indexedListSelector = `(${listSelector})[${containerIndex}]`;
const indexedSelector = childSelector.replace(
`//${listPattern}`,
indexedListSelector
);
return indexedSelector;
} else {
// If pattern doesn't match, create a more generic indexed selector
return `(${listSelector})[${containerIndex}]${childSelector.replace(
"//",
"/"
)}`;
}
};
// Main scraping logic with unified support for both CSS and XPath
console.log("🚀 Starting unified list data extraction");
console.log("List Selector:", listSelector);
console.log("Fields:", fields);
// Main scraping logic with context support
let containers = queryElementAll(document, listSelector); let containers = queryElementAll(document, listSelector);
containers = Array.from(containers); containers = Array.from(containers);
if (containers.length === 0) return []; if (containers.length === 0) {
console.warn("❌ No containers found for listSelector:", listSelector);
return [];
}
if (limit > 1 && containers.length === 1) { console.log(`📦 Found ${containers.length} list containers`);
// For CSS selectors, try to find similar containers if needed
if (
!isXPathSelector(listSelector) &&
limit > 1 &&
containers.length === 1
) {
const baseContainer = containers[0]; const baseContainer = containers[0];
const similarContainers = findSimilarElements(baseContainer); const similarContainers = findSimilarElements(baseContainer);
if (similarContainers.length > 0) { if (similarContainers.length > 0) {
const newContainers = similarContainers.filter(container => const newContainers = similarContainers.filter(
!container.matches(listSelector) (container) => !container.matches(listSelector)
); );
containers = [...containers, ...newContainers]; containers = [...containers, ...newContainers];
} }
@@ -769,10 +993,60 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const containerFields = containers.map(() => ({ const containerFields = containers.map(() => ({
tableFields: {}, tableFields: {},
nonTableFields: {} nonTableFields: {},
})); }));
// Classify fields // For XPath selectors, use the new approach
if (isXPathSelector(listSelector)) {
const extractedData = [];
const containersToProcess = Math.min(containers.length, limit);
for (
let containerIndex = 0;
containerIndex < containersToProcess;
containerIndex++
) {
const record = {};
for (const [label, field] of Object.entries(fields)) {
let element = null;
if (isXPathSelector(field.selector)) {
// Create indexed absolute XPath
const indexedSelector = createIndexedXPath(
field.selector,
listSelector,
containerIndex + 1
);
element = evaluateXPath(document, indexedSelector);
} else {
// Fallback for CSS selectors within XPath containers
const container = containers[containerIndex];
element = queryElement(container, field.selector);
}
if (element) {
const value = extractValue(element, field.attribute);
if (value !== null && value !== "") {
record[label] = value;
} else {
record[label] = "";
}
} else {
record[label] = "";
}
}
if (Object.values(record).some((value) => value !== "")) {
extractedData.push(record);
}
}
console.log(`📊 Total records extracted: ${extractedData.length}`);
return extractedData;
}
// For CSS selectors, use the original table-aware approach
containers.forEach((container, containerIndex) => { containers.forEach((container, containerIndex) => {
for (const [label, field] of Object.entries(fields)) { for (const [label, field] of Object.entries(fields)) {
const sampleElement = queryElement(container, field.selector); const sampleElement = queryElement(container, field.selector);
@@ -783,7 +1057,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
containerFields[containerIndex].tableFields[label] = { containerFields[containerIndex].tableFields[label] = {
...field, ...field,
tableContext: ancestor.type, tableContext: ancestor.type,
cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 cellIndex:
ancestor.type === "TD" ? getCellIndex(ancestor.element) : -1,
}; };
} else { } else {
containerFields[containerIndex].nonTableFields[label] = field; containerFields[containerIndex].nonTableFields[label] = field;
@@ -798,7 +1073,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const nonTableData = []; const nonTableData = [];
// Process table data with support for iframes, frames, and shadow DOM // Process table data with support for iframes, frames, and shadow DOM
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { for (
let containerIndex = 0;
containerIndex < containers.length;
containerIndex++
) {
const container = containers[containerIndex]; const container = containers[containerIndex];
const { tableFields } = containerFields[containerIndex]; const { tableFields } = containerFields[containerIndex];
@@ -808,13 +1087,20 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
let tableContext = firstElement; let tableContext = firstElement;
// Find table context including iframe, frame and shadow DOM // Find table context including iframe, frame and shadow DOM
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) { while (
tableContext &&
tableContext.tagName !== "TABLE" &&
tableContext !== container
) {
if (tableContext.getRootNode() instanceof ShadowRoot) { if (tableContext.getRootNode() instanceof ShadowRoot) {
tableContext = tableContext.getRootNode().host; tableContext = tableContext.getRootNode().host;
continue; continue;
} }
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') { if (
tableContext.tagName === "IFRAME" ||
tableContext.tagName === "FRAME"
) {
try { try {
tableContext = tableContext.contentDocument.body; tableContext = tableContext.contentDocument.body;
} catch (e) { } catch (e) {
@@ -830,30 +1116,45 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const rows = []; const rows = [];
// Get rows from regular DOM // Get rows from regular DOM
rows.push(...tableContext.getElementsByTagName('TR')); rows.push(...tableContext.getElementsByTagName("TR"));
// Get rows from shadow DOM // Get rows from shadow DOM
if (tableContext.shadowRoot) { if (tableContext.shadowRoot) {
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR')); rows.push(...tableContext.shadowRoot.getElementsByTagName("TR"));
} }
// Get rows from iframes and frames // Get rows from iframes and frames
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') { if (
tableContext.tagName === "IFRAME" ||
tableContext.tagName === "FRAME"
) {
try { try {
const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document; const frameDoc =
rows.push(...frameDoc.getElementsByTagName('TR')); tableContext.contentDocument ||
tableContext.contentWindow.document;
rows.push(...frameDoc.getElementsByTagName("TR"));
} catch (e) { } catch (e) {
console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e); console.warn(
`Cannot access ${tableContext.tagName.toLowerCase()} rows:`,
e
);
} }
} }
const processedRows = filterRowsBasedOnTag(rows, tableFields); const processedRows = filterRowsBasedOnTag(rows, tableFields);
for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) { for (
let rowIndex = 0;
rowIndex < Math.min(processedRows.length, limit);
rowIndex++
) {
const record = {}; const record = {};
const currentRow = processedRows[rowIndex]; const currentRow = processedRows[rowIndex];
for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) { for (const [
label,
{ selector, attribute, cellIndex },
] of Object.entries(tableFields)) {
let element = null; let element = null;
if (cellIndex >= 0) { if (cellIndex >= 0) {
@@ -871,18 +1172,27 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
if (td) { if (td) {
element = queryElement(td, selector); element = queryElement(td, selector);
if (!element && selector.split(/(?:>>|:>>)/).pop().includes('td:nth-child')) { if (
!element &&
selector
.split(/(?:>>|:>>)/)
.pop()
.includes("td:nth-child")
) {
element = td; element = td;
} }
if (!element) { if (!element) {
const tagOnlySelector = selector.split('.')[0]; const tagOnlySelector = selector.split(".")[0];
element = queryElement(td, tagOnlySelector); element = queryElement(td, tagOnlySelector);
} }
if (!element) { if (!element) {
let currentElement = td; let currentElement = td;
while (currentElement && currentElement.children.length > 0) { while (
currentElement &&
currentElement.children.length > 0
) {
let foundContentChild = false; let foundContentChild = false;
for (const child of currentElement.children) { for (const child of currentElement.children) {
if (extractValue(child, attribute)) { if (extractValue(child, attribute)) {
@@ -914,7 +1224,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
} }
// Process non-table data with all contexts support // Process non-table data with all contexts support
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { for (
let containerIndex = 0;
containerIndex < containers.length;
containerIndex++
) {
if (nonTableData.length >= limit) break; if (nonTableData.length >= limit) break;
const container = containers[containerIndex]; const container = containers[containerIndex];
@@ -923,7 +1237,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
if (Object.keys(nonTableFields).length > 0) { if (Object.keys(nonTableFields).length > 0) {
const record = {}; const record = {};
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { for (const [label, { selector, attribute }] of Object.entries(
nonTableFields
)) {
// Get the last part of the selector after any context delimiter // Get the last part of the selector after any context delimiter
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0]; const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
const element = tryFallbackSelector(container, relativeSelector); const element = tryFallbackSelector(container, relativeSelector);
@@ -941,6 +1257,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
// Merge and limit the results // Merge and limit the results
const scrapedData = [...tableData, ...nonTableData]; const scrapedData = [...tableData, ...nonTableData];
console.log(`📊 Total records extracted: ${scrapedData.length}`);
return scrapedData; return scrapedData;
}; };

View File

@@ -1287,28 +1287,42 @@ export class RemoteBrowser {
*/ */
public registerEditorEvents = (): void => { public registerEditorEvents = (): void => {
// For each event, include userId to make sure events are handled for the correct browser // For each event, include userId to make sure events are handled for the correct browser
logger.log('debug', `Registering editor events for user: ${this.userId}`); logger.log("debug", `Registering editor events for user: ${this.userId}`);
this.socket.on(`captureDirectScreenshot:${this.userId}`, async (settings) => { this.socket.on(
logger.debug(`Direct screenshot capture requested for user ${this.userId}`); `captureDirectScreenshot:${this.userId}`,
async (settings) => {
logger.debug(
`Direct screenshot capture requested for user ${this.userId}`
);
await this.captureDirectScreenshot(settings); await this.captureDirectScreenshot(settings);
}); }
);
// For backward compatibility // For backward compatibility
this.socket.on('captureDirectScreenshot', async (settings) => { this.socket.on("captureDirectScreenshot", async (settings) => {
await this.captureDirectScreenshot(settings); await this.captureDirectScreenshot(settings);
}); });
// Listen for specific events for this user // Listen for specific events for this user
this.socket.on(`rerender:${this.userId}`, async () => { this.socket.on(`rerender:${this.userId}`, async () => {
logger.debug(`Rerender event received for user ${this.userId}`); logger.debug(`Rerender event received for user ${this.userId}`);
if (this.renderingMode === "dom") {
await this.makeAndEmitDOMSnapshot();
} else {
await this.makeAndEmitScreenshot(); await this.makeAndEmitScreenshot();
}
}); });
// For backward compatibility, also listen to the general event this.socket.on("rerender", async () => {
this.socket.on('rerender', async () => { logger.debug(
logger.debug(`General rerender event received, checking if for user ${this.userId}`); `General rerender event received, checking if for user ${this.userId}`
);
if (this.renderingMode === "dom") {
await this.makeAndEmitDOMSnapshot();
} else {
await this.makeAndEmitScreenshot(); await this.makeAndEmitScreenshot();
}
}); });
this.socket.on(`settings:${this.userId}`, (settings) => { this.socket.on(`settings:${this.userId}`, (settings) => {
@@ -1317,19 +1331,25 @@ export class RemoteBrowser {
}); });
this.socket.on(`changeTab:${this.userId}`, async (tabIndex) => { this.socket.on(`changeTab:${this.userId}`, async (tabIndex) => {
logger.debug(`Tab change to ${tabIndex} requested for user ${this.userId}`); logger.debug(
`Tab change to ${tabIndex} requested for user ${this.userId}`
);
await this.changeTab(tabIndex); await this.changeTab(tabIndex);
}); });
this.socket.on(`addTab:${this.userId}`, async () => { this.socket.on(`addTab:${this.userId}`, async () => {
logger.debug(`New tab requested for user ${this.userId}`); logger.debug(`New tab requested for user ${this.userId}`);
await this.currentPage?.context().newPage(); await this.currentPage?.context().newPage();
const lastTabIndex = this.currentPage ? this.currentPage.context().pages().length - 1 : 0; const lastTabIndex = this.currentPage
? this.currentPage.context().pages().length - 1
: 0;
await this.changeTab(lastTabIndex); await this.changeTab(lastTabIndex);
}); });
this.socket.on(`closeTab:${this.userId}`, async (tabInfo) => { this.socket.on(`closeTab:${this.userId}`, async (tabInfo) => {
logger.debug(`Close tab ${tabInfo.index} requested for user ${this.userId}`); logger.debug(
`Close tab ${tabInfo.index} requested for user ${this.userId}`
);
const page = this.currentPage?.context().pages()[tabInfo.index]; const page = this.currentPage?.context().pages()[tabInfo.index];
if (page) { if (page) {
if (tabInfo.isCurrent) { if (tabInfo.isCurrent) {
@@ -1343,34 +1363,58 @@ export class RemoteBrowser {
} }
await page.close(); await page.close();
logger.log( logger.log(
'debug', "debug",
`Tab ${tabInfo.index} was closed for user ${this.userId}, new tab count: ${this.currentPage?.context().pages().length}` `Tab ${tabInfo.index} was closed for user ${
this.userId
}, new tab count: ${this.currentPage?.context().pages().length}`
); );
} else { } else {
logger.log('error', `Tab index ${tabInfo.index} out of range for user ${this.userId}`); logger.log(
"error",
`Tab index ${tabInfo.index} out of range for user ${this.userId}`
);
} }
}); });
this.socket.on(`setViewportSize:${this.userId}`, async (data: { width: number, height: number }) => { this.socket.on(
`setViewportSize:${this.userId}`,
async (data: { width: number; height: number }) => {
const { width, height } = data; const { width, height } = data;
logger.log('debug', `Viewport size change to width=${width}, height=${height} requested for user ${this.userId}`); logger.log(
"debug",
`Viewport size change to width=${width}, height=${height} requested for user ${this.userId}`
);
// Update the browser context's viewport dynamically // Update the browser context's viewport dynamically
if (this.context && this.browser) { if (this.context && this.browser) {
this.context = await this.browser.newContext({ viewport: { width, height } }); this.context = await this.browser.newContext({
logger.log('debug', `Viewport size updated to width=${width}, height=${height} for user ${this.userId}`); viewport: { width, height },
}
}); });
logger.log(
"debug",
`Viewport size updated to width=${width}, height=${height} for user ${this.userId}`
);
}
}
);
// For backward compatibility, also register the standard events // For backward compatibility, also register the standard events
this.socket.on('settings', (settings) => this.interpreterSettings = settings); this.socket.on(
this.socket.on('changeTab', async (tabIndex) => await this.changeTab(tabIndex)); "settings",
this.socket.on('addTab', async () => { (settings) => (this.interpreterSettings = settings)
);
this.socket.on(
"changeTab",
async (tabIndex) => await this.changeTab(tabIndex)
);
this.socket.on("addTab", async () => {
await this.currentPage?.context().newPage(); await this.currentPage?.context().newPage();
const lastTabIndex = this.currentPage ? this.currentPage.context().pages().length - 1 : 0; const lastTabIndex = this.currentPage
? this.currentPage.context().pages().length - 1
: 0;
await this.changeTab(lastTabIndex); await this.changeTab(lastTabIndex);
}); });
this.socket.on('closeTab', async (tabInfo) => { this.socket.on("closeTab", async (tabInfo) => {
const page = this.currentPage?.context().pages()[tabInfo.index]; const page = this.currentPage?.context().pages()[tabInfo.index];
if (page) { if (page) {
if (tabInfo.isCurrent) { if (tabInfo.isCurrent) {
@@ -1383,18 +1427,25 @@ export class RemoteBrowser {
await page.close(); await page.close();
} }
}); });
this.socket.on('setViewportSize', async (data: { width: number, height: number }) => { this.socket.on(
"setViewportSize",
async (data: { width: number; height: number }) => {
const { width, height } = data; const { width, height } = data;
if (this.context && this.browser) { if (this.context && this.browser) {
this.context = await this.browser.newContext({ viewport: { width, height } }); this.context = await this.browser.newContext({
} viewport: { width, height },
}); });
}
}
);
this.socket.on('extractListData', async (data: { this.socket.on(
listSelector: string, "extractListData",
fields: Record<string, any>, async (data: {
currentListId: number, listSelector: string;
pagination: any fields: Record<string, any>;
currentListId: number;
pagination: any;
}) => { }) => {
if (this.currentPage) { if (this.currentPage) {
const extractedData = await this.extractListData( const extractedData = await this.extractListData(
@@ -1403,12 +1454,13 @@ export class RemoteBrowser {
data.fields data.fields
); );
this.socket.emit('listDataExtracted', { this.socket.emit("listDataExtracted", {
currentListId: data.currentListId, currentListId: data.currentListId,
data: extractedData data: extractedData,
}); });
} }
}); }
);
}; };
/** /**
* Subscribes the remote browser for a screencast session * Subscribes the remote browser for a screencast session
@@ -1481,10 +1533,7 @@ export class RemoteBrowser {
* CDP-based DOM snapshot creation using captured network resources * CDP-based DOM snapshot creation using captured network resources
*/ */
public async makeAndEmitDOMSnapshot(): Promise<void> { public async makeAndEmitDOMSnapshot(): Promise<void> {
if ( if (!this.currentPage || !this.isDOMStreamingActive) {
!this.currentPage ||
!this.isDOMStreamingActive
) {
return; return;
} }
@@ -1537,6 +1586,7 @@ export class RemoteBrowser {
if (typeof window.rrwebSnapshot === "undefined") { if (typeof window.rrwebSnapshot === "undefined") {
throw new Error("rrweb-snapshot library not available"); throw new Error("rrweb-snapshot library not available");
} }
return window.rrwebSnapshot.snapshot(document, { return window.rrwebSnapshot.snapshot(document, {
inlineImages: true, inlineImages: true,
collectFonts: true, collectFonts: true,
@@ -1557,10 +1607,12 @@ export class RemoteBrowser {
this.emitRRWebSnapshot(enhancedSnapshot); this.emitRRWebSnapshot(enhancedSnapshot);
} catch (error) { } catch (error) {
// Handle navigation context destruction gracefully // Handle navigation context destruction gracefully
if (error instanceof Error && if (
error instanceof Error &&
(error.message.includes("Execution context was destroyed") || (error.message.includes("Execution context was destroyed") ||
error.message.includes("most likely because of a navigation") || error.message.includes("most likely because of a navigation") ||
error.message.includes("Target closed"))) { error.message.includes("Target closed"))
) {
logger.debug("DOM snapshot skipped due to page navigation or closure"); logger.debug("DOM snapshot skipped due to page navigation or closure");
return; // Don't emit error for navigation - this is expected return; // Don't emit error for navigation - this is expected
} }
@@ -1772,6 +1824,7 @@ export class RemoteBrowser {
const page = this.currentPage?.context().pages()[tabIndex]; const page = this.currentPage?.context().pages()[tabIndex];
if (page) { if (page) {
await this.stopScreencast(); await this.stopScreencast();
await this.stopDOM();
this.currentPage = page; this.currentPage = page;
await this.setupPageEventListeners(this.currentPage); await this.setupPageEventListeners(this.currentPage);
@@ -1783,8 +1836,13 @@ export class RemoteBrowser {
url: this.currentPage.url(), url: this.currentPage.url(),
userId: this.userId userId: this.userId
}); });
if (this.isDOMStreamingActive) {
await this.makeAndEmitDOMSnapshot();
await this.subscribeToDOM();
} else {
await this.makeAndEmitScreenshot(); await this.makeAndEmitScreenshot();
await this.subscribeToScreencast(); await this.subscribeToScreencast();
}
} else { } else {
logger.log('error', `${tabIndex} index out of range of pages`) logger.log('error', `${tabIndex} index out of range of pages`)
} }

View File

@@ -464,7 +464,6 @@ export class WorkflowGenerator {
public onClick = async (coordinates: Coordinates, page: Page) => { public onClick = async (coordinates: Coordinates, page: Page) => {
let where: WhereWhatPair["where"] = { url: this.getBestUrl(page.url()) }; let where: WhereWhatPair["where"] = { url: this.getBestUrl(page.url()) };
const selector = await this.generateSelector(page, coordinates, ActionType.Click); const selector = await this.generateSelector(page, coordinates, ActionType.Click);
console.log("COOORDINATES: ", coordinates);
logger.log('debug', `Element's selector: ${selector}`); logger.log('debug', `Element's selector: ${selector}`);
const elementInfo = await getElementInformation(page, coordinates, '', false); const elementInfo = await getElementInformation(page, coordinates, '', false);
@@ -999,6 +998,7 @@ export class WorkflowGenerator {
rect, rect,
selector: displaySelector, selector: displaySelector,
elementInfo, elementInfo,
isDOMMode: this.isDOMMode,
// Include shadow DOM specific information // Include shadow DOM specific information
shadowInfo: elementInfo?.isShadowRoot ? { shadowInfo: elementInfo?.isShadowRoot ? {
mode: elementInfo.shadowRootMode, mode: elementInfo.shadowRootMode,

View File

@@ -11,7 +11,7 @@ import { useTranslation } from 'react-i18next';
import { AuthContext } from '../../context/auth'; import { AuthContext } from '../../context/auth';
import { coordinateMapper } from '../../helpers/coordinateMapper'; import { coordinateMapper } from '../../helpers/coordinateMapper';
import { useBrowserDimensionsStore } from '../../context/browserDimensions'; import { useBrowserDimensionsStore } from '../../context/browserDimensions';
import { clientSelectorGenerator } from "../../helpers/clientSelectorGenerator"; import { clientSelectorGenerator, ElementFingerprint } from "../../helpers/clientSelectorGenerator";
import DatePicker from "../pickers/DatePicker"; import DatePicker from "../pickers/DatePicker";
import Dropdown from "../pickers/Dropdown"; import Dropdown from "../pickers/Dropdown";
import TimePicker from "../pickers/TimePicker"; import TimePicker from "../pickers/TimePicker";
@@ -147,15 +147,14 @@ export const BrowserWindow = () => {
const { browserWidth, browserHeight } = useBrowserDimensionsStore(); const { browserWidth, browserHeight } = useBrowserDimensionsStore();
const [canvasRef, setCanvasReference] = useState<React.RefObject<HTMLCanvasElement> | undefined>(undefined); const [canvasRef, setCanvasReference] = useState<React.RefObject<HTMLCanvasElement> | undefined>(undefined);
const [screenShot, setScreenShot] = useState<string>(""); const [screenShot, setScreenShot] = useState<string>("");
const [highlighterData, setHighlighterData] = useState<{ rect: DOMRect, selector: string, elementInfo: ElementInfo | null, childSelectors?: string[] } | null>(null); const [highlighterData, setHighlighterData] = useState<{ rect: DOMRect, selector: string, elementInfo: ElementInfo | null, childSelectors?: string[], groupElements?: Array<{ element: HTMLElement; rect: DOMRect } >} | null>(null);
const [showAttributeModal, setShowAttributeModal] = useState(false); const [showAttributeModal, setShowAttributeModal] = useState(false);
const [attributeOptions, setAttributeOptions] = useState<AttributeOption[]>([]); const [attributeOptions, setAttributeOptions] = useState<AttributeOption[]>([]);
const [selectedElement, setSelectedElement] = useState<{ selector: string, info: ElementInfo | null } | null>(null); const [selectedElement, setSelectedElement] = useState<{ selector: string, info: ElementInfo | null } | null>(null);
const [currentListId, setCurrentListId] = useState<number | null>(null); const [currentListId, setCurrentListId] = useState<number | null>(null);
const [viewportInfo, setViewportInfo] = useState<ViewportInfo>({ width: browserWidth, height: browserHeight }); const [viewportInfo, setViewportInfo] = useState<ViewportInfo>({ width: browserWidth, height: browserHeight });
const [isDOMMode, setIsDOMMode] = useState(false);
const [currentSnapshot, setCurrentSnapshot] = useState<ProcessedSnapshot | null>(null);
const [isLoading, setIsLoading] = useState(false); const [isLoading, setIsLoading] = useState(false);
const [cachedChildSelectors, setCachedChildSelectors] = useState<string[]>([]);
const [listSelector, setListSelector] = useState<string | null>(null); const [listSelector, setListSelector] = useState<string | null>(null);
const [fields, setFields] = useState<Record<string, TextStep>>({}); const [fields, setFields] = useState<Record<string, TextStep>>({});
@@ -164,10 +163,16 @@ export const BrowserWindow = () => {
const highlighterUpdateRef = useRef<number>(0); const highlighterUpdateRef = useRef<number>(0);
const { socket } = useSocketStore(); const { socket } = useSocketStore();
const { notify, currentTextActionId, currentListActionId } = useGlobalInfoStore(); const { notify, currentTextActionId, currentListActionId, updateDOMMode, isDOMMode, currentSnapshot } = useGlobalInfoStore();
const { getText, getList, paginationMode, paginationType, limitMode, captureStage } = useActionContext(); const { getText, getList, paginationMode, paginationType, limitMode, captureStage } = useActionContext();
const { addTextStep, addListStep, updateListStepData } = useBrowserSteps(); const { addTextStep, addListStep, updateListStepData } = useBrowserSteps();
const [currentGroupInfo, setCurrentGroupInfo] = useState<{
isGroupElement: boolean;
groupSize: number;
groupElements: HTMLElement[];
} | null>(null);
const { state } = useContext(AuthContext); const { state } = useContext(AuthContext);
const { user } = state; const { user } = state;
@@ -243,51 +248,47 @@ export const BrowserWindow = () => {
(data: RRWebDOMCastData) => { (data: RRWebDOMCastData) => {
if (!data.userId || data.userId === user?.id) { if (!data.userId || data.userId === user?.id) {
if (data.snapshotData && data.snapshotData.snapshot) { if (data.snapshotData && data.snapshotData.snapshot) {
setCurrentSnapshot(data.snapshotData); updateDOMMode(true, data.snapshotData);
setIsDOMMode(true);
socket?.emit("dom-mode-enabled"); socket?.emit("dom-mode-enabled");
setIsLoading(false); setIsLoading(false);
} else { } else {
setIsLoading(false); setIsLoading(false);
} }
} }
}, },
[user?.id, socket] [user?.id, socket, updateDOMMode]
); );
const domModeHandler = useCallback( const domModeHandler = useCallback(
(data: any) => { (data: any) => {
if (!data.userId || data.userId === user?.id) { if (!data.userId || data.userId === user?.id) {
setIsDOMMode(true); updateDOMMode(true);
socket?.emit("dom-mode-enabled"); socket?.emit("dom-mode-enabled");
setIsLoading(false); setIsLoading(false);
} }
}, },
[user?.id, socket] [user?.id, socket, updateDOMMode]
); );
const screenshotModeHandler = useCallback( const screenshotModeHandler = useCallback(
(data: any) => { (data: any) => {
if (!data.userId || data.userId === user?.id) { if (!data.userId || data.userId === user?.id) {
setIsDOMMode(false); updateDOMMode(false);
socket?.emit("screenshot-mode-enabled"); socket?.emit("screenshot-mode-enabled");
setCurrentSnapshot(null);
setIsLoading(false); setIsLoading(false);
} }
}, },
[user?.id] [user?.id, updateDOMMode]
); );
const domModeErrorHandler = useCallback( const domModeErrorHandler = useCallback(
(data: any) => { (data: any) => {
if (!data.userId || data.userId === user?.id) { if (!data.userId || data.userId === user?.id) {
setIsDOMMode(false); updateDOMMode(false);
setCurrentSnapshot(null);
setIsLoading(false); setIsLoading(false);
} }
}, },
[user?.id] [user?.id, updateDOMMode]
); );
useEffect(() => { useEffect(() => {
@@ -304,8 +305,23 @@ export const BrowserWindow = () => {
socket?.emit("listSelector", { selector: listSelector }); socket?.emit("listSelector", { selector: listSelector });
clientSelectorGenerator.setListSelector(listSelector); clientSelectorGenerator.setListSelector(listSelector);
setCachedChildSelectors([]);
if (currentSnapshot) {
const iframeElement = document.querySelector(
"#dom-browser-iframe"
) as HTMLIFrameElement;
if (iframeElement?.contentDocument) {
const childSelectors = clientSelectorGenerator.getChildSelectors(
iframeElement.contentDocument,
listSelector
);
setCachedChildSelectors(childSelectors);
} }
}, [isDOMMode, listSelector, socket, getList]); }
}
}, [isDOMMode, listSelector, socket, getList, currentSnapshot]);
useEffect(() => { useEffect(() => {
coordinateMapper.updateDimensions(dimensions.width, dimensions.height, viewportInfo.width, viewportInfo.height); coordinateMapper.updateDimensions(dimensions.width, dimensions.height, viewportInfo.width, viewportInfo.height);
@@ -345,6 +361,7 @@ export const BrowserWindow = () => {
setListSelector(null); setListSelector(null);
setFields({}); setFields({});
setCurrentListId(null); setCurrentListId(null);
setCachedChildSelectors([]);
}, []); }, []);
useEffect(() => { useEffect(() => {
@@ -372,7 +389,7 @@ export const BrowserWindow = () => {
socket.on("screencast", screencastHandler); socket.on("screencast", screencastHandler);
socket.on("domcast", rrwebSnapshotHandler); socket.on("domcast", rrwebSnapshotHandler);
socket.on("dom-mode-enabled", domModeHandler); socket.on("dom-mode-enabled", domModeHandler);
socket.on("screenshot-mode-enabled", screenshotModeHandler); // socket.on("screenshot-mode-enabled", screenshotModeHandler);
socket.on("dom-mode-error", domModeErrorHandler); socket.on("dom-mode-error", domModeErrorHandler);
} }
@@ -386,7 +403,7 @@ export const BrowserWindow = () => {
socket.off("screencast", screencastHandler); socket.off("screencast", screencastHandler);
socket.off("domcast", rrwebSnapshotHandler); socket.off("domcast", rrwebSnapshotHandler);
socket.off("dom-mode-enabled", domModeHandler); socket.off("dom-mode-enabled", domModeHandler);
socket.off("screenshot-mode-enabled", screenshotModeHandler); // socket.off("screenshot-mode-enabled", screenshotModeHandler);
socket.off("dom-mode-error", domModeErrorHandler); socket.off("dom-mode-error", domModeErrorHandler);
} }
}; };
@@ -398,7 +415,7 @@ export const BrowserWindow = () => {
screencastHandler, screencastHandler,
rrwebSnapshotHandler, rrwebSnapshotHandler,
domModeHandler, domModeHandler,
screenshotModeHandler, // screenshotModeHandler,
domModeErrorHandler, domModeErrorHandler,
]); ]);
@@ -408,8 +425,19 @@ export const BrowserWindow = () => {
selector: string; selector: string;
elementInfo: ElementInfo | null; elementInfo: ElementInfo | null;
childSelectors?: string[]; childSelectors?: string[];
groupInfo?: {
isGroupElement: boolean;
groupSize: number;
groupElements: HTMLElement[];
groupFingerprint: ElementFingerprint;
};
isDOMMode?: boolean; isDOMMode?: boolean;
}) => { }) => {
if (!getText && !getList) {
setHighlighterData(null);
return;
}
if (!isDOMMode || !currentSnapshot) { if (!isDOMMode || !currentSnapshot) {
return; return;
} }
@@ -424,15 +452,6 @@ export const BrowserWindow = () => {
) as HTMLIFrameElement; ) as HTMLIFrameElement;
} }
if (!iframeElement) {
const browserWindow = document.querySelector("#browser-window");
if (browserWindow) {
iframeElement = browserWindow.querySelector(
"iframe"
) as HTMLIFrameElement;
}
}
if (!iframeElement) { if (!iframeElement) {
console.error("Could not find iframe element for DOM highlighting"); console.error("Could not find iframe element for DOM highlighting");
return; return;
@@ -441,6 +460,12 @@ export const BrowserWindow = () => {
const iframeRect = iframeElement.getBoundingClientRect(); const iframeRect = iframeElement.getBoundingClientRect();
const IFRAME_BODY_PADDING = 16; const IFRAME_BODY_PADDING = 16;
if (data.groupInfo) {
setCurrentGroupInfo(data.groupInfo);
} else {
setCurrentGroupInfo(null);
}
const absoluteRect = new DOMRect( const absoluteRect = new DOMRect(
data.rect.x + iframeRect.left - IFRAME_BODY_PADDING, data.rect.x + iframeRect.left - IFRAME_BODY_PADDING,
data.rect.y + iframeRect.top - IFRAME_BODY_PADDING, data.rect.y + iframeRect.top - IFRAME_BODY_PADDING,
@@ -451,12 +476,36 @@ export const BrowserWindow = () => {
const mappedData = { const mappedData = {
...data, ...data,
rect: absoluteRect, rect: absoluteRect,
childSelectors: data.childSelectors || cachedChildSelectors,
}; };
if (getList === true) { if (getList === true) {
if (listSelector) { if (!listSelector && data.groupInfo?.isGroupElement) {
socket?.emit("listSelector", { selector: listSelector }); const updatedGroupElements = data.groupInfo.groupElements.map(
const hasValidChildSelectors = (element) => {
const elementRect = element.getBoundingClientRect();
return {
element,
rect: new DOMRect(
elementRect.x + iframeRect.left - IFRAME_BODY_PADDING,
elementRect.y + iframeRect.top - IFRAME_BODY_PADDING,
elementRect.width,
elementRect.height
),
};
}
);
const mappedData = {
...data,
rect: absoluteRect,
groupElements: updatedGroupElements,
childSelectors: data.childSelectors || cachedChildSelectors,
};
setHighlighterData(mappedData);
} else if (listSelector) {
const hasChildSelectors =
Array.isArray(mappedData.childSelectors) && Array.isArray(mappedData.childSelectors) &&
mappedData.childSelectors.length > 0; mappedData.childSelectors.length > 0;
@@ -471,62 +520,8 @@ export const BrowserWindow = () => {
} else { } else {
setHighlighterData(null); setHighlighterData(null);
} }
} else if ( } else if (hasChildSelectors) {
mappedData.childSelectors &&
mappedData.childSelectors.includes(mappedData.selector)
) {
setHighlighterData(mappedData); setHighlighterData(mappedData);
} else if (
mappedData.elementInfo?.isIframeContent &&
mappedData.childSelectors
) {
const isIframeChild = mappedData.childSelectors.some(
(childSelector) =>
mappedData.selector.includes(":>>") &&
childSelector
.split(":>>")
.some((part) => mappedData.selector.includes(part.trim()))
);
setHighlighterData(isIframeChild ? mappedData : null);
} else if (
mappedData.selector.includes(":>>") &&
hasValidChildSelectors
) {
const selectorParts = mappedData.selector
.split(":>>")
.map((part) => part.trim());
const isValidMixedSelector = selectorParts.some((part) =>
mappedData.childSelectors!.some((childSelector) =>
childSelector.includes(part)
)
);
setHighlighterData(isValidMixedSelector ? mappedData : null);
} else if (
mappedData.elementInfo?.isShadowRoot &&
mappedData.childSelectors
) {
const isShadowChild = mappedData.childSelectors.some(
(childSelector) =>
mappedData.selector.includes(">>") &&
childSelector
.split(">>")
.some((part) => mappedData.selector.includes(part.trim()))
);
setHighlighterData(isShadowChild ? mappedData : null);
} else if (
mappedData.selector.includes(">>") &&
hasValidChildSelectors
) {
const selectorParts = mappedData.selector
.split(">>")
.map((part) => part.trim());
const isValidMixedSelector = selectorParts.some((part) =>
mappedData.childSelectors!.some((childSelector) =>
childSelector.includes(part)
)
);
setHighlighterData(isValidMixedSelector ? mappedData : null);
} else { } else {
setHighlighterData(null); setHighlighterData(null);
} }
@@ -534,23 +529,29 @@ export const BrowserWindow = () => {
setHighlighterData(mappedData); setHighlighterData(mappedData);
} }
} else { } else {
// getText mode
setHighlighterData(mappedData); setHighlighterData(mappedData);
} }
}, },
[ [
isDOMMode, isDOMMode,
currentSnapshot, currentSnapshot,
getText,
getList, getList,
socket, socket,
listSelector, listSelector,
paginationMode, paginationMode,
paginationType, paginationType,
limitMode, limitMode,
cachedChildSelectors,
] ]
); );
const highlighterHandler = useCallback((data: { rect: DOMRect, selector: string, elementInfo: ElementInfo | null, childSelectors?: string[] }) => { const highlighterHandler = useCallback((data: { rect: DOMRect, selector: string, elementInfo: ElementInfo | null, childSelectors?: string[], isDOMMode?: boolean; }) => {
if (isDOMMode || data.isDOMMode) {
domHighlighterHandler(data);
return;
}
const now = performance.now(); const now = performance.now();
if (now - highlighterUpdateRef.current < 16) { if (now - highlighterUpdateRef.current < 16) {
return; return;
@@ -652,6 +653,20 @@ export const BrowserWindow = () => {
}; };
}, [socket, highlighterHandler, onMouseMove, getList, listSelector]); }, [socket, highlighterHandler, onMouseMove, getList, listSelector]);
useEffect(() => {
document.addEventListener("mousemove", onMouseMove, false);
if (socket) {
socket.off("highlighter", highlighterHandler);
socket.on("highlighter", highlighterHandler);
}
return () => {
document.removeEventListener("mousemove", onMouseMove);
if (socket) {
socket.off("highlighter", highlighterHandler);
}
};
}, [socket, highlighterHandler, getList, listSelector]);
useEffect(() => { useEffect(() => {
if (socket && listSelector) { if (socket && listSelector) {
console.log('Syncing list selector with server:', listSelector); console.log('Syncing list selector with server:', listSelector);
@@ -673,11 +688,205 @@ export const BrowserWindow = () => {
selector: string; selector: string;
elementInfo: ElementInfo | null; elementInfo: ElementInfo | null;
childSelectors?: string[]; childSelectors?: string[];
groupInfo?: {
isGroupElement: boolean;
groupSize: number;
groupElements: HTMLElement[];
};
}) => { }) => {
setShowAttributeModal(false); setShowAttributeModal(false);
setSelectedElement(null); setSelectedElement(null);
setAttributeOptions([]); setAttributeOptions([]);
if (paginationMode && getList) {
if (
paginationType !== "" &&
paginationType !== "scrollDown" &&
paginationType !== "scrollUp" &&
paginationType !== "none"
) {
setPaginationSelector(highlighterData.selector);
notify(
`info`,
t(
"browser_window.attribute_modal.notifications.pagination_select_success"
)
);
addListStep(
listSelector!,
fields,
currentListId || 0,
currentListActionId || `list-${crypto.randomUUID()}`,
{ type: paginationType, selector: highlighterData.selector }
);
socket?.emit("setPaginationMode", { pagination: false });
}
return;
}
if (
getList === true &&
!listSelector &&
highlighterData.groupInfo?.isGroupElement
) {
let cleanedSelector = highlighterData.selector;
setListSelector(cleanedSelector);
notify(
`info`,
t(
"browser_window.attribute_modal.notifications.list_select_success",
{
count: highlighterData.groupInfo.groupSize,
}
) ||
`Selected group with ${highlighterData.groupInfo.groupSize} similar elements`
);
setCurrentListId(Date.now());
setFields({});
socket?.emit("setGetList", { getList: true });
socket?.emit("listSelector", { selector: cleanedSelector });
return;
}
if (getList === true && listSelector && currentListId) {
const options = getAttributeOptions(
highlighterData.elementInfo?.tagName || "",
highlighterData.elementInfo
);
if (options.length === 1) {
const attribute = options[0].value;
let currentSelector = highlighterData.selector;
const data =
attribute === "href"
? highlighterData.elementInfo?.url || ""
: attribute === "src"
? highlighterData.elementInfo?.imageUrl || ""
: highlighterData.elementInfo?.innerText || "";
const newField: TextStep = {
id: Date.now(),
type: "text",
label: `Label ${Object.keys(fields).length + 1}`,
data: data,
selectorObj: {
selector: currentSelector,
tag: highlighterData.elementInfo?.tagName,
shadow: highlighterData.elementInfo?.isShadowRoot,
attribute,
},
};
const updatedFields = {
...fields,
[newField.id]: newField,
};
setFields(updatedFields);
if (listSelector) {
addListStep(
listSelector,
updatedFields,
currentListId,
currentListActionId || `list-${crypto.randomUUID()}`,
{ type: "", selector: paginationSelector }
);
}
} else {
setAttributeOptions(options);
setSelectedElement({
selector: highlighterData.selector,
info: highlighterData.elementInfo,
});
setShowAttributeModal(true);
}
return;
}
if (getText === true) {
const options = getAttributeOptions(
highlighterData.elementInfo?.tagName || "",
highlighterData.elementInfo
);
if (options.length === 1) {
const attribute = options[0].value;
const data =
attribute === "href"
? highlighterData.elementInfo?.url || ""
: attribute === "src"
? highlighterData.elementInfo?.imageUrl || ""
: highlighterData.elementInfo?.innerText || "";
addTextStep(
"",
data,
{
selector: highlighterData.selector,
tag: highlighterData.elementInfo?.tagName,
shadow: highlighterData.elementInfo?.isShadowRoot,
attribute,
},
currentTextActionId || `text-${crypto.randomUUID()}`
);
} else {
setAttributeOptions(options);
setSelectedElement({
selector: highlighterData.selector,
info: highlighterData.elementInfo,
});
setShowAttributeModal(true);
}
}
},
[
getText,
getList,
listSelector,
paginationMode,
paginationType,
limitMode,
fields,
currentListId,
currentTextActionId,
currentListActionId,
addTextStep,
addListStep,
notify,
socket,
t,
paginationSelector,
]
);
const handleClick = (e: React.MouseEvent<HTMLDivElement>) => {
if (highlighterData) {
let shouldProcessClick = false;
if (!isDOMMode && canvasRef?.current) {
const canvasRect = canvasRef.current.getBoundingClientRect();
const clickX = e.clientX - canvasRect.left;
const clickY = e.clientY - canvasRect.top;
const highlightRect = highlighterData.rect;
const mappedRect =
coordinateMapper.mapBrowserRectToCanvas(highlightRect);
shouldProcessClick =
clickX >= mappedRect.left &&
clickX <= mappedRect.right &&
clickY >= mappedRect.top &&
clickY <= mappedRect.bottom;
} else {
shouldProcessClick = true;
}
if (shouldProcessClick) {
const options = getAttributeOptions( const options = getAttributeOptions(
highlighterData.elementInfo?.tagName || "", highlighterData.elementInfo?.tagName || "",
highlighterData.elementInfo highlighterData.elementInfo
@@ -742,47 +951,24 @@ export const BrowserWindow = () => {
if (getList === true && !listSelector) { if (getList === true && !listSelector) {
let cleanedSelector = highlighterData.selector; let cleanedSelector = highlighterData.selector;
if (cleanedSelector.includes("nth-child")) { if (
cleanedSelector = cleanedSelector.replace(/:nth-child\(\d+\)/g, ""); cleanedSelector.includes("[") &&
cleanedSelector.match(/\[\d+\]/)
) {
cleanedSelector = cleanedSelector.replace(/\[\d+\]/g, "");
} }
setListSelector(cleanedSelector); setListSelector(cleanedSelector);
notify( notify(
`info`, `info`,
t("browser_window.attribute_modal.notifications.list_select_success") t(
"browser_window.attribute_modal.notifications.list_select_success"
)
); );
setCurrentListId(Date.now()); setCurrentListId(Date.now());
setFields({}); setFields({});
socket?.emit("setGetList", { getList: true });
socket?.emit("listSelector", { selector: cleanedSelector });
} else if (getList === true && listSelector && currentListId) { } else if (getList === true && listSelector && currentListId) {
if (options.length === 1) {
const attribute = options[0].value; const attribute = options[0].value;
let currentSelector = highlighterData.selector;
if (currentSelector.includes(">")) {
const [firstPart, ...restParts] = currentSelector
.split(">")
.map((p) => p.trim());
const listSelectorRightPart = listSelector
.split(">")
.pop()
?.trim()
.replace(/:nth-child\(\d+\)/g, "");
if (
firstPart.includes("nth-child") &&
firstPart.replace(/:nth-child\(\d+\)/g, "") ===
listSelectorRightPart
) {
currentSelector = `${firstPart.replace(
/:nth-child\(\d+\)/g,
""
)} > ${restParts.join(" > ")}`;
}
}
const data = const data =
attribute === "href" attribute === "href"
? highlighterData.elementInfo?.url || "" ? highlighterData.elementInfo?.url || ""
@@ -790,6 +976,22 @@ export const BrowserWindow = () => {
? highlighterData.elementInfo?.imageUrl || "" ? highlighterData.elementInfo?.imageUrl || ""
: highlighterData.elementInfo?.innerText || ""; : highlighterData.elementInfo?.innerText || "";
if (options.length === 1) {
let currentSelector = highlighterData.selector;
if (currentSelector.includes("/")) {
const xpathParts = currentSelector
.split("/")
.filter((part) => part);
const cleanedParts = xpathParts.map((part) => {
return part.replace(/\[\d+\]/g, "");
});
if (cleanedParts.length > 0) {
currentSelector = "//" + cleanedParts.join("/");
}
}
const newField: TextStep = { const newField: TextStep = {
id: Date.now(), id: Date.now(),
type: "text", type: "text",
@@ -828,150 +1030,6 @@ export const BrowserWindow = () => {
setShowAttributeModal(true); setShowAttributeModal(true);
} }
} }
},
[
getText,
getList,
listSelector,
paginationMode,
paginationType,
fields,
currentListId,
currentTextActionId,
currentListActionId,
addTextStep,
addListStep,
notify,
socket,
t,
paginationSelector,
]
);
const handleClick = (e: React.MouseEvent<HTMLDivElement>) => {
if (highlighterData && canvasRef?.current) {
const canvasRect = canvasRef.current.getBoundingClientRect();
const clickX = e.clientX - canvasRect.left;
const clickY = e.clientY - canvasRect.top;
const highlightRect = highlighterData.rect;
const mappedRect = coordinateMapper.mapBrowserRectToCanvas(highlightRect);
if (
clickX >= mappedRect.left &&
clickX <= mappedRect.right &&
clickY >= mappedRect.top &&
clickY <= mappedRect.bottom
) {
const options = getAttributeOptions(highlighterData.elementInfo?.tagName || '', highlighterData.elementInfo);
if (getText === true) {
if (options.length === 1) {
// Directly use the available attribute if only one option is present
const attribute = options[0].value;
const data = attribute === 'href' ? highlighterData.elementInfo?.url || '' :
attribute === 'src' ? highlighterData.elementInfo?.imageUrl || '' :
highlighterData.elementInfo?.innerText || '';
addTextStep('', data, {
selector: highlighterData.selector,
tag: highlighterData.elementInfo?.tagName,
shadow: highlighterData.elementInfo?.isShadowRoot,
attribute,
}, currentTextActionId || `text-${crypto.randomUUID()}`);
} else {
// Show the modal if there are multiple options
setAttributeOptions(options);
setSelectedElement({
selector: highlighterData.selector,
info: highlighterData.elementInfo,
});
setShowAttributeModal(true);
}
}
if (paginationMode && getList) {
// Only allow selection in pagination mode if type is not empty, 'scrollDown', or 'scrollUp'
if (paginationType !== '' && paginationType !== 'scrollDown' && paginationType !== 'scrollUp' && paginationType !== 'none') {
setPaginationSelector(highlighterData.selector);
notify(`info`, t('browser_window.attribute_modal.notifications.pagination_select_success'));
addListStep(listSelector!, fields, currentListId || 0, currentListActionId || `list-${crypto.randomUUID()}`, { type: paginationType, selector: highlighterData.selector });
socket?.emit('setPaginationMode', { pagination: false });
}
return;
}
if (getList === true && !listSelector) {
let cleanedSelector = highlighterData.selector;
if (cleanedSelector.includes('nth-child')) {
cleanedSelector = cleanedSelector.replace(/:nth-child\(\d+\)/g, '');
}
setListSelector(cleanedSelector);
notify(`info`, t('browser_window.attribute_modal.notifications.list_select_success'));
setCurrentListId(Date.now());
setFields({});
} else if (getList === true && listSelector && currentListId) {
const attribute = options[0].value;
const data = attribute === 'href' ? highlighterData.elementInfo?.url || '' :
attribute === 'src' ? highlighterData.elementInfo?.imageUrl || '' :
highlighterData.elementInfo?.innerText || '';
// Add fields to the list
if (options.length === 1) {
const attribute = options[0].value;
let currentSelector = highlighterData.selector;
if (currentSelector.includes('>')) {
const [firstPart, ...restParts] = currentSelector.split('>').map(p => p.trim());
const listSelectorRightPart = listSelector.split('>').pop()?.trim().replace(/:nth-child\(\d+\)/g, '');
if (firstPart.includes('nth-child') &&
firstPart.replace(/:nth-child\(\d+\)/g, '') === listSelectorRightPart) {
currentSelector = `${firstPart.replace(/:nth-child\(\d+\)/g, '')} > ${restParts.join(' > ')}`;
}
}
const newField: TextStep = {
id: Date.now(),
type: 'text',
label: `Label ${Object.keys(fields).length + 1}`,
data: data,
selectorObj: {
selector: currentSelector,
tag: highlighterData.elementInfo?.tagName,
shadow: highlighterData.elementInfo?.isShadowRoot,
attribute
}
};
const updatedFields = {
...fields,
[newField.id]: newField
};
setFields(updatedFields);
if (listSelector) {
addListStep(
listSelector,
updatedFields,
currentListId,
currentListActionId || `list-${crypto.randomUUID()}`,
{ type: '', selector: paginationSelector }
);
}
} else {
setAttributeOptions(options);
setSelectedElement({
selector: highlighterData.selector,
info: highlighterData.elementInfo
});
setShowAttributeModal(true);
}
}
} }
} }
}; };
@@ -1150,6 +1208,10 @@ export const BrowserWindow = () => {
{isDOMMode && highlighterData && ( {isDOMMode && highlighterData && (
<> <>
{/* Individual element highlight (for non-group or hovered element) */}
{(!getList ||
listSelector ||
!currentGroupInfo?.isGroupElement) && (
<div <div
style={{ style={{
position: "absolute", position: "absolute",
@@ -1172,6 +1234,59 @@ export const BrowserWindow = () => {
transition: "all 0.1s ease-out", transition: "all 0.1s ease-out",
}} }}
/> />
)}
{/* Group elements highlighting with real-time coordinates */}
{getList &&
!listSelector &&
currentGroupInfo?.isGroupElement &&
highlighterData.groupElements &&
highlighterData.groupElements.map((groupElement, index) => (
<React.Fragment key={index}>
{/* Highlight box */}
<div
style={{
position: "absolute",
left: Math.max(0, groupElement.rect.x),
top: Math.max(0, groupElement.rect.y),
width: Math.min(
groupElement.rect.width,
dimensions.width
),
height: Math.min(
groupElement.rect.height,
dimensions.height
),
background: "rgba(255, 0, 195, 0.15)",
border: "2px dashed #ff00c3",
borderRadius: "3px",
pointerEvents: "none",
zIndex: 1000,
boxShadow: "0 0 0 1px rgba(255, 255, 255, 0.8)",
transition: "all 0.1s ease-out",
}}
/>
<div
style={{
position: "absolute",
left: Math.max(0, groupElement.rect.x),
top: Math.max(0, groupElement.rect.y - 20),
background: "#ff00c3",
color: "white",
padding: "2px 6px",
fontSize: "10px",
fontWeight: "bold",
borderRadius: "2px",
pointerEvents: "none",
zIndex: 1001,
whiteSpace: "nowrap",
}}
>
List item {index + 1}
</div>
</React.Fragment>
))}
</> </>
)} )}
</> </>
@@ -1186,6 +1301,7 @@ export const BrowserWindow = () => {
getList={getList} getList={getList}
getText={getText} getText={getText}
listSelector={listSelector} listSelector={listSelector}
cachedChildSelectors={cachedChildSelectors}
paginationMode={paginationMode} paginationMode={paginationMode}
paginationType={paginationType} paginationType={paginationType}
limitMode={limitMode} limitMode={limitMode}

View File

@@ -98,6 +98,7 @@ interface RRWebDOMBrowserRendererProps {
getList?: boolean; getList?: boolean;
getText?: boolean; getText?: boolean;
listSelector?: string | null; listSelector?: string | null;
cachedChildSelectors?: string[];
paginationMode?: boolean; paginationMode?: boolean;
paginationType?: string; paginationType?: string;
limitMode?: boolean; limitMode?: boolean;
@@ -106,12 +107,14 @@ interface RRWebDOMBrowserRendererProps {
selector: string; selector: string;
elementInfo: ElementInfo | null; elementInfo: ElementInfo | null;
childSelectors?: string[]; childSelectors?: string[];
groupInfo?: any;
}) => void; }) => void;
onElementSelect?: (data: { onElementSelect?: (data: {
rect: DOMRect; rect: DOMRect;
selector: string; selector: string;
elementInfo: ElementInfo | null; elementInfo: ElementInfo | null;
childSelectors?: string[]; childSelectors?: string[];
groupInfo?: any;
}) => void; }) => void;
onShowDatePicker?: (info: { onShowDatePicker?: (info: {
coordinates: { x: number; y: number }; coordinates: { x: number; y: number };
@@ -144,6 +147,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
getList = false, getList = false,
getText = false, getText = false,
listSelector = null, listSelector = null,
cachedChildSelectors = [],
paginationMode = false, paginationMode = false,
paginationType = "", paginationType = "",
limitMode = false, limitMode = false,
@@ -205,11 +209,24 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
const handleDOMHighlighting = useCallback( const handleDOMHighlighting = useCallback(
(x: number, y: number, iframeDoc: Document) => { (x: number, y: number, iframeDoc: Document) => {
try { try {
if (!getText && !getList) {
setCurrentHighlight(null);
if (onHighlight) {
onHighlight({
rect: new DOMRect(0, 0, 0, 0),
selector: "",
elementInfo: null,
});
}
return;
}
const highlighterData = const highlighterData =
clientSelectorGenerator.generateDataForHighlighter( clientSelectorGenerator.generateDataForHighlighter(
{ x, y }, { x, y },
iframeDoc, iframeDoc,
true true,
cachedChildSelectors
); );
if (!highlighterData) { if (!highlighterData) {
@@ -224,70 +241,40 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
return; return;
} }
const { rect, selector, elementInfo, childSelectors } = highlighterData; const { rect, selector, elementInfo, childSelectors, groupInfo } =
highlighterData;
let shouldHighlight = false; let shouldHighlight = false;
if (getList) { if (getList) {
if (listSelector) { // First phase: Allow any group to be highlighted for selection
const hasValidChildSelectors = if (!listSelector && groupInfo?.isGroupElement) {
Array.isArray(childSelectors) && childSelectors.length > 0; shouldHighlight = true;
}
// Second phase: Show valid children within selected group
else if (listSelector) {
if (limitMode) { if (limitMode) {
shouldHighlight = false; shouldHighlight = false;
} else if (paginationMode) { } else if (
if ( paginationMode &&
paginationType !== "" && paginationType !== "" &&
!["none", "scrollDown", "scrollUp"].includes(paginationType) !["none", "scrollDown", "scrollUp"].includes(paginationType)
) { ) {
shouldHighlight = true; shouldHighlight = true;
} else { } else if (childSelectors && childSelectors.length > 0) {
shouldHighlight = false; console.log("✅ Child selectors present, highlighting enabled");
}
} else if (childSelectors && childSelectors.includes(selector)) {
shouldHighlight = true; shouldHighlight = true;
} else if (elementInfo?.isIframeContent && childSelectors) {
const isIframeChild = childSelectors.some(
(childSelector: string) =>
selector.includes(":>>") &&
childSelector
.split(":>>")
.some((part) => selector.includes(part.trim()))
);
shouldHighlight = isIframeChild;
} else if (selector.includes(":>>") && hasValidChildSelectors) {
const selectorParts = selector
.split(":>>")
.map((part: string) => part.trim());
const isValidMixedSelector = selectorParts.some((part: any) =>
childSelectors!.some((childSelector) =>
childSelector.includes(part)
)
);
} else if (elementInfo?.isShadowRoot && childSelectors) {
const isShadowChild = childSelectors.some(
(childSelector: string) =>
selector.includes(">>") &&
childSelector
.split(">>")
.some((part) => selector.includes(part.trim()))
);
} else if (selector.includes(">>") && hasValidChildSelectors) {
const selectorParts = selector
.split(">>")
.map((part: string) => part.trim());
const isValidMixedSelector = selectorParts.some((part: any) =>
childSelectors!.some((childSelector) =>
childSelector.includes(part)
)
);
} else { } else {
console.log("❌ No child selectors available");
shouldHighlight = false; shouldHighlight = false;
} }
} else { }
// No list selector - show regular highlighting
else {
shouldHighlight = true; shouldHighlight = true;
} }
} else { } else {
// getText mode - always highlight
shouldHighlight = true; shouldHighlight = true;
} }
@@ -316,6 +303,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
}, },
selector, selector,
childSelectors, childSelectors,
groupInfo,
}); });
} }
} }
@@ -335,9 +323,11 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
} }
}, },
[ [
getText,
getList, getList,
listSelector, listSelector,
paginationMode, paginationMode,
cachedChildSelectors,
paginationType, paginationType,
limitMode, limitMode,
onHighlight, onHighlight,
@@ -363,6 +353,10 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
return; return;
} }
if (!isInCaptureMode) {
return;
}
const now = performance.now(); const now = performance.now();
if (now - lastMouseMoveTime.current < MOUSE_MOVE_THROTTLE) { if (now - lastMouseMoveTime.current < MOUSE_MOVE_THROTTLE) {
return; return;
@@ -401,11 +395,24 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
e.stopPropagation(); e.stopPropagation();
if (currentHighlight && onElementSelect) { if (currentHighlight && onElementSelect) {
// Get the group info for the current highlight
const highlighterData =
clientSelectorGenerator.generateDataForHighlighter(
{ x: iframeX, y: iframeY },
iframeDoc,
true,
cachedChildSelectors
);
onElementSelect({ onElementSelect({
rect: currentHighlight.rect, rect: currentHighlight.rect,
selector: currentHighlight.selector, selector: currentHighlight.selector,
elementInfo: currentHighlight.elementInfo, elementInfo: currentHighlight.elementInfo,
childSelectors: currentHighlight.childSelectors || [], childSelectors:
cachedChildSelectors.length > 0
? cachedChildSelectors
: highlighterData?.childSelectors || [],
groupInfo: highlighterData?.groupInfo,
}); });
} }
notifyLastAction("select element"); notifyLastAction("select element");
@@ -790,11 +797,40 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
rebuiltHTML = "<!DOCTYPE html>\n" + rebuiltHTML; rebuiltHTML = "<!DOCTYPE html>\n" + rebuiltHTML;
const additionalCSS = [];
if (snapshotData.resources.fonts?.length > 0) {
const fontCSS = snapshotData.resources.fonts
.map((font) => {
const format = font.format || "woff2";
return `
@font-face {
font-family: 'ProxiedFont-${
font.url.split("/").pop()?.split(".")[0] || "unknown"
}';
src: url("${font.dataUrl}") format("${format}");
font-display: swap;
}
`;
})
.join("\n");
additionalCSS.push(fontCSS);
}
if (snapshotData.resources.stylesheets?.length > 0) {
const externalCSS = snapshotData.resources.stylesheets
.map((stylesheet) => stylesheet.content)
.join("\n\n");
additionalCSS.push(externalCSS);
}
const enhancedCSS = ` const enhancedCSS = `
/* rrweb rebuilt content styles */ /* rrweb rebuilt content styles */
html, body { html, body {
margin: 0 !important; margin: 0 !important;
padding: 8px !important; padding: 8px !important;
font-family: system-ui, -apple-system, BlinkMacSystemFont, sans-serif !important;
background: white !important;
overflow-x: hidden !important; overflow-x: hidden !important;
} }
@@ -819,12 +855,22 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
-ms-overflow-style: none !important; /* Internet Explorer 10+ */ -ms-overflow-style: none !important; /* Internet Explorer 10+ */
} }
img {
max-width: 100% !important;
height: auto !important;
}
/* Make everything interactive */ /* Make everything interactive */
* { * {
cursor: "pointer" !important; cursor: "pointer" !important;
} }
/* Additional CSS from resources */
${additionalCSS.join("\n\n")}
`; `;
const headTagRegex = /<head[^>]*>/i; const headTagRegex = /<head[^>]*>/i;
const cssInjection = ` const cssInjection = `
<meta charset="utf-8"> <meta charset="utf-8">

View File

@@ -22,6 +22,7 @@ import { useThemeMode } from '../../context/theme-provider';
import { useTranslation } from 'react-i18next'; import { useTranslation } from 'react-i18next';
import { useBrowserDimensionsStore } from '../../context/browserDimensions'; import { useBrowserDimensionsStore } from '../../context/browserDimensions';
import { clientListExtractor } from '../../helpers/clientListExtractor'; import { clientListExtractor } from '../../helpers/clientListExtractor';
import { clientSelectorGenerator } from '../../helpers/clientSelectorGenerator';
const fetchWorkflow = (id: string, callback: (response: WorkflowFile) => void) => { const fetchWorkflow = (id: string, callback: (response: WorkflowFile) => void) => {
getActiveWorkflow(id).then( getActiveWorkflow(id).then(
@@ -52,10 +53,8 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
const [isCaptureTextConfirmed, setIsCaptureTextConfirmed] = useState(false); const [isCaptureTextConfirmed, setIsCaptureTextConfirmed] = useState(false);
const [isCaptureListConfirmed, setIsCaptureListConfirmed] = useState(false); const [isCaptureListConfirmed, setIsCaptureListConfirmed] = useState(false);
const { panelHeight } = useBrowserDimensionsStore(); const { panelHeight } = useBrowserDimensionsStore();
const [isDOMMode, setIsDOMMode] = useState(false);
const [currentSnapshot, setCurrentSnapshot] = useState<any>(null);
const { lastAction, notify, currentWorkflowActionsState, setCurrentWorkflowActionsState, resetInterpretationLog, currentListActionId, setCurrentListActionId, currentTextActionId, setCurrentTextActionId, currentScreenshotActionId, setCurrentScreenshotActionId } = useGlobalInfoStore(); const { lastAction, notify, currentWorkflowActionsState, setCurrentWorkflowActionsState, resetInterpretationLog, currentListActionId, setCurrentListActionId, currentTextActionId, setCurrentTextActionId, currentScreenshotActionId, setCurrentScreenshotActionId, updateDOMMode, currentSnapshot, isDOMMode } = useGlobalInfoStore();
const { const {
getText, startGetText, stopGetText, getText, startGetText, stopGetText,
getList, startGetList, stopGetList, getList, startGetList, stopGetList,
@@ -86,22 +85,20 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
if (socket) { if (socket) {
const domModeHandler = (data: any) => { const domModeHandler = (data: any) => {
if (!data.userId || data.userId === id) { if (!data.userId || data.userId === id) {
setIsDOMMode(true); updateDOMMode(true);
} }
}; };
const screenshotModeHandler = (data: any) => { const screenshotModeHandler = (data: any) => {
if (!data.userId || data.userId === id) { if (!data.userId || data.userId === id) {
setIsDOMMode(false); updateDOMMode(false);
setCurrentSnapshot(null);
} }
}; };
const domcastHandler = (data: any) => { const domcastHandler = (data: any) => {
if (!data.userId || data.userId === id) { if (!data.userId || data.userId === id) {
if (data.snapshotData && data.snapshotData.snapshot) { if (data.snapshotData && data.snapshotData.snapshot) {
setCurrentSnapshot(data.snapshotData); updateDOMMode(true, data.snapshotData);
setIsDOMMode(true);
} }
} }
}; };
@@ -116,7 +113,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
socket.off("domcast", domcastHandler); socket.off("domcast", domcastHandler);
}; };
} }
}, [socket, id]); }, [socket, id, updateDOMMode]);
useEffect(() => { useEffect(() => {
if (socket) { if (socket) {
@@ -214,7 +211,6 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
) => { ) => {
if (isDOMMode && currentSnapshot) { if (isDOMMode && currentSnapshot) {
try { try {
// Find the DOM iframe element
let iframeElement = document.querySelector( let iframeElement = document.querySelector(
"#dom-browser-iframe" "#dom-browser-iframe"
) as HTMLIFrameElement; ) as HTMLIFrameElement;
@@ -247,22 +243,42 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
return; return;
} }
// Use client-side extraction Object.entries(fields).forEach(([key, field]) => {
if (field.selectorObj?.selector) {
const isFieldXPath =
field.selectorObj.selector.startsWith("//") ||
field.selectorObj.selector.startsWith("/");
console.log(
`Field "${key}" selector:`,
field.selectorObj.selector,
`(XPath: ${isFieldXPath})`
);
}
});
const extractedData = clientListExtractor.extractListData( const extractedData = clientListExtractor.extractListData(
iframeDoc, iframeDoc,
listSelector, listSelector,
fields, fields,
5 // limit for preview 5
); );
updateListStepData(currentListId, extractedData); updateListStepData(currentListId, extractedData);
console.log("✅ UI extraction completed:");
if (extractedData.length === 0) {
console.warn(
"⚠️ No data extracted - this might indicate selector issues"
);
notify(
"warning",
"No data was extracted. Please verify your selections."
);
}
} catch (error) { } catch (error) {
console.error("Error in client-side data extraction:", error); console.error("Error in client-side data extraction:", error);
notify("error", "Failed to extract data client-side"); notify("error", "Failed to extract data client-side");
} }
} else { } else {
// Fallback to socket-based extraction for screenshot mode
if (!socket) { if (!socket) {
console.error("Socket not available for backend extraction"); console.error("Socket not available for backend extraction");
return; return;
@@ -275,8 +291,6 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
currentListId, currentListId,
pagination: { type: "", selector: "" }, pagination: { type: "", selector: "" },
}); });
console.log("📤 Sent extraction request to server");
} catch (error) { } catch (error) {
console.error("Error in backend data extraction:", error); console.error("Error in backend data extraction:", error);
} }
@@ -443,6 +457,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
resetInterpretationLog(); resetInterpretationLog();
finishAction('text'); finishAction('text');
onFinishCapture(); onFinishCapture();
clientSelectorGenerator.cleanup();
}, [stopGetText, getTextSettingsObject, socket, browserSteps, confirmedTextSteps, resetInterpretationLog, finishAction, notify, onFinishCapture, t]); }, [stopGetText, getTextSettingsObject, socket, browserSteps, confirmedTextSteps, resetInterpretationLog, finishAction, notify, onFinishCapture, t]);
const getListSettingsObject = useCallback(() => { const getListSettingsObject = useCallback(() => {
@@ -495,6 +510,8 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
const stopCaptureAndEmitGetListSettings = useCallback(() => { const stopCaptureAndEmitGetListSettings = useCallback(() => {
const settings = getListSettingsObject(); const settings = getListSettingsObject();
console.log("rrwebSnapshotHandler", settings);
const latestListStep = getLatestListStep(browserSteps); const latestListStep = getLatestListStep(browserSteps);
if (latestListStep && settings) { if (latestListStep && settings) {
extractDataClientSide(latestListStep.listSelector!, latestListStep.fields, latestListStep.id); extractDataClientSide(latestListStep.listSelector!, latestListStep.fields, latestListStep.id);
@@ -509,6 +526,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
resetInterpretationLog(); resetInterpretationLog();
finishAction('list'); finishAction('list');
onFinishCapture(); onFinishCapture();
clientSelectorGenerator.cleanup();
}, [getListSettingsObject, socket, notify, handleStopGetList, resetInterpretationLog, finishAction, onFinishCapture, t, browserSteps, extractDataClientSide]); }, [getListSettingsObject, socket, notify, handleStopGetList, resetInterpretationLog, finishAction, onFinishCapture, t, browserSteps, extractDataClientSide]);
const hasUnconfirmedListTextFields = browserSteps.some(step => const hasUnconfirmedListTextFields = browserSteps.some(step =>
@@ -638,6 +656,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
setCurrentTextActionId(''); setCurrentTextActionId('');
setIsCaptureTextConfirmed(false); setIsCaptureTextConfirmed(false);
clientSelectorGenerator.cleanup();
notify('error', t('right_panel.errors.capture_text_discarded')); notify('error', t('right_panel.errors.capture_text_discarded'));
}, [currentTextActionId, browserSteps, stopGetText, deleteStepsByActionId, notify, t]); }, [currentTextActionId, browserSteps, stopGetText, deleteStepsByActionId, notify, t]);
@@ -668,6 +687,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
setCaptureStage('initial'); setCaptureStage('initial');
setCurrentListActionId(''); setCurrentListActionId('');
setIsCaptureListConfirmed(false); setIsCaptureListConfirmed(false);
clientSelectorGenerator.cleanup();
notify('error', t('right_panel.errors.capture_list_discarded')); notify('error', t('right_panel.errors.capture_list_discarded'));
}, [currentListActionId, browserSteps, stopGetList, deleteStepsByActionId, resetListState, setShowPaginationOptions, setShowLimitOptions, setCaptureStage, notify, t]); }, [currentListActionId, browserSteps, stopGetList, deleteStepsByActionId, resetListState, setShowPaginationOptions, setShowLimitOptions, setCaptureStage, notify, t]);
@@ -686,6 +706,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
stopGetScreenshot(); stopGetScreenshot();
resetInterpretationLog(); resetInterpretationLog();
finishAction('screenshot'); finishAction('screenshot');
clientSelectorGenerator.cleanup();
onFinishCapture(); onFinishCapture();
}; };

View File

@@ -27,6 +27,41 @@ interface ScheduleConfig {
cronExpression?: string; cronExpression?: string;
} }
interface ProcessedSnapshot {
snapshot: any;
resources: {
stylesheets: Array<{
href: string;
content: string;
media?: string;
}>;
images: Array<{
src: string;
dataUrl: string;
alt?: string;
}>;
fonts: Array<{
url: string;
dataUrl: string;
format?: string;
}>;
scripts: Array<{
src: string;
content: string;
type?: string;
}>;
media: Array<{
src: string;
dataUrl: string;
type: string;
}>;
};
baseUrl: string;
viewport: { width: number; height: number };
timestamp: number;
processingStats: any;
}
export interface RobotSettings { export interface RobotSettings {
id: string; id: string;
userId?: number; userId?: number;
@@ -86,6 +121,11 @@ interface GlobalInfo {
setCurrentListActionId: (actionId: string) => void; setCurrentListActionId: (actionId: string) => void;
currentScreenshotActionId: string; currentScreenshotActionId: string;
setCurrentScreenshotActionId: (actionId: string) => void; setCurrentScreenshotActionId: (actionId: string) => void;
isDOMMode: boolean;
setIsDOMMode: (isDOMMode: boolean) => void;
currentSnapshot: ProcessedSnapshot | null;
setCurrentSnapshot: (snapshot: ProcessedSnapshot | null) => void;
updateDOMMode: (isDOMMode: boolean, snapshot?: ProcessedSnapshot | null) => void;
}; };
class GlobalInfoStore implements Partial<GlobalInfo> { class GlobalInfoStore implements Partial<GlobalInfo> {
@@ -115,6 +155,8 @@ class GlobalInfoStore implements Partial<GlobalInfo> {
currentTextActionId = ''; currentTextActionId = '';
currentListActionId = ''; currentListActionId = '';
currentScreenshotActionId = ''; currentScreenshotActionId = '';
isDOMMode = false;
currentSnapshot = null;
}; };
const globalInfoStore = new GlobalInfoStore(); const globalInfoStore = new GlobalInfoStore();
@@ -141,6 +183,8 @@ export const GlobalInfoProvider = ({ children }: { children: JSX.Element }) => {
const [currentTextActionId, setCurrentTextActionId] = useState<string>(''); const [currentTextActionId, setCurrentTextActionId] = useState<string>('');
const [currentListActionId, setCurrentListActionId] = useState<string>(''); const [currentListActionId, setCurrentListActionId] = useState<string>('');
const [currentScreenshotActionId, setCurrentScreenshotActionId] = useState<string>(''); const [currentScreenshotActionId, setCurrentScreenshotActionId] = useState<string>('');
const [isDOMMode, setIsDOMMode] = useState<boolean>(globalInfoStore.isDOMMode);
const [currentSnapshot, setCurrentSnapshot] = useState<ProcessedSnapshot | null>(globalInfoStore.currentSnapshot);
const notify = (severity: 'error' | 'warning' | 'info' | 'success', message: string) => { const notify = (severity: 'error' | 'warning' | 'info' | 'success', message: string) => {
setNotification({ severity, message, isOpen: true }); setNotification({ severity, message, isOpen: true });
@@ -165,6 +209,18 @@ export const GlobalInfoProvider = ({ children }: { children: JSX.Element }) => {
}, 100); }, 100);
} }
const updateDOMMode = (mode: boolean, snapshot?: ProcessedSnapshot | null) => {
setIsDOMMode(mode);
if (snapshot !== undefined) {
setCurrentSnapshot(snapshot);
}
if (!mode) {
setCurrentSnapshot(null);
}
}
return ( return (
<globalInfoContext.Provider <globalInfoContext.Provider
value={{ value={{
@@ -205,6 +261,11 @@ export const GlobalInfoProvider = ({ children }: { children: JSX.Element }) => {
setCurrentListActionId, setCurrentListActionId,
currentScreenshotActionId, currentScreenshotActionId,
setCurrentScreenshotActionId, setCurrentScreenshotActionId,
isDOMMode,
setIsDOMMode,
currentSnapshot,
setCurrentSnapshot,
updateDOMMode,
}} }}
> >
{children} {children}

View File

@@ -15,31 +15,90 @@ interface ExtractedListData {
[key: string]: string; [key: string]: string;
} }
interface TableField { interface Field {
selector: string; selector: string;
attribute: string; attribute: string;
tableContext?: string;
cellIndex?: number;
}
interface NonTableField {
selector: string;
attribute: string;
}
interface ContainerFields {
tableFields: Record<string, TableField>;
nonTableFields: Record<string, NonTableField>;
} }
class ClientListExtractor { class ClientListExtractor {
private evaluateXPath = (
rootElement: Element | Document,
xpath: string
): Element | null => {
try {
const ownerDoc =
rootElement.nodeType === Node.DOCUMENT_NODE
? (rootElement as Document)
: rootElement.ownerDocument;
if (!ownerDoc) return null;
const result = ownerDoc.evaluate(
xpath,
rootElement,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
);
return result.singleNodeValue as Element | null;
} catch (error) {
console.warn("XPath evaluation failed:", xpath, error);
return null;
}
};
private evaluateXPathAll = (
rootElement: Element | Document,
xpath: string
): Element[] => {
try {
const ownerDoc =
rootElement.nodeType === Node.DOCUMENT_NODE
? (rootElement as Document)
: rootElement.ownerDocument;
if (!ownerDoc) return [];
const result = ownerDoc.evaluate(
xpath,
rootElement,
null,
XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,
null
);
const elements: Element[] = [];
for (let i = 0; i < result.snapshotLength; i++) {
const node = result.snapshotItem(i);
if (node && node.nodeType === Node.ELEMENT_NODE) {
elements.push(node as Element);
}
}
return elements;
} catch (error) {
console.warn("XPath evaluation failed:", xpath, error);
return [];
}
};
private queryElement = ( private queryElement = (
rootElement: Element | Document, rootElement: Element | Document,
selector: string selector: string
): Element | null => { ): Element | null => {
if (!selector.includes(">>") && !selector.includes(":>>")) { if (!selector.includes(">>") && !selector.includes(":>>")) {
// Check if it's an XPath selector (starts with // or / or ./)
if (
selector.startsWith("//") ||
selector.startsWith("/") ||
selector.startsWith("./")
) {
return this.evaluateXPath(rootElement, selector);
} else {
return rootElement.querySelector(selector); return rootElement.querySelector(selector);
} }
}
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim()); const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
let currentElement: Element | Document | null = rootElement; let currentElement: Element | Document | null = rootElement;
@@ -59,7 +118,17 @@ class ClientListExtractor {
frameElement.contentDocument || frameElement.contentDocument ||
frameElement.contentWindow?.document; frameElement.contentWindow?.document;
if (!frameDoc) return null; if (!frameDoc) return null;
// Handle XPath in iframe context
if (
parts[i].startsWith("//") ||
parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
currentElement = this.evaluateXPath(frameDoc, parts[i]);
} else {
currentElement = frameDoc.querySelector(parts[i]); currentElement = frameDoc.querySelector(parts[i]);
}
continue; continue;
} catch (e) { } catch (e) {
console.warn( console.warn(
@@ -75,18 +144,38 @@ class ClientListExtractor {
let nextElement: Element | null = null; let nextElement: Element | null = null;
if ("querySelector" in currentElement) { if ("querySelector" in currentElement) {
// Handle XPath vs CSS selector
if (
parts[i].startsWith("//") ||
parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
nextElement = this.evaluateXPath(currentElement, parts[i]);
} else {
nextElement = currentElement.querySelector(parts[i]); nextElement = currentElement.querySelector(parts[i]);
} }
}
if ( if (
!nextElement && !nextElement &&
"shadowRoot" in currentElement && "shadowRoot" in currentElement &&
(currentElement as Element).shadowRoot (currentElement as Element).shadowRoot
) { ) {
if (
parts[i].startsWith("//") ||
parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
nextElement = this.evaluateXPath(
(currentElement as Element).shadowRoot as unknown as Document,
parts[i]
);
} else {
nextElement = (currentElement as Element).shadowRoot!.querySelector( nextElement = (currentElement as Element).shadowRoot!.querySelector(
parts[i] parts[i]
); );
} }
}
if (!nextElement && "children" in currentElement) { if (!nextElement && "children" in currentElement) {
const children: any = Array.from( const children: any = Array.from(
@@ -94,7 +183,18 @@ class ClientListExtractor {
); );
for (const child of children) { for (const child of children) {
if (child.shadowRoot) { if (child.shadowRoot) {
if (
parts[i].startsWith("//") ||
parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
nextElement = this.evaluateXPath(
child.shadowRoot as unknown as Document,
parts[i]
);
} else {
nextElement = child.shadowRoot.querySelector(parts[i]); nextElement = child.shadowRoot.querySelector(parts[i]);
}
if (nextElement) break; if (nextElement) break;
} }
} }
@@ -111,8 +211,13 @@ class ClientListExtractor {
selector: string selector: string
): Element[] => { ): Element[] => {
if (!selector.includes(">>") && !selector.includes(":>>")) { if (!selector.includes(">>") && !selector.includes(":>>")) {
// Check if it's an XPath selector (starts with // or /)
if (selector.startsWith("//") || selector.startsWith("/")) {
return this.evaluateXPathAll(rootElement, selector);
} else {
return Array.from(rootElement.querySelectorAll(selector)); return Array.from(rootElement.querySelectorAll(selector));
} }
}
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim()); const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
let currentElements: (Element | Document)[] = [rootElement]; let currentElements: (Element | Document)[] = [rootElement];
@@ -133,7 +238,14 @@ class ClientListExtractor {
frameElement.contentDocument || frameElement.contentDocument ||
frameElement.contentWindow?.document; frameElement.contentWindow?.document;
if (frameDoc) { if (frameDoc) {
nextElements.push(...Array.from(frameDoc.querySelectorAll(part))); // Handle XPath in iframe context
if (part.startsWith("//") || part.startsWith("/")) {
nextElements.push(...this.evaluateXPathAll(frameDoc, part));
} else {
nextElements.push(
...Array.from(frameDoc.querySelectorAll(part))
);
}
} }
} catch (e) { } catch (e) {
console.warn( console.warn(
@@ -146,21 +258,43 @@ class ClientListExtractor {
} }
} else { } else {
if ("querySelectorAll" in element) { if ("querySelectorAll" in element) {
// Handle XPath vs CSS selector
if (part.startsWith("//") || part.startsWith("/")) {
nextElements.push(...this.evaluateXPathAll(element, part));
} else {
nextElements.push(...Array.from(element.querySelectorAll(part))); nextElements.push(...Array.from(element.querySelectorAll(part)));
} }
}
if ("shadowRoot" in element && (element as Element).shadowRoot) { if ("shadowRoot" in element && (element as Element).shadowRoot) {
if (part.startsWith("//") || part.startsWith("/")) {
nextElements.push(
...this.evaluateXPathAll(
(element as Element).shadowRoot as unknown as Document,
part
)
);
} else {
nextElements.push( nextElements.push(
...Array.from( ...Array.from(
(element as Element).shadowRoot!.querySelectorAll(part) (element as Element).shadowRoot!.querySelectorAll(part)
) )
); );
} }
}
if ("children" in element) { if ("children" in element) {
const children = Array.from((element as Element).children || []); const children = Array.from((element as Element).children || []);
for (const child of children) { for (const child of children) {
if (child.shadowRoot) { if (child.shadowRoot) {
if (part.startsWith("//") || part.startsWith("/")) {
nextElements.push(
...this.evaluateXPathAll(
child.shadowRoot as unknown as Document,
part
)
);
} else {
nextElements.push( nextElements.push(
...Array.from(child.shadowRoot.querySelectorAll(part)) ...Array.from(child.shadowRoot.querySelectorAll(part))
); );
@@ -169,6 +303,7 @@ class ClientListExtractor {
} }
} }
} }
}
currentElements = nextElements; currentElements = nextElements;
} }
@@ -193,36 +328,67 @@ class ClientListExtractor {
} }
if (attribute === "innerText") { if (attribute === "innerText") {
return (element as HTMLElement).innerText?.trim() || null; // First try standard innerText/textContent
} else if (attribute === "innerHTML") { let textContent =
return element.innerHTML?.trim() || null; (element as HTMLElement).innerText?.trim() ||
} else if (attribute === "src" || attribute === "href") { (element as HTMLElement).textContent?.trim();
if (attribute === "href" && element.tagName !== "A") {
const parentElement = element.parentElement; // If empty, check for common data attributes that might contain the text
if (parentElement && parentElement.tagName === "A") { if (!textContent) {
const parentHref = parentElement.getAttribute("href"); // Check for data-* attributes that commonly contain text values
if (parentHref) { const dataAttributes = [
try { "data-600",
return new URL(parentHref, baseURL).href; "data-text",
} catch (e) { "data-label",
return parentHref; "data-value",
} "data-content",
];
for (const attr of dataAttributes) {
const dataValue = element.getAttribute(attr);
if (dataValue && dataValue.trim()) {
textContent = dataValue.trim();
break;
} }
} }
} }
return textContent || null;
} else if (attribute === "innerHTML") {
return element.innerHTML?.trim() || null;
} else if (attribute === "href") {
// For href, we need to find the anchor tag if the current element isn't one
let anchorElement = element;
// If current element is not an anchor, look for parent anchor
if (element.tagName !== "A") {
anchorElement =
element.closest("a") ||
element.parentElement?.closest("a") ||
element;
}
const hrefValue = anchorElement.getAttribute("href");
if (!hrefValue || hrefValue.trim() === "") {
return null;
}
try {
return new URL(hrefValue, baseURL).href;
} catch (e) {
console.warn("Error creating URL from", hrefValue, e);
return hrefValue;
}
} else if (attribute === "src") {
const attrValue = element.getAttribute(attribute); const attrValue = element.getAttribute(attribute);
const dataAttr = attrValue || element.getAttribute("data-" + attribute); const dataAttr = attrValue || element.getAttribute("data-" + attribute);
if (!dataAttr || dataAttr.trim() === "") { if (!dataAttr || dataAttr.trim() === "") {
if (attribute === "src") {
const style = window.getComputedStyle(element as HTMLElement); const style = window.getComputedStyle(element as HTMLElement);
const bgImage = style.backgroundImage; const bgImage = style.backgroundImage;
if (bgImage && bgImage !== "none") { if (bgImage && bgImage !== "none") {
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/); const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
return matches ? new URL(matches[1], baseURL).href : null; return matches ? new URL(matches[1], baseURL).href : null;
} }
}
return null; return null;
} }
@@ -236,187 +402,8 @@ class ClientListExtractor {
return element.getAttribute(attribute); return element.getAttribute(attribute);
}; };
private findTableAncestor = ( private convertFields = (fields: any): Record<string, Field> => {
element: Element const convertedFields: Record<string, Field> = {};
): { type: string; element: Element } | null => {
let currentElement: Element | null = element;
const MAX_DEPTH = 5;
let depth = 0;
while (currentElement && depth < MAX_DEPTH) {
if (currentElement.getRootNode() instanceof ShadowRoot) {
currentElement = (currentElement.getRootNode() as ShadowRoot).host;
continue;
}
if (currentElement.tagName === "TD") {
return { type: "TD", element: currentElement };
} else if (currentElement.tagName === "TR") {
return { type: "TR", element: currentElement };
}
if (
currentElement.tagName === "IFRAME" ||
currentElement.tagName === "FRAME"
) {
try {
const frameElement = currentElement as
| HTMLIFrameElement
| HTMLFrameElement;
currentElement = frameElement.contentDocument?.body || null;
} catch (e) {
return null;
}
} else {
currentElement = currentElement.parentElement;
}
depth++;
}
return null;
};
private getCellIndex = (td: Element): number => {
if (td.getRootNode() instanceof ShadowRoot) {
const shadowRoot = td.getRootNode() as ShadowRoot;
const allCells = Array.from(shadowRoot.querySelectorAll("td"));
return allCells.indexOf(td as HTMLTableCellElement);
}
let index = 0;
let sibling = td;
while ((sibling = sibling.previousElementSibling as Element)) {
index++;
}
return index;
};
private hasThElement = (
row: Element,
tableFields: Record<string, TableField>
): boolean => {
for (const [_, { selector }] of Object.entries(tableFields)) {
const element = this.queryElement(row, selector);
if (element) {
let current: Element | ShadowRoot | Document | null = element;
while (current && current !== row) {
if (current.getRootNode() instanceof ShadowRoot) {
current = (current.getRootNode() as ShadowRoot).host;
continue;
}
if ((current as Element).tagName === "TH") return true;
if (
(current as Element).tagName === "IFRAME" ||
(current as Element).tagName === "FRAME"
) {
try {
const frameElement = current as
| HTMLIFrameElement
| HTMLFrameElement;
current = frameElement.contentDocument?.body || null;
} catch (e) {
break;
}
} else {
current = (current as Element).parentElement;
}
}
}
}
return false;
};
private filterRowsBasedOnTag = (
rows: Element[],
tableFields: Record<string, TableField>
): Element[] => {
for (const row of rows) {
if (this.hasThElement(row, tableFields)) {
return rows;
}
}
return rows.filter((row) => {
const directTH = row.getElementsByTagName("TH").length === 0;
const shadowTH = row.shadowRoot
? row.shadowRoot.querySelector("th") === null
: true;
return directTH && shadowTH;
});
};
private calculateClassSimilarity = (
classList1: string[],
classList2: string[]
): number => {
const set1 = new Set(classList1);
const set2 = new Set(classList2);
const intersection = new Set([...set1].filter((x) => set2.has(x)));
const union = new Set([...set1, ...set2]);
return intersection.size / union.size;
};
private findSimilarElements = (
baseElement: Element,
document: Document,
similarityThreshold: number = 0.7
): Element[] => {
const baseClasses = Array.from(baseElement.classList);
if (baseClasses.length === 0) return [];
const allElements: Element[] = [];
allElements.push(
...Array.from(document.getElementsByTagName(baseElement.tagName))
);
if (baseElement.getRootNode() instanceof ShadowRoot) {
const shadowHost = (baseElement.getRootNode() as ShadowRoot).host;
allElements.push(
...Array.from(shadowHost.getElementsByTagName(baseElement.tagName))
);
}
const frames = [
...Array.from(document.getElementsByTagName("iframe")),
...Array.from(document.getElementsByTagName("frame")),
];
for (const frame of frames) {
try {
const frameElement = frame as HTMLIFrameElement | HTMLFrameElement;
const frameDoc =
frameElement.contentDocument || frameElement.contentWindow?.document;
if (frameDoc) {
allElements.push(
...Array.from(frameDoc.getElementsByTagName(baseElement.tagName))
);
}
} catch (e) {
console.warn(
`Cannot access ${frame.tagName.toLowerCase()} content:`,
e
);
}
}
return allElements.filter((element) => {
if (element === baseElement) return false;
const similarity = this.calculateClassSimilarity(
baseClasses,
Array.from(element.classList)
);
return similarity >= similarityThreshold;
});
};
private convertFields = (
fields: any
): Record<string, { selector: string; attribute: string }> => {
const convertedFields: Record<
string,
{ selector: string; attribute: string }
> = {};
for (const [key, field] of Object.entries(fields)) { for (const [key, field] of Object.entries(fields)) {
const typedField = field as TextStep; const typedField = field as TextStep;
@@ -439,285 +426,134 @@ class ClientListExtractor {
// Convert fields to the format expected by the extraction logic // Convert fields to the format expected by the extraction logic
const convertedFields = this.convertFields(fields); const convertedFields = this.convertFields(fields);
// Get all container elements matching the list selector // Step 1: Get all container elements matching the list selector
let containers = this.queryElementAll(iframeDocument, listSelector); const containers = this.queryElementAll(iframeDocument, listSelector);
if (containers.length === 0) { if (containers.length === 0) {
console.warn("No containers found for listSelector:", listSelector); console.warn("No containers found for listSelector:", listSelector);
return []; return [];
} }
// Enhanced container discovery: find similar elements if we need more containers // Step 2: Extract data from each container up to the limit
if (limit > 1 && containers.length === 1) { const extractedData: ExtractedListData[] = [];
const baseContainer = containers[0]; const containersToProcess = Math.min(containers.length, limit);
const similarContainers = this.findSimilarElements(
baseContainer,
iframeDocument,
0.7
);
if (similarContainers.length > 0) {
const newContainers = similarContainers.filter(
(container) => !container.matches(listSelector)
);
containers = [...containers, ...newContainers];
}
}
// Analyze fields for table vs non-table context
const containerFields: ContainerFields[] = containers.map(() => ({
tableFields: {},
nonTableFields: {},
}));
containers.forEach((container, containerIndex) => {
for (const [label, field] of Object.entries(convertedFields)) {
const sampleElement = this.queryElement(container, field.selector);
if (sampleElement) {
const ancestor = this.findTableAncestor(sampleElement);
if (ancestor) {
containerFields[containerIndex].tableFields[label] = {
...field,
tableContext: ancestor.type,
cellIndex:
ancestor.type === "TD"
? this.getCellIndex(ancestor.element)
: -1,
};
} else {
containerFields[containerIndex].nonTableFields[label] = field;
}
} else {
containerFields[containerIndex].nonTableFields[label] = field;
}
}
});
// Extract table data
const tableData: ExtractedListData[] = [];
for ( for (
let containerIndex = 0; let containerIndex = 0;
containerIndex < containers.length; containerIndex < containersToProcess;
containerIndex++ containerIndex++
) { ) {
const container = containers[containerIndex]; const container = containers[containerIndex];
const { tableFields } = containerFields[containerIndex];
if (Object.keys(tableFields).length > 0) {
const firstField = Object.values(tableFields)[0];
const firstElement = this.queryElement(
container,
firstField.selector
);
let tableContext: Element | null = firstElement;
// Find the table context
while (
tableContext &&
tableContext.tagName !== "TABLE" &&
tableContext !== container
) {
if (tableContext.getRootNode() instanceof ShadowRoot) {
tableContext = (tableContext.getRootNode() as ShadowRoot).host;
continue;
}
if (
tableContext.tagName === "IFRAME" ||
tableContext.tagName === "FRAME"
) {
try {
const frameElement = tableContext as
| HTMLIFrameElement
| HTMLFrameElement;
tableContext = frameElement.contentDocument?.body || null;
} catch (e) {
break;
}
} else {
tableContext = tableContext.parentElement;
}
}
if (tableContext) {
const rows: Element[] = [];
rows.push(...Array.from(tableContext.getElementsByTagName("TR")));
if (
tableContext.tagName === "IFRAME" ||
tableContext.tagName === "FRAME"
) {
try {
const frameElement = tableContext as
| HTMLIFrameElement
| HTMLFrameElement;
const frameDoc =
frameElement.contentDocument ||
frameElement.contentWindow?.document;
if (frameDoc) {
rows.push(...Array.from(frameDoc.getElementsByTagName("TR")));
}
} catch (e) {
console.warn(
`Cannot access ${tableContext.tagName.toLowerCase()} rows:`,
e
);
}
}
const processedRows = this.filterRowsBasedOnTag(rows, tableFields);
for (
let rowIndex = 0;
rowIndex < Math.min(processedRows.length, limit);
rowIndex++
) {
const record: ExtractedListData = {}; const record: ExtractedListData = {};
const currentRow = processedRows[rowIndex];
for (const [ // Step 3: For each field, extract data from the current container
label, for (const [label, { selector, attribute }] of Object.entries(
{ selector, attribute, cellIndex }, convertedFields
] of Object.entries(tableFields)) { )) {
let element: Element | null = null; let element: Element | null = null;
if (cellIndex !== undefined && cellIndex >= 0) { // CORRECT APPROACH: Create indexed absolute XPath
let td: Element | null = if (selector.startsWith("//")) {
currentRow.children[cellIndex] || null; // Convert the absolute selector to target the specific container instance
const indexedSelector = this.createIndexedXPath(
selector,
listSelector,
containerIndex + 1
);
if (!td && currentRow.shadowRoot) { element = this.evaluateXPathSingle(iframeDocument, indexedSelector);
const shadowCells = currentRow.shadowRoot.children;
if (shadowCells && shadowCells.length > cellIndex) {
td = shadowCells[cellIndex];
}
}
if (td) {
element = this.queryElement(td, selector);
if (
!element &&
selector
.split(/(?:>>|:>>)/)
.pop()
?.includes("td:nth-child")
) {
element = td;
}
if (!element) {
const tagOnlySelector = selector.split(".")[0];
element = this.queryElement(td, tagOnlySelector);
}
if (!element) {
let currentElement: Element | null = td;
while (
currentElement &&
currentElement.children.length > 0
) {
let foundContentChild = false;
for (const child of Array.from(
currentElement.children
)) {
if (this.extractValue(child, attribute)) {
currentElement = child;
foundContentChild = true;
break;
}
}
if (!foundContentChild) break;
}
element = currentElement;
}
}
} else { } else {
element = this.queryElement(currentRow, selector); // Fallback for non-XPath selectors
element = this.queryElement(container, selector);
} }
// Step 4: Extract the value from the found element
if (element) { if (element) {
const value = this.extractValue(element, attribute); const value = this.extractValue(element, attribute);
if (value !== null && value !== "") { if (value !== null && value !== "") {
record[label] = value; record[label] = value;
} else { } else {
console.warn( console.warn(` ⚠️ Empty value for "${label}"`);
`❌ No value for ${label} in row ${rowIndex + 1}`
);
record[label] = ""; record[label] = "";
} }
} else { } else {
console.warn( console.warn(` ❌ Element not found for "${label}"`);
`❌ Element not found for ${label} with selector:`,
selector
);
record[label] = ""; record[label] = "";
} }
} }
// Step 5: Add record if it has any non-empty values
if (Object.values(record).some((value) => value !== "")) { if (Object.values(record).some((value) => value !== "")) {
tableData.push(record); extractedData.push(record);
}
}
}
}
}
// Extract non-table data
const nonTableData: ExtractedListData[] = [];
for (
let containerIndex = 0;
containerIndex < containers.length;
containerIndex++
) {
if (nonTableData.length >= limit) break;
const container = containers[containerIndex];
const { nonTableFields } = containerFields[containerIndex];
if (Object.keys(nonTableFields).length > 0) {
const record: ExtractedListData = {};
for (const [label, { selector, attribute }] of Object.entries(
nonTableFields
)) {
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
const element = this.queryElement(container, relativeSelector);
if (element) {
const value = this.extractValue(element, attribute);
if (value !== null && value !== "") {
record[label] = value;
} else { } else {
console.warn( console.warn(
`❌ No value for ${label} in container ${containerIndex + 1}` ` ⚠️ Skipping empty record for container ${containerIndex + 1}`
); );
record[label] = "";
}
} else {
console.warn(
`❌ Element not found for ${label} with selector:`,
selector
);
record[label] = "";
} }
} }
if (Object.values(record).some((value) => value !== "")) {
nonTableData.push(record);
}
}
}
// Combine and limit results
const extractedData = [...tableData, ...nonTableData].slice(0, limit);
return extractedData; return extractedData;
} catch (error) { } catch (error) {
console.error("Error in client-side extractListData:", error); console.error("💥 Error in client-side extractListData:", error);
return []; return [];
} }
}; };
// Create indexed XPath for specific container instance
private createIndexedXPath(
childSelector: string,
listSelector: string,
containerIndex: number
): string {
// Check if the child selector contains the list selector pattern
if (childSelector.includes(listSelector.replace("//", ""))) {
// Replace the list selector part with indexed version
const listPattern = listSelector.replace("//", "");
const indexedListSelector = `(${listSelector})[${containerIndex}]`;
const indexedSelector = childSelector.replace(
`//${listPattern}`,
indexedListSelector
);
return indexedSelector;
} else {
// If pattern doesn't match, create a more generic indexed selector
// This is a fallback approach
console.warn(` ⚠️ Pattern doesn't match, using fallback approach`);
return `(${listSelector})[${containerIndex}]${childSelector.replace(
"//",
"/"
)}`;
}
}
// Helper method for single XPath evaluation
private evaluateXPathSingle = (
document: Document,
xpath: string
): Element | null => {
try {
const result = document.evaluate(
xpath,
document,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
);
const element = result.singleNodeValue as Element | null;
if (!element) {
console.warn(`❌ XPath found no element for: ${xpath}`);
}
return element;
} catch (error) {
console.error("❌ XPath evaluation failed:", xpath, error);
return null;
}
};
} }
export const clientListExtractor = new ClientListExtractor(); export const clientListExtractor = new ClientListExtractor();

File diff suppressed because it is too large Load Diff