Merge branch 'develop' into perfect-ui

This commit is contained in:
Rohit
2025-07-07 01:24:45 +05:30
committed by GitHub
10 changed files with 3074 additions and 3013 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -492,7 +492,8 @@ const handleChangeUrl = async (activeBrowser: RemoteBrowser, page: Page, url: st
await generator.onChangeUrl(url, page);
try {
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
await page.waitForTimeout(2000);
logger.log("debug", `Went to ${url}`);
} catch (e) {
const { message } = e as Error;

View File

@@ -464,7 +464,6 @@ export class WorkflowGenerator {
public onClick = async (coordinates: Coordinates, page: Page) => {
let where: WhereWhatPair["where"] = { url: this.getBestUrl(page.url()) };
const selector = await this.generateSelector(page, coordinates, ActionType.Click);
console.log("COOORDINATES: ", coordinates);
logger.log('debug', `Element's selector: ${selector}`);
const elementInfo = await getElementInformation(page, coordinates, '', false);
@@ -999,6 +998,7 @@ export class WorkflowGenerator {
rect,
selector: displaySelector,
elementInfo,
isDOMMode: this.isDOMMode,
// Include shadow DOM specific information
shadowInfo: elementInfo?.isShadowRoot ? {
mode: elementInfo.shadowRootMode,

File diff suppressed because it is too large Load Diff

View File

@@ -98,6 +98,7 @@ interface RRWebDOMBrowserRendererProps {
getList?: boolean;
getText?: boolean;
listSelector?: string | null;
cachedChildSelectors?: string[];
paginationMode?: boolean;
paginationType?: string;
limitMode?: boolean;
@@ -106,12 +107,14 @@ interface RRWebDOMBrowserRendererProps {
selector: string;
elementInfo: ElementInfo | null;
childSelectors?: string[];
groupInfo?: any;
}) => void;
onElementSelect?: (data: {
rect: DOMRect;
selector: string;
elementInfo: ElementInfo | null;
childSelectors?: string[];
groupInfo?: any;
}) => void;
onShowDatePicker?: (info: {
coordinates: { x: number; y: number };
@@ -144,6 +147,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
getList = false,
getText = false,
listSelector = null,
cachedChildSelectors = [],
paginationMode = false,
paginationType = "",
limitMode = false,
@@ -205,11 +209,24 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
const handleDOMHighlighting = useCallback(
(x: number, y: number, iframeDoc: Document) => {
try {
if (!getText && !getList) {
setCurrentHighlight(null);
if (onHighlight) {
onHighlight({
rect: new DOMRect(0, 0, 0, 0),
selector: "",
elementInfo: null,
});
}
return;
}
const highlighterData =
clientSelectorGenerator.generateDataForHighlighter(
{ x, y },
iframeDoc,
true
true,
cachedChildSelectors
);
if (!highlighterData) {
@@ -224,70 +241,40 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
return;
}
const { rect, selector, elementInfo, childSelectors } = highlighterData;
const { rect, selector, elementInfo, childSelectors, groupInfo } =
highlighterData;
let shouldHighlight = false;
if (getList) {
if (listSelector) {
const hasValidChildSelectors =
Array.isArray(childSelectors) && childSelectors.length > 0;
// First phase: Allow any group to be highlighted for selection
if (!listSelector && groupInfo?.isGroupElement) {
shouldHighlight = true;
}
// Second phase: Show valid children within selected group
else if (listSelector) {
if (limitMode) {
shouldHighlight = false;
} else if (paginationMode) {
if (
paginationType !== "" &&
!["none", "scrollDown", "scrollUp"].includes(paginationType)
) {
shouldHighlight = true;
} else {
shouldHighlight = false;
}
} else if (childSelectors && childSelectors.includes(selector)) {
} else if (
paginationMode &&
paginationType !== "" &&
!["none", "scrollDown", "scrollUp"].includes(paginationType)
) {
shouldHighlight = true;
} else if (childSelectors && childSelectors.length > 0) {
console.log("✅ Child selectors present, highlighting enabled");
shouldHighlight = true;
} else if (elementInfo?.isIframeContent && childSelectors) {
const isIframeChild = childSelectors.some(
(childSelector: string) =>
selector.includes(":>>") &&
childSelector
.split(":>>")
.some((part) => selector.includes(part.trim()))
);
shouldHighlight = isIframeChild;
} else if (selector.includes(":>>") && hasValidChildSelectors) {
const selectorParts = selector
.split(":>>")
.map((part: string) => part.trim());
const isValidMixedSelector = selectorParts.some((part: any) =>
childSelectors!.some((childSelector) =>
childSelector.includes(part)
)
);
} else if (elementInfo?.isShadowRoot && childSelectors) {
const isShadowChild = childSelectors.some(
(childSelector: string) =>
selector.includes(">>") &&
childSelector
.split(">>")
.some((part) => selector.includes(part.trim()))
);
} else if (selector.includes(">>") && hasValidChildSelectors) {
const selectorParts = selector
.split(">>")
.map((part: string) => part.trim());
const isValidMixedSelector = selectorParts.some((part: any) =>
childSelectors!.some((childSelector) =>
childSelector.includes(part)
)
);
} else {
console.log("❌ No child selectors available");
shouldHighlight = false;
}
} else {
}
// No list selector - show regular highlighting
else {
shouldHighlight = true;
}
} else {
// getText mode - always highlight
shouldHighlight = true;
}
@@ -316,6 +303,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
},
selector,
childSelectors,
groupInfo,
});
}
}
@@ -335,9 +323,11 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
}
},
[
getText,
getList,
listSelector,
paginationMode,
cachedChildSelectors,
paginationType,
limitMode,
onHighlight,
@@ -363,6 +353,10 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
return;
}
if (!isInCaptureMode) {
return;
}
const now = performance.now();
if (now - lastMouseMoveTime.current < MOUSE_MOVE_THROTTLE) {
return;
@@ -401,11 +395,24 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
e.stopPropagation();
if (currentHighlight && onElementSelect) {
// Get the group info for the current highlight
const highlighterData =
clientSelectorGenerator.generateDataForHighlighter(
{ x: iframeX, y: iframeY },
iframeDoc,
true,
cachedChildSelectors
);
onElementSelect({
rect: currentHighlight.rect,
selector: currentHighlight.selector,
elementInfo: currentHighlight.elementInfo,
childSelectors: currentHighlight.childSelectors || [],
childSelectors:
cachedChildSelectors.length > 0
? cachedChildSelectors
: highlighterData?.childSelectors || [],
groupInfo: highlighterData?.groupInfo,
});
}
notifyLastAction("select element");

View File

@@ -22,6 +22,7 @@ import { useThemeMode } from '../../context/theme-provider';
import { useTranslation } from 'react-i18next';
import { useBrowserDimensionsStore } from '../../context/browserDimensions';
import { clientListExtractor } from '../../helpers/clientListExtractor';
import { clientSelectorGenerator } from '../../helpers/clientSelectorGenerator';
const fetchWorkflow = (id: string, callback: (response: WorkflowFile) => void) => {
getActiveWorkflow(id).then(
@@ -52,10 +53,8 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
const [isCaptureTextConfirmed, setIsCaptureTextConfirmed] = useState(false);
const [isCaptureListConfirmed, setIsCaptureListConfirmed] = useState(false);
const { panelHeight } = useBrowserDimensionsStore();
const [isDOMMode, setIsDOMMode] = useState(false);
const [currentSnapshot, setCurrentSnapshot] = useState<any>(null);
const { lastAction, notify, currentWorkflowActionsState, setCurrentWorkflowActionsState, resetInterpretationLog, currentListActionId, setCurrentListActionId, currentTextActionId, setCurrentTextActionId, currentScreenshotActionId, setCurrentScreenshotActionId } = useGlobalInfoStore();
const { lastAction, notify, currentWorkflowActionsState, setCurrentWorkflowActionsState, resetInterpretationLog, currentListActionId, setCurrentListActionId, currentTextActionId, setCurrentTextActionId, currentScreenshotActionId, setCurrentScreenshotActionId, updateDOMMode, currentSnapshot, isDOMMode } = useGlobalInfoStore();
const {
getText, startGetText, stopGetText,
getList, startGetList, stopGetList,
@@ -86,22 +85,20 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
if (socket) {
const domModeHandler = (data: any) => {
if (!data.userId || data.userId === id) {
setIsDOMMode(true);
updateDOMMode(true);
}
};
const screenshotModeHandler = (data: any) => {
if (!data.userId || data.userId === id) {
setIsDOMMode(false);
setCurrentSnapshot(null);
updateDOMMode(false);
}
};
const domcastHandler = (data: any) => {
if (!data.userId || data.userId === id) {
if (data.snapshotData && data.snapshotData.snapshot) {
setCurrentSnapshot(data.snapshotData);
setIsDOMMode(true);
updateDOMMode(true, data.snapshotData);
}
}
};
@@ -116,7 +113,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
socket.off("domcast", domcastHandler);
};
}
}, [socket, id]);
}, [socket, id, updateDOMMode]);
useEffect(() => {
if (socket) {
@@ -214,7 +211,6 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
) => {
if (isDOMMode && currentSnapshot) {
try {
// Find the DOM iframe element
let iframeElement = document.querySelector(
"#dom-browser-iframe"
) as HTMLIFrameElement;
@@ -247,22 +243,42 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
return;
}
// Use client-side extraction
Object.entries(fields).forEach(([key, field]) => {
if (field.selectorObj?.selector) {
const isFieldXPath =
field.selectorObj.selector.startsWith("//") ||
field.selectorObj.selector.startsWith("/");
console.log(
`Field "${key}" selector:`,
field.selectorObj.selector,
`(XPath: ${isFieldXPath})`
);
}
});
const extractedData = clientListExtractor.extractListData(
iframeDoc,
listSelector,
fields,
5 // limit for preview
5
);
updateListStepData(currentListId, extractedData);
console.log("✅ UI extraction completed:");
if (extractedData.length === 0) {
console.warn(
"⚠️ No data extracted - this might indicate selector issues"
);
notify(
"warning",
"No data was extracted. Please verify your selections."
);
}
} catch (error) {
console.error("Error in client-side data extraction:", error);
notify("error", "Failed to extract data client-side");
}
} else {
// Fallback to socket-based extraction for screenshot mode
if (!socket) {
console.error("Socket not available for backend extraction");
return;
@@ -275,8 +291,6 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
currentListId,
pagination: { type: "", selector: "" },
});
console.log("📤 Sent extraction request to server");
} catch (error) {
console.error("Error in backend data extraction:", error);
}
@@ -443,6 +457,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
resetInterpretationLog();
finishAction('text');
onFinishCapture();
clientSelectorGenerator.cleanup();
}, [stopGetText, getTextSettingsObject, socket, browserSteps, confirmedTextSteps, resetInterpretationLog, finishAction, notify, onFinishCapture, t]);
const getListSettingsObject = useCallback(() => {
@@ -494,6 +509,8 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
const stopCaptureAndEmitGetListSettings = useCallback(() => {
const settings = getListSettingsObject();
console.log("rrwebSnapshotHandler", settings);
const latestListStep = getLatestListStep(browserSteps);
if (latestListStep && settings) {
@@ -509,6 +526,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
resetInterpretationLog();
finishAction('list');
onFinishCapture();
clientSelectorGenerator.cleanup();
}, [getListSettingsObject, socket, notify, handleStopGetList, resetInterpretationLog, finishAction, onFinishCapture, t, browserSteps, extractDataClientSide]);
const hasUnconfirmedListTextFields = browserSteps.some(step =>
@@ -638,6 +656,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
setCurrentTextActionId('');
setIsCaptureTextConfirmed(false);
clientSelectorGenerator.cleanup();
notify('error', t('right_panel.errors.capture_text_discarded'));
}, [currentTextActionId, browserSteps, stopGetText, deleteStepsByActionId, notify, t]);
@@ -668,6 +687,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
setCaptureStage('initial');
setCurrentListActionId('');
setIsCaptureListConfirmed(false);
clientSelectorGenerator.cleanup();
notify('error', t('right_panel.errors.capture_list_discarded'));
}, [currentListActionId, browserSteps, stopGetList, deleteStepsByActionId, resetListState, setShowPaginationOptions, setShowLimitOptions, setCaptureStage, notify, t]);
@@ -686,6 +706,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
stopGetScreenshot();
resetInterpretationLog();
finishAction('screenshot');
clientSelectorGenerator.cleanup();
onFinishCapture();
};

View File

@@ -27,6 +27,41 @@ interface ScheduleConfig {
cronExpression?: string;
}
interface ProcessedSnapshot {
snapshot: any;
resources: {
stylesheets: Array<{
href: string;
content: string;
media?: string;
}>;
images: Array<{
src: string;
dataUrl: string;
alt?: string;
}>;
fonts: Array<{
url: string;
dataUrl: string;
format?: string;
}>;
scripts: Array<{
src: string;
content: string;
type?: string;
}>;
media: Array<{
src: string;
dataUrl: string;
type: string;
}>;
};
baseUrl: string;
viewport: { width: number; height: number };
timestamp: number;
processingStats: any;
}
export interface RobotSettings {
id: string;
userId?: number;
@@ -86,6 +121,11 @@ interface GlobalInfo {
setCurrentListActionId: (actionId: string) => void;
currentScreenshotActionId: string;
setCurrentScreenshotActionId: (actionId: string) => void;
isDOMMode: boolean;
setIsDOMMode: (isDOMMode: boolean) => void;
currentSnapshot: ProcessedSnapshot | null;
setCurrentSnapshot: (snapshot: ProcessedSnapshot | null) => void;
updateDOMMode: (isDOMMode: boolean, snapshot?: ProcessedSnapshot | null) => void;
};
class GlobalInfoStore implements Partial<GlobalInfo> {
@@ -115,6 +155,8 @@ class GlobalInfoStore implements Partial<GlobalInfo> {
currentTextActionId = '';
currentListActionId = '';
currentScreenshotActionId = '';
isDOMMode = false;
currentSnapshot = null;
};
const globalInfoStore = new GlobalInfoStore();
@@ -141,6 +183,8 @@ export const GlobalInfoProvider = ({ children }: { children: JSX.Element }) => {
const [currentTextActionId, setCurrentTextActionId] = useState<string>('');
const [currentListActionId, setCurrentListActionId] = useState<string>('');
const [currentScreenshotActionId, setCurrentScreenshotActionId] = useState<string>('');
const [isDOMMode, setIsDOMMode] = useState<boolean>(globalInfoStore.isDOMMode);
const [currentSnapshot, setCurrentSnapshot] = useState<ProcessedSnapshot | null>(globalInfoStore.currentSnapshot);
const notify = (severity: 'error' | 'warning' | 'info' | 'success', message: string) => {
setNotification({ severity, message, isOpen: true });
@@ -165,6 +209,18 @@ export const GlobalInfoProvider = ({ children }: { children: JSX.Element }) => {
}, 100);
}
const updateDOMMode = (mode: boolean, snapshot?: ProcessedSnapshot | null) => {
setIsDOMMode(mode);
if (snapshot !== undefined) {
setCurrentSnapshot(snapshot);
}
if (!mode) {
setCurrentSnapshot(null);
}
}
return (
<globalInfoContext.Provider
value={{
@@ -205,6 +261,11 @@ export const GlobalInfoProvider = ({ children }: { children: JSX.Element }) => {
setCurrentListActionId,
currentScreenshotActionId,
setCurrentScreenshotActionId,
isDOMMode,
setIsDOMMode,
currentSnapshot,
setCurrentSnapshot,
updateDOMMode,
}}
>
{children}

View File

@@ -15,30 +15,89 @@ interface ExtractedListData {
[key: string]: string;
}
interface TableField {
interface Field {
selector: string;
attribute: string;
tableContext?: string;
cellIndex?: number;
}
interface NonTableField {
selector: string;
attribute: string;
}
interface ContainerFields {
tableFields: Record<string, TableField>;
nonTableFields: Record<string, NonTableField>;
}
class ClientListExtractor {
private evaluateXPath = (
rootElement: Element | Document,
xpath: string
): Element | null => {
try {
const ownerDoc =
rootElement.nodeType === Node.DOCUMENT_NODE
? (rootElement as Document)
: rootElement.ownerDocument;
if (!ownerDoc) return null;
const result = ownerDoc.evaluate(
xpath,
rootElement,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
);
return result.singleNodeValue as Element | null;
} catch (error) {
console.warn("XPath evaluation failed:", xpath, error);
return null;
}
};
private evaluateXPathAll = (
rootElement: Element | Document,
xpath: string
): Element[] => {
try {
const ownerDoc =
rootElement.nodeType === Node.DOCUMENT_NODE
? (rootElement as Document)
: rootElement.ownerDocument;
if (!ownerDoc) return [];
const result = ownerDoc.evaluate(
xpath,
rootElement,
null,
XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,
null
);
const elements: Element[] = [];
for (let i = 0; i < result.snapshotLength; i++) {
const node = result.snapshotItem(i);
if (node && node.nodeType === Node.ELEMENT_NODE) {
elements.push(node as Element);
}
}
return elements;
} catch (error) {
console.warn("XPath evaluation failed:", xpath, error);
return [];
}
};
private queryElement = (
rootElement: Element | Document,
selector: string
): Element | null => {
if (!selector.includes(">>") && !selector.includes(":>>")) {
return rootElement.querySelector(selector);
// Check if it's an XPath selector (starts with // or / or ./)
if (
selector.startsWith("//") ||
selector.startsWith("/") ||
selector.startsWith("./")
) {
return this.evaluateXPath(rootElement, selector);
} else {
return rootElement.querySelector(selector);
}
}
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
@@ -59,7 +118,17 @@ class ClientListExtractor {
frameElement.contentDocument ||
frameElement.contentWindow?.document;
if (!frameDoc) return null;
currentElement = frameDoc.querySelector(parts[i]);
// Handle XPath in iframe context
if (
parts[i].startsWith("//") ||
parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
currentElement = this.evaluateXPath(frameDoc, parts[i]);
} else {
currentElement = frameDoc.querySelector(parts[i]);
}
continue;
} catch (e) {
console.warn(
@@ -75,7 +144,16 @@ class ClientListExtractor {
let nextElement: Element | null = null;
if ("querySelector" in currentElement) {
nextElement = currentElement.querySelector(parts[i]);
// Handle XPath vs CSS selector
if (
parts[i].startsWith("//") ||
parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
nextElement = this.evaluateXPath(currentElement, parts[i]);
} else {
nextElement = currentElement.querySelector(parts[i]);
}
}
if (
@@ -83,9 +161,20 @@ class ClientListExtractor {
"shadowRoot" in currentElement &&
(currentElement as Element).shadowRoot
) {
nextElement = (currentElement as Element).shadowRoot!.querySelector(
parts[i]
);
if (
parts[i].startsWith("//") ||
parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
nextElement = this.evaluateXPath(
(currentElement as Element).shadowRoot as unknown as Document,
parts[i]
);
} else {
nextElement = (currentElement as Element).shadowRoot!.querySelector(
parts[i]
);
}
}
if (!nextElement && "children" in currentElement) {
@@ -94,7 +183,18 @@ class ClientListExtractor {
);
for (const child of children) {
if (child.shadowRoot) {
nextElement = child.shadowRoot.querySelector(parts[i]);
if (
parts[i].startsWith("//") ||
parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
nextElement = this.evaluateXPath(
child.shadowRoot as unknown as Document,
parts[i]
);
} else {
nextElement = child.shadowRoot.querySelector(parts[i]);
}
if (nextElement) break;
}
}
@@ -111,7 +211,12 @@ class ClientListExtractor {
selector: string
): Element[] => {
if (!selector.includes(">>") && !selector.includes(":>>")) {
return Array.from(rootElement.querySelectorAll(selector));
// Check if it's an XPath selector (starts with // or /)
if (selector.startsWith("//") || selector.startsWith("/")) {
return this.evaluateXPathAll(rootElement, selector);
} else {
return Array.from(rootElement.querySelectorAll(selector));
}
}
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
@@ -133,7 +238,14 @@ class ClientListExtractor {
frameElement.contentDocument ||
frameElement.contentWindow?.document;
if (frameDoc) {
nextElements.push(...Array.from(frameDoc.querySelectorAll(part)));
// Handle XPath in iframe context
if (part.startsWith("//") || part.startsWith("/")) {
nextElements.push(...this.evaluateXPathAll(frameDoc, part));
} else {
nextElements.push(
...Array.from(frameDoc.querySelectorAll(part))
);
}
}
} catch (e) {
console.warn(
@@ -146,24 +258,47 @@ class ClientListExtractor {
}
} else {
if ("querySelectorAll" in element) {
nextElements.push(...Array.from(element.querySelectorAll(part)));
// Handle XPath vs CSS selector
if (part.startsWith("//") || part.startsWith("/")) {
nextElements.push(...this.evaluateXPathAll(element, part));
} else {
nextElements.push(...Array.from(element.querySelectorAll(part)));
}
}
if ("shadowRoot" in element && (element as Element).shadowRoot) {
nextElements.push(
...Array.from(
(element as Element).shadowRoot!.querySelectorAll(part)
)
);
if (part.startsWith("//") || part.startsWith("/")) {
nextElements.push(
...this.evaluateXPathAll(
(element as Element).shadowRoot as unknown as Document,
part
)
);
} else {
nextElements.push(
...Array.from(
(element as Element).shadowRoot!.querySelectorAll(part)
)
);
}
}
if ("children" in element) {
const children = Array.from((element as Element).children || []);
for (const child of children) {
if (child.shadowRoot) {
nextElements.push(
...Array.from(child.shadowRoot.querySelectorAll(part))
);
if (part.startsWith("//") || part.startsWith("/")) {
nextElements.push(
...this.evaluateXPathAll(
child.shadowRoot as unknown as Document,
part
)
);
} else {
nextElements.push(
...Array.from(child.shadowRoot.querySelectorAll(part))
);
}
}
}
}
@@ -193,35 +328,66 @@ class ClientListExtractor {
}
if (attribute === "innerText") {
return (element as HTMLElement).innerText?.trim() || null;
} else if (attribute === "innerHTML") {
return element.innerHTML?.trim() || null;
} else if (attribute === "src" || attribute === "href") {
if (attribute === "href" && element.tagName !== "A") {
const parentElement = element.parentElement;
if (parentElement && parentElement.tagName === "A") {
const parentHref = parentElement.getAttribute("href");
if (parentHref) {
try {
return new URL(parentHref, baseURL).href;
} catch (e) {
return parentHref;
}
// First try standard innerText/textContent
let textContent =
(element as HTMLElement).innerText?.trim() ||
(element as HTMLElement).textContent?.trim();
// If empty, check for common data attributes that might contain the text
if (!textContent) {
// Check for data-* attributes that commonly contain text values
const dataAttributes = [
"data-600",
"data-text",
"data-label",
"data-value",
"data-content",
];
for (const attr of dataAttributes) {
const dataValue = element.getAttribute(attr);
if (dataValue && dataValue.trim()) {
textContent = dataValue.trim();
break;
}
}
}
return textContent || null;
} else if (attribute === "innerHTML") {
return element.innerHTML?.trim() || null;
} else if (attribute === "href") {
// For href, we need to find the anchor tag if the current element isn't one
let anchorElement = element;
// If current element is not an anchor, look for parent anchor
if (element.tagName !== "A") {
anchorElement =
element.closest("a") ||
element.parentElement?.closest("a") ||
element;
}
const hrefValue = anchorElement.getAttribute("href");
if (!hrefValue || hrefValue.trim() === "") {
return null;
}
try {
return new URL(hrefValue, baseURL).href;
} catch (e) {
console.warn("Error creating URL from", hrefValue, e);
return hrefValue;
}
} else if (attribute === "src") {
const attrValue = element.getAttribute(attribute);
const dataAttr = attrValue || element.getAttribute("data-" + attribute);
if (!dataAttr || dataAttr.trim() === "") {
if (attribute === "src") {
const style = window.getComputedStyle(element as HTMLElement);
const bgImage = style.backgroundImage;
if (bgImage && bgImage !== "none") {
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
return matches ? new URL(matches[1], baseURL).href : null;
}
const style = window.getComputedStyle(element as HTMLElement);
const bgImage = style.backgroundImage;
if (bgImage && bgImage !== "none") {
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
return matches ? new URL(matches[1], baseURL).href : null;
}
return null;
}
@@ -236,187 +402,8 @@ class ClientListExtractor {
return element.getAttribute(attribute);
};
private findTableAncestor = (
element: Element
): { type: string; element: Element } | null => {
let currentElement: Element | null = element;
const MAX_DEPTH = 5;
let depth = 0;
while (currentElement && depth < MAX_DEPTH) {
if (currentElement.getRootNode() instanceof ShadowRoot) {
currentElement = (currentElement.getRootNode() as ShadowRoot).host;
continue;
}
if (currentElement.tagName === "TD") {
return { type: "TD", element: currentElement };
} else if (currentElement.tagName === "TR") {
return { type: "TR", element: currentElement };
}
if (
currentElement.tagName === "IFRAME" ||
currentElement.tagName === "FRAME"
) {
try {
const frameElement = currentElement as
| HTMLIFrameElement
| HTMLFrameElement;
currentElement = frameElement.contentDocument?.body || null;
} catch (e) {
return null;
}
} else {
currentElement = currentElement.parentElement;
}
depth++;
}
return null;
};
private getCellIndex = (td: Element): number => {
if (td.getRootNode() instanceof ShadowRoot) {
const shadowRoot = td.getRootNode() as ShadowRoot;
const allCells = Array.from(shadowRoot.querySelectorAll("td"));
return allCells.indexOf(td as HTMLTableCellElement);
}
let index = 0;
let sibling = td;
while ((sibling = sibling.previousElementSibling as Element)) {
index++;
}
return index;
};
private hasThElement = (
row: Element,
tableFields: Record<string, TableField>
): boolean => {
for (const [_, { selector }] of Object.entries(tableFields)) {
const element = this.queryElement(row, selector);
if (element) {
let current: Element | ShadowRoot | Document | null = element;
while (current && current !== row) {
if (current.getRootNode() instanceof ShadowRoot) {
current = (current.getRootNode() as ShadowRoot).host;
continue;
}
if ((current as Element).tagName === "TH") return true;
if (
(current as Element).tagName === "IFRAME" ||
(current as Element).tagName === "FRAME"
) {
try {
const frameElement = current as
| HTMLIFrameElement
| HTMLFrameElement;
current = frameElement.contentDocument?.body || null;
} catch (e) {
break;
}
} else {
current = (current as Element).parentElement;
}
}
}
}
return false;
};
private filterRowsBasedOnTag = (
rows: Element[],
tableFields: Record<string, TableField>
): Element[] => {
for (const row of rows) {
if (this.hasThElement(row, tableFields)) {
return rows;
}
}
return rows.filter((row) => {
const directTH = row.getElementsByTagName("TH").length === 0;
const shadowTH = row.shadowRoot
? row.shadowRoot.querySelector("th") === null
: true;
return directTH && shadowTH;
});
};
private calculateClassSimilarity = (
classList1: string[],
classList2: string[]
): number => {
const set1 = new Set(classList1);
const set2 = new Set(classList2);
const intersection = new Set([...set1].filter((x) => set2.has(x)));
const union = new Set([...set1, ...set2]);
return intersection.size / union.size;
};
private findSimilarElements = (
baseElement: Element,
document: Document,
similarityThreshold: number = 0.7
): Element[] => {
const baseClasses = Array.from(baseElement.classList);
if (baseClasses.length === 0) return [];
const allElements: Element[] = [];
allElements.push(
...Array.from(document.getElementsByTagName(baseElement.tagName))
);
if (baseElement.getRootNode() instanceof ShadowRoot) {
const shadowHost = (baseElement.getRootNode() as ShadowRoot).host;
allElements.push(
...Array.from(shadowHost.getElementsByTagName(baseElement.tagName))
);
}
const frames = [
...Array.from(document.getElementsByTagName("iframe")),
...Array.from(document.getElementsByTagName("frame")),
];
for (const frame of frames) {
try {
const frameElement = frame as HTMLIFrameElement | HTMLFrameElement;
const frameDoc =
frameElement.contentDocument || frameElement.contentWindow?.document;
if (frameDoc) {
allElements.push(
...Array.from(frameDoc.getElementsByTagName(baseElement.tagName))
);
}
} catch (e) {
console.warn(
`Cannot access ${frame.tagName.toLowerCase()} content:`,
e
);
}
}
return allElements.filter((element) => {
if (element === baseElement) return false;
const similarity = this.calculateClassSimilarity(
baseClasses,
Array.from(element.classList)
);
return similarity >= similarityThreshold;
});
};
private convertFields = (
fields: any
): Record<string, { selector: string; attribute: string }> => {
const convertedFields: Record<
string,
{ selector: string; attribute: string }
> = {};
private convertFields = (fields: any): Record<string, Field> => {
const convertedFields: Record<string, Field> = {};
for (const [key, field] of Object.entries(fields)) {
const typedField = field as TextStep;
@@ -439,285 +426,134 @@ class ClientListExtractor {
// Convert fields to the format expected by the extraction logic
const convertedFields = this.convertFields(fields);
// Get all container elements matching the list selector
let containers = this.queryElementAll(iframeDocument, listSelector);
// Step 1: Get all container elements matching the list selector
const containers = this.queryElementAll(iframeDocument, listSelector);
if (containers.length === 0) {
console.warn("No containers found for listSelector:", listSelector);
console.warn("No containers found for listSelector:", listSelector);
return [];
}
// Enhanced container discovery: find similar elements if we need more containers
if (limit > 1 && containers.length === 1) {
const baseContainer = containers[0];
const similarContainers = this.findSimilarElements(
baseContainer,
iframeDocument,
0.7
);
// Step 2: Extract data from each container up to the limit
const extractedData: ExtractedListData[] = [];
const containersToProcess = Math.min(containers.length, limit);
if (similarContainers.length > 0) {
const newContainers = similarContainers.filter(
(container) => !container.matches(listSelector)
);
containers = [...containers, ...newContainers];
}
}
for (
let containerIndex = 0;
containerIndex < containersToProcess;
containerIndex++
) {
const container = containers[containerIndex];
const record: ExtractedListData = {};
// Analyze fields for table vs non-table context
const containerFields: ContainerFields[] = containers.map(() => ({
tableFields: {},
nonTableFields: {},
}));
// Step 3: For each field, extract data from the current container
for (const [label, { selector, attribute }] of Object.entries(
convertedFields
)) {
let element: Element | null = null;
containers.forEach((container, containerIndex) => {
for (const [label, field] of Object.entries(convertedFields)) {
const sampleElement = this.queryElement(container, field.selector);
// CORRECT APPROACH: Create indexed absolute XPath
if (selector.startsWith("//")) {
// Convert the absolute selector to target the specific container instance
const indexedSelector = this.createIndexedXPath(
selector,
listSelector,
containerIndex + 1
);
if (sampleElement) {
const ancestor = this.findTableAncestor(sampleElement);
if (ancestor) {
containerFields[containerIndex].tableFields[label] = {
...field,
tableContext: ancestor.type,
cellIndex:
ancestor.type === "TD"
? this.getCellIndex(ancestor.element)
: -1,
};
} else {
containerFields[containerIndex].nonTableFields[label] = field;
}
element = this.evaluateXPathSingle(iframeDocument, indexedSelector);
} else {
containerFields[containerIndex].nonTableFields[label] = field;
// Fallback for non-XPath selectors
element = this.queryElement(container, selector);
}
}
});
// Extract table data
const tableData: ExtractedListData[] = [];
for (
let containerIndex = 0;
containerIndex < containers.length;
containerIndex++
) {
const container = containers[containerIndex];
const { tableFields } = containerFields[containerIndex];
if (Object.keys(tableFields).length > 0) {
const firstField = Object.values(tableFields)[0];
const firstElement = this.queryElement(
container,
firstField.selector
);
let tableContext: Element | null = firstElement;
// Find the table context
while (
tableContext &&
tableContext.tagName !== "TABLE" &&
tableContext !== container
) {
if (tableContext.getRootNode() instanceof ShadowRoot) {
tableContext = (tableContext.getRootNode() as ShadowRoot).host;
continue;
}
if (
tableContext.tagName === "IFRAME" ||
tableContext.tagName === "FRAME"
) {
try {
const frameElement = tableContext as
| HTMLIFrameElement
| HTMLFrameElement;
tableContext = frameElement.contentDocument?.body || null;
} catch (e) {
break;
}
// Step 4: Extract the value from the found element
if (element) {
const value = this.extractValue(element, attribute);
if (value !== null && value !== "") {
record[label] = value;
} else {
tableContext = tableContext.parentElement;
}
}
if (tableContext) {
const rows: Element[] = [];
rows.push(...Array.from(tableContext.getElementsByTagName("TR")));
if (
tableContext.tagName === "IFRAME" ||
tableContext.tagName === "FRAME"
) {
try {
const frameElement = tableContext as
| HTMLIFrameElement
| HTMLFrameElement;
const frameDoc =
frameElement.contentDocument ||
frameElement.contentWindow?.document;
if (frameDoc) {
rows.push(...Array.from(frameDoc.getElementsByTagName("TR")));
}
} catch (e) {
console.warn(
`Cannot access ${tableContext.tagName.toLowerCase()} rows:`,
e
);
}
}
const processedRows = this.filterRowsBasedOnTag(rows, tableFields);
for (
let rowIndex = 0;
rowIndex < Math.min(processedRows.length, limit);
rowIndex++
) {
const record: ExtractedListData = {};
const currentRow = processedRows[rowIndex];
for (const [
label,
{ selector, attribute, cellIndex },
] of Object.entries(tableFields)) {
let element: Element | null = null;
if (cellIndex !== undefined && cellIndex >= 0) {
let td: Element | null =
currentRow.children[cellIndex] || null;
if (!td && currentRow.shadowRoot) {
const shadowCells = currentRow.shadowRoot.children;
if (shadowCells && shadowCells.length > cellIndex) {
td = shadowCells[cellIndex];
}
}
if (td) {
element = this.queryElement(td, selector);
if (
!element &&
selector
.split(/(?:>>|:>>)/)
.pop()
?.includes("td:nth-child")
) {
element = td;
}
if (!element) {
const tagOnlySelector = selector.split(".")[0];
element = this.queryElement(td, tagOnlySelector);
}
if (!element) {
let currentElement: Element | null = td;
while (
currentElement &&
currentElement.children.length > 0
) {
let foundContentChild = false;
for (const child of Array.from(
currentElement.children
)) {
if (this.extractValue(child, attribute)) {
currentElement = child;
foundContentChild = true;
break;
}
}
if (!foundContentChild) break;
}
element = currentElement;
}
}
} else {
element = this.queryElement(currentRow, selector);
}
if (element) {
const value = this.extractValue(element, attribute);
if (value !== null && value !== "") {
record[label] = value;
} else {
console.warn(
`❌ No value for ${label} in row ${rowIndex + 1}`
);
record[label] = "";
}
} else {
console.warn(
`❌ Element not found for ${label} with selector:`,
selector
);
record[label] = "";
}
}
if (Object.values(record).some((value) => value !== "")) {
tableData.push(record);
}
}
}
}
}
// Extract non-table data
const nonTableData: ExtractedListData[] = [];
for (
let containerIndex = 0;
containerIndex < containers.length;
containerIndex++
) {
if (nonTableData.length >= limit) break;
const container = containers[containerIndex];
const { nonTableFields } = containerFields[containerIndex];
if (Object.keys(nonTableFields).length > 0) {
const record: ExtractedListData = {};
for (const [label, { selector, attribute }] of Object.entries(
nonTableFields
)) {
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
const element = this.queryElement(container, relativeSelector);
if (element) {
const value = this.extractValue(element, attribute);
if (value !== null && value !== "") {
record[label] = value;
} else {
console.warn(
`❌ No value for ${label} in container ${containerIndex + 1}`
);
record[label] = "";
}
} else {
console.warn(
`❌ Element not found for ${label} with selector:`,
selector
);
console.warn(` ⚠️ Empty value for "${label}"`);
record[label] = "";
}
}
if (Object.values(record).some((value) => value !== "")) {
nonTableData.push(record);
} else {
console.warn(` ❌ Element not found for "${label}"`);
record[label] = "";
}
}
}
// Combine and limit results
const extractedData = [...tableData, ...nonTableData].slice(0, limit);
// Step 5: Add record if it has any non-empty values
if (Object.values(record).some((value) => value !== "")) {
extractedData.push(record);
} else {
console.warn(
` ⚠️ Skipping empty record for container ${containerIndex + 1}`
);
}
}
return extractedData;
} catch (error) {
console.error("Error in client-side extractListData:", error);
console.error("💥 Error in client-side extractListData:", error);
return [];
}
};
// Create indexed XPath for specific container instance
private createIndexedXPath(
childSelector: string,
listSelector: string,
containerIndex: number
): string {
// Check if the child selector contains the list selector pattern
if (childSelector.includes(listSelector.replace("//", ""))) {
// Replace the list selector part with indexed version
const listPattern = listSelector.replace("//", "");
const indexedListSelector = `(${listSelector})[${containerIndex}]`;
const indexedSelector = childSelector.replace(
`//${listPattern}`,
indexedListSelector
);
return indexedSelector;
} else {
// If pattern doesn't match, create a more generic indexed selector
// This is a fallback approach
console.warn(` ⚠️ Pattern doesn't match, using fallback approach`);
return `(${listSelector})[${containerIndex}]${childSelector.replace(
"//",
"/"
)}`;
}
}
// Helper method for single XPath evaluation
private evaluateXPathSingle = (
document: Document,
xpath: string
): Element | null => {
try {
const result = document.evaluate(
xpath,
document,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
);
const element = result.singleNodeValue as Element | null;
if (!element) {
console.warn(`❌ XPath found no element for: ${xpath}`);
}
return element;
} catch (error) {
console.error("❌ XPath evaluation failed:", xpath, error);
return null;
}
};
}
export const clientListExtractor = new ClientListExtractor();

File diff suppressed because it is too large Load Diff