feat: add hover action support (#3994)
Co-authored-by: LawyZheng <lawyzheng1106@gmail.com>
This commit is contained in:
@@ -12,6 +12,7 @@ class ActionType(StrEnum):
|
||||
SELECT_OPTION = "select_option"
|
||||
CHECKBOX = "checkbox"
|
||||
WAIT = "wait"
|
||||
HOVER = "hover"
|
||||
NULL_ACTION = "null_action"
|
||||
SOLVE_CAPTCHA = "solve_captcha"
|
||||
TERMINATE = "terminate"
|
||||
@@ -37,11 +38,13 @@ class ActionType(StrEnum):
|
||||
ActionType.DOWNLOAD_FILE,
|
||||
ActionType.SELECT_OPTION,
|
||||
ActionType.CHECKBOX,
|
||||
ActionType.HOVER,
|
||||
]
|
||||
|
||||
|
||||
POST_ACTION_EXECUTION_ACTION_TYPES = [
|
||||
ActionType.CLICK,
|
||||
ActionType.HOVER,
|
||||
ActionType.INPUT_TEXT,
|
||||
ActionType.UPLOAD_FILE,
|
||||
ActionType.DOWNLOAD_FILE,
|
||||
|
||||
@@ -282,6 +282,11 @@ class WaitAction(Action):
|
||||
seconds: int = 20
|
||||
|
||||
|
||||
class HoverAction(WebAction):
|
||||
action_type: ActionType = ActionType.HOVER
|
||||
hold_seconds: float = 0.0
|
||||
|
||||
|
||||
class TerminateAction(DecisiveAction):
|
||||
action_type: ActionType = ActionType.TERMINATE
|
||||
|
||||
|
||||
@@ -32,6 +32,7 @@ from skyvern.exceptions import (
|
||||
ErrFoundSelectableElement,
|
||||
FailedToFetchSecret,
|
||||
FailToClick,
|
||||
FailToHover,
|
||||
FailToSelectByIndex,
|
||||
FailToSelectByLabel,
|
||||
FailToSelectByValue,
|
||||
@@ -1998,6 +1999,68 @@ async def handle_wait_action(
|
||||
return [ActionFailure(exception=Exception("Wait action is treated as a failure"))]
|
||||
|
||||
|
||||
@TraceManager.traced_async(ignore_inputs=["scraped_page", "page"])
|
||||
async def handle_hover_action(
|
||||
action: actions.HoverAction,
|
||||
page: Page,
|
||||
scraped_page: ScrapedPage,
|
||||
task: Task,
|
||||
step: Step,
|
||||
) -> list[ActionResult]:
|
||||
dom = DomUtil(scraped_page=scraped_page, page=page)
|
||||
try:
|
||||
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
|
||||
except Exception as exc:
|
||||
LOG.warning(
|
||||
"Failed to resolve element for hover action",
|
||||
action=action,
|
||||
workflow_run_id=task.workflow_run_id,
|
||||
exc_info=True,
|
||||
)
|
||||
return [ActionFailure(exception=exc)]
|
||||
|
||||
try:
|
||||
await skyvern_element.hover_to_reveal()
|
||||
await skyvern_element.get_locator().scroll_into_view_if_needed()
|
||||
await skyvern_element.get_locator().hover(timeout=settings.BROWSER_ACTION_TIMEOUT_MS)
|
||||
|
||||
# Save the absolute page position of the hovered element
|
||||
# This allows us to scroll back to this position after re-scraping
|
||||
try:
|
||||
bounding_box = await skyvern_element.get_locator().bounding_box(timeout=settings.BROWSER_ACTION_TIMEOUT_MS)
|
||||
if bounding_box:
|
||||
# Get current scroll position
|
||||
scroll_y = await page.evaluate("window.scrollY")
|
||||
# Calculate absolute page Y = viewport Y + scroll offset
|
||||
absolute_page_y = bounding_box["y"] + scroll_y
|
||||
|
||||
context = skyvern_context.current()
|
||||
if context:
|
||||
context.last_hovered_element_page_y = absolute_page_y
|
||||
context.last_hovered_element_id = action.element_id
|
||||
LOG.info(
|
||||
"Saved hovered element absolute position",
|
||||
element_id=action.element_id,
|
||||
viewport_y=bounding_box["y"],
|
||||
scroll_y=scroll_y,
|
||||
absolute_page_y=absolute_page_y,
|
||||
)
|
||||
except Exception:
|
||||
LOG.warning("Failed to save hovered element position", exc_info=True)
|
||||
|
||||
if action.hold_seconds and action.hold_seconds > 0:
|
||||
await asyncio.sleep(action.hold_seconds)
|
||||
return [ActionSuccess()]
|
||||
except Exception as exc:
|
||||
LOG.warning(
|
||||
"Hover action failed",
|
||||
action=action,
|
||||
workflow_run_id=task.workflow_run_id,
|
||||
exc_info=True,
|
||||
)
|
||||
return [ActionFailure(FailToHover(skyvern_element.get_id(), msg=str(exc)))]
|
||||
|
||||
|
||||
@TraceManager.traced_async(ignore_inputs=["scraped_page", "page"])
|
||||
async def handle_terminate_action(
|
||||
action: actions.TerminateAction,
|
||||
@@ -2205,6 +2268,7 @@ ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_ac
|
||||
ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action)
|
||||
ActionHandler.register_action_type(ActionType.SELECT_OPTION, handle_select_option_action)
|
||||
ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action)
|
||||
ActionHandler.register_action_type(ActionType.HOVER, handle_hover_action)
|
||||
ActionHandler.register_action_type(ActionType.TERMINATE, handle_terminate_action)
|
||||
ActionHandler.register_action_type(ActionType.COMPLETE, handle_complete_action)
|
||||
ActionHandler.register_action_type(ActionType.EXTRACT, handle_extract_action)
|
||||
@@ -2284,6 +2348,7 @@ async def chain_click(
|
||||
:param css: css of the element to click
|
||||
"""
|
||||
try:
|
||||
await skyvern_element.hover_to_reveal()
|
||||
if not await skyvern_element.navigate_to_a_href(page=page):
|
||||
await locator.click(timeout=timeout)
|
||||
LOG.info("Chain click: main element click succeeded", action=action, locator=locator)
|
||||
|
||||
@@ -27,6 +27,7 @@ from skyvern.webeye.actions.actions import (
|
||||
DownloadFileAction,
|
||||
DragAction,
|
||||
GotoUrlAction,
|
||||
HoverAction,
|
||||
InputOrSelectContext,
|
||||
InputTextAction,
|
||||
KeypressAction,
|
||||
@@ -166,6 +167,9 @@ def parse_action(
|
||||
if action_type == ActionType.WAIT:
|
||||
return WaitAction(**base_action_dict)
|
||||
|
||||
if action_type == ActionType.HOVER:
|
||||
return HoverAction(**base_action_dict, hold_seconds=action.get("hold_seconds", 0) or 0)
|
||||
|
||||
if action_type == ActionType.COMPLETE:
|
||||
return CompleteAction(
|
||||
**base_action_dict,
|
||||
|
||||
@@ -411,6 +411,44 @@ function hasASPClientControl() {
|
||||
return typeof ASPxClientControl !== "undefined";
|
||||
}
|
||||
|
||||
// Check if element is only visible on hover (e.g., hover-only buttons)
|
||||
function isHoverOnlyElement(element) {
|
||||
// Check for common hover-only patterns in class names
|
||||
const className = element.className?.toString() ?? "";
|
||||
const parentClassName = element.parentElement?.className?.toString() ?? "";
|
||||
|
||||
// Common hover-only class patterns
|
||||
if (
|
||||
className.includes("hover-") ||
|
||||
className.includes("-hover") ||
|
||||
parentClassName.includes("hover-") ||
|
||||
parentClassName.includes("-hover")
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if parent has hover-related attributes or classes that might reveal this element
|
||||
let parent = element.parentElement;
|
||||
let depth = 0;
|
||||
// Cap recursion to avoid walking the entire tree and bloating prompts
|
||||
const maxDepth = 5;
|
||||
while (parent && parent !== document.body && depth < maxDepth) {
|
||||
const parentClass = parent.className?.toString() ?? "";
|
||||
if (
|
||||
parentClass.includes("hover") ||
|
||||
parentClass.includes("card") ||
|
||||
parentClass.includes("item")
|
||||
) {
|
||||
// This element might be revealed on parent hover
|
||||
return true;
|
||||
}
|
||||
parent = parent.parentElement;
|
||||
depth += 1;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
// from playwright: https://github.com/microsoft/playwright/blob/1b65f26f0287c0352e76673bc5f85bc36c934b55/packages/playwright-core/src/server/injected/domUtils.ts#L100-L119
|
||||
// NOTE: According this logic, some elements with aria-hidden won't be considered as invisible. And the result shows they are indeed interactable.
|
||||
function isElementVisible(element) {
|
||||
@@ -450,6 +488,10 @@ function isElementVisible(element) {
|
||||
if (!isElementStyleVisibilityVisible(element, style)) return false;
|
||||
const rect = element.getBoundingClientRect();
|
||||
if (rect.width <= 0 || rect.height <= 0) {
|
||||
// Check if this element might be visible on hover before marking as invisible
|
||||
if (isHoverOnlyElement(element)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -824,7 +866,12 @@ function isInteractable(element, hoverStylesMap) {
|
||||
// https://developer.mozilla.org/en-US/docs/Web/CSS/pointer-events#none
|
||||
const elementPointerEvent = getElementComputedStyle(element)?.pointerEvents;
|
||||
if (elementPointerEvent === "none" && !element.disabled) {
|
||||
return false;
|
||||
// Some CTAs stay hidden until the parent is hovered
|
||||
// When we can infer that the element is revealed on hover, keep it interactable so the agent
|
||||
// has a chance to hover the parent before clicking.
|
||||
if (!isHoverOnlyElement(element)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (isInteractableInput(element, hoverStylesMap)) {
|
||||
@@ -1569,6 +1616,7 @@ async function buildElementObject(
|
||||
frame: frame,
|
||||
frame_index: window.GlobalSkyvernFrameIndex,
|
||||
interactable: interactable,
|
||||
hoverOnly: isHoverOnlyElement(element),
|
||||
tagName: elementTagNameLower,
|
||||
attributes: attrs,
|
||||
beforePseudoText: getPseudoContent(element, "::before"),
|
||||
|
||||
@@ -2,6 +2,7 @@ import asyncio
|
||||
import copy
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from typing import Any
|
||||
|
||||
import structlog
|
||||
from playwright._impl._errors import TimeoutError
|
||||
@@ -92,6 +93,14 @@ def load_js_script() -> str:
|
||||
JS_FUNCTION_DEFS = load_js_script()
|
||||
|
||||
|
||||
# function to convert JSON element to HTML
|
||||
def build_attribute(key: str, value: Any) -> str:
|
||||
if isinstance(value, bool) or isinstance(value, int):
|
||||
return f'{key}="{str(value).lower()}"'
|
||||
|
||||
return f'{key}="{str(value)}"' if value else key
|
||||
|
||||
|
||||
def clean_element_before_hashing(element: dict) -> dict:
|
||||
def clean_nested(element: dict) -> dict:
|
||||
element_cleaned = {key: value for key, value in element.items() if key not in {"id", "rect", "frame_index"}}
|
||||
@@ -125,7 +134,7 @@ def build_element_dict(
|
||||
|
||||
for element in elements:
|
||||
element_id: str = element.get("id", "")
|
||||
# get_interactable_element_tree marks each interactable element with a unique_id attribute
|
||||
# get_interactable_element_tree marks each interactable element with a SKYVERN_ID_ATTR attribute
|
||||
id_to_css_dict[element_id] = f"[{SKYVERN_ID_ATTR}='{element_id}']"
|
||||
id_to_element_dict[element_id] = element
|
||||
id_to_frame_dict[element_id] = element["frame"]
|
||||
@@ -409,16 +418,18 @@ async def add_frame_interactable_elements(
|
||||
# it will get stuck when we `frame.evaluate()` on an invisible iframe
|
||||
if not await frame_element.is_visible():
|
||||
return elements, element_tree
|
||||
unique_id = await frame_element.get_attribute("unique_id")
|
||||
if not unique_id:
|
||||
skyvern_id = await frame_element.get_attribute(SKYVERN_ID_ATTR)
|
||||
if not skyvern_id:
|
||||
LOG.info(
|
||||
"No unique_id found for frame, skipping",
|
||||
"No Skyvern id found for frame, skipping",
|
||||
frame_index=frame_index,
|
||||
attr=SKYVERN_ID_ATTR,
|
||||
)
|
||||
return elements, element_tree
|
||||
except Exception:
|
||||
LOG.warning(
|
||||
"Unable to get unique_id from frame_element",
|
||||
"Unable to get Skyvern id from frame_element",
|
||||
attr=SKYVERN_ID_ATTR,
|
||||
exc_info=True,
|
||||
)
|
||||
return elements, element_tree
|
||||
@@ -427,11 +438,11 @@ async def add_frame_interactable_elements(
|
||||
await skyvern_frame.safe_wait_for_animation_end()
|
||||
|
||||
frame_elements, frame_element_tree = await skyvern_frame.build_tree_from_body(
|
||||
frame_name=unique_id, frame_index=frame_index
|
||||
frame_name=skyvern_id, frame_index=frame_index
|
||||
)
|
||||
|
||||
for element in elements:
|
||||
if element["id"] == unique_id:
|
||||
if element["id"] == skyvern_id:
|
||||
element["children"] = frame_element_tree
|
||||
|
||||
elements = elements + frame_elements
|
||||
@@ -638,6 +649,9 @@ def _should_keep_unique_id(element: dict) -> bool:
|
||||
# 1. no readonly attr and not disable attr and no interactable
|
||||
# 2. readonly=false and disable=false and interactable=false
|
||||
|
||||
if element.get("hoverOnly"):
|
||||
return True
|
||||
|
||||
attributes = element.get("attributes", {})
|
||||
if (
|
||||
"disabled" not in attributes
|
||||
|
||||
@@ -133,6 +133,7 @@ class SkyvernElement:
|
||||
self._id_cache = static_element.get("id", "")
|
||||
self._tag_name = static_element.get("tagName", "")
|
||||
self._selectable = static_element.get("isSelectable", False)
|
||||
self._hover_only = static_element.get("hoverOnly", False)
|
||||
self._frame_id = static_element.get("frame", "")
|
||||
self._attributes = static_element.get("attributes", {})
|
||||
self._rect: FloatRect | None = None
|
||||
@@ -401,6 +402,49 @@ class SkyvernElement:
|
||||
def get_attributes(self) -> dict:
|
||||
return self._attributes
|
||||
|
||||
def requires_hover(self) -> bool:
|
||||
return bool(self._hover_only)
|
||||
|
||||
async def hover_to_reveal(
|
||||
self,
|
||||
max_depth: int = 4,
|
||||
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
|
||||
settle_delay_s: float = 0.15,
|
||||
) -> bool:
|
||||
if not self.requires_hover():
|
||||
return False
|
||||
|
||||
hover_target = self.get_locator()
|
||||
for depth in range(max_depth):
|
||||
try:
|
||||
await hover_target.scroll_into_view_if_needed()
|
||||
await hover_target.hover(timeout=timeout)
|
||||
await asyncio.sleep(settle_delay_s)
|
||||
if await self.get_locator().is_visible(timeout=timeout):
|
||||
LOG.debug("Hover reveal succeeded", element_id=self.get_id(), depth=depth)
|
||||
return True
|
||||
except Exception:
|
||||
LOG.debug(
|
||||
"Hover attempt failed while trying to reveal element",
|
||||
exc_info=True,
|
||||
element_id=self.get_id(),
|
||||
depth=depth,
|
||||
)
|
||||
|
||||
parent_locator = hover_target.locator("..")
|
||||
try:
|
||||
if await parent_locator.count() != 1:
|
||||
break
|
||||
except Exception:
|
||||
LOG.debug(
|
||||
"Unable to evaluate parent locator during hover reveal", exc_info=True, element_id=self.get_id()
|
||||
)
|
||||
break
|
||||
hover_target = parent_locator
|
||||
|
||||
LOG.debug("Hover reveal attempts exhausted", element_id=self.get_id())
|
||||
return False
|
||||
|
||||
def get_options(self) -> list[SkyvernOptionType]:
|
||||
options = self.__static_element.get("options", None)
|
||||
if options is None:
|
||||
|
||||
Reference in New Issue
Block a user