feat: add hover action support (#3994)

Co-authored-by: LawyZheng <lawyzheng1106@gmail.com>
This commit is contained in:
Mohamed Khalil
2025-12-09 17:27:26 +02:00
committed by GitHub
parent 0e8d667959
commit f49b07f30d
22 changed files with 281 additions and 13 deletions

View File

@@ -12,6 +12,7 @@ class ActionType(StrEnum):
SELECT_OPTION = "select_option"
CHECKBOX = "checkbox"
WAIT = "wait"
HOVER = "hover"
NULL_ACTION = "null_action"
SOLVE_CAPTCHA = "solve_captcha"
TERMINATE = "terminate"
@@ -37,11 +38,13 @@ class ActionType(StrEnum):
ActionType.DOWNLOAD_FILE,
ActionType.SELECT_OPTION,
ActionType.CHECKBOX,
ActionType.HOVER,
]
POST_ACTION_EXECUTION_ACTION_TYPES = [
ActionType.CLICK,
ActionType.HOVER,
ActionType.INPUT_TEXT,
ActionType.UPLOAD_FILE,
ActionType.DOWNLOAD_FILE,

View File

@@ -282,6 +282,11 @@ class WaitAction(Action):
seconds: int = 20
class HoverAction(WebAction):
action_type: ActionType = ActionType.HOVER
hold_seconds: float = 0.0
class TerminateAction(DecisiveAction):
action_type: ActionType = ActionType.TERMINATE

View File

@@ -32,6 +32,7 @@ from skyvern.exceptions import (
ErrFoundSelectableElement,
FailedToFetchSecret,
FailToClick,
FailToHover,
FailToSelectByIndex,
FailToSelectByLabel,
FailToSelectByValue,
@@ -1998,6 +1999,68 @@ async def handle_wait_action(
return [ActionFailure(exception=Exception("Wait action is treated as a failure"))]
@TraceManager.traced_async(ignore_inputs=["scraped_page", "page"])
async def handle_hover_action(
action: actions.HoverAction,
page: Page,
scraped_page: ScrapedPage,
task: Task,
step: Step,
) -> list[ActionResult]:
dom = DomUtil(scraped_page=scraped_page, page=page)
try:
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
except Exception as exc:
LOG.warning(
"Failed to resolve element for hover action",
action=action,
workflow_run_id=task.workflow_run_id,
exc_info=True,
)
return [ActionFailure(exception=exc)]
try:
await skyvern_element.hover_to_reveal()
await skyvern_element.get_locator().scroll_into_view_if_needed()
await skyvern_element.get_locator().hover(timeout=settings.BROWSER_ACTION_TIMEOUT_MS)
# Save the absolute page position of the hovered element
# This allows us to scroll back to this position after re-scraping
try:
bounding_box = await skyvern_element.get_locator().bounding_box(timeout=settings.BROWSER_ACTION_TIMEOUT_MS)
if bounding_box:
# Get current scroll position
scroll_y = await page.evaluate("window.scrollY")
# Calculate absolute page Y = viewport Y + scroll offset
absolute_page_y = bounding_box["y"] + scroll_y
context = skyvern_context.current()
if context:
context.last_hovered_element_page_y = absolute_page_y
context.last_hovered_element_id = action.element_id
LOG.info(
"Saved hovered element absolute position",
element_id=action.element_id,
viewport_y=bounding_box["y"],
scroll_y=scroll_y,
absolute_page_y=absolute_page_y,
)
except Exception:
LOG.warning("Failed to save hovered element position", exc_info=True)
if action.hold_seconds and action.hold_seconds > 0:
await asyncio.sleep(action.hold_seconds)
return [ActionSuccess()]
except Exception as exc:
LOG.warning(
"Hover action failed",
action=action,
workflow_run_id=task.workflow_run_id,
exc_info=True,
)
return [ActionFailure(FailToHover(skyvern_element.get_id(), msg=str(exc)))]
@TraceManager.traced_async(ignore_inputs=["scraped_page", "page"])
async def handle_terminate_action(
action: actions.TerminateAction,
@@ -2205,6 +2268,7 @@ ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_ac
ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action)
ActionHandler.register_action_type(ActionType.SELECT_OPTION, handle_select_option_action)
ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action)
ActionHandler.register_action_type(ActionType.HOVER, handle_hover_action)
ActionHandler.register_action_type(ActionType.TERMINATE, handle_terminate_action)
ActionHandler.register_action_type(ActionType.COMPLETE, handle_complete_action)
ActionHandler.register_action_type(ActionType.EXTRACT, handle_extract_action)
@@ -2284,6 +2348,7 @@ async def chain_click(
:param css: css of the element to click
"""
try:
await skyvern_element.hover_to_reveal()
if not await skyvern_element.navigate_to_a_href(page=page):
await locator.click(timeout=timeout)
LOG.info("Chain click: main element click succeeded", action=action, locator=locator)

View File

@@ -27,6 +27,7 @@ from skyvern.webeye.actions.actions import (
DownloadFileAction,
DragAction,
GotoUrlAction,
HoverAction,
InputOrSelectContext,
InputTextAction,
KeypressAction,
@@ -166,6 +167,9 @@ def parse_action(
if action_type == ActionType.WAIT:
return WaitAction(**base_action_dict)
if action_type == ActionType.HOVER:
return HoverAction(**base_action_dict, hold_seconds=action.get("hold_seconds", 0) or 0)
if action_type == ActionType.COMPLETE:
return CompleteAction(
**base_action_dict,

View File

@@ -411,6 +411,44 @@ function hasASPClientControl() {
return typeof ASPxClientControl !== "undefined";
}
// Check if element is only visible on hover (e.g., hover-only buttons)
function isHoverOnlyElement(element) {
// Check for common hover-only patterns in class names
const className = element.className?.toString() ?? "";
const parentClassName = element.parentElement?.className?.toString() ?? "";
// Common hover-only class patterns
if (
className.includes("hover-") ||
className.includes("-hover") ||
parentClassName.includes("hover-") ||
parentClassName.includes("-hover")
) {
return true;
}
// Check if parent has hover-related attributes or classes that might reveal this element
let parent = element.parentElement;
let depth = 0;
// Cap recursion to avoid walking the entire tree and bloating prompts
const maxDepth = 5;
while (parent && parent !== document.body && depth < maxDepth) {
const parentClass = parent.className?.toString() ?? "";
if (
parentClass.includes("hover") ||
parentClass.includes("card") ||
parentClass.includes("item")
) {
// This element might be revealed on parent hover
return true;
}
parent = parent.parentElement;
depth += 1;
}
return false;
}
// from playwright: https://github.com/microsoft/playwright/blob/1b65f26f0287c0352e76673bc5f85bc36c934b55/packages/playwright-core/src/server/injected/domUtils.ts#L100-L119
// NOTE: According this logic, some elements with aria-hidden won't be considered as invisible. And the result shows they are indeed interactable.
function isElementVisible(element) {
@@ -450,6 +488,10 @@ function isElementVisible(element) {
if (!isElementStyleVisibilityVisible(element, style)) return false;
const rect = element.getBoundingClientRect();
if (rect.width <= 0 || rect.height <= 0) {
// Check if this element might be visible on hover before marking as invisible
if (isHoverOnlyElement(element)) {
return true;
}
return false;
}
@@ -824,7 +866,12 @@ function isInteractable(element, hoverStylesMap) {
// https://developer.mozilla.org/en-US/docs/Web/CSS/pointer-events#none
const elementPointerEvent = getElementComputedStyle(element)?.pointerEvents;
if (elementPointerEvent === "none" && !element.disabled) {
return false;
// Some CTAs stay hidden until the parent is hovered
// When we can infer that the element is revealed on hover, keep it interactable so the agent
// has a chance to hover the parent before clicking.
if (!isHoverOnlyElement(element)) {
return false;
}
}
if (isInteractableInput(element, hoverStylesMap)) {
@@ -1569,6 +1616,7 @@ async function buildElementObject(
frame: frame,
frame_index: window.GlobalSkyvernFrameIndex,
interactable: interactable,
hoverOnly: isHoverOnlyElement(element),
tagName: elementTagNameLower,
attributes: attrs,
beforePseudoText: getPseudoContent(element, "::before"),

View File

@@ -2,6 +2,7 @@ import asyncio
import copy
import json
from collections import defaultdict
from typing import Any
import structlog
from playwright._impl._errors import TimeoutError
@@ -92,6 +93,14 @@ def load_js_script() -> str:
JS_FUNCTION_DEFS = load_js_script()
# function to convert JSON element to HTML
def build_attribute(key: str, value: Any) -> str:
if isinstance(value, bool) or isinstance(value, int):
return f'{key}="{str(value).lower()}"'
return f'{key}="{str(value)}"' if value else key
def clean_element_before_hashing(element: dict) -> dict:
def clean_nested(element: dict) -> dict:
element_cleaned = {key: value for key, value in element.items() if key not in {"id", "rect", "frame_index"}}
@@ -125,7 +134,7 @@ def build_element_dict(
for element in elements:
element_id: str = element.get("id", "")
# get_interactable_element_tree marks each interactable element with a unique_id attribute
# get_interactable_element_tree marks each interactable element with a SKYVERN_ID_ATTR attribute
id_to_css_dict[element_id] = f"[{SKYVERN_ID_ATTR}='{element_id}']"
id_to_element_dict[element_id] = element
id_to_frame_dict[element_id] = element["frame"]
@@ -409,16 +418,18 @@ async def add_frame_interactable_elements(
# it will get stuck when we `frame.evaluate()` on an invisible iframe
if not await frame_element.is_visible():
return elements, element_tree
unique_id = await frame_element.get_attribute("unique_id")
if not unique_id:
skyvern_id = await frame_element.get_attribute(SKYVERN_ID_ATTR)
if not skyvern_id:
LOG.info(
"No unique_id found for frame, skipping",
"No Skyvern id found for frame, skipping",
frame_index=frame_index,
attr=SKYVERN_ID_ATTR,
)
return elements, element_tree
except Exception:
LOG.warning(
"Unable to get unique_id from frame_element",
"Unable to get Skyvern id from frame_element",
attr=SKYVERN_ID_ATTR,
exc_info=True,
)
return elements, element_tree
@@ -427,11 +438,11 @@ async def add_frame_interactable_elements(
await skyvern_frame.safe_wait_for_animation_end()
frame_elements, frame_element_tree = await skyvern_frame.build_tree_from_body(
frame_name=unique_id, frame_index=frame_index
frame_name=skyvern_id, frame_index=frame_index
)
for element in elements:
if element["id"] == unique_id:
if element["id"] == skyvern_id:
element["children"] = frame_element_tree
elements = elements + frame_elements
@@ -638,6 +649,9 @@ def _should_keep_unique_id(element: dict) -> bool:
# 1. no readonly attr and not disable attr and no interactable
# 2. readonly=false and disable=false and interactable=false
if element.get("hoverOnly"):
return True
attributes = element.get("attributes", {})
if (
"disabled" not in attributes

View File

@@ -133,6 +133,7 @@ class SkyvernElement:
self._id_cache = static_element.get("id", "")
self._tag_name = static_element.get("tagName", "")
self._selectable = static_element.get("isSelectable", False)
self._hover_only = static_element.get("hoverOnly", False)
self._frame_id = static_element.get("frame", "")
self._attributes = static_element.get("attributes", {})
self._rect: FloatRect | None = None
@@ -401,6 +402,49 @@ class SkyvernElement:
def get_attributes(self) -> dict:
return self._attributes
def requires_hover(self) -> bool:
return bool(self._hover_only)
async def hover_to_reveal(
self,
max_depth: int = 4,
timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
settle_delay_s: float = 0.15,
) -> bool:
if not self.requires_hover():
return False
hover_target = self.get_locator()
for depth in range(max_depth):
try:
await hover_target.scroll_into_view_if_needed()
await hover_target.hover(timeout=timeout)
await asyncio.sleep(settle_delay_s)
if await self.get_locator().is_visible(timeout=timeout):
LOG.debug("Hover reveal succeeded", element_id=self.get_id(), depth=depth)
return True
except Exception:
LOG.debug(
"Hover attempt failed while trying to reveal element",
exc_info=True,
element_id=self.get_id(),
depth=depth,
)
parent_locator = hover_target.locator("..")
try:
if await parent_locator.count() != 1:
break
except Exception:
LOG.debug(
"Unable to evaluate parent locator during hover reveal", exc_info=True, element_id=self.get_id()
)
break
hover_target = parent_locator
LOG.debug("Hover reveal attempts exhausted", element_id=self.get_id())
return False
def get_options(self) -> list[SkyvernOptionType]:
options = self.__static_element.get("options", None)
if options is None: