add actions db model and caching V0 (#980)
This commit is contained in:
@@ -1,14 +1,17 @@
|
||||
from enum import StrEnum
|
||||
from typing import Annotated, Any, Dict
|
||||
from typing import Annotated, Any, Dict, Type, TypeVar
|
||||
|
||||
import structlog
|
||||
from deprecation import deprecated
|
||||
from litellm import ConfigDict
|
||||
from pydantic import BaseModel, Field, ValidationError
|
||||
|
||||
from skyvern.exceptions import UnsupportedActionType
|
||||
from skyvern.forge.sdk.schemas.tasks import Task
|
||||
from skyvern.webeye.scraper.scraper import ScrapedPage
|
||||
|
||||
LOG = structlog.get_logger()
|
||||
T = TypeVar("T", bound="Action")
|
||||
|
||||
|
||||
class ActionType(StrEnum):
|
||||
@@ -27,6 +30,23 @@ class ActionType(StrEnum):
|
||||
TERMINATE = "terminate"
|
||||
COMPLETE = "complete"
|
||||
|
||||
def is_web_action(self) -> bool:
|
||||
return self in [
|
||||
ActionType.CLICK,
|
||||
ActionType.INPUT_TEXT,
|
||||
ActionType.UPLOAD_FILE,
|
||||
ActionType.DOWNLOAD_FILE,
|
||||
ActionType.SELECT_OPTION,
|
||||
ActionType.CHECKBOX,
|
||||
]
|
||||
|
||||
|
||||
class ActionStatus(StrEnum):
|
||||
pending = "pending"
|
||||
skipped = "skipped"
|
||||
failed = "failed"
|
||||
completed = "completed"
|
||||
|
||||
|
||||
class UserDefinedError(BaseModel):
|
||||
error_code: str
|
||||
@@ -53,11 +73,26 @@ class InputOrSelectContext(BaseModel):
|
||||
|
||||
|
||||
class Action(BaseModel):
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
action_type: ActionType
|
||||
status: ActionStatus = ActionStatus.pending
|
||||
action_id: str | None = None
|
||||
source_action_id: str | None = None
|
||||
organization_id: str | None = None
|
||||
workflow_run_id: str | None = None
|
||||
task_id: str | None = None
|
||||
step_id: str | None = None
|
||||
step_order: int | None = None
|
||||
action_order: int | None = None
|
||||
confidence_float: float | None = None
|
||||
description: str | None = None
|
||||
reasoning: str | None = None
|
||||
intention: str | None = None
|
||||
response: str | None = None
|
||||
element_id: Annotated[str, Field(coerce_numbers_to_str=True)] | None = None
|
||||
skyvern_element_hash: str | None = None
|
||||
skyvern_element_data: dict[str, Any] | None = None
|
||||
|
||||
# DecisiveAction (CompleteAction, TerminateAction) fields
|
||||
errors: list[UserDefinedError] | None = None
|
||||
@@ -72,6 +107,38 @@ class Action(BaseModel):
|
||||
option: SelectOption | None = None
|
||||
is_checked: bool | None = None
|
||||
|
||||
@classmethod
|
||||
def validate(cls: Type[T], value: Any) -> T:
|
||||
if isinstance(value, dict):
|
||||
action_type = value["action_type"]
|
||||
|
||||
if action_type is ActionType.CLICK:
|
||||
return ClickAction.model_validate(value)
|
||||
elif action_type is ActionType.INPUT_TEXT:
|
||||
return InputTextAction.model_validate(value)
|
||||
elif action_type is ActionType.UPLOAD_FILE:
|
||||
return UploadFileAction.model_validate(value)
|
||||
elif action_type is ActionType.DOWNLOAD_FILE:
|
||||
return DownloadFileAction.model_validate(value)
|
||||
elif action_type is ActionType.NULL_ACTION:
|
||||
return NullAction.model_validate(value)
|
||||
elif action_type is ActionType.TERMINATE:
|
||||
return TerminateAction.model_validate(value)
|
||||
elif action_type is ActionType.COMPLETE:
|
||||
return CompleteAction.model_validate(value)
|
||||
elif action_type is ActionType.SELECT_OPTION:
|
||||
return SelectOptionAction.model_validate(value)
|
||||
elif action_type is ActionType.CHECKBOX:
|
||||
return CheckboxAction.model_validate(value)
|
||||
elif action_type is ActionType.WAIT:
|
||||
return WaitAction.model_validate(value)
|
||||
elif action_type is ActionType.SOLVE_CAPTCHA:
|
||||
return SolveCaptchaAction.model_validate(value)
|
||||
else:
|
||||
raise ValueError(f"Unsupported action type: {action_type}")
|
||||
else:
|
||||
raise ValueError("Invalid action data")
|
||||
|
||||
|
||||
class WebAction(Action):
|
||||
element_id: Annotated[str, Field(coerce_numbers_to_str=True)]
|
||||
@@ -159,7 +226,7 @@ class CompleteAction(DecisiveAction):
|
||||
data_extraction_goal: str | None = None
|
||||
|
||||
|
||||
def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None) -> Action:
|
||||
def parse_action(action: Dict[str, Any], scraped_page: ScrapedPage, data_extraction_goal: str | None = None) -> Action:
|
||||
if "id" in action:
|
||||
element_id = action["id"]
|
||||
elif "element_id" in action:
|
||||
@@ -167,57 +234,58 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
|
||||
else:
|
||||
element_id = None
|
||||
|
||||
skyvern_element_hash = scraped_page.id_to_element_hash.get(element_id) if element_id else None
|
||||
skyvern_element_data = scraped_page.id_to_element_dict.get(element_id) if element_id else None
|
||||
|
||||
reasoning = action["reasoning"] if "reasoning" in action else None
|
||||
confidence_float = action["confidence_float"] if "confidence_float" in action else None
|
||||
# TODO: currently action intention and response are only used for Q&A actions, like input_text
|
||||
# When we start supporting click action, intention will be the reasoning for the click action (why take the action)
|
||||
intention = action["user_detail_query"] if "user_detail_query" in action else None
|
||||
response = action["user_detail_answer"] if "user_detail_answer" in action else None
|
||||
|
||||
base_action_dict = {
|
||||
"element_id": element_id,
|
||||
"skyvern_element_hash": skyvern_element_hash,
|
||||
"skyvern_element_data": skyvern_element_data,
|
||||
"reasoning": reasoning,
|
||||
"confidence_float": confidence_float,
|
||||
"intention": intention,
|
||||
"response": response,
|
||||
}
|
||||
|
||||
if "action_type" not in action or action["action_type"] is None:
|
||||
return NullAction(reasoning=reasoning, confidence_float=confidence_float)
|
||||
return NullAction(**base_action_dict)
|
||||
|
||||
# `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK")
|
||||
action_type = ActionType[action["action_type"].upper()]
|
||||
|
||||
if not action_type.is_web_action():
|
||||
# LLM sometimes hallucinates and returns element id for non-web actions such as WAIT, TERMINATE, COMPLETE etc.
|
||||
# That can sometimes cause cached action plan to be invalidated. This way we're making sure the element id is not
|
||||
# set for non-web actions.
|
||||
base_action_dict["element_id"] = None
|
||||
|
||||
if action_type == ActionType.TERMINATE:
|
||||
return TerminateAction(
|
||||
reasoning=reasoning,
|
||||
confidence_float=confidence_float,
|
||||
errors=action["errors"] if "errors" in action else [],
|
||||
)
|
||||
return TerminateAction(**base_action_dict, errors=action["errors"] if "errors" in action else [])
|
||||
|
||||
if action_type == ActionType.CLICK:
|
||||
file_url = action["file_url"] if "file_url" in action else None
|
||||
return ClickAction(
|
||||
element_id=element_id,
|
||||
reasoning=reasoning,
|
||||
confidence_float=confidence_float,
|
||||
file_url=file_url,
|
||||
download=action.get("download", False),
|
||||
)
|
||||
return ClickAction(**base_action_dict, file_url=file_url, download=action.get("download", False))
|
||||
|
||||
if action_type == ActionType.INPUT_TEXT:
|
||||
return InputTextAction(
|
||||
element_id=element_id,
|
||||
text=action["text"],
|
||||
reasoning=reasoning,
|
||||
confidence_float=confidence_float,
|
||||
)
|
||||
return InputTextAction(**base_action_dict, text=action["text"])
|
||||
|
||||
if action_type == ActionType.UPLOAD_FILE:
|
||||
# TODO: see if the element is a file input element. if it's not, convert this action into a click action
|
||||
return UploadFileAction(
|
||||
element_id=element_id,
|
||||
confidence_float=confidence_float,
|
||||
**base_action_dict,
|
||||
file_url=action["file_url"],
|
||||
reasoning=reasoning,
|
||||
)
|
||||
|
||||
# This action is not used in the current implementation. Click actions are used instead.
|
||||
if action_type == ActionType.DOWNLOAD_FILE:
|
||||
return DownloadFileAction(
|
||||
element_id=element_id,
|
||||
file_name=action["file_name"],
|
||||
reasoning=reasoning,
|
||||
confidence_float=confidence_float,
|
||||
)
|
||||
return DownloadFileAction(**base_action_dict, file_name=action["file_name"])
|
||||
|
||||
if action_type == ActionType.SELECT_OPTION:
|
||||
option = action["option"]
|
||||
@@ -229,49 +297,54 @@ def parse_action(action: Dict[str, Any], data_extraction_goal: str | None = None
|
||||
if label is None and value is None and index is None:
|
||||
raise ValueError("At least one of 'label', 'value', or 'index' must be provided for a SelectOption")
|
||||
return SelectOptionAction(
|
||||
element_id=element_id,
|
||||
**base_action_dict,
|
||||
option=SelectOption(
|
||||
label=label,
|
||||
value=value,
|
||||
index=index,
|
||||
),
|
||||
reasoning=reasoning,
|
||||
confidence_float=confidence_float,
|
||||
)
|
||||
|
||||
if action_type == ActionType.CHECKBOX:
|
||||
return CheckboxAction(
|
||||
element_id=element_id,
|
||||
**base_action_dict,
|
||||
is_checked=action["is_checked"],
|
||||
reasoning=reasoning,
|
||||
confidence_float=confidence_float,
|
||||
)
|
||||
|
||||
if action_type == ActionType.WAIT:
|
||||
return WaitAction(reasoning=reasoning, confidence_float=confidence_float)
|
||||
return WaitAction(**base_action_dict)
|
||||
|
||||
if action_type == ActionType.COMPLETE:
|
||||
return CompleteAction(
|
||||
reasoning=reasoning,
|
||||
confidence_float=confidence_float,
|
||||
**base_action_dict,
|
||||
data_extraction_goal=data_extraction_goal,
|
||||
errors=action["errors"] if "errors" in action else [],
|
||||
)
|
||||
|
||||
if action_type == "null":
|
||||
return NullAction(reasoning=reasoning, confidence_float=confidence_float)
|
||||
return NullAction(**base_action_dict)
|
||||
|
||||
if action_type == ActionType.SOLVE_CAPTCHA:
|
||||
return SolveCaptchaAction(reasoning=reasoning, confidence_float=confidence_float)
|
||||
return SolveCaptchaAction(**base_action_dict)
|
||||
|
||||
raise UnsupportedActionType(action_type=action_type)
|
||||
|
||||
|
||||
def parse_actions(task: Task, json_response: list[Dict[str, Any]]) -> list[Action]:
|
||||
def parse_actions(
|
||||
task: Task, step_id: str, step_order: int, scraped_page: ScrapedPage, json_response: list[Dict[str, Any]]
|
||||
) -> list[Action]:
|
||||
actions: list[Action] = []
|
||||
for action in json_response:
|
||||
for idx, action in enumerate(json_response):
|
||||
try:
|
||||
action_instance = parse_action(action=action, data_extraction_goal=task.data_extraction_goal)
|
||||
action_instance = parse_action(
|
||||
action=action, scraped_page=scraped_page, data_extraction_goal=task.data_extraction_goal
|
||||
)
|
||||
action_instance.organization_id = task.organization_id
|
||||
action_instance.workflow_run_id = task.workflow_run_id
|
||||
action_instance.task_id = task.task_id
|
||||
action_instance.step_id = step_id
|
||||
action_instance.step_order = step_order
|
||||
action_instance.action_order = idx
|
||||
if isinstance(action_instance, TerminateAction):
|
||||
LOG.warning(
|
||||
"Agent decided to terminate",
|
||||
@@ -303,6 +376,23 @@ def parse_actions(task: Task, json_response: list[Dict[str, Any]]) -> list[Actio
|
||||
raw_action=action,
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
############################ This part of code might not be needed ############################
|
||||
# Reason #1. validation can be done in action handler but not in parser
|
||||
# Reason #2. no need to validate whether the element_id has a hash.
|
||||
# If there's no hash, we can fall back to normal operation
|
||||
all_element_ids = [action.element_id for action in actions if action.element_id]
|
||||
missing_element_ids = [
|
||||
element_id for element_id in all_element_ids if element_id not in scraped_page.id_to_element_hash
|
||||
]
|
||||
if missing_element_ids:
|
||||
LOG.warning(
|
||||
"Missing elements in scraped page",
|
||||
task_id=task.task_id,
|
||||
missing_element_ids=missing_element_ids,
|
||||
all_element_ids=all_element_ids,
|
||||
)
|
||||
############################ This part of code might not be needed ############################
|
||||
return actions
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user