Sync cloud skyvern to oss skyvern (#55)

2024-03-12 22:28:16 -07:00
parent 647ea2ac0f
commit 15d78d7b08
25 changed files with 554 additions and 163 deletions
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -3,7 +3,7 @@ from enum import StrEnum
 from typing import Any, Dict, List

 import structlog
-from pydantic import BaseModel
+from pydantic import BaseModel, Field

 from skyvern.forge.sdk.schemas.tasks import Task

@@ -34,6 +34,16 @@ class WebAction(Action, abc.ABC):
    element_id: int


+class UserDefinedError(BaseModel):
+    error_code: str
+    reasoning: str
+    confidence_float: float = Field(..., ge=0, le=1)
+
+
+class DecisiveAction(Action, abc.ABC):
+    errors: List[UserDefinedError] = []
+
+
 class ClickAction(WebAction):
    action_type: ActionType = ActionType.CLICK
    file_url: str | None = None
@@ -102,11 +112,11 @@ class WaitAction(Action):
    action_type: ActionType = ActionType.WAIT


-class TerminateAction(Action):
+class TerminateAction(DecisiveAction):
    action_type: ActionType = ActionType.TERMINATE


-class CompleteAction(Action):
+class CompleteAction(DecisiveAction):
    action_type: ActionType = ActionType.COMPLETE
    data_extraction_goal: str | None = None

@@ -129,7 +139,7 @@ def parse_actions(task: Task, json_response: List[Dict[str, Any]]) -> List[Actio
                reasoning=reasoning,
                actions=actions,
            )
-            actions.append(TerminateAction(reasoning=reasoning))
+            actions.append(TerminateAction(reasoning=reasoning, errors=action["errors"] if "errors" in action else []))
        elif action_type == ActionType.CLICK:
            file_url = action["file_url"] if "file_url" in action else None
            actions.append(ClickAction(element_id=element_id, reasoning=reasoning, file_url=file_url))
@@ -165,7 +175,13 @@ def parse_actions(task: Task, json_response: List[Dict[str, Any]]) -> List[Actio
                    actions=actions,
                    llm_response=json_response,
                )
-            return [CompleteAction(reasoning=reasoning, data_extraction_goal=task.data_extraction_goal)]
+            return [
+                CompleteAction(
+                    reasoning=reasoning,
+                    data_extraction_goal=task.data_extraction_goal,
+                    errors=action["errors"] if "errors" in action else [],
+                )
+            ]
        elif action_type == "null":
            actions.append(NullAction(reasoning=reasoning))
        elif action_type == ActionType.SOLVE_CAPTCHA:
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -1,6 +1,7 @@
 import asyncio
+import json
 import re
-from typing import Awaitable, Callable, List
+from typing import Any, Awaitable, Callable, List

 import structlog
 from playwright.async_api import Locator, Page
@@ -82,7 +83,9 @@ async def handle_click_action(
 ) -> list[ActionResult]:
    xpath = await validate_actions_in_dom(action, page, scraped_page)
    await asyncio.sleep(0.3)
-    return await chain_click(page, action, xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+    return await chain_click(
+        task, page, action, xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
+    )


 async def handle_input_text_action(
@@ -91,7 +94,8 @@ async def handle_input_text_action(
    xpath = await validate_actions_in_dom(action, page, scraped_page)
    locator = page.locator(f"xpath={xpath}")
    await locator.clear()
-    await locator.fill(action.text, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+    text = get_actual_value_of_parameter_if_secret(task, action.text)
+    await locator.fill(text, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)

    # This is a hack that gets dropdowns to select the "best" option based on what's typed
    # Fixes situations like tsk_228671423990405776 where the location isn't being autocompleted
@@ -100,7 +104,7 @@ async def handle_input_text_action(
    if not input_value:
        LOG.info("Failed to input the text, trying to press sequentially with an enter click", action=action)
        await locator.clear()
-        await locator.press_sequentially(action.text, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        await locator.press_sequentially(text, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
        await locator.press("Enter", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
        input_value = await locator.input_value(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
        LOG.info("Input value", input_value=input_value, action=action)
@@ -114,7 +118,12 @@ async def handle_upload_file_action(
    if not action.file_url:
        LOG.warning("InputFileAction has no file_url", action=action)
        return [ActionFailure(MissingFileUrl())]
-    if action.file_url not in str(task.navigation_payload):
+    # ************************************************************************************************************** #
+    # After this point if the file_url is a secret, it will be replaced with the actual value
+    # In order to make sure we don't log the secret value, we log the action with the original value action.file_url
+    # ************************************************************************************************************** #
+    file_url = get_actual_value_of_parameter_if_secret(task, action.file_url)
+    if file_url not in str(task.navigation_payload):
        LOG.warning(
            "LLM might be imagining the file url, which is not in navigation payload",
            action=action,
@@ -122,7 +131,7 @@ async def handle_upload_file_action(
        )
        return [ActionFailure(ImaginaryFileUrl(action.file_url))]
    xpath = await validate_actions_in_dom(action, page, scraped_page)
-    file_path = download_file(action.file_url)
+    file_path = download_file(file_url)
    locator = page.locator(f"xpath={xpath}")
    is_file_input = await is_file_input_element(locator)
    if is_file_input:
@@ -141,7 +150,9 @@ async def handle_upload_file_action(
        LOG.info("Taking UploadFileAction. Found non file input tag", action=action)
        # treat it as a click action
        action.is_upload_file_tag = False
-        return await chain_click(page, action, xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        return await chain_click(
+            task, page, action, xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
+        )


 async def handle_null_action(
@@ -189,7 +200,7 @@ async def handle_select_option_action(
                child_anchor_xpath=child_anchor_xpath,
            )
            click_action = ClickAction(element_id=action.element_id)
-            return await chain_click(page, click_action, child_anchor_xpath)
+            return await chain_click(task, page, click_action, child_anchor_xpath)
        return [ActionFailure(Exception("No anchor tag found for the label for SelectOptionAction"))]
    elif tag_name == "a":
        # turn the SelectOptionAction into a ClickAction
@@ -198,7 +209,7 @@ async def handle_select_option_action(
            action=action,
        )
        click_action = ClickAction(element_id=action.element_id)
-        action_result = await chain_click(page, click_action, xpath)
+        action_result = await chain_click(task, page, click_action, xpath)
        return action_result
    elif tag_name == "ul" or tag_name == "div" or tag_name == "li":
        # if the role is listbox, find the option with the "label" or "value" and click that option element
@@ -234,7 +245,7 @@ async def handle_select_option_action(
            )
            # click the option element
            click_action = ClickAction(element_id=action.element_id)
-            return await chain_click(page, click_action, xpath)
+            return await chain_click(task, page, click_action, xpath)
        else:
            LOG.error(
                "SelectOptionAction on a non-listbox element. Cannot handle this action",
@@ -349,6 +360,22 @@ ActionHandler.register_action_type(ActionType.TERMINATE, handle_terminate_action
 ActionHandler.register_action_type(ActionType.COMPLETE, handle_complete_action)


+def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:
+    """
+    Get the actual value of a parameter if it's a secret. If it's not a secret, return the parameter value as is.
+
+    Just return the parameter value if the task isn't a workflow's task.
+
+    This is only used for InputTextAction, UploadFileAction, and ClickAction (if it has a file_url).
+    """
+    if task.workflow_run_id is None:
+        return parameter
+
+    workflow_run_context = app.WORKFLOW_CONTEXT_MANAGER.get_workflow_run_context(task.workflow_run_id)
+    secret_value = workflow_run_context.get_original_secret_value_or_none(parameter)
+    return secret_value if secret_value is not None else parameter
+
+
 async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: ScrapedPage) -> str:
    xpath = scraped_page.id_to_xpath_dict[action.element_id]
    locator = page.locator(xpath)
@@ -371,6 +398,7 @@ async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: S


 async def chain_click(
+    task: Task,
    page: Page,
    action: ClickAction | UploadFileAction,
    xpath: str,
@@ -384,7 +412,8 @@ async def chain_click(
    LOG.info("Chain click starts", action=action, xpath=xpath)
    file: list[str] | str = []
    if action.file_url:
-        file = download_file(action.file_url) or []
+        file_url = get_actual_value_of_parameter_if_secret(task, action.file_url)
+        file = download_file(file_url) or []

    fc_func = lambda fc: fc.set_files(files=file)
    page.on("filechooser", fc_func)
@@ -535,11 +564,13 @@ async def extract_information_for_navigation_goal(
    extract_information_prompt = prompt_engine.load_prompt(
        prompt_template,
        navigation_goal=task.navigation_goal,
+        navigation_payload=task.navigation_payload,
        elements=scraped_page.element_tree,
        data_extraction_goal=task.data_extraction_goal,
        extracted_information_schema=task.extracted_information_schema,
        current_url=scraped_page.url,
        extracted_text=scraped_page.extracted_text,
+        error_code_mapping_str=json.dumps(task.error_code_mapping) if task.error_code_mapping else None,
    )

    json_response = await app.OPENAI_CLIENT.chat_completion(
--- a/skyvern/webeye/actions/models.py
+++ b/skyvern/webeye/actions/models.py
@@ -5,7 +5,7 @@ from typing import Any
 from pydantic import BaseModel

 from skyvern.forge.sdk.settings_manager import SettingsManager
-from skyvern.webeye.actions.actions import Action, ActionTypeUnion
+from skyvern.webeye.actions.actions import Action, ActionTypeUnion, DecisiveAction, UserDefinedError
 from skyvern.webeye.actions.responses import ActionResult
 from skyvern.webeye.scraper.scraper import ScrapedPage

@@ -19,6 +19,7 @@ class AgentStepOutput(BaseModel):
    action_results: list[ActionResult] | None = None
    # Nullable for backwards compatibility, once backfill is done, this won't be nullable anymore
    actions_and_results: list[tuple[ActionTypeUnion, list[ActionResult]]] | None = None
+    errors: list[UserDefinedError] = []

    def __repr__(self) -> str:
        return f"AgentStepOutput({self.model_dump()})"
@@ -51,8 +52,17 @@ class DetailedAgentStepOutput(BaseModel):
    def __str__(self) -> str:
        return self.__repr__()

+    def extract_errors(self) -> list[UserDefinedError]:
+        errors = []
+        if self.actions_and_results:
+            for action, action_results in self.actions_and_results:
+                if isinstance(action, DecisiveAction):
+                    errors.extend(action.errors)
+        return errors
+
    def to_agent_step_output(self) -> AgentStepOutput:
        return AgentStepOutput(
            action_results=self.action_results if self.action_results else [],
            actions_and_results=self.actions_and_results if self.actions_and_results else [],
+            errors=self.extract_errors(),
        )
--- a/skyvern/webeye/browser_factory.py
+++ b/skyvern/webeye/browser_factory.py
@@ -6,10 +6,17 @@ from datetime import datetime
 from typing import Any, Awaitable, Protocol

 import structlog
+from playwright._impl._errors import TimeoutError
 from playwright.async_api import BrowserContext, Error, Page, Playwright, async_playwright
 from pydantic import BaseModel

-from skyvern.exceptions import FailedToNavigateToUrl, UnknownBrowserType, UnknownErrorWhileCreatingBrowserContext
+from skyvern.exceptions import (
+    FailedToNavigateToUrl,
+    FailedToTakeScreenshot,
+    MissingBrowserStatePage,
+    UnknownBrowserType,
+    UnknownErrorWhileCreatingBrowserContext,
+)
 from skyvern.forge.sdk.core.skyvern_context import current
 from skyvern.forge.sdk.settings_manager import SettingsManager

@@ -58,9 +65,14 @@ class BrowserContextFactory:

    @staticmethod
    def build_browser_artifacts(
-        video_path: str | None = None, har_path: str | None = None, video_artifact_id: str | None = None
+        video_path: str | None = None,
+        har_path: str | None = None,
+        video_artifact_id: str | None = None,
+        traces_dir: str | None = None,
    ) -> BrowserArtifacts:
-        return BrowserArtifacts(video_path=video_path, har_path=har_path, video_artifact_id=video_artifact_id)
+        return BrowserArtifacts(
+            video_path=video_path, har_path=har_path, video_artifact_id=video_artifact_id, traces_dir=traces_dir
+        )

    @classmethod
    def register_type(cls, browser_type: str, creator: BrowserContextCreator) -> None:
@@ -86,6 +98,7 @@ class BrowserArtifacts(BaseModel):
    video_path: str | None = None
    video_artifact_id: str | None = None
    har_path: str | None = None
+    traces_dir: str | None = None


 async def _create_headless_chromium(playwright: Playwright, **kwargs: dict) -> tuple[BrowserContext, BrowserArtifacts]:
@@ -180,3 +193,26 @@ class BrowserState:
            LOG.info("Stopping playwright")
            await self.pw.stop()
            LOG.info("Playwright is stopped")
+
+    async def take_screenshot(self, full_page: bool = False, file_path: str | None = None) -> bytes:
+        if not self.page:
+            LOG.error("BrowserState has no page")
+            raise MissingBrowserStatePage()
+        try:
+            if file_path:
+                return await self.page.screenshot(
+                    path=file_path,
+                    full_page=full_page,
+                    timeout=SettingsManager.get_settings().BROWSER_SCREENSHOT_TIMEOUT_MS,
+                )
+            return await self.page.screenshot(
+                full_page=full_page,
+                timeout=SettingsManager.get_settings().BROWSER_SCREENSHOT_TIMEOUT_MS,
+                animations="disabled",
+            )
+        except TimeoutError as e:
+            LOG.exception(f"Timeout error while taking screenshot: {str(e)}", exc_info=True)
+            raise FailedToTakeScreenshot(error_message=str(e)) from e
+        except Exception as e:
+            LOG.exception(f"Unknown error while taking screenshot: {str(e)}", exc_info=True)
+            raise FailedToTakeScreenshot(error_message=str(e)) from e
--- a/skyvern/webeye/browser_manager.py
+++ b/skyvern/webeye/browser_manager.py
@@ -50,6 +50,8 @@ class BrowserManager:
        await browser_state.get_or_create_page(task.url)

        self.pages[task.task_id] = browser_state
+        if task.workflow_run_id:
+            self.pages[task.workflow_run_id] = browser_state
        return browser_state

    async def get_or_create_for_workflow_run(self, workflow_run: WorkflowRun, url: str | None = None) -> BrowserState:
@@ -95,8 +97,11 @@ class BrowserManager:
        if browser_state:
            path = browser_state.browser_artifacts.video_path
            if path:
-                with open(path, "rb") as f:
-                    return f.read()
+                try:
+                    with open(path, "rb") as f:
+                        return f.read()
+                except FileNotFoundError:
+                    pass
        LOG.warning(
            "Video data not found for task", task_id=task_id, workflow_id=workflow_id, workflow_run_id=workflow_run_id
        )
@@ -135,18 +140,32 @@ class BrowserManager:
        LOG.info("Cleaning up for task")
        browser_state_to_close = self.pages.pop(task_id, None)
        if browser_state_to_close:
+            # Stop tracing before closing the browser if tracing is enabled
+            if browser_state_to_close.browser_context and browser_state_to_close.browser_artifacts.traces_dir:
+                trace_path = f"{browser_state_to_close.browser_artifacts.traces_dir}/{task_id}.zip"
+                await browser_state_to_close.browser_context.tracing.stop(path=trace_path)
+                LOG.info("Stopped tracing", trace_path=trace_path)
+
            await browser_state_to_close.close(close_browser_on_completion=close_browser_on_completion)
        LOG.info("Task is cleaned up")

        return browser_state_to_close

    async def cleanup_for_workflow_run(
-        self, workflow_run_id: str, close_browser_on_completion: bool = True
+        self, workflow_run_id: str, task_ids: list[str], close_browser_on_completion: bool = True
    ) -> BrowserState | None:
        LOG.info("Cleaning up for workflow run")
        browser_state_to_close = self.pages.pop(workflow_run_id, None)
        if browser_state_to_close:
+            # Stop tracing before closing the browser if tracing is enabled
+            if browser_state_to_close.browser_context and browser_state_to_close.browser_artifacts.traces_dir:
+                trace_path = f"{browser_state_to_close.browser_artifacts.traces_dir}/{workflow_run_id}.zip"
+                await browser_state_to_close.browser_context.tracing.stop(path=trace_path)
+                LOG.info("Stopped tracing", trace_path=trace_path)
+
            await browser_state_to_close.close(close_browser_on_completion=close_browser_on_completion)
+        for task_id in task_ids:
+            self.pages.pop(task_id, None)
        LOG.info("Workflow run is cleaned up")

        return browser_state_to_close
--- a/skyvern/webeye/scraper/scraper.py
+++ b/skyvern/webeye/scraper/scraper.py
@@ -170,7 +170,7 @@ async def scrape_web_unsafe(
    scroll_y_px = await scroll_to_top(page, drow_boxes=True)
    # Checking max number of screenshots to prevent infinite loop
    while scroll_y_px_old != scroll_y_px and len(screenshots) < SettingsManager.get_settings().MAX_NUM_SCREENSHOTS:
-        screenshot = await page.screenshot(full_page=False)
+        screenshot = await browser_state.take_screenshot(full_page=False)
        screenshots.append(screenshot)
        scroll_y_px_old = scroll_y_px
        LOG.info("Scrolling to next page", url=url, num_screenshots=len(screenshots))
@@ -348,9 +348,10 @@ def _build_element_links(elements: list[dict]) -> None:
        listbox_text = element["text"] if "text" in element else ""

        # WARNING: If a listbox has really little commont content (yes/no, etc.),
-        #   it might have conflict and will connect to wrong element. If so, code should be added to prevent that:
+        #   it might have conflict and will connect to wrong element
        # if len(listbox_text) < 10:
-        #     # do not support small listbox text as it's error proning. larger text match is more reliable
+        #     # do not support small listbox text for now as it's error proning. larger text match is more reliable
+        #     LOG.info("Skip because too short listbox text", listbox_text=listbox_text)
        #     continue

        for text, linked_elements in text_to_elements_map.items():
@@ -369,7 +370,6 @@ def _build_element_links(elements: list[dict]) -> None:
        for context, linked_elements in context_to_elements_map.items():
            if listbox_text in context:
                for linked_element in linked_elements:
-                    # if _ensure_nearby_rects(element["rect"], linked_element["rect"]):
                    if linked_element["id"] != element["id"]:
                        LOG.info(
                            "Match listbox to target element context",