resize screenshots to make it compatible with anthropic CUA (#2255)
This commit is contained in:
@@ -385,7 +385,7 @@ class ForgeAgent:
|
|||||||
# llm_caller = LLMCaller(llm_key="BEDROCK_ANTHROPIC_CLAUDE3.5_SONNET_INFERENCE_PROFILE")
|
# llm_caller = LLMCaller(llm_key="BEDROCK_ANTHROPIC_CLAUDE3.5_SONNET_INFERENCE_PROFILE")
|
||||||
llm_caller = LLMCallerManager.get_llm_caller(task.task_id)
|
llm_caller = LLMCallerManager.get_llm_caller(task.task_id)
|
||||||
if not llm_caller:
|
if not llm_caller:
|
||||||
llm_caller = LLMCaller(llm_key=settings.ANTHROPIC_CUA_LLM_KEY)
|
llm_caller = LLMCaller(llm_key=settings.ANTHROPIC_CUA_LLM_KEY, screenshot_scaling_enabled=True)
|
||||||
LLMCallerManager.set_llm_caller(task.task_id, llm_caller)
|
LLMCallerManager.set_llm_caller(task.task_id, llm_caller)
|
||||||
step, detailed_output = await self.agent_step(
|
step, detailed_output = await self.agent_step(
|
||||||
task,
|
task,
|
||||||
@@ -1450,7 +1450,13 @@ class ForgeAgent:
|
|||||||
assistant_content = llm_response["content"]
|
assistant_content = llm_response["content"]
|
||||||
llm_caller.message_history.append({"role": "assistant", "content": assistant_content})
|
llm_caller.message_history.append({"role": "assistant", "content": assistant_content})
|
||||||
|
|
||||||
actions = await parse_anthropic_actions(task, step, assistant_content)
|
actions = await parse_anthropic_actions(
|
||||||
|
task,
|
||||||
|
step,
|
||||||
|
assistant_content,
|
||||||
|
llm_caller.browser_window_dimension,
|
||||||
|
llm_caller.screenshot_resize_target_dimension,
|
||||||
|
)
|
||||||
return actions
|
return actions
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ from skyvern.forge.sdk.core import skyvern_context
|
|||||||
from skyvern.forge.sdk.models import Step
|
from skyvern.forge.sdk.models import Step
|
||||||
from skyvern.forge.sdk.schemas.ai_suggestions import AISuggestion
|
from skyvern.forge.sdk.schemas.ai_suggestions import AISuggestion
|
||||||
from skyvern.forge.sdk.schemas.task_v2 import TaskV2, Thought
|
from skyvern.forge.sdk.schemas.task_v2 import TaskV2, Thought
|
||||||
|
from skyvern.utils.image_resizer import Resolution, get_resize_target_dimension, resize_screenshots
|
||||||
|
|
||||||
LOG = structlog.get_logger()
|
LOG = structlog.get_logger()
|
||||||
|
|
||||||
@@ -454,12 +455,22 @@ class LLMCaller:
|
|||||||
An LLMCaller instance defines the LLM configs and keeps the chat history if needed.
|
An LLMCaller instance defines the LLM configs and keeps the chat history if needed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, llm_key: str, base_parameters: dict[str, Any] | None = None):
|
def __init__(
|
||||||
|
self,
|
||||||
|
llm_key: str,
|
||||||
|
screenshot_scaling_enabled: bool = False,
|
||||||
|
base_parameters: dict[str, Any] | None = None,
|
||||||
|
):
|
||||||
self.llm_key = llm_key
|
self.llm_key = llm_key
|
||||||
self.llm_config = LLMConfigRegistry.get_config(llm_key)
|
self.llm_config = LLMConfigRegistry.get_config(llm_key)
|
||||||
self.base_parameters = base_parameters
|
self.base_parameters = base_parameters
|
||||||
self.message_history: list[dict[str, Any]] = []
|
self.message_history: list[dict[str, Any]] = []
|
||||||
self.current_tool_results: list[dict[str, Any]] = []
|
self.current_tool_results: list[dict[str, Any]] = []
|
||||||
|
self.screenshot_scaling_enabled = screenshot_scaling_enabled
|
||||||
|
self.browser_window_dimension = Resolution(width=settings.BROWSER_WIDTH, height=settings.BROWSER_HEIGHT)
|
||||||
|
self.screenshot_resize_target_dimension = self.browser_window_dimension
|
||||||
|
if screenshot_scaling_enabled:
|
||||||
|
self.screenshot_resize_target_dimension = get_resize_target_dimension(self.browser_window_dimension)
|
||||||
|
|
||||||
def add_tool_result(self, tool_result: dict[str, Any]) -> None:
|
def add_tool_result(self, tool_result: dict[str, Any]) -> None:
|
||||||
self.current_tool_results.append(tool_result)
|
self.current_tool_results.append(tool_result)
|
||||||
@@ -504,6 +515,9 @@ class LLMCaller:
|
|||||||
ai_suggestion=ai_suggestion,
|
ai_suggestion=ai_suggestion,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if screenshots and self.screenshot_scaling_enabled:
|
||||||
|
screenshots = resize_screenshots(screenshots, self.screenshot_resize_target_dimension)
|
||||||
|
|
||||||
await app.ARTIFACT_MANAGER.create_llm_artifact(
|
await app.ARTIFACT_MANAGER.create_llm_artifact(
|
||||||
data=prompt.encode("utf-8") if prompt else b"",
|
data=prompt.encode("utf-8") if prompt else b"",
|
||||||
artifact_type=ArtifactType.LLM_PROMPT,
|
artifact_type=ArtifactType.LLM_PROMPT,
|
||||||
|
|||||||
59
skyvern/utils/image_resizer.py
Normal file
59
skyvern/utils/image_resizer.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
import io
|
||||||
|
from typing import TypedDict
|
||||||
|
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
|
||||||
|
class Resolution(TypedDict):
|
||||||
|
width: int
|
||||||
|
height: int
|
||||||
|
|
||||||
|
|
||||||
|
MAX_SCALING_TARGETS_ANTHROPIC_CUA: dict[str, Resolution] = {
|
||||||
|
"XGA": Resolution(width=1024, height=768), # 4:3
|
||||||
|
"WXGA": Resolution(width=1280, height=800), # 16:10
|
||||||
|
"FWXGA": Resolution(width=1366, height=768), # ~16:9
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_resize_target_dimension(
|
||||||
|
window_size: Resolution, max_scaling_targets: dict[str, Resolution] = MAX_SCALING_TARGETS_ANTHROPIC_CUA
|
||||||
|
) -> Resolution:
|
||||||
|
ratio = window_size["width"] / window_size["height"]
|
||||||
|
for dimension in max_scaling_targets.values():
|
||||||
|
if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
|
||||||
|
return dimension
|
||||||
|
return window_size
|
||||||
|
|
||||||
|
|
||||||
|
def resize_screenshots(screenshots: list[bytes], target_dimension: Resolution) -> list[bytes]:
|
||||||
|
"""
|
||||||
|
The image scaling logic is originated from anthropic's quickstart guide:
|
||||||
|
https://github.com/anthropics/anthropic-quickstarts/blob/81c4085944abb1734db411f05290b538fdc46dcd/computer-use-demo/computer_use_demo/tools/computer.py#L49-L60
|
||||||
|
"""
|
||||||
|
new_screenshots = []
|
||||||
|
for screenshot in screenshots:
|
||||||
|
# Convert bytes to PIL Image
|
||||||
|
img = Image.open(io.BytesIO(screenshot))
|
||||||
|
|
||||||
|
# Resize image to target dimensions
|
||||||
|
resized_img = img.resize((target_dimension["width"], target_dimension["height"]), Image.Resampling.LANCZOS)
|
||||||
|
|
||||||
|
# Convert back to bytes
|
||||||
|
img_byte_arr = io.BytesIO()
|
||||||
|
resized_img.save(img_byte_arr, format="PNG")
|
||||||
|
img_byte = img_byte_arr.getvalue()
|
||||||
|
|
||||||
|
new_screenshots.append(img_byte)
|
||||||
|
return new_screenshots
|
||||||
|
|
||||||
|
|
||||||
|
def scale_coordinates(
|
||||||
|
current_coordinates: tuple[int, int],
|
||||||
|
current_dimension: Resolution,
|
||||||
|
target_dimension: Resolution,
|
||||||
|
) -> tuple[int, int]:
|
||||||
|
return (
|
||||||
|
int(current_coordinates[0] * target_dimension["width"] / current_dimension["width"]),
|
||||||
|
int(current_coordinates[1] * target_dimension["height"] / current_dimension["height"]),
|
||||||
|
)
|
||||||
@@ -9,6 +9,7 @@ from skyvern.forge import app
|
|||||||
from skyvern.forge.prompts import prompt_engine
|
from skyvern.forge.prompts import prompt_engine
|
||||||
from skyvern.forge.sdk.models import Step
|
from skyvern.forge.sdk.models import Step
|
||||||
from skyvern.forge.sdk.schemas.tasks import Task
|
from skyvern.forge.sdk.schemas.tasks import Task
|
||||||
|
from skyvern.utils.image_resizer import Resolution, scale_coordinates
|
||||||
from skyvern.webeye.actions.actions import (
|
from skyvern.webeye.actions.actions import (
|
||||||
Action,
|
Action,
|
||||||
ActionType,
|
ActionType,
|
||||||
@@ -454,6 +455,8 @@ async def parse_anthropic_actions(
|
|||||||
task: Task,
|
task: Task,
|
||||||
step: Step,
|
step: Step,
|
||||||
assistant_content: list[dict[str, Any]],
|
assistant_content: list[dict[str, Any]],
|
||||||
|
browser_window_dimension: Resolution,
|
||||||
|
screenshot_resize_target_dimension: Resolution,
|
||||||
) -> list[Action]:
|
) -> list[Action]:
|
||||||
tool_calls = [block for block in assistant_content if block["type"] == "tool_use" and block["name"] == "computer"]
|
tool_calls = [block for block in assistant_content if block["type"] == "tool_use" and block["name"] == "computer"]
|
||||||
idx = 0
|
idx = 0
|
||||||
@@ -468,7 +471,11 @@ async def parse_anthropic_actions(
|
|||||||
continue
|
continue
|
||||||
action = tool_call_input["action"]
|
action = tool_call_input["action"]
|
||||||
if action == "mouse_move":
|
if action == "mouse_move":
|
||||||
x, y = tool_call_input["coordinate"]
|
original_x, original_y = tool_call_input["coordinate"]
|
||||||
|
# (x, y) is the coordinate in resized screenshots. We need to scale it to the browser window dimension.
|
||||||
|
x, y = scale_coordinates(
|
||||||
|
(original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension
|
||||||
|
)
|
||||||
actions.append(
|
actions.append(
|
||||||
MoveAction(
|
MoveAction(
|
||||||
x=x,
|
x=x,
|
||||||
@@ -497,7 +504,10 @@ async def parse_anthropic_actions(
|
|||||||
)
|
)
|
||||||
idx += 1
|
idx += 1
|
||||||
continue
|
continue
|
||||||
x, y = coordinate
|
original_x, original_y = coordinate
|
||||||
|
x, y = scale_coordinates(
|
||||||
|
(original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension
|
||||||
|
)
|
||||||
actions.append(
|
actions.append(
|
||||||
ClickAction(
|
ClickAction(
|
||||||
element_id="",
|
element_id="",
|
||||||
|
|||||||
Reference in New Issue
Block a user