support volcengine + migrate ui tars to volcengine (#2705)
This commit is contained in:
14
.env.example
14
.env.example
@@ -43,14 +43,12 @@ ENABLE_NOVITA=false
|
|||||||
# NOVITA_API_KEY: Your Novita AI API key.
|
# NOVITA_API_KEY: Your Novita AI API key.
|
||||||
NOVITA_API_KEY=""
|
NOVITA_API_KEY=""
|
||||||
|
|
||||||
# ENABLE_UI_TARS: Set to true to enable UI-TARS (Seed1.5-VL) as a language model provider.
|
# ENABLE_VOLCENGINE: Set to true to enable Volcengine(ByteDance Doubao) as a language model provider.
|
||||||
ENABLE_UI_TARS=false
|
ENABLE_VOLCENGINE=false
|
||||||
# UI_TARS_API_KEY: Your ByteDance Doubao API key for accessing UI-TARS models.
|
# VOLCENGINE_API_KEY: Your Volcengine(ByteDance Doubao) API key.
|
||||||
UI_TARS_API_KEY=""
|
VOLCENGINE_API_KEY=""
|
||||||
# UI_TARS_API_BASE: The base URL for ByteDance Doubao API.
|
# VOLCENGINE_API_BASE: The base URL for Volcengine(ByteDance Doubao) API.
|
||||||
UI_TARS_API_BASE="https://ark.cn-beijing.volces.com/api/v3"
|
VOLCENGINE_API_BASE="https://ark.cn-beijing.volces.com/api/v3"
|
||||||
# UI_TARS_MODEL: Your UI-TARS model endpoint ID from ByteDance Doubao.
|
|
||||||
UI_TARS_MODEL="doubao-1-5-thinking-vision-pro-250428"
|
|
||||||
|
|
||||||
# LLM_KEY: The chosen language model to use. This should be one of the models
|
# LLM_KEY: The chosen language model to use. This should be one of the models
|
||||||
# provided by the enabled LLM providers (e.g., OPENAI_GPT4_TURBO, OPENAI_GPT4V, ANTHROPIC_CLAUDE3, AZURE_OPENAI_GPT4V).
|
# provided by the enabled LLM providers (e.g., OPENAI_GPT4_TURBO, OPENAI_GPT4V, ANTHROPIC_CLAUDE3, AZURE_OPENAI_GPT4V).
|
||||||
|
|||||||
@@ -160,30 +160,26 @@ def setup_llm_providers() -> None:
|
|||||||
else:
|
else:
|
||||||
update_or_add_env_var("ENABLE_NOVITA", "false")
|
update_or_add_env_var("ENABLE_NOVITA", "false")
|
||||||
|
|
||||||
console.print("\n[bold blue]--- UI-TARS Configuration ---[/bold blue]")
|
console.print("\n[bold blue]--- VolcEngine Configuration ---[/bold blue]")
|
||||||
console.print("To enable UI-TARS (Seed1.5-VL), you must have a ByteDance Doubao API key.")
|
console.print("To enable VolcEngine, you must have a ByteDance Doubao API key.")
|
||||||
console.print("UI-TARS now uses direct VolcEngine API calls for improved compatibility.")
|
enable_volcengine = Confirm.ask("Do you want to enable VolcEngine?")
|
||||||
enable_ui_tars = Confirm.ask("Do you want to enable UI-TARS?")
|
if enable_volcengine:
|
||||||
if enable_ui_tars:
|
volcengine_api_key = Prompt.ask("Enter your VolcEngine(ByteDance Doubao) API key", password=True)
|
||||||
ui_tars_api_key = Prompt.ask("Enter your ByteDance Doubao API key", password=True)
|
if not volcengine_api_key:
|
||||||
if not ui_tars_api_key:
|
console.print("[red]Error: VolcEngine key is required. VolcEngine will not be enabled.[/red]")
|
||||||
console.print("[red]Error: UI-TARS API key is required. UI-TARS will not be enabled.[/red]")
|
|
||||||
else:
|
else:
|
||||||
update_or_add_env_var("UI_TARS_API_KEY", ui_tars_api_key)
|
update_or_add_env_var("VOLCENGINE_API_KEY", volcengine_api_key)
|
||||||
update_or_add_env_var("ENABLE_UI_TARS", "true")
|
update_or_add_env_var("ENABLE_VOLCENGINE", "true")
|
||||||
|
|
||||||
# Optional: Allow customizing model endpoint
|
model_options.extend(
|
||||||
custom_model = Confirm.ask(
|
[
|
||||||
"Do you want to use a custom model endpoint? (default: doubao-1-5-thinking-vision-pro-250428)"
|
"VOLCENGINE_DOUBAO_SEED_1_6",
|
||||||
|
"VOLCENGINE_DOUBAO_SEED_1_6_FLASH",
|
||||||
|
"VOLCENGINE_DOUBAO_1_5_THINKING_VISION_PRO",
|
||||||
|
]
|
||||||
)
|
)
|
||||||
if custom_model:
|
|
||||||
ui_tars_model = Prompt.ask("Enter your UI-TARS model endpoint ID")
|
|
||||||
if ui_tars_model:
|
|
||||||
update_or_add_env_var("UI_TARS_MODEL", ui_tars_model)
|
|
||||||
|
|
||||||
model_options.append("UI_TARS_SEED1_5_VL")
|
|
||||||
else:
|
else:
|
||||||
update_or_add_env_var("ENABLE_UI_TARS", "false")
|
update_or_add_env_var("ENABLE_VOLCENGINE", "false")
|
||||||
|
|
||||||
console.print("\n[bold blue]--- OpenAI-Compatible Provider Configuration ---[/bold blue]")
|
console.print("\n[bold blue]--- OpenAI-Compatible Provider Configuration ---[/bold blue]")
|
||||||
console.print("To enable an OpenAI-compatible provider, you must have a model name, API key, and API base URL.")
|
console.print("To enable an OpenAI-compatible provider, you must have a model name, API key, and API base URL.")
|
||||||
|
|||||||
@@ -134,12 +134,11 @@ class Settings(BaseSettings):
|
|||||||
ANTHROPIC_API_KEY: str | None = None
|
ANTHROPIC_API_KEY: str | None = None
|
||||||
ANTHROPIC_CUA_LLM_KEY: str = "ANTHROPIC_CLAUDE3.7_SONNET"
|
ANTHROPIC_CUA_LLM_KEY: str = "ANTHROPIC_CLAUDE3.7_SONNET"
|
||||||
|
|
||||||
# UI-TARS (Seed1.5-VL via Doubao)
|
# VOLCENGINE (Doubao)
|
||||||
UI_TARS_API_KEY: str | None = None
|
ENABLE_VOLCENGINE: bool = False
|
||||||
UI_TARS_API_BASE: str = "https://ark.cn-beijing.volces.com/api/v3"
|
VOLCENGINE_API_KEY: str | None = None
|
||||||
UI_TARS_MODEL: str = "doubao-1-5-thinking-vision-pro-250428"
|
VOLCENGINE_API_BASE: str = "https://ark.cn-beijing.volces.com/api/v3"
|
||||||
UI_TARS_LLM_KEY: str = "UI_TARS_SEED1_5_VL"
|
VOLCENGINE_CUA_LLM_KEY: str = "VOLCENGINE_DOUBAO_1_5_THINKING_VISION_PRO"
|
||||||
ENABLE_UI_TARS: bool = False
|
|
||||||
|
|
||||||
# OPENAI COMPATIBLE
|
# OPENAI COMPATIBLE
|
||||||
OPENAI_COMPATIBLE_MODEL_NAME: str | None = None
|
OPENAI_COMPATIBLE_MODEL_NAME: str | None = None
|
||||||
|
|||||||
@@ -404,7 +404,7 @@ class ForgeAgent:
|
|||||||
llm_caller = LLMCallerManager.get_llm_caller(task.task_id)
|
llm_caller = LLMCallerManager.get_llm_caller(task.task_id)
|
||||||
if not llm_caller:
|
if not llm_caller:
|
||||||
# create a new UI-TARS llm_caller
|
# create a new UI-TARS llm_caller
|
||||||
llm_key = task.llm_key or settings.UI_TARS_LLM_KEY
|
llm_key = task.llm_key or settings.VOLCENGINE_CUA_LLM_KEY
|
||||||
llm_caller = UITarsLLMCaller(llm_key=llm_key, screenshot_scaling_enabled=True)
|
llm_caller = UITarsLLMCaller(llm_key=llm_key, screenshot_scaling_enabled=True)
|
||||||
llm_caller.initialize_conversation(task)
|
llm_caller.initialize_conversation(task)
|
||||||
|
|
||||||
|
|||||||
@@ -48,10 +48,10 @@ if SettingsManager.get_settings().ENABLE_BEDROCK_ANTHROPIC:
|
|||||||
|
|
||||||
# Add UI-TARS client setup
|
# Add UI-TARS client setup
|
||||||
UI_TARS_CLIENT = None
|
UI_TARS_CLIENT = None
|
||||||
if SettingsManager.get_settings().ENABLE_UI_TARS:
|
if SettingsManager.get_settings().ENABLE_VOLCENGINE:
|
||||||
UI_TARS_CLIENT = AsyncOpenAI(
|
UI_TARS_CLIENT = AsyncOpenAI(
|
||||||
api_key=SettingsManager.get_settings().UI_TARS_API_KEY,
|
api_key=SettingsManager.get_settings().VOLCENGINE_API_KEY,
|
||||||
base_url=SettingsManager.get_settings().UI_TARS_API_BASE,
|
base_url=SettingsManager.get_settings().VOLCENGINE_API_BASE,
|
||||||
)
|
)
|
||||||
|
|
||||||
SECONDARY_LLM_API_HANDLER = LLMAPIHandlerFactory.get_llm_api_handler(
|
SECONDARY_LLM_API_HANDLER = LLMAPIHandlerFactory.get_llm_api_handler(
|
||||||
|
|||||||
@@ -198,6 +198,7 @@ class LLMAPIHandlerFactory:
|
|||||||
)
|
)
|
||||||
if step or thought:
|
if step or thought:
|
||||||
try:
|
try:
|
||||||
|
# FIXME: volcengine doesn't support litellm cost calculation.
|
||||||
llm_cost = litellm.completion_cost(completion_response=response)
|
llm_cost = litellm.completion_cost(completion_response=response)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
LOG.debug("Failed to calculate LLM cost", error=str(e), exc_info=True)
|
LOG.debug("Failed to calculate LLM cost", error=str(e), exc_info=True)
|
||||||
@@ -401,6 +402,7 @@ class LLMAPIHandlerFactory:
|
|||||||
|
|
||||||
if step or thought:
|
if step or thought:
|
||||||
try:
|
try:
|
||||||
|
# FIXME: volcengine doesn't support litellm cost calculation.
|
||||||
llm_cost = litellm.completion_cost(completion_response=response)
|
llm_cost = litellm.completion_cost(completion_response=response)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
LOG.debug("Failed to calculate LLM cost", error=str(e), exc_info=True)
|
LOG.debug("Failed to calculate LLM cost", error=str(e), exc_info=True)
|
||||||
@@ -746,7 +748,7 @@ class LLMCaller:
|
|||||||
tools: list | None = None,
|
tools: list | None = None,
|
||||||
timeout: float = settings.LLM_CONFIG_TIMEOUT,
|
timeout: float = settings.LLM_CONFIG_TIMEOUT,
|
||||||
**active_parameters: dict[str, Any],
|
**active_parameters: dict[str, Any],
|
||||||
) -> ModelResponse | CustomStreamWrapper | AnthropicMessage | Any:
|
) -> ModelResponse | CustomStreamWrapper | AnthropicMessage | UITarsResponse:
|
||||||
if self.llm_key and "ANTHROPIC" in self.llm_key:
|
if self.llm_key and "ANTHROPIC" in self.llm_key:
|
||||||
return await self._call_anthropic(messages, tools, timeout, **active_parameters)
|
return await self._call_anthropic(messages, tools, timeout, **active_parameters)
|
||||||
|
|
||||||
@@ -802,14 +804,14 @@ class LLMCaller:
|
|||||||
tools: list | None = None,
|
tools: list | None = None,
|
||||||
timeout: float = settings.LLM_CONFIG_TIMEOUT,
|
timeout: float = settings.LLM_CONFIG_TIMEOUT,
|
||||||
**active_parameters: dict[str, Any],
|
**active_parameters: dict[str, Any],
|
||||||
) -> Any:
|
) -> UITarsResponse:
|
||||||
"""Custom UI-TARS API call using OpenAI client with VolcEngine endpoint."""
|
"""Custom UI-TARS API call using OpenAI client with VolcEngine endpoint."""
|
||||||
max_tokens = active_parameters.get("max_completion_tokens") or active_parameters.get("max_tokens") or 400
|
max_tokens = active_parameters.get("max_completion_tokens") or active_parameters.get("max_tokens") or 400
|
||||||
model_name = self.llm_config.model_name
|
model_name = self.llm_config.model_name.replace("volcengine/", "")
|
||||||
|
|
||||||
if not app.UI_TARS_CLIENT:
|
if not app.UI_TARS_CLIENT:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"UI_TARS_CLIENT not initialized. Please ensure ENABLE_UI_TARS=true and UI_TARS_API_KEY is set."
|
"UI_TARS_CLIENT not initialized. Please ensure ENABLE_VOLCENGINE=true and VOLCENGINE_API_KEY is set."
|
||||||
)
|
)
|
||||||
|
|
||||||
LOG.info(
|
LOG.info(
|
||||||
@@ -851,39 +853,18 @@ class LLMCaller:
|
|||||||
return response
|
return response
|
||||||
|
|
||||||
async def get_call_stats(
|
async def get_call_stats(
|
||||||
self, response: ModelResponse | CustomStreamWrapper | AnthropicMessage | dict[str, Any] | Any
|
self, response: ModelResponse | CustomStreamWrapper | AnthropicMessage | UITarsResponse
|
||||||
) -> LLMCallStats:
|
) -> LLMCallStats:
|
||||||
empty_call_stats = LLMCallStats()
|
empty_call_stats = LLMCallStats()
|
||||||
|
|
||||||
# Handle UI-TARS response (UITarsResponse object from _call_ui_tars)
|
# Handle UI-TARS response (UITarsResponse object from _call_ui_tars)
|
||||||
if hasattr(response, "usage") and hasattr(response, "choices") and hasattr(response, "model"):
|
if isinstance(response, UITarsResponse):
|
||||||
usage = response.usage
|
ui_tars_usage = response.usage
|
||||||
# Use Doubao pricing: ¥0.8/1M input, ¥2/1M output (convert to USD: ~$0.11/$0.28)
|
|
||||||
input_token_cost = (0.11 / 1000000) * usage.get("prompt_tokens", 0)
|
|
||||||
output_token_cost = (0.28 / 1000000) * usage.get("completion_tokens", 0)
|
|
||||||
llm_cost = input_token_cost + output_token_cost
|
|
||||||
|
|
||||||
return LLMCallStats(
|
return LLMCallStats(
|
||||||
llm_cost=llm_cost,
|
llm_cost=0, # TODO: calculate the cost according to the price: https://www.volcengine.com/docs/82379/1544106
|
||||||
input_tokens=usage.get("prompt_tokens", 0),
|
input_tokens=ui_tars_usage.get("prompt_tokens", 0),
|
||||||
output_tokens=usage.get("completion_tokens", 0),
|
output_tokens=ui_tars_usage.get("completion_tokens", 0),
|
||||||
cached_tokens=0, # UI-TARS doesn't have cached tokens
|
cached_tokens=0, # only part of model support cached tokens
|
||||||
reasoning_tokens=0,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Handle UI-TARS response (dict format - fallback)
|
|
||||||
if isinstance(response, dict) and "choices" in response and "usage" in response:
|
|
||||||
usage = response["usage"]
|
|
||||||
# Use Doubao pricing: ¥0.8/1M input, ¥2/1M output (convert to USD: ~$0.11/$0.28)
|
|
||||||
input_token_cost = (0.11 / 1000000) * usage.get("prompt_tokens", 0)
|
|
||||||
output_token_cost = (0.28 / 1000000) * usage.get("completion_tokens", 0)
|
|
||||||
llm_cost = input_token_cost + output_token_cost
|
|
||||||
|
|
||||||
return LLMCallStats(
|
|
||||||
llm_cost=llm_cost,
|
|
||||||
input_tokens=usage.get("prompt_tokens", 0),
|
|
||||||
output_tokens=usage.get("completion_tokens", 0),
|
|
||||||
cached_tokens=0, # UI-TARS doesn't have cached tokens
|
|
||||||
reasoning_tokens=0,
|
reasoning_tokens=0,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -568,16 +568,46 @@ if settings.ENABLE_AZURE_O3:
|
|||||||
max_completion_tokens=100000,
|
max_completion_tokens=100000,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
if settings.ENABLE_UI_TARS:
|
if settings.ENABLE_VOLCENGINE:
|
||||||
LLMConfigRegistry.register_config(
|
LLMConfigRegistry.register_config(
|
||||||
"UI_TARS_SEED1_5_VL",
|
"VOLCENGINE_DOUBAO_SEED_1_6",
|
||||||
LLMConfig(
|
LLMConfig(
|
||||||
settings.UI_TARS_MODEL,
|
"volcengine/doubao-seed-1.6-250615",
|
||||||
["UI_TARS_API_KEY"],
|
["VOLCENGINE_API_KEY"],
|
||||||
|
litellm_params=LiteLLMParams(
|
||||||
|
api_base=settings.VOLCENGINE_API_BASE,
|
||||||
|
api_key=settings.VOLCENGINE_API_KEY,
|
||||||
|
),
|
||||||
|
supports_vision=True,
|
||||||
|
add_assistant_prefix=False,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
LLMConfigRegistry.register_config(
|
||||||
|
"VOLCENGINE_DOUBAO_SEED_1_6_FLASH",
|
||||||
|
LLMConfig(
|
||||||
|
"volcengine/doubao-seed-1.6-flash-250615",
|
||||||
|
["VOLCENGINE_API_KEY"],
|
||||||
|
litellm_params=LiteLLMParams(
|
||||||
|
api_base=settings.VOLCENGINE_API_BASE,
|
||||||
|
api_key=settings.VOLCENGINE_API_KEY,
|
||||||
|
),
|
||||||
|
supports_vision=True,
|
||||||
|
add_assistant_prefix=False,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
LLMConfigRegistry.register_config(
|
||||||
|
"VOLCENGINE_DOUBAO_1_5_THINKING_VISION_PRO",
|
||||||
|
LLMConfig(
|
||||||
|
"volcengine/doubao-1-5-thinking-vision-pro-250428",
|
||||||
|
["VOLCENGINE_API_KEY"],
|
||||||
|
litellm_params=LiteLLMParams(
|
||||||
|
api_base=settings.VOLCENGINE_API_BASE,
|
||||||
|
api_key=settings.VOLCENGINE_API_KEY,
|
||||||
|
),
|
||||||
supports_vision=True,
|
supports_vision=True,
|
||||||
add_assistant_prefix=False,
|
add_assistant_prefix=False,
|
||||||
max_tokens=400,
|
|
||||||
temperature=0.0,
|
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ class UITarsLLMCaller(LLMCaller):
|
|||||||
# Handle None case for navigation_goal
|
# Handle None case for navigation_goal
|
||||||
instruction = task.navigation_goal or "Default navigation task"
|
instruction = task.navigation_goal or "Default navigation task"
|
||||||
system_prompt = _build_system_prompt(instruction)
|
system_prompt = _build_system_prompt(instruction)
|
||||||
self.message_history = [{"role": "user", "content": system_prompt}]
|
self.message_history: list = [{"role": "user", "content": system_prompt}]
|
||||||
self._conversation_initialized = True
|
self._conversation_initialized = True
|
||||||
LOG.debug("Initialized UI-TARS conversation", task_id=task.task_id)
|
LOG.debug("Initialized UI-TARS conversation", task_id=task.task_id)
|
||||||
|
|
||||||
|
|||||||
@@ -3,21 +3,25 @@
|
|||||||
import json
|
import json
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from anthropic import BaseModel
|
||||||
|
|
||||||
class UITarsResponse:
|
|
||||||
|
class Message:
|
||||||
|
def __init__(self, content: str):
|
||||||
|
self.content = content
|
||||||
|
self.role = "assistant"
|
||||||
|
|
||||||
|
|
||||||
|
class Choice:
|
||||||
|
def __init__(self, content: str):
|
||||||
|
self.message = Message(content)
|
||||||
|
|
||||||
|
|
||||||
|
class UITarsResponse(BaseModel):
|
||||||
"""A response object that mimics the ModelResponse interface for UI-TARS API responses."""
|
"""A response object that mimics the ModelResponse interface for UI-TARS API responses."""
|
||||||
|
|
||||||
def __init__(self, content: str, model: str):
|
def __init__(self, content: str, model: str):
|
||||||
# Create choice objects with proper nested structure for parse_api_response
|
# Create choice objects with proper nested structure for parse_api_response
|
||||||
class Message:
|
|
||||||
def __init__(self, content: str):
|
|
||||||
self.content = content
|
|
||||||
self.role = "assistant"
|
|
||||||
|
|
||||||
class Choice:
|
|
||||||
def __init__(self, content: str):
|
|
||||||
self.message = Message(content)
|
|
||||||
|
|
||||||
self.choices = [Choice(content)]
|
self.choices = [Choice(content)]
|
||||||
self.usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
|
self.usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
|
||||||
self.model = model
|
self.model = model
|
||||||
|
|||||||
Reference in New Issue
Block a user