From d6b07a8a555a5dd4c6e60ea3de16396017316d81 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 11 Mar 2026 20:52:05 +0000 Subject: [PATCH 1/2] Revert "feat: auto-detect pre-built Docker images across all benchmarks (#456)" This reverts commit 2bfcc6c6e19e81bdc727127fd019d5e0dd964826. The auto-detect feature introduced in #456 is causing slow image builds and timeouts. This revert restores the previous image building behavior to restore benchmark build performance. Fixes #502 Co-authored-by: openhands --- benchmarks/commit0/run_infer.py | 48 ++-- benchmarks/gaia/run_infer.py | 25 +- benchmarks/multiswebench/run_infer.py | 67 ++++-- benchmarks/swebench/run_infer.py | 79 +++++-- benchmarks/swebenchmultimodal/run_infer.py | 66 ++++-- benchmarks/swefficiency/run_infer.py | 71 ++++-- benchmarks/swtbench/run_infer.py | 70 ++++-- benchmarks/utils/build_utils.py | 48 +--- benchmarks/utils/image_utils.py | 71 +----- tests/test_image_utils.py | 257 --------------------- tests/test_llm_config.py | 10 +- vendor/software-agent-sdk | 2 +- 12 files changed, 315 insertions(+), 499 deletions(-) delete mode 100644 tests/test_image_utils.py diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index b9a08102a..03ddecb85 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -1,6 +1,7 @@ import json import os from collections import Counter +from pathlib import Path from typing import Any, List from commit0.harness.constants import SPLIT @@ -12,7 +13,7 @@ get_base_docker_image, ) from benchmarks.commit0.config import INFER_DEFAULTS -from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser +from benchmarks.utils.args_parser import get_parser from benchmarks.utils.console_logging import summarize_instance from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -23,19 +24,19 @@ construct_eval_output_dir, get_default_on_result_writer, ) -from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists +from benchmarks.utils.image_utils import image_exists from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, EvalMetadata, EvalOutput, ) -from benchmarks.utils.version import IMAGE_TAG_PREFIX +from benchmarks.utils.version import SDK_SHORT_SHA from openhands.sdk import Agent, Conversation, Tool, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools -from openhands.workspace import APIRemoteWorkspace +from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace logger = get_logger(__name__) @@ -187,16 +188,15 @@ def prepare_workspace( logger.info(f"Using base docker image: {base_docker_image}") if self.metadata.workspace_type == "docker": - custom_tag = extract_custom_tag(base_docker_image) - suffix = f"-{build_target}" if build_target != "binary" else "" - agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" - ) - workspace = create_docker_workspace( - agent_server_image=agent_server_image, + # Build agent-server image from base commit0 image + workspace = DockerDevWorkspace( base_image=base_docker_image, - build_target=build_target, - forward_env=forward_env, + working_dir="/workspace", + target=build_target, + forward_env=forward_env or [], + ) + logger.info( + f"Building workspace from {base_docker_image}. This may take a while..." ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") @@ -205,13 +205,14 @@ def prepare_workspace( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) + sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) custom_tag = extract_custom_tag(base_docker_image) suffix = f"-{build_target}" if build_target != "binary" else "" agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" ) - if not remote_image_exists(agent_server_image): + if not image_exists(agent_server_image): raise RuntimeError( f"Agent server image {agent_server_image} does not exist in container registry. " "Run 'benchmarks/commit0/build_images.py --push' to build and push it first." @@ -219,7 +220,7 @@ def prepare_workspace( logger.info( f"Using remote workspace with image {agent_server_image} " - f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" + f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) workspace = APIRemoteWorkspace( @@ -591,8 +592,21 @@ def evaluate_instance( def main() -> None: + prompt_dir = (Path(__file__).parent / "prompts").resolve() + choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + default_prompt_path = prompt_dir / "default.j2" + assert default_prompt_path.exists(), ( + f"Default prompt {default_prompt_path} not found" + ) + parser = get_parser() - add_prompt_path_argument(parser, __file__) + parser.add_argument( + "--prompt-path", + type=str, + default=str(default_prompt_path), + choices=choices, + help="Path to prompt template file", + ) parser.add_argument( "--repo-split", type=str, diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index a100a2cfb..cfbf0682c 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -27,10 +27,10 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response -from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists +from benchmarks.utils.image_utils import image_exists from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput -from benchmarks.utils.version import IMAGE_TAG_PREFIX +from benchmarks.utils.version import SDK_SHORT_SHA from openhands.sdk import ( Agent, Conversation, @@ -47,7 +47,7 @@ from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools -from openhands.workspace import APIRemoteWorkspace +from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace logger = get_logger(__name__) @@ -156,14 +156,11 @@ def prepare_workspace( logger.info(f"Preparing workspace for instance {instance.id}") if self.metadata.workspace_type == "docker": - agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary" - ) - workspace = create_docker_workspace( - agent_server_image=agent_server_image, + # Use DockerDevWorkspace with base image (same as main branch) + workspace = DockerDevWorkspace( base_image="nikolaik/python-nodejs:python3.12-nodejs22", - build_target="binary", - forward_env=forward_env, + working_dir="/workspace", + forward_env=forward_env or [], ) elif self.metadata.workspace_type == "remote": # For workflow, use APIRemoteWorkspace with pre-built GAIA image @@ -177,11 +174,12 @@ def prepare_workspace( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) + sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary" + f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-gaia-binary" ) - if not remote_image_exists(agent_server_image): + if not image_exists(agent_server_image): raise RuntimeError( f"Agent server image {agent_server_image} does not exist in container registry. " f"Run 'benchmarks/gaia/build_images.py --push' to build and push it first." @@ -189,7 +187,7 @@ def prepare_workspace( logger.info( f"Using remote workspace with GAIA image {agent_server_image} " - f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" + f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) workspace = APIRemoteWorkspace( @@ -592,7 +590,6 @@ def main() -> None: max_attempts=args.max_attempts, critic=critic, selected_instances_file=args.select, - max_retries=args.max_retries, workspace_type=args.workspace, enable_delegation=args.enable_delegation, ) diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index 9d47d87e9..20b926a61 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -1,5 +1,6 @@ import json import os +from pathlib import Path from typing import List, cast import pandas as pd @@ -12,8 +13,8 @@ ) from benchmarks.multiswebench.download_dataset import download_and_concat_dataset from benchmarks.multiswebench.scripts.data.data_change import format_data_for_inference -from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser -from benchmarks.utils.build_utils import ensure_local_image +from benchmarks.utils.args_parser import get_parser +from benchmarks.utils.build_utils import build_image from benchmarks.utils.console_logging import summarize_instance from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -25,14 +26,14 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response -from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.image_utils import image_exists from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, EvalMetadata, EvalOutput, ) -from benchmarks.utils.version import IMAGE_TAG_PREFIX +from benchmarks.utils.version import SDK_SHORT_SHA from openhands.sdk import Agent, Conversation, Tool, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool @@ -209,14 +210,38 @@ def prepare_workspace( if self.metadata.workspace_type == "docker": agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) - ensure_local_image( - agent_server_image=agent_server_image, - base_image=official_docker_image, - custom_tag=custom_tag, - target=build_target, + SKIP_BUILD = os.getenv("MULTI_SWE_BENCH_SKIP_BUILD", "0").lower() in ( + "1", + "true", + "yes", ) + logger.info(f"MULTI_SWE_BENCH_SKIP_BUILD={SKIP_BUILD}") + if not SKIP_BUILD: + logger.info( + f"Building workspace from {official_docker_image} " + f"for instance {instance.id}. " + "This may take a while...\n" + "You can run benchmarks/multiswebench/build_images.py and set " + "MULTI_SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built " + "agent-server image." + ) + output = build_image( + base_image=official_docker_image, + target_image=EVAL_AGENT_SERVER_IMAGE, + custom_tag=custom_tag, + target=build_target, + push=False, + ) + logger.info(f"Image build output: {output}") + assert output.error is None, f"Image build failed: {output.error}" + if agent_server_image not in output.tags: + raise RuntimeError( + f"Built image tags {output.tags} do not include expected tag " + f"{agent_server_image}" + ) + workspace = DockerWorkspace( server_image=agent_server_image, working_dir="/workspace", @@ -224,22 +249,23 @@ def prepare_workspace( ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") + sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" ) - if not remote_image_exists(agent_server_image): + if not image_exists(agent_server_image): raise RuntimeError( f"Agent server image {agent_server_image} does not exist in container registry, " "make sure to build, push it, and make it public accessible before using remote workspace." ) logger.info( f"Using remote workspace with image {agent_server_image} " - f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" + f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) workspace = APIRemoteWorkspace( @@ -402,8 +428,21 @@ def evaluate_instance( def main() -> None: + prompt_dir = (Path(__file__).parent / "prompts").resolve() + choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + default_prompt_path = prompt_dir / "default.j2" + assert default_prompt_path.exists(), ( + f"Default prompt {default_prompt_path} not found" + ) + parser = get_parser() - add_prompt_path_argument(parser, __file__) + parser.add_argument( + "--prompt-path", + type=str, + default=str(default_prompt_path), + choices=choices, + help="Path to prompt template file", + ) parser.add_argument( "--lang", type=str, diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 065c3aecd..e4e6af9f4 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -1,5 +1,6 @@ import json import os +from pathlib import Path from typing import List from jinja2 import Environment, FileSystemLoader @@ -12,8 +13,8 @@ wrap_image, ) from benchmarks.swebench.config import INFER_DEFAULTS -from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser -from benchmarks.utils.build_utils import ensure_local_image +from benchmarks.utils.args_parser import get_parser +from benchmarks.utils.build_utils import build_image from benchmarks.utils.console_logging import summarize_instance from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -25,7 +26,7 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response -from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.image_utils import image_exists from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, @@ -33,7 +34,7 @@ EvalOutput, ToolPresetType, ) -from benchmarks.utils.version import IMAGE_TAG_PREFIX +from benchmarks.utils.version import SDK_SHORT_SHA from openhands.sdk import Agent, Conversation, Tool, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool @@ -153,30 +154,44 @@ def prepare_workspace( f"-{build_target}" if build_target != constants.BUILD_TARGET_BINARY else "" ) base_agent_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) wrap_needed = should_wrap_instance_id(instance.id) agent_server_image = base_agent_image if self.metadata.workspace_type == "docker": - built = ensure_local_image( - agent_server_image=base_agent_image, - base_image=official_docker_image, - custom_tag=custom_tag, - target=build_target, - ) - if built and wrap_needed: - wrapped_result = wrap_image(base_agent_image, push=False) - if wrapped_result.error: - raise RuntimeError( - "Wrapped image build failed: " - f"{wrapped_result.error}; log={wrapped_result.log_path}" - ) - elif not built and wrap_needed: + SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") + logger.info(f"SKIP_BUILD={SKIP_BUILD}") + if not SKIP_BUILD: logger.info( - f"Using pre-built image {base_agent_image} " - "(assumed already wrapped)" + f"Building workspace from {official_docker_image} " + f"for instance {instance.id}. " + "This may take a while...\n" + "You can run benchmarks/swebench/build_images.py and set " + "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built " + "agent-server image." ) + output = build_image( + base_image=official_docker_image, + target_image=EVAL_AGENT_SERVER_IMAGE, + custom_tag=custom_tag, + target=build_target, + push=False, + ) + logger.info(f"Image build output: {output}") + assert output.error is None, f"Image build failed: {output.error}" + if base_agent_image not in output.tags: + raise RuntimeError( + f"Built image tags {output.tags} do not include expected tag " + f"{base_agent_image}" + ) + if wrap_needed: + wrapped_result = wrap_image(base_agent_image, push=False) + if wrapped_result.error: + raise RuntimeError( + "Wrapped image build failed: " + f"{wrapped_result.error}; log={wrapped_result.log_path}" + ) workspace = DockerWorkspace( server_image=agent_server_image, @@ -185,22 +200,23 @@ def prepare_workspace( ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") + sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" ) - if not remote_image_exists(agent_server_image): + if not image_exists(agent_server_image): raise RuntimeError( f"Agent server image {agent_server_image} does not exist in container registry, " "make sure to build, push it, and make it public accessible before using remote workspace." ) logger.info( f"Using remote workspace with image {agent_server_image} " - f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" + f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) startup_timeout = float( os.getenv( @@ -347,8 +363,21 @@ def evaluate_instance( def main() -> None: + prompt_dir = (Path(__file__).parent / "prompts").resolve() + choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + default_prompt_path = prompt_dir / "default.j2" + assert default_prompt_path.exists(), ( + f"Default prompt {default_prompt_path} not found" + ) + parser = get_parser() - add_prompt_path_argument(parser, __file__) + parser.add_argument( + "--prompt-path", + type=str, + default=str(default_prompt_path), + choices=choices, + help="Path to prompt template file", + ) parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 7eb3d348f..295b3fd13 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -1,5 +1,6 @@ import json import os +from pathlib import Path from typing import List import requests @@ -10,8 +11,8 @@ get_official_docker_image, ) from benchmarks.swebenchmultimodal.config import INFER_DEFAULTS -from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser -from benchmarks.utils.build_utils import ensure_local_image +from benchmarks.utils.args_parser import get_parser +from benchmarks.utils.build_utils import build_image from benchmarks.utils.console_logging import summarize_instance from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -23,14 +24,14 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response -from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.image_utils import image_exists from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, EvalMetadata, EvalOutput, ) -from benchmarks.utils.version import IMAGE_TAG_PREFIX +from benchmarks.utils.version import SDK_SHORT_SHA from openhands.sdk import ( Agent, Conversation, @@ -162,14 +163,35 @@ def prepare_workspace( if self.metadata.workspace_type == "docker": agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" - ) - ensure_local_image( - agent_server_image=agent_server_image, - base_image=official_docker_image, - custom_tag=custom_tag, - target=build_target, + f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) + SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") + logger.info(f"SKIP_BUILD={SKIP_BUILD}") + if not SKIP_BUILD: + logger.info( + f"Building workspace from {official_docker_image} " + f"for instance {instance.id}. " + "This may take a while...\n" + "You can run benchmarks/swebenchmultimodal/build_images.py and set " + "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built " + "agent-server image." + ) + + output = build_image( + base_image=official_docker_image, + target_image=EVAL_AGENT_SERVER_IMAGE, + custom_tag=custom_tag, + target=build_target, + push=False, + ) + logger.info(f"Image build output: {output}") + assert output.error is None, f"Image build failed: {output.error}" + if agent_server_image not in output.tags: + raise RuntimeError( + f"Built image tags {output.tags} do not include expected tag " + f"{agent_server_image}" + ) + workspace = DockerWorkspace( server_image=agent_server_image, working_dir="/workspace", @@ -177,22 +199,23 @@ def prepare_workspace( ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") + sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" ) - if not remote_image_exists(agent_server_image): + if not image_exists(agent_server_image): raise RuntimeError( f"Agent server image {agent_server_image} does not exist in container registry, " "make sure to build, push it, and make it public accessible before using remote workspace." ) logger.info( f"Using remote workspace with image {agent_server_image} " - f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" + f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) workspace = APIRemoteWorkspace( @@ -400,8 +423,21 @@ def evaluate_instance( def main() -> None: + prompt_dir = (Path(__file__).parent / "prompts").resolve() + choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + default_prompt_path = prompt_dir / "default.j2" + assert default_prompt_path.exists(), ( + f"Default prompt {default_prompt_path} not found" + ) + parser = get_parser() - add_prompt_path_argument(parser, __file__) + parser.add_argument( + "--prompt-path", + type=str, + default=str(default_prompt_path), + choices=choices, + help="Path to prompt template file", + ) # Apply INFER_DEFAULTS from config (matches evaluation repository values.yaml) parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() diff --git a/benchmarks/swefficiency/run_infer.py b/benchmarks/swefficiency/run_infer.py index bb3efd908..1f418a323 100644 --- a/benchmarks/swefficiency/run_infer.py +++ b/benchmarks/swefficiency/run_infer.py @@ -1,6 +1,7 @@ import json import multiprocessing import os +from pathlib import Path from typing import Any, List from jinja2 import Environment, FileSystemLoader @@ -9,8 +10,8 @@ from benchmarks.swefficiency import constants from benchmarks.swefficiency.config import DOCKER_DEFAULTS, INFER_DEFAULTS from benchmarks.swefficiency.workspace import ResourceLimitedDockerWorkspace -from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser -from benchmarks.utils.build_utils import ensure_local_image +from benchmarks.utils.args_parser import get_parser +from benchmarks.utils.build_utils import build_image from benchmarks.utils.conversation import build_event_persistence_callback from benchmarks.utils.critics import create_critic from benchmarks.utils.dataset import get_dataset @@ -20,13 +21,13 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response -from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.image_utils import image_exists from benchmarks.utils.models import ( EvalInstance, EvalMetadata, EvalOutput, ) -from benchmarks.utils.version import IMAGE_TAG_PREFIX +from benchmarks.utils.version import SDK_SHORT_SHA from openhands.sdk import LLM, Agent, Conversation, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.preset.default import get_default_tools @@ -200,19 +201,37 @@ def prepare_workspace( # Build agent server image tag suffix = f"-{build_target}" if build_target != "binary" else "" agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) logger.info(f"Base image: {base_docker_image}") logger.info(f"Agent server image: {agent_server_image}") if self.metadata.workspace_type == "docker": - ensure_local_image( - agent_server_image=agent_server_image, - base_image=base_docker_image, - custom_tag=custom_tag, - target=build_target, - ) + # Build agent-server image from base swefficiency image + SKIP_BUILD = os.getenv("SKIP_BUILD", "0").lower() in ("1", "true", "yes") + logger.info(f"SKIP_BUILD={SKIP_BUILD}") + + if not SKIP_BUILD: + logger.info( + f"Building workspace from {base_docker_image} " + f"for instance {instance.id}. " + "This may take a while..." + ) + output = build_image( + base_image=base_docker_image, + target_image=EVAL_AGENT_SERVER_IMAGE, + custom_tag=custom_tag, + target=build_target, + push=False, + ) + logger.info(f"Image build output: {output}") + assert output.error is None, f"Image build failed: {output.error}" + if agent_server_image not in output.tags: + raise RuntimeError( + f"Built image tags {output.tags} do not include expected tag " + f"{agent_server_image}" + ) # Get CPU group for resource limiting cpu_group = self._acquire_cpu_group() @@ -237,20 +256,25 @@ def prepare_workspace( elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") + sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) - if not remote_image_exists(agent_server_image): + # For remote, use SDK_SHORT_SHA from env if available + remote_agent_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" + ) + if not image_exists(remote_agent_image): raise RuntimeError( - f"Agent server image {agent_server_image} does not exist in container registry, " + f"Agent server image {remote_agent_image} does not exist in container registry, " "make sure to build, push it, and make it public accessible before using remote workspace." ) logger.info( - f"Using remote workspace with image {agent_server_image} " - f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" + f"Using remote workspace with image {remote_agent_image} " + f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) workspace = APIRemoteWorkspace( @@ -258,7 +282,7 @@ def prepare_workspace( "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev" ), runtime_api_key=runtime_api_key, - server_image=agent_server_image, + server_image=remote_agent_image, target_type="source", forward_env=forward_env or [], resource_factor=resource_factor, @@ -389,8 +413,21 @@ def evaluate_instance( def main() -> None: + prompt_dir = (Path(__file__).parent / "prompts").resolve() + choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + default_prompt_path = prompt_dir / "default.j2" + assert default_prompt_path.exists(), ( + f"Default prompt {default_prompt_path} not found" + ) + parser = get_parser() - add_prompt_path_argument(parser, __file__) + parser.add_argument( + "--prompt-path", + type=str, + default=str(default_prompt_path), + choices=choices, + help="Path to prompt template file", + ) parser.add_argument( "--num-cpus-per-worker", type=int, diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index bdfb7b13e..f795a1a18 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -1,11 +1,12 @@ import json import os +from pathlib import Path from typing import List from jinja2 import Environment, FileSystemLoader from benchmarks.swtbench.config import INFER_DEFAULTS -from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser +from benchmarks.utils.args_parser import get_parser from benchmarks.utils.console_logging import summarize_instance from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -17,19 +18,20 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response -from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists +from benchmarks.utils.image_utils import image_exists from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, EvalMetadata, EvalOutput, ) -from benchmarks.utils.version import IMAGE_TAG_PREFIX +from benchmarks.utils.version import SDK_SHORT_SHA +from openhands.agent_server.docker.build import _base_slug from openhands.sdk import Agent, Conversation, Tool, __version__, get_logger from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools -from openhands.workspace import APIRemoteWorkspace +from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace logger = get_logger(__name__) @@ -54,10 +56,6 @@ def get_agent_server_docker_image( target: str = "source-minimal", ) -> str: """Get the agent server Docker image for an instance.""" - # Importing here because openhands.agent_server.docker.build runs git checks - # which fails when installed as a package outside the git repo - from openhands.agent_server.docker.build import _base_slug - official_image_name = get_official_docker_image(instance_id, docker_image_prefix) return ( "ghcr.io/all-hands-ai/agent-server" @@ -171,32 +169,51 @@ def prepare_workspace( if self.metadata.workspace_type == "docker": agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" - ) - workspace = create_docker_workspace( - agent_server_image=agent_server_image, - base_image=official_docker_image, - build_target=build_target, - forward_env=forward_env, + f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) + SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") + logger.info(f"SKIP_BUILD={SKIP_BUILD}") + if not SKIP_BUILD: + logger.info( + f"Building workspace from {official_docker_image} " + f"for instance {instance.id}. " + "This may take a while...\n" + "You can run benchmarks/swtbench/build_images.py and set " + "SKIP_BUILD=1 to skip building and use pre-built " + "agent-server image." + ) + # For SWT-bench, we use DockerDevWorkspace with base_image + workspace = DockerDevWorkspace( + base_image=official_docker_image, + working_dir="/workspace", + target=build_target, + forward_env=forward_env or [], + ) + else: + workspace = DockerWorkspace( + server_image=agent_server_image, + working_dir="/workspace", + forward_env=forward_env or [], + ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") + sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) if not runtime_api_key: raise ValueError( "RUNTIME_API_KEY environment variable is not set for remote workspace" ) agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" + f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}" ) - if not remote_image_exists(agent_server_image): + if not image_exists(agent_server_image): raise RuntimeError( f"Agent server image {agent_server_image} does not exist in container registry, " "make sure to build, push it, and make it public accessible before using remote workspace." ) logger.info( f"Using remote workspace with image {agent_server_image} " - f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})" + f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})" ) startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600")) workspace = APIRemoteWorkspace( @@ -339,8 +356,21 @@ def evaluate_instance( def main() -> None: """Main entry point for SWT-bench evaluation.""" + prompt_dir = (Path(__file__).parent / "prompts").resolve() + choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")] + default_prompt_path = prompt_dir / "default.j2" + assert default_prompt_path.exists(), ( + f"Default prompt {default_prompt_path} not found" + ) + parser = get_parser() - add_prompt_path_argument(parser, __file__) + parser.add_argument( + "--prompt-path", + type=str, + default=str(default_prompt_path), + choices=choices, + help="Path to prompt template file", + ) parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() @@ -360,7 +390,7 @@ def main() -> None: dataset_name=dataset_description, model_name=llm.model, max_iterations=args.max_iterations, - eval_note=f"SWT-{args.note}" if args.note else None, + eval_note="SWT-" + args.note, ) critic = create_critic(args) diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py index b4e56b9ad..9c700f1d8 100644 --- a/benchmarks/utils/build_utils.py +++ b/benchmarks/utils/build_utils.py @@ -20,7 +20,6 @@ from pydantic import BaseModel, Field from tqdm.auto import tqdm -from benchmarks.swebench.constants import TargetType from benchmarks.utils.args_parser import get_parser from benchmarks.utils.buildx_utils import ( buildkit_disk_usage, @@ -28,7 +27,8 @@ maybe_reset_buildkit, ) from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE -from benchmarks.utils.image_utils import local_image_exists, remote_image_exists +from benchmarks.utils.image_utils import image_exists +from openhands.agent_server.docker.build import BuildOptions, TargetType, build from openhands.sdk import get_logger @@ -282,10 +282,6 @@ def build_image( target: TargetType = "source-minimal", push: bool = False, ) -> BuildOutput: - # Importing here because openhands.agent_server.docker.build runs git checks - # which fails when installed as a package outside the git repo - from openhands.agent_server.docker.build import BuildOptions, build - # Get SDK info from submodule to ensure tags use the correct SDK SHA git_ref, git_sha, sdk_version = _get_sdk_submodule_info() @@ -304,51 +300,13 @@ def build_image( ) for t in opts.all_tags: # Check if image exists or not - if remote_image_exists(t): + if image_exists(t): logger.info("Image %s already exists. Skipping build.", t) return BuildOutput(base_image=base_image, tags=[t], error=None) tags = build(opts) return BuildOutput(base_image=base_image, tags=tags, error=None) -def ensure_local_image( - agent_server_image: str, - base_image: str, - custom_tag: str, - target: TargetType = "source-minimal", -) -> bool: - """Build an agent-server image locally if it doesn't already exist. - - Returns True if a build occurred, False if the image already existed. - Set FORCE_BUILD=1 to skip auto-detection and always rebuild. - """ - force_build = os.getenv("FORCE_BUILD", "0").lower() in ("1", "true", "yes") - if not force_build and local_image_exists(agent_server_image): - logger.info(f"Using pre-built image {agent_server_image}") - return False - - if force_build: - logger.info(f"FORCE_BUILD set, building image from {base_image}...") - else: - logger.info(f"Building image from {base_image}...") - output = build_image( - base_image=base_image, - target_image=EVAL_AGENT_SERVER_IMAGE, - custom_tag=custom_tag, - target=target, - push=False, - ) - logger.info(f"Image build output: {output}") - if output.error is not None: - raise RuntimeError(f"Image build failed: {output.error}") - if agent_server_image not in output.tags: - raise RuntimeError( - f"Built image tags {output.tags} do not include expected tag " - f"{agent_server_image}" - ) - return True - - def _build_with_logging( log_dir: Path, base_image: str, diff --git a/benchmarks/utils/image_utils.py b/benchmarks/utils/image_utils.py index 467074cb9..a463f3b4f 100644 --- a/benchmarks/utils/image_utils.py +++ b/benchmarks/utils/image_utils.py @@ -1,24 +1,9 @@ #!/usr/bin/env python3 -from __future__ import annotations - import base64 -import os -import subprocess import sys -from typing import TYPE_CHECKING - - -if TYPE_CHECKING: - from openhands.sdk.workspace import TargetType - from openhands.workspace import DockerDevWorkspace, DockerWorkspace import requests -from openhands.sdk import get_logger - - -logger = get_logger(__name__) - ACCEPT = ",".join( [ @@ -69,64 +54,12 @@ def _ghcr_token(repo: str, username: str | None, pat: str | None) -> str | None: return None -def local_image_exists(image: str) -> bool: - """Check if a Docker image exists in the local Docker daemon.""" - try: - result = subprocess.run( - ["docker", "image", "inspect", image], - capture_output=True, - check=False, - timeout=5, - ) - return result.returncode == 0 - except (subprocess.TimeoutExpired, FileNotFoundError) as e: - logger.warning(f"Failed to check if image {image} exists: {e}") - return False - - -def create_docker_workspace( - agent_server_image: str, - base_image: str, - build_target: TargetType, - working_dir: str = "/workspace", - forward_env: list[str] | None = None, -) -> DockerWorkspace | DockerDevWorkspace: - """Create a Docker workspace, building the image only if not already available. - - Returns DockerWorkspace when a pre-built image is found locally, - DockerDevWorkspace otherwise (which builds on-the-fly). - Set FORCE_BUILD=1 to skip auto-detection and always build. - """ - from openhands.workspace import DockerDevWorkspace, DockerWorkspace - - force_build = os.getenv("FORCE_BUILD", "0").lower() in ("1", "true", "yes") - if not force_build and local_image_exists(agent_server_image): - logger.info(f"Using pre-built image {agent_server_image}") - return DockerWorkspace( - server_image=agent_server_image, - working_dir=working_dir, - forward_env=forward_env or [], - ) - else: - if force_build: - logger.info(f"FORCE_BUILD set, building workspace from {base_image}...") - else: - logger.info(f"Building workspace from {base_image}...") - return DockerDevWorkspace( - base_image=base_image, - working_dir=working_dir, - target=build_target, - forward_env=forward_env or [], - ) - - -def remote_image_exists( +def image_exists( image_ref: str, gh_username: str | None = None, gh_pat: str | None = None, # GitHub PAT with read:packages for private GHCR docker_token: str | None = None, # Docker Hub JWT if you already have one ) -> bool: - """Check if a Docker image exists in a remote registry.""" registry, repo, ref = _parse(image_ref) headers = {"Accept": ACCEPT} @@ -168,5 +101,5 @@ def remote_image_exists( gh_user = sys.argv[2] if len(sys.argv) > 2 else None gh_pat = sys.argv[3] if len(sys.argv) > 3 else None - ok = remote_image_exists(image, gh_username=gh_user, gh_pat=gh_pat) + ok = image_exists(image, gh_username=gh_user, gh_pat=gh_pat) print(f"{image} -> {'✅ exists' if ok else '❌ not found or unauthorized'}") diff --git a/tests/test_image_utils.py b/tests/test_image_utils.py deleted file mode 100644 index c46830cb6..000000000 --- a/tests/test_image_utils.py +++ /dev/null @@ -1,257 +0,0 @@ -"""Tests for image_utils and build_utils helper functions. - -Tests cover local_image_exists(), create_docker_workspace(), and ensure_local_image() -which centralize Docker image detection and build logic across all benchmarks. -""" - -import os -import subprocess -from unittest.mock import MagicMock, patch - -import pytest - -from benchmarks.utils.build_utils import BuildOutput - - -class TestLocalImageExists: - """Tests for local_image_exists().""" - - @patch("benchmarks.utils.image_utils.subprocess.run") - def test_image_exists(self, mock_run): - from benchmarks.utils.image_utils import local_image_exists - - mock_run.return_value = MagicMock(returncode=0) - assert local_image_exists("myimage:latest") is True - mock_run.assert_called_once_with( - ["docker", "image", "inspect", "myimage:latest"], - capture_output=True, - check=False, - timeout=5, - ) - - @patch("benchmarks.utils.image_utils.subprocess.run") - def test_image_not_found(self, mock_run): - from benchmarks.utils.image_utils import local_image_exists - - mock_run.return_value = MagicMock(returncode=1) - assert local_image_exists("noimage:latest") is False - - @patch("benchmarks.utils.image_utils.subprocess.run") - def test_timeout_returns_false(self, mock_run): - from benchmarks.utils.image_utils import local_image_exists - - mock_run.side_effect = subprocess.TimeoutExpired(cmd="docker", timeout=5) - assert local_image_exists("myimage:latest") is False - - @patch("benchmarks.utils.image_utils.subprocess.run") - def test_docker_not_installed_returns_false(self, mock_run): - from benchmarks.utils.image_utils import local_image_exists - - mock_run.side_effect = FileNotFoundError("docker not found") - assert local_image_exists("myimage:latest") is False - - -class TestCreateDockerWorkspace: - """Tests for create_docker_workspace(). - - These tests mock the Docker daemon interaction (local_image_exists) and - workspace constructors (which connect to Docker), but verify the actual - branching logic and argument forwarding. - """ - - @patch("benchmarks.utils.image_utils.local_image_exists", return_value=True) - def test_returns_docker_workspace_when_image_exists(self, _mock_exists): - from benchmarks.utils.image_utils import create_docker_workspace - from openhands.workspace import DockerWorkspace - - with patch("openhands.workspace.DockerWorkspace", wraps=DockerWorkspace) as spy: - # wraps=DockerWorkspace would call the real constructor which needs Docker, - # so we set a return_value to avoid that while still checking isinstance - sentinel = MagicMock(spec=DockerWorkspace) - spy.return_value = sentinel - ws = create_docker_workspace( - agent_server_image="server:v1", - base_image="base:latest", - build_target="binary", - ) - spy.assert_called_once_with( - server_image="server:v1", - working_dir="/workspace", - forward_env=[], - ) - assert ws is sentinel - - @patch("benchmarks.utils.image_utils.local_image_exists", return_value=False) - def test_returns_docker_dev_workspace_when_image_missing(self, _mock_exists): - from benchmarks.utils.image_utils import create_docker_workspace - from openhands.workspace import DockerDevWorkspace - - sentinel = MagicMock(spec=DockerDevWorkspace) - with patch( - "openhands.workspace.DockerDevWorkspace", return_value=sentinel - ) as spy: - ws = create_docker_workspace( - agent_server_image="server:v1", - base_image="base:latest", - build_target="source-minimal", - forward_env=["FOO"], - ) - spy.assert_called_once_with( - base_image="base:latest", - working_dir="/workspace", - target="source-minimal", - forward_env=["FOO"], - ) - assert ws is sentinel - - @patch.dict(os.environ, {"FORCE_BUILD": "1"}) - @patch("benchmarks.utils.image_utils.local_image_exists", return_value=True) - def test_force_build_skips_detection(self, mock_exists): - from benchmarks.utils.image_utils import create_docker_workspace - from openhands.workspace import DockerDevWorkspace - - sentinel = MagicMock(spec=DockerDevWorkspace) - with patch("openhands.workspace.DockerDevWorkspace", return_value=sentinel): - ws = create_docker_workspace( - agent_server_image="server:v1", - base_image="base:latest", - build_target="binary", - ) - # Should build even though image exists locally - assert ws is sentinel - # local_image_exists should NOT have been called when FORCE_BUILD=1 - mock_exists.assert_not_called() - - @patch("benchmarks.utils.image_utils.local_image_exists", return_value=True) - def test_custom_working_dir_and_forward_env(self, _mock_exists): - """Verify custom parameters are forwarded correctly.""" - from benchmarks.utils.image_utils import create_docker_workspace - - with patch("openhands.workspace.DockerWorkspace") as MockDW: - create_docker_workspace( - agent_server_image="server:v1", - base_image="base:latest", - build_target="binary", - working_dir="/custom", - forward_env=["API_KEY", "TOKEN"], - ) - MockDW.assert_called_once_with( - server_image="server:v1", - working_dir="/custom", - forward_env=["API_KEY", "TOKEN"], - ) - - -class TestEnsureLocalImage: - """Tests for ensure_local_image(). - - Uses real BuildOutput objects (not mocks) so validation logic in - ensure_local_image is exercised against actual data structures. - """ - - @patch("benchmarks.utils.build_utils.local_image_exists", return_value=True) - @patch("benchmarks.utils.build_utils.build_image") - def test_returns_false_when_image_exists(self, mock_build, _mock_exists): - from benchmarks.utils.build_utils import ensure_local_image - - result = ensure_local_image( - agent_server_image="server:v1", - base_image="base:latest", - custom_tag="mytag", - ) - assert result is False - mock_build.assert_not_called() - - @patch("benchmarks.utils.build_utils.local_image_exists", return_value=False) - @patch("benchmarks.utils.build_utils.build_image") - def test_returns_true_when_build_occurs(self, mock_build, _mock_exists): - from benchmarks.utils.build_utils import ensure_local_image - - mock_build.return_value = BuildOutput( - base_image="base:latest", - tags=["server:v1"], - error=None, - ) - result = ensure_local_image( - agent_server_image="server:v1", - base_image="base:latest", - custom_tag="mytag", - ) - assert result is True - mock_build.assert_called_once() - - @patch("benchmarks.utils.build_utils.local_image_exists", return_value=False) - @patch("benchmarks.utils.build_utils.build_image") - def test_raises_on_build_failure(self, mock_build, _mock_exists): - from benchmarks.utils.build_utils import ensure_local_image - - mock_build.return_value = BuildOutput( - base_image="base:latest", - tags=[], - error="build exploded", - ) - with pytest.raises(RuntimeError, match="Image build failed"): - ensure_local_image( - agent_server_image="server:v1", - base_image="base:latest", - custom_tag="mytag", - ) - - @patch("benchmarks.utils.build_utils.local_image_exists", return_value=False) - @patch("benchmarks.utils.build_utils.build_image") - def test_raises_on_tag_mismatch(self, mock_build, _mock_exists): - from benchmarks.utils.build_utils import ensure_local_image - - mock_build.return_value = BuildOutput( - base_image="base:latest", - tags=["server:wrong-tag"], - error=None, - ) - with pytest.raises(RuntimeError, match="do not include expected tag"): - ensure_local_image( - agent_server_image="server:v1", - base_image="base:latest", - custom_tag="mytag", - ) - - @patch.dict(os.environ, {"FORCE_BUILD": "1"}) - @patch("benchmarks.utils.build_utils.local_image_exists", return_value=True) - @patch("benchmarks.utils.build_utils.build_image") - def test_force_build_skips_detection(self, mock_build, mock_exists): - from benchmarks.utils.build_utils import ensure_local_image - - mock_build.return_value = BuildOutput( - base_image="base:latest", - tags=["server:v1"], - error=None, - ) - result = ensure_local_image( - agent_server_image="server:v1", - base_image="base:latest", - custom_tag="mytag", - ) - assert result is True - mock_build.assert_called_once() - # local_image_exists should NOT have been called when FORCE_BUILD=1 - mock_exists.assert_not_called() - - @patch("benchmarks.utils.build_utils.local_image_exists", return_value=False) - @patch("benchmarks.utils.build_utils.build_image") - def test_passes_target_to_build_image(self, mock_build, _mock_exists): - """Verify the target parameter flows through to build_image.""" - from benchmarks.utils.build_utils import ensure_local_image - - mock_build.return_value = BuildOutput( - base_image="base:latest", - tags=["server:v1"], - error=None, - ) - ensure_local_image( - agent_server_image="server:v1", - base_image="base:latest", - custom_tag="mytag", - target="binary", - ) - _, kwargs = mock_build.call_args - assert kwargs["target"] == "binary" - assert kwargs["push"] is False diff --git a/tests/test_llm_config.py b/tests/test_llm_config.py index a244ff811..5d6cd348d 100644 --- a/tests/test_llm_config.py +++ b/tests/test_llm_config.py @@ -20,14 +20,14 @@ class TestLoadLLMConfigValidConfigs: def test_minimal_valid_config(self, tmp_path: Path) -> None: """Minimal config with only required 'model' field loads correctly.""" - config = {"model": "gpt-4o"} + config = {"model": "gpt-4"} config_path = tmp_path / "config.json" config_path.write_text(json.dumps(config)) llm = load_llm_config(config_path) assert isinstance(llm, LLM) - assert llm.model == "gpt-4o" + assert llm.model == "gpt-4" def test_full_valid_config(self, tmp_path: Path) -> None: """Config with all common fields loads correctly.""" @@ -211,7 +211,7 @@ def test_unreadable_file_raises_permission_error(self, tmp_path: Path) -> None: def test_config_with_extra_fields_loads(self, tmp_path: Path) -> None: """Config with unknown extra fields should still load (pydantic default).""" config = { - "model": "gpt-4o", + "model": "gpt-4", "unknown_field": "value", "another_unknown": 123, } @@ -220,11 +220,11 @@ def test_config_with_extra_fields_loads(self, tmp_path: Path) -> None: # Should not raise - pydantic by default ignores extra fields llm = load_llm_config(config_path) - assert llm.model == "gpt-4o" + assert llm.model == "gpt-4" def test_unicode_in_config(self, tmp_path: Path) -> None: """Config with unicode characters loads correctly.""" - config = {"model": "gpt-4o", "api_key": "key-with-émojis-🔑"} + config = {"model": "gpt-4", "api_key": "key-with-émojis-🔑"} config_path = tmp_path / "config.json" config_path.write_text(json.dumps(config, ensure_ascii=False)) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index bde715c12..b498a6990 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit bde715c12bce8fb112980529d5ad162f6b81a7f1 +Subproject commit b498a69908f7d06feb3921ffe05ff7e781a6f108 From 721ed01d7e4551b63868b34106ee5015dc316e71 Mon Sep 17 00:00:00 2001 From: openhands Date: Wed, 11 Mar 2026 23:04:14 +0000 Subject: [PATCH 2/2] fix: update image_exists imports after revert The revert of 2bfcc6c correctly removed the functions added in that commit, but didn't account for the subsequent rename of image_exists to remote_image_exists in commit 92efb47 (#471). Since we're reverting to the pre-2bfcc6c state, the function should be called image_exists, not remote_image_exists. Changes: - benchmarks/swebench/build_images.py: import and use image_exists - benchmarks/gaia/build_images.py: import and use image_exists - benchmarks/swtbench/build_eval_env_images.py: use alias (as it was pre-2bfcc6c) This fixes the ImportError that was causing SWE-bench builds to fail. Co-authored-by: openhands --- benchmarks/gaia/build_images.py | 4 ++-- benchmarks/swebench/build_images.py | 4 ++-- benchmarks/swtbench/build_eval_env_images.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/gaia/build_images.py b/benchmarks/gaia/build_images.py index 45aee7550..8d3969532 100644 --- a/benchmarks/gaia/build_images.py +++ b/benchmarks/gaia/build_images.py @@ -21,7 +21,7 @@ get_build_parser, run_docker_build_layer, ) -from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.image_utils import image_exists from openhands.sdk import get_logger @@ -81,7 +81,7 @@ def tag_fn(_base: str) -> str: # inflating the image and causing runtime OOM crashes. _, git_sha, _ = _get_sdk_submodule_info() base_gaia_image = f"{args.image}:{git_sha[:7]}-gaia-{args.target}" - if not args.dry_run and remote_image_exists(base_gaia_image): + if not args.dry_run and image_exists(base_gaia_image): logger.info("Image %s already exists. Skipping build.", base_gaia_image) return 0 diff --git a/benchmarks/swebench/build_images.py b/benchmarks/swebench/build_images.py index 5ace5419b..cae96b871 100644 --- a/benchmarks/swebench/build_images.py +++ b/benchmarks/swebench/build_images.py @@ -22,7 +22,7 @@ run_docker_build_layer, ) from benchmarks.utils.dataset import get_dataset -from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.image_utils import image_exists from openhands.sdk import get_logger @@ -94,7 +94,7 @@ def wrap_image(agent_image: str, push: bool = False) -> BuildOutput: For pushes, verify the base tag exists in the registry. For local builds, assume the tag is available locally or resolvable by Docker during buildx. """ - if push and not remote_image_exists(agent_image): + if push and not image_exists(agent_image): return BuildOutput( base_image=agent_image, tags=[], diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py index 2f0ea9862..fde30ed9c 100644 --- a/benchmarks/swtbench/build_eval_env_images.py +++ b/benchmarks/swtbench/build_eval_env_images.py @@ -12,7 +12,7 @@ from benchmarks.swtbench.config import EVAL_DEFAULTS from benchmarks.swtbench.image_utils import ensure_swt_bench_repo from benchmarks.utils.dataset import get_dataset -from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.image_utils import image_exists as remote_image_exists from openhands.sdk import get_logger