From d6b07a8a555a5dd4c6e60ea3de16396017316d81 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 11 Mar 2026 20:52:05 +0000
Subject: [PATCH 1/2] Revert "feat: auto-detect pre-built Docker images across
 all benchmarks (#456)"

This reverts commit 2bfcc6c6e19e81bdc727127fd019d5e0dd964826.

The auto-detect feature introduced in #456 is causing slow image builds
and timeouts. This revert restores the previous image building behavior
to restore benchmark build performance.

Fixes #502

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/commit0/run_infer.py            |  48 ++--
 benchmarks/gaia/run_infer.py               |  25 +-
 benchmarks/multiswebench/run_infer.py      |  67 ++++--
 benchmarks/swebench/run_infer.py           |  79 +++++--
 benchmarks/swebenchmultimodal/run_infer.py |  66 ++++--
 benchmarks/swefficiency/run_infer.py       |  71 ++++--
 benchmarks/swtbench/run_infer.py           |  70 ++++--
 benchmarks/utils/build_utils.py            |  48 +---
 benchmarks/utils/image_utils.py            |  71 +-----
 tests/test_image_utils.py                  | 257 ---------------------
 tests/test_llm_config.py                   |  10 +-
 vendor/software-agent-sdk                  |   2 +-
 12 files changed, 315 insertions(+), 499 deletions(-)
 delete mode 100644 tests/test_image_utils.py

diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
index b9a08102a..03ddecb85 100644
--- a/benchmarks/commit0/run_infer.py
+++ b/benchmarks/commit0/run_infer.py
@@ -1,6 +1,7 @@
 import json
 import os
 from collections import Counter
+from pathlib import Path
 from typing import Any, List
 
 from commit0.harness.constants import SPLIT
@@ -12,7 +13,7 @@
     get_base_docker_image,
 )
 from benchmarks.commit0.config import INFER_DEFAULTS
-from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser
+from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.console_logging import summarize_instance
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.conversation import build_event_persistence_callback
@@ -23,19 +24,19 @@
     construct_eval_output_dir,
     get_default_on_result_writer,
 )
-from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
+from benchmarks.utils.image_utils import image_exists
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
-from benchmarks.utils.version import IMAGE_TAG_PREFIX
+from benchmarks.utils.version import SDK_SHORT_SHA
 from openhands.sdk import Agent, Conversation, Tool, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.delegate import DelegateTool
 from openhands.tools.preset.default import get_default_tools
-from openhands.workspace import APIRemoteWorkspace
+from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
 
 
 logger = get_logger(__name__)
@@ -187,16 +188,15 @@ def prepare_workspace(
         logger.info(f"Using base docker image: {base_docker_image}")
 
         if self.metadata.workspace_type == "docker":
-            custom_tag = extract_custom_tag(base_docker_image)
-            suffix = f"-{build_target}" if build_target != "binary" else ""
-            agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
-            )
-            workspace = create_docker_workspace(
-                agent_server_image=agent_server_image,
+            # Build agent-server image from base commit0 image
+            workspace = DockerDevWorkspace(
                 base_image=base_docker_image,
-                build_target=build_target,
-                forward_env=forward_env,
+                working_dir="/workspace",
+                target=build_target,
+                forward_env=forward_env or [],
+            )
+            logger.info(
+                f"Building workspace from {base_docker_image}. This may take a while..."
             )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
@@ -205,13 +205,14 @@ def prepare_workspace(
                     "RUNTIME_API_KEY environment variable is not set for remote workspace"
                 )
 
+            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
             custom_tag = extract_custom_tag(base_docker_image)
             suffix = f"-{build_target}" if build_target != "binary" else ""
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
             )
 
-            if not remote_image_exists(agent_server_image):
+            if not image_exists(agent_server_image):
                 raise RuntimeError(
                     f"Agent server image {agent_server_image} does not exist in container registry. "
                     "Run 'benchmarks/commit0/build_images.py --push' to build and push it first."
@@ -219,7 +220,7 @@ def prepare_workspace(
 
             logger.info(
                 f"Using remote workspace with image {agent_server_image} "
-                f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
+                f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
             )
             startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
             workspace = APIRemoteWorkspace(
@@ -591,8 +592,21 @@ def evaluate_instance(
 
 
 def main() -> None:
+    prompt_dir = (Path(__file__).parent / "prompts").resolve()
+    choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    default_prompt_path = prompt_dir / "default.j2"
+    assert default_prompt_path.exists(), (
+        f"Default prompt {default_prompt_path} not found"
+    )
+
     parser = get_parser()
-    add_prompt_path_argument(parser, __file__)
+    parser.add_argument(
+        "--prompt-path",
+        type=str,
+        default=str(default_prompt_path),
+        choices=choices,
+        help="Path to prompt template file",
+    )
     parser.add_argument(
         "--repo-split",
         type=str,
diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
index a100a2cfb..cfbf0682c 100644
--- a/benchmarks/gaia/run_infer.py
+++ b/benchmarks/gaia/run_infer.py
@@ -27,10 +27,10 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
-from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
+from benchmarks.utils.image_utils import image_exists
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
-from benchmarks.utils.version import IMAGE_TAG_PREFIX
+from benchmarks.utils.version import SDK_SHORT_SHA
 from openhands.sdk import (
     Agent,
     Conversation,
@@ -47,7 +47,7 @@
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.delegate import DelegateTool
 from openhands.tools.preset.default import get_default_tools
-from openhands.workspace import APIRemoteWorkspace
+from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
 
 
 logger = get_logger(__name__)
@@ -156,14 +156,11 @@ def prepare_workspace(
         logger.info(f"Preparing workspace for instance {instance.id}")
 
         if self.metadata.workspace_type == "docker":
-            agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary"
-            )
-            workspace = create_docker_workspace(
-                agent_server_image=agent_server_image,
+            # Use DockerDevWorkspace with base image (same as main branch)
+            workspace = DockerDevWorkspace(
                 base_image="nikolaik/python-nodejs:python3.12-nodejs22",
-                build_target="binary",
-                forward_env=forward_env,
+                working_dir="/workspace",
+                forward_env=forward_env or [],
             )
         elif self.metadata.workspace_type == "remote":
             # For workflow, use APIRemoteWorkspace with pre-built GAIA image
@@ -177,11 +174,12 @@ def prepare_workspace(
                     "RUNTIME_API_KEY environment variable is not set for remote workspace"
                 )
 
+            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-gaia-binary"
             )
 
-            if not remote_image_exists(agent_server_image):
+            if not image_exists(agent_server_image):
                 raise RuntimeError(
                     f"Agent server image {agent_server_image} does not exist in container registry. "
                     f"Run 'benchmarks/gaia/build_images.py --push' to build and push it first."
@@ -189,7 +187,7 @@ def prepare_workspace(
 
             logger.info(
                 f"Using remote workspace with GAIA image {agent_server_image} "
-                f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
+                f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
             )
             startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
             workspace = APIRemoteWorkspace(
@@ -592,7 +590,6 @@ def main() -> None:
         max_attempts=args.max_attempts,
         critic=critic,
         selected_instances_file=args.select,
-        max_retries=args.max_retries,
         workspace_type=args.workspace,
         enable_delegation=args.enable_delegation,
     )
diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py
index 9d47d87e9..20b926a61 100644
--- a/benchmarks/multiswebench/run_infer.py
+++ b/benchmarks/multiswebench/run_infer.py
@@ -1,5 +1,6 @@
 import json
 import os
+from pathlib import Path
 from typing import List, cast
 
 import pandas as pd
@@ -12,8 +13,8 @@
 )
 from benchmarks.multiswebench.download_dataset import download_and_concat_dataset
 from benchmarks.multiswebench.scripts.data.data_change import format_data_for_inference
-from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser
-from benchmarks.utils.build_utils import ensure_local_image
+from benchmarks.utils.args_parser import get_parser
+from benchmarks.utils.build_utils import build_image
 from benchmarks.utils.console_logging import summarize_instance
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.conversation import build_event_persistence_callback
@@ -25,14 +26,14 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
-from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.image_utils import image_exists
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
-from benchmarks.utils.version import IMAGE_TAG_PREFIX
+from benchmarks.utils.version import SDK_SHORT_SHA
 from openhands.sdk import Agent, Conversation, Tool, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.delegate import DelegateTool
@@ -209,14 +210,38 @@ def prepare_workspace(
 
         if self.metadata.workspace_type == "docker":
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
             )
-            ensure_local_image(
-                agent_server_image=agent_server_image,
-                base_image=official_docker_image,
-                custom_tag=custom_tag,
-                target=build_target,
+            SKIP_BUILD = os.getenv("MULTI_SWE_BENCH_SKIP_BUILD", "0").lower() in (
+                "1",
+                "true",
+                "yes",
             )
+            logger.info(f"MULTI_SWE_BENCH_SKIP_BUILD={SKIP_BUILD}")
+            if not SKIP_BUILD:
+                logger.info(
+                    f"Building workspace from {official_docker_image} "
+                    f"for instance {instance.id}. "
+                    "This may take a while...\n"
+                    "You can run benchmarks/multiswebench/build_images.py and set "
+                    "MULTI_SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
+                    "agent-server image."
+                )
+                output = build_image(
+                    base_image=official_docker_image,
+                    target_image=EVAL_AGENT_SERVER_IMAGE,
+                    custom_tag=custom_tag,
+                    target=build_target,
+                    push=False,
+                )
+                logger.info(f"Image build output: {output}")
+                assert output.error is None, f"Image build failed: {output.error}"
+                if agent_server_image not in output.tags:
+                    raise RuntimeError(
+                        f"Built image tags {output.tags} do not include expected tag "
+                        f"{agent_server_image}"
+                    )
+
             workspace = DockerWorkspace(
                 server_image=agent_server_image,
                 working_dir="/workspace",
@@ -224,22 +249,23 @@ def prepare_workspace(
             )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
+            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
             if not runtime_api_key:
                 raise ValueError(
                     "RUNTIME_API_KEY environment variable is not set for remote workspace"
                 )
 
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
             )
-            if not remote_image_exists(agent_server_image):
+            if not image_exists(agent_server_image):
                 raise RuntimeError(
                     f"Agent server image {agent_server_image} does not exist in container registry, "
                     "make sure to build, push it, and make it public accessible before using remote workspace."
                 )
             logger.info(
                 f"Using remote workspace with image {agent_server_image} "
-                f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
+                f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
             )
             startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
             workspace = APIRemoteWorkspace(
@@ -402,8 +428,21 @@ def evaluate_instance(
 
 
 def main() -> None:
+    prompt_dir = (Path(__file__).parent / "prompts").resolve()
+    choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    default_prompt_path = prompt_dir / "default.j2"
+    assert default_prompt_path.exists(), (
+        f"Default prompt {default_prompt_path} not found"
+    )
+
     parser = get_parser()
-    add_prompt_path_argument(parser, __file__)
+    parser.add_argument(
+        "--prompt-path",
+        type=str,
+        default=str(default_prompt_path),
+        choices=choices,
+        help="Path to prompt template file",
+    )
     parser.add_argument(
         "--lang",
         type=str,
diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
index 065c3aecd..e4e6af9f4 100644
--- a/benchmarks/swebench/run_infer.py
+++ b/benchmarks/swebench/run_infer.py
@@ -1,5 +1,6 @@
 import json
 import os
+from pathlib import Path
 from typing import List
 
 from jinja2 import Environment, FileSystemLoader
@@ -12,8 +13,8 @@
     wrap_image,
 )
 from benchmarks.swebench.config import INFER_DEFAULTS
-from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser
-from benchmarks.utils.build_utils import ensure_local_image
+from benchmarks.utils.args_parser import get_parser
+from benchmarks.utils.build_utils import build_image
 from benchmarks.utils.console_logging import summarize_instance
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.conversation import build_event_persistence_callback
@@ -25,7 +26,7 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
-from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.image_utils import image_exists
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -33,7 +34,7 @@
     EvalOutput,
     ToolPresetType,
 )
-from benchmarks.utils.version import IMAGE_TAG_PREFIX
+from benchmarks.utils.version import SDK_SHORT_SHA
 from openhands.sdk import Agent, Conversation, Tool, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.delegate import DelegateTool
@@ -153,30 +154,44 @@ def prepare_workspace(
             f"-{build_target}" if build_target != constants.BUILD_TARGET_BINARY else ""
         )
         base_agent_image = (
-            f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
+            f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
         )
         wrap_needed = should_wrap_instance_id(instance.id)
         agent_server_image = base_agent_image
 
         if self.metadata.workspace_type == "docker":
-            built = ensure_local_image(
-                agent_server_image=base_agent_image,
-                base_image=official_docker_image,
-                custom_tag=custom_tag,
-                target=build_target,
-            )
-            if built and wrap_needed:
-                wrapped_result = wrap_image(base_agent_image, push=False)
-                if wrapped_result.error:
-                    raise RuntimeError(
-                        "Wrapped image build failed: "
-                        f"{wrapped_result.error}; log={wrapped_result.log_path}"
-                    )
-            elif not built and wrap_needed:
+            SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
+            logger.info(f"SKIP_BUILD={SKIP_BUILD}")
+            if not SKIP_BUILD:
                 logger.info(
-                    f"Using pre-built image {base_agent_image} "
-                    "(assumed already wrapped)"
+                    f"Building workspace from {official_docker_image} "
+                    f"for instance {instance.id}. "
+                    "This may take a while...\n"
+                    "You can run benchmarks/swebench/build_images.py and set "
+                    "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
+                    "agent-server image."
                 )
+                output = build_image(
+                    base_image=official_docker_image,
+                    target_image=EVAL_AGENT_SERVER_IMAGE,
+                    custom_tag=custom_tag,
+                    target=build_target,
+                    push=False,
+                )
+                logger.info(f"Image build output: {output}")
+                assert output.error is None, f"Image build failed: {output.error}"
+                if base_agent_image not in output.tags:
+                    raise RuntimeError(
+                        f"Built image tags {output.tags} do not include expected tag "
+                        f"{base_agent_image}"
+                    )
+                if wrap_needed:
+                    wrapped_result = wrap_image(base_agent_image, push=False)
+                    if wrapped_result.error:
+                        raise RuntimeError(
+                            "Wrapped image build failed: "
+                            f"{wrapped_result.error}; log={wrapped_result.log_path}"
+                        )
 
             workspace = DockerWorkspace(
                 server_image=agent_server_image,
@@ -185,22 +200,23 @@ def prepare_workspace(
             )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
+            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
             if not runtime_api_key:
                 raise ValueError(
                     "RUNTIME_API_KEY environment variable is not set for remote workspace"
                 )
 
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
             )
-            if not remote_image_exists(agent_server_image):
+            if not image_exists(agent_server_image):
                 raise RuntimeError(
                     f"Agent server image {agent_server_image} does not exist in container registry, "
                     "make sure to build, push it, and make it public accessible before using remote workspace."
                 )
             logger.info(
                 f"Using remote workspace with image {agent_server_image} "
-                f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
+                f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
             )
             startup_timeout = float(
                 os.getenv(
@@ -347,8 +363,21 @@ def evaluate_instance(
 
 
 def main() -> None:
+    prompt_dir = (Path(__file__).parent / "prompts").resolve()
+    choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    default_prompt_path = prompt_dir / "default.j2"
+    assert default_prompt_path.exists(), (
+        f"Default prompt {default_prompt_path} not found"
+    )
+
     parser = get_parser()
-    add_prompt_path_argument(parser, __file__)
+    parser.add_argument(
+        "--prompt-path",
+        type=str,
+        default=str(default_prompt_path),
+        choices=choices,
+        help="Path to prompt template file",
+    )
     parser.set_defaults(**INFER_DEFAULTS)
     args = parser.parse_args()
 
diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py
index 7eb3d348f..295b3fd13 100644
--- a/benchmarks/swebenchmultimodal/run_infer.py
+++ b/benchmarks/swebenchmultimodal/run_infer.py
@@ -1,5 +1,6 @@
 import json
 import os
+from pathlib import Path
 from typing import List
 
 import requests
@@ -10,8 +11,8 @@
     get_official_docker_image,
 )
 from benchmarks.swebenchmultimodal.config import INFER_DEFAULTS
-from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser
-from benchmarks.utils.build_utils import ensure_local_image
+from benchmarks.utils.args_parser import get_parser
+from benchmarks.utils.build_utils import build_image
 from benchmarks.utils.console_logging import summarize_instance
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.conversation import build_event_persistence_callback
@@ -23,14 +24,14 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
-from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.image_utils import image_exists
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
-from benchmarks.utils.version import IMAGE_TAG_PREFIX
+from benchmarks.utils.version import SDK_SHORT_SHA
 from openhands.sdk import (
     Agent,
     Conversation,
@@ -162,14 +163,35 @@ def prepare_workspace(
 
         if self.metadata.workspace_type == "docker":
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
-            )
-            ensure_local_image(
-                agent_server_image=agent_server_image,
-                base_image=official_docker_image,
-                custom_tag=custom_tag,
-                target=build_target,
+                f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
             )
+            SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
+            logger.info(f"SKIP_BUILD={SKIP_BUILD}")
+            if not SKIP_BUILD:
+                logger.info(
+                    f"Building workspace from {official_docker_image} "
+                    f"for instance {instance.id}. "
+                    "This may take a while...\n"
+                    "You can run benchmarks/swebenchmultimodal/build_images.py and set "
+                    "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
+                    "agent-server image."
+                )
+
+                output = build_image(
+                    base_image=official_docker_image,
+                    target_image=EVAL_AGENT_SERVER_IMAGE,
+                    custom_tag=custom_tag,
+                    target=build_target,
+                    push=False,
+                )
+                logger.info(f"Image build output: {output}")
+                assert output.error is None, f"Image build failed: {output.error}"
+                if agent_server_image not in output.tags:
+                    raise RuntimeError(
+                        f"Built image tags {output.tags} do not include expected tag "
+                        f"{agent_server_image}"
+                    )
+
             workspace = DockerWorkspace(
                 server_image=agent_server_image,
                 working_dir="/workspace",
@@ -177,22 +199,23 @@ def prepare_workspace(
             )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
+            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
             if not runtime_api_key:
                 raise ValueError(
                     "RUNTIME_API_KEY environment variable is not set for remote workspace"
                 )
 
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
             )
-            if not remote_image_exists(agent_server_image):
+            if not image_exists(agent_server_image):
                 raise RuntimeError(
                     f"Agent server image {agent_server_image} does not exist in container registry, "
                     "make sure to build, push it, and make it public accessible before using remote workspace."
                 )
             logger.info(
                 f"Using remote workspace with image {agent_server_image} "
-                f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
+                f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
             )
             startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
             workspace = APIRemoteWorkspace(
@@ -400,8 +423,21 @@ def evaluate_instance(
 
 
 def main() -> None:
+    prompt_dir = (Path(__file__).parent / "prompts").resolve()
+    choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    default_prompt_path = prompt_dir / "default.j2"
+    assert default_prompt_path.exists(), (
+        f"Default prompt {default_prompt_path} not found"
+    )
+
     parser = get_parser()
-    add_prompt_path_argument(parser, __file__)
+    parser.add_argument(
+        "--prompt-path",
+        type=str,
+        default=str(default_prompt_path),
+        choices=choices,
+        help="Path to prompt template file",
+    )
     # Apply INFER_DEFAULTS from config (matches evaluation repository values.yaml)
     parser.set_defaults(**INFER_DEFAULTS)
     args = parser.parse_args()
diff --git a/benchmarks/swefficiency/run_infer.py b/benchmarks/swefficiency/run_infer.py
index bb3efd908..1f418a323 100644
--- a/benchmarks/swefficiency/run_infer.py
+++ b/benchmarks/swefficiency/run_infer.py
@@ -1,6 +1,7 @@
 import json
 import multiprocessing
 import os
+from pathlib import Path
 from typing import Any, List
 
 from jinja2 import Environment, FileSystemLoader
@@ -9,8 +10,8 @@
 from benchmarks.swefficiency import constants
 from benchmarks.swefficiency.config import DOCKER_DEFAULTS, INFER_DEFAULTS
 from benchmarks.swefficiency.workspace import ResourceLimitedDockerWorkspace
-from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser
-from benchmarks.utils.build_utils import ensure_local_image
+from benchmarks.utils.args_parser import get_parser
+from benchmarks.utils.build_utils import build_image
 from benchmarks.utils.conversation import build_event_persistence_callback
 from benchmarks.utils.critics import create_critic
 from benchmarks.utils.dataset import get_dataset
@@ -20,13 +21,13 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
-from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.image_utils import image_exists
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
-from benchmarks.utils.version import IMAGE_TAG_PREFIX
+from benchmarks.utils.version import SDK_SHORT_SHA
 from openhands.sdk import LLM, Agent, Conversation, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.preset.default import get_default_tools
@@ -200,19 +201,37 @@ def prepare_workspace(
         # Build agent server image tag
         suffix = f"-{build_target}" if build_target != "binary" else ""
         agent_server_image = (
-            f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
+            f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
         )
 
         logger.info(f"Base image: {base_docker_image}")
         logger.info(f"Agent server image: {agent_server_image}")
 
         if self.metadata.workspace_type == "docker":
-            ensure_local_image(
-                agent_server_image=agent_server_image,
-                base_image=base_docker_image,
-                custom_tag=custom_tag,
-                target=build_target,
-            )
+            # Build agent-server image from base swefficiency image
+            SKIP_BUILD = os.getenv("SKIP_BUILD", "0").lower() in ("1", "true", "yes")
+            logger.info(f"SKIP_BUILD={SKIP_BUILD}")
+
+            if not SKIP_BUILD:
+                logger.info(
+                    f"Building workspace from {base_docker_image} "
+                    f"for instance {instance.id}. "
+                    "This may take a while..."
+                )
+                output = build_image(
+                    base_image=base_docker_image,
+                    target_image=EVAL_AGENT_SERVER_IMAGE,
+                    custom_tag=custom_tag,
+                    target=build_target,
+                    push=False,
+                )
+                logger.info(f"Image build output: {output}")
+                assert output.error is None, f"Image build failed: {output.error}"
+                if agent_server_image not in output.tags:
+                    raise RuntimeError(
+                        f"Built image tags {output.tags} do not include expected tag "
+                        f"{agent_server_image}"
+                    )
 
             # Get CPU group for resource limiting
             cpu_group = self._acquire_cpu_group()
@@ -237,20 +256,25 @@ def prepare_workspace(
 
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
+            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
             if not runtime_api_key:
                 raise ValueError(
                     "RUNTIME_API_KEY environment variable is not set for remote workspace"
                 )
 
-            if not remote_image_exists(agent_server_image):
+            # For remote, use SDK_SHORT_SHA from env if available
+            remote_agent_image = (
+                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
+            )
+            if not image_exists(remote_agent_image):
                 raise RuntimeError(
-                    f"Agent server image {agent_server_image} does not exist in container registry, "
+                    f"Agent server image {remote_agent_image} does not exist in container registry, "
                     "make sure to build, push it, and make it public accessible before using remote workspace."
                 )
 
             logger.info(
-                f"Using remote workspace with image {agent_server_image} "
-                f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
+                f"Using remote workspace with image {remote_agent_image} "
+                f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
             )
 
             workspace = APIRemoteWorkspace(
@@ -258,7 +282,7 @@ def prepare_workspace(
                     "RUNTIME_API_URL", "https://runtime.eval.all-hands.dev"
                 ),
                 runtime_api_key=runtime_api_key,
-                server_image=agent_server_image,
+                server_image=remote_agent_image,
                 target_type="source",
                 forward_env=forward_env or [],
                 resource_factor=resource_factor,
@@ -389,8 +413,21 @@ def evaluate_instance(
 
 
 def main() -> None:
+    prompt_dir = (Path(__file__).parent / "prompts").resolve()
+    choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    default_prompt_path = prompt_dir / "default.j2"
+    assert default_prompt_path.exists(), (
+        f"Default prompt {default_prompt_path} not found"
+    )
+
     parser = get_parser()
-    add_prompt_path_argument(parser, __file__)
+    parser.add_argument(
+        "--prompt-path",
+        type=str,
+        default=str(default_prompt_path),
+        choices=choices,
+        help="Path to prompt template file",
+    )
     parser.add_argument(
         "--num-cpus-per-worker",
         type=int,
diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
index bdfb7b13e..f795a1a18 100644
--- a/benchmarks/swtbench/run_infer.py
+++ b/benchmarks/swtbench/run_infer.py
@@ -1,11 +1,12 @@
 import json
 import os
+from pathlib import Path
 from typing import List
 
 from jinja2 import Environment, FileSystemLoader
 
 from benchmarks.swtbench.config import INFER_DEFAULTS
-from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser
+from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.console_logging import summarize_instance
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
 from benchmarks.utils.conversation import build_event_persistence_callback
@@ -17,19 +18,20 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
-from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
+from benchmarks.utils.image_utils import image_exists
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
     EvalOutput,
 )
-from benchmarks.utils.version import IMAGE_TAG_PREFIX
+from benchmarks.utils.version import SDK_SHORT_SHA
+from openhands.agent_server.docker.build import _base_slug
 from openhands.sdk import Agent, Conversation, Tool, __version__, get_logger
 from openhands.sdk.workspace import RemoteWorkspace
 from openhands.tools.delegate import DelegateTool
 from openhands.tools.preset.default import get_default_tools
-from openhands.workspace import APIRemoteWorkspace
+from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace
 
 
 logger = get_logger(__name__)
@@ -54,10 +56,6 @@ def get_agent_server_docker_image(
     target: str = "source-minimal",
 ) -> str:
     """Get the agent server Docker image for an instance."""
-    # Importing here because openhands.agent_server.docker.build runs git checks
-    # which fails when installed as a package outside the git repo
-    from openhands.agent_server.docker.build import _base_slug
-
     official_image_name = get_official_docker_image(instance_id, docker_image_prefix)
     return (
         "ghcr.io/all-hands-ai/agent-server"
@@ -171,32 +169,51 @@ def prepare_workspace(
 
         if self.metadata.workspace_type == "docker":
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
-            )
-            workspace = create_docker_workspace(
-                agent_server_image=agent_server_image,
-                base_image=official_docker_image,
-                build_target=build_target,
-                forward_env=forward_env,
+                f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
             )
+            SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
+            logger.info(f"SKIP_BUILD={SKIP_BUILD}")
+            if not SKIP_BUILD:
+                logger.info(
+                    f"Building workspace from {official_docker_image} "
+                    f"for instance {instance.id}. "
+                    "This may take a while...\n"
+                    "You can run benchmarks/swtbench/build_images.py and set "
+                    "SKIP_BUILD=1 to skip building and use pre-built "
+                    "agent-server image."
+                )
+                # For SWT-bench, we use DockerDevWorkspace with base_image
+                workspace = DockerDevWorkspace(
+                    base_image=official_docker_image,
+                    working_dir="/workspace",
+                    target=build_target,
+                    forward_env=forward_env or [],
+                )
+            else:
+                workspace = DockerWorkspace(
+                    server_image=agent_server_image,
+                    working_dir="/workspace",
+                    forward_env=forward_env or [],
+                )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
+            sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
             if not runtime_api_key:
                 raise ValueError(
                     "RUNTIME_API_KEY environment variable is not set for remote workspace"
                 )
 
             agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
+                f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
             )
-            if not remote_image_exists(agent_server_image):
+            if not image_exists(agent_server_image):
                 raise RuntimeError(
                     f"Agent server image {agent_server_image} does not exist in container registry, "
                     "make sure to build, push it, and make it public accessible before using remote workspace."
                 )
             logger.info(
                 f"Using remote workspace with image {agent_server_image} "
-                f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
+                f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
             )
             startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
             workspace = APIRemoteWorkspace(
@@ -339,8 +356,21 @@ def evaluate_instance(
 
 def main() -> None:
     """Main entry point for SWT-bench evaluation."""
+    prompt_dir = (Path(__file__).parent / "prompts").resolve()
+    choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
+    default_prompt_path = prompt_dir / "default.j2"
+    assert default_prompt_path.exists(), (
+        f"Default prompt {default_prompt_path} not found"
+    )
+
     parser = get_parser()
-    add_prompt_path_argument(parser, __file__)
+    parser.add_argument(
+        "--prompt-path",
+        type=str,
+        default=str(default_prompt_path),
+        choices=choices,
+        help="Path to prompt template file",
+    )
     parser.set_defaults(**INFER_DEFAULTS)
     args = parser.parse_args()
 
@@ -360,7 +390,7 @@ def main() -> None:
         dataset_name=dataset_description,
         model_name=llm.model,
         max_iterations=args.max_iterations,
-        eval_note=f"SWT-{args.note}" if args.note else None,
+        eval_note="SWT-" + args.note,
     )
 
     critic = create_critic(args)
diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py
index b4e56b9ad..9c700f1d8 100644
--- a/benchmarks/utils/build_utils.py
+++ b/benchmarks/utils/build_utils.py
@@ -20,7 +20,6 @@
 from pydantic import BaseModel, Field
 from tqdm.auto import tqdm
 
-from benchmarks.swebench.constants import TargetType
 from benchmarks.utils.args_parser import get_parser
 from benchmarks.utils.buildx_utils import (
     buildkit_disk_usage,
@@ -28,7 +27,8 @@
     maybe_reset_buildkit,
 )
 from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
-from benchmarks.utils.image_utils import local_image_exists, remote_image_exists
+from benchmarks.utils.image_utils import image_exists
+from openhands.agent_server.docker.build import BuildOptions, TargetType, build
 from openhands.sdk import get_logger
 
 
@@ -282,10 +282,6 @@ def build_image(
     target: TargetType = "source-minimal",
     push: bool = False,
 ) -> BuildOutput:
-    # Importing here because openhands.agent_server.docker.build runs git checks
-    # which fails when installed as a package outside the git repo
-    from openhands.agent_server.docker.build import BuildOptions, build
-
     # Get SDK info from submodule to ensure tags use the correct SDK SHA
     git_ref, git_sha, sdk_version = _get_sdk_submodule_info()
 
@@ -304,51 +300,13 @@ def build_image(
     )
     for t in opts.all_tags:
         # Check if image exists or not
-        if remote_image_exists(t):
+        if image_exists(t):
             logger.info("Image %s already exists. Skipping build.", t)
             return BuildOutput(base_image=base_image, tags=[t], error=None)
     tags = build(opts)
     return BuildOutput(base_image=base_image, tags=tags, error=None)
 
 
-def ensure_local_image(
-    agent_server_image: str,
-    base_image: str,
-    custom_tag: str,
-    target: TargetType = "source-minimal",
-) -> bool:
-    """Build an agent-server image locally if it doesn't already exist.
-
-    Returns True if a build occurred, False if the image already existed.
-    Set FORCE_BUILD=1 to skip auto-detection and always rebuild.
-    """
-    force_build = os.getenv("FORCE_BUILD", "0").lower() in ("1", "true", "yes")
-    if not force_build and local_image_exists(agent_server_image):
-        logger.info(f"Using pre-built image {agent_server_image}")
-        return False
-
-    if force_build:
-        logger.info(f"FORCE_BUILD set, building image from {base_image}...")
-    else:
-        logger.info(f"Building image from {base_image}...")
-    output = build_image(
-        base_image=base_image,
-        target_image=EVAL_AGENT_SERVER_IMAGE,
-        custom_tag=custom_tag,
-        target=target,
-        push=False,
-    )
-    logger.info(f"Image build output: {output}")
-    if output.error is not None:
-        raise RuntimeError(f"Image build failed: {output.error}")
-    if agent_server_image not in output.tags:
-        raise RuntimeError(
-            f"Built image tags {output.tags} do not include expected tag "
-            f"{agent_server_image}"
-        )
-    return True
-
-
 def _build_with_logging(
     log_dir: Path,
     base_image: str,
diff --git a/benchmarks/utils/image_utils.py b/benchmarks/utils/image_utils.py
index 467074cb9..a463f3b4f 100644
--- a/benchmarks/utils/image_utils.py
+++ b/benchmarks/utils/image_utils.py
@@ -1,24 +1,9 @@
 #!/usr/bin/env python3
-from __future__ import annotations
-
 import base64
-import os
-import subprocess
 import sys
-from typing import TYPE_CHECKING
-
-
-if TYPE_CHECKING:
-    from openhands.sdk.workspace import TargetType
-    from openhands.workspace import DockerDevWorkspace, DockerWorkspace
 
 import requests
 
-from openhands.sdk import get_logger
-
-
-logger = get_logger(__name__)
-
 
 ACCEPT = ",".join(
     [
@@ -69,64 +54,12 @@ def _ghcr_token(repo: str, username: str | None, pat: str | None) -> str | None:
     return None
 
 
-def local_image_exists(image: str) -> bool:
-    """Check if a Docker image exists in the local Docker daemon."""
-    try:
-        result = subprocess.run(
-            ["docker", "image", "inspect", image],
-            capture_output=True,
-            check=False,
-            timeout=5,
-        )
-        return result.returncode == 0
-    except (subprocess.TimeoutExpired, FileNotFoundError) as e:
-        logger.warning(f"Failed to check if image {image} exists: {e}")
-        return False
-
-
-def create_docker_workspace(
-    agent_server_image: str,
-    base_image: str,
-    build_target: TargetType,
-    working_dir: str = "/workspace",
-    forward_env: list[str] | None = None,
-) -> DockerWorkspace | DockerDevWorkspace:
-    """Create a Docker workspace, building the image only if not already available.
-
-    Returns DockerWorkspace when a pre-built image is found locally,
-    DockerDevWorkspace otherwise (which builds on-the-fly).
-    Set FORCE_BUILD=1 to skip auto-detection and always build.
-    """
-    from openhands.workspace import DockerDevWorkspace, DockerWorkspace
-
-    force_build = os.getenv("FORCE_BUILD", "0").lower() in ("1", "true", "yes")
-    if not force_build and local_image_exists(agent_server_image):
-        logger.info(f"Using pre-built image {agent_server_image}")
-        return DockerWorkspace(
-            server_image=agent_server_image,
-            working_dir=working_dir,
-            forward_env=forward_env or [],
-        )
-    else:
-        if force_build:
-            logger.info(f"FORCE_BUILD set, building workspace from {base_image}...")
-        else:
-            logger.info(f"Building workspace from {base_image}...")
-        return DockerDevWorkspace(
-            base_image=base_image,
-            working_dir=working_dir,
-            target=build_target,
-            forward_env=forward_env or [],
-        )
-
-
-def remote_image_exists(
+def image_exists(
     image_ref: str,
     gh_username: str | None = None,
     gh_pat: str | None = None,  # GitHub PAT with read:packages for private GHCR
     docker_token: str | None = None,  # Docker Hub JWT if you already have one
 ) -> bool:
-    """Check if a Docker image exists in a remote registry."""
     registry, repo, ref = _parse(image_ref)
     headers = {"Accept": ACCEPT}
 
@@ -168,5 +101,5 @@ def remote_image_exists(
     gh_user = sys.argv[2] if len(sys.argv) > 2 else None
     gh_pat = sys.argv[3] if len(sys.argv) > 3 else None
 
-    ok = remote_image_exists(image, gh_username=gh_user, gh_pat=gh_pat)
+    ok = image_exists(image, gh_username=gh_user, gh_pat=gh_pat)
     print(f"{image} -> {'✅ exists' if ok else '❌ not found or unauthorized'}")
diff --git a/tests/test_image_utils.py b/tests/test_image_utils.py
deleted file mode 100644
index c46830cb6..000000000
--- a/tests/test_image_utils.py
+++ /dev/null
@@ -1,257 +0,0 @@
-"""Tests for image_utils and build_utils helper functions.
-
-Tests cover local_image_exists(), create_docker_workspace(), and ensure_local_image()
-which centralize Docker image detection and build logic across all benchmarks.
-"""
-
-import os
-import subprocess
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from benchmarks.utils.build_utils import BuildOutput
-
-
-class TestLocalImageExists:
-    """Tests for local_image_exists()."""
-
-    @patch("benchmarks.utils.image_utils.subprocess.run")
-    def test_image_exists(self, mock_run):
-        from benchmarks.utils.image_utils import local_image_exists
-
-        mock_run.return_value = MagicMock(returncode=0)
-        assert local_image_exists("myimage:latest") is True
-        mock_run.assert_called_once_with(
-            ["docker", "image", "inspect", "myimage:latest"],
-            capture_output=True,
-            check=False,
-            timeout=5,
-        )
-
-    @patch("benchmarks.utils.image_utils.subprocess.run")
-    def test_image_not_found(self, mock_run):
-        from benchmarks.utils.image_utils import local_image_exists
-
-        mock_run.return_value = MagicMock(returncode=1)
-        assert local_image_exists("noimage:latest") is False
-
-    @patch("benchmarks.utils.image_utils.subprocess.run")
-    def test_timeout_returns_false(self, mock_run):
-        from benchmarks.utils.image_utils import local_image_exists
-
-        mock_run.side_effect = subprocess.TimeoutExpired(cmd="docker", timeout=5)
-        assert local_image_exists("myimage:latest") is False
-
-    @patch("benchmarks.utils.image_utils.subprocess.run")
-    def test_docker_not_installed_returns_false(self, mock_run):
-        from benchmarks.utils.image_utils import local_image_exists
-
-        mock_run.side_effect = FileNotFoundError("docker not found")
-        assert local_image_exists("myimage:latest") is False
-
-
-class TestCreateDockerWorkspace:
-    """Tests for create_docker_workspace().
-
-    These tests mock the Docker daemon interaction (local_image_exists) and
-    workspace constructors (which connect to Docker), but verify the actual
-    branching logic and argument forwarding.
-    """
-
-    @patch("benchmarks.utils.image_utils.local_image_exists", return_value=True)
-    def test_returns_docker_workspace_when_image_exists(self, _mock_exists):
-        from benchmarks.utils.image_utils import create_docker_workspace
-        from openhands.workspace import DockerWorkspace
-
-        with patch("openhands.workspace.DockerWorkspace", wraps=DockerWorkspace) as spy:
-            # wraps=DockerWorkspace would call the real constructor which needs Docker,
-            # so we set a return_value to avoid that while still checking isinstance
-            sentinel = MagicMock(spec=DockerWorkspace)
-            spy.return_value = sentinel
-            ws = create_docker_workspace(
-                agent_server_image="server:v1",
-                base_image="base:latest",
-                build_target="binary",
-            )
-            spy.assert_called_once_with(
-                server_image="server:v1",
-                working_dir="/workspace",
-                forward_env=[],
-            )
-            assert ws is sentinel
-
-    @patch("benchmarks.utils.image_utils.local_image_exists", return_value=False)
-    def test_returns_docker_dev_workspace_when_image_missing(self, _mock_exists):
-        from benchmarks.utils.image_utils import create_docker_workspace
-        from openhands.workspace import DockerDevWorkspace
-
-        sentinel = MagicMock(spec=DockerDevWorkspace)
-        with patch(
-            "openhands.workspace.DockerDevWorkspace", return_value=sentinel
-        ) as spy:
-            ws = create_docker_workspace(
-                agent_server_image="server:v1",
-                base_image="base:latest",
-                build_target="source-minimal",
-                forward_env=["FOO"],
-            )
-            spy.assert_called_once_with(
-                base_image="base:latest",
-                working_dir="/workspace",
-                target="source-minimal",
-                forward_env=["FOO"],
-            )
-            assert ws is sentinel
-
-    @patch.dict(os.environ, {"FORCE_BUILD": "1"})
-    @patch("benchmarks.utils.image_utils.local_image_exists", return_value=True)
-    def test_force_build_skips_detection(self, mock_exists):
-        from benchmarks.utils.image_utils import create_docker_workspace
-        from openhands.workspace import DockerDevWorkspace
-
-        sentinel = MagicMock(spec=DockerDevWorkspace)
-        with patch("openhands.workspace.DockerDevWorkspace", return_value=sentinel):
-            ws = create_docker_workspace(
-                agent_server_image="server:v1",
-                base_image="base:latest",
-                build_target="binary",
-            )
-            # Should build even though image exists locally
-            assert ws is sentinel
-            # local_image_exists should NOT have been called when FORCE_BUILD=1
-            mock_exists.assert_not_called()
-
-    @patch("benchmarks.utils.image_utils.local_image_exists", return_value=True)
-    def test_custom_working_dir_and_forward_env(self, _mock_exists):
-        """Verify custom parameters are forwarded correctly."""
-        from benchmarks.utils.image_utils import create_docker_workspace
-
-        with patch("openhands.workspace.DockerWorkspace") as MockDW:
-            create_docker_workspace(
-                agent_server_image="server:v1",
-                base_image="base:latest",
-                build_target="binary",
-                working_dir="/custom",
-                forward_env=["API_KEY", "TOKEN"],
-            )
-            MockDW.assert_called_once_with(
-                server_image="server:v1",
-                working_dir="/custom",
-                forward_env=["API_KEY", "TOKEN"],
-            )
-
-
-class TestEnsureLocalImage:
-    """Tests for ensure_local_image().
-
-    Uses real BuildOutput objects (not mocks) so validation logic in
-    ensure_local_image is exercised against actual data structures.
-    """
-
-    @patch("benchmarks.utils.build_utils.local_image_exists", return_value=True)
-    @patch("benchmarks.utils.build_utils.build_image")
-    def test_returns_false_when_image_exists(self, mock_build, _mock_exists):
-        from benchmarks.utils.build_utils import ensure_local_image
-
-        result = ensure_local_image(
-            agent_server_image="server:v1",
-            base_image="base:latest",
-            custom_tag="mytag",
-        )
-        assert result is False
-        mock_build.assert_not_called()
-
-    @patch("benchmarks.utils.build_utils.local_image_exists", return_value=False)
-    @patch("benchmarks.utils.build_utils.build_image")
-    def test_returns_true_when_build_occurs(self, mock_build, _mock_exists):
-        from benchmarks.utils.build_utils import ensure_local_image
-
-        mock_build.return_value = BuildOutput(
-            base_image="base:latest",
-            tags=["server:v1"],
-            error=None,
-        )
-        result = ensure_local_image(
-            agent_server_image="server:v1",
-            base_image="base:latest",
-            custom_tag="mytag",
-        )
-        assert result is True
-        mock_build.assert_called_once()
-
-    @patch("benchmarks.utils.build_utils.local_image_exists", return_value=False)
-    @patch("benchmarks.utils.build_utils.build_image")
-    def test_raises_on_build_failure(self, mock_build, _mock_exists):
-        from benchmarks.utils.build_utils import ensure_local_image
-
-        mock_build.return_value = BuildOutput(
-            base_image="base:latest",
-            tags=[],
-            error="build exploded",
-        )
-        with pytest.raises(RuntimeError, match="Image build failed"):
-            ensure_local_image(
-                agent_server_image="server:v1",
-                base_image="base:latest",
-                custom_tag="mytag",
-            )
-
-    @patch("benchmarks.utils.build_utils.local_image_exists", return_value=False)
-    @patch("benchmarks.utils.build_utils.build_image")
-    def test_raises_on_tag_mismatch(self, mock_build, _mock_exists):
-        from benchmarks.utils.build_utils import ensure_local_image
-
-        mock_build.return_value = BuildOutput(
-            base_image="base:latest",
-            tags=["server:wrong-tag"],
-            error=None,
-        )
-        with pytest.raises(RuntimeError, match="do not include expected tag"):
-            ensure_local_image(
-                agent_server_image="server:v1",
-                base_image="base:latest",
-                custom_tag="mytag",
-            )
-
-    @patch.dict(os.environ, {"FORCE_BUILD": "1"})
-    @patch("benchmarks.utils.build_utils.local_image_exists", return_value=True)
-    @patch("benchmarks.utils.build_utils.build_image")
-    def test_force_build_skips_detection(self, mock_build, mock_exists):
-        from benchmarks.utils.build_utils import ensure_local_image
-
-        mock_build.return_value = BuildOutput(
-            base_image="base:latest",
-            tags=["server:v1"],
-            error=None,
-        )
-        result = ensure_local_image(
-            agent_server_image="server:v1",
-            base_image="base:latest",
-            custom_tag="mytag",
-        )
-        assert result is True
-        mock_build.assert_called_once()
-        # local_image_exists should NOT have been called when FORCE_BUILD=1
-        mock_exists.assert_not_called()
-
-    @patch("benchmarks.utils.build_utils.local_image_exists", return_value=False)
-    @patch("benchmarks.utils.build_utils.build_image")
-    def test_passes_target_to_build_image(self, mock_build, _mock_exists):
-        """Verify the target parameter flows through to build_image."""
-        from benchmarks.utils.build_utils import ensure_local_image
-
-        mock_build.return_value = BuildOutput(
-            base_image="base:latest",
-            tags=["server:v1"],
-            error=None,
-        )
-        ensure_local_image(
-            agent_server_image="server:v1",
-            base_image="base:latest",
-            custom_tag="mytag",
-            target="binary",
-        )
-        _, kwargs = mock_build.call_args
-        assert kwargs["target"] == "binary"
-        assert kwargs["push"] is False
diff --git a/tests/test_llm_config.py b/tests/test_llm_config.py
index a244ff811..5d6cd348d 100644
--- a/tests/test_llm_config.py
+++ b/tests/test_llm_config.py
@@ -20,14 +20,14 @@ class TestLoadLLMConfigValidConfigs:
 
     def test_minimal_valid_config(self, tmp_path: Path) -> None:
         """Minimal config with only required 'model' field loads correctly."""
-        config = {"model": "gpt-4o"}
+        config = {"model": "gpt-4"}
         config_path = tmp_path / "config.json"
         config_path.write_text(json.dumps(config))
 
         llm = load_llm_config(config_path)
 
         assert isinstance(llm, LLM)
-        assert llm.model == "gpt-4o"
+        assert llm.model == "gpt-4"
 
     def test_full_valid_config(self, tmp_path: Path) -> None:
         """Config with all common fields loads correctly."""
@@ -211,7 +211,7 @@ def test_unreadable_file_raises_permission_error(self, tmp_path: Path) -> None:
     def test_config_with_extra_fields_loads(self, tmp_path: Path) -> None:
         """Config with unknown extra fields should still load (pydantic default)."""
         config = {
-            "model": "gpt-4o",
+            "model": "gpt-4",
             "unknown_field": "value",
             "another_unknown": 123,
         }
@@ -220,11 +220,11 @@ def test_config_with_extra_fields_loads(self, tmp_path: Path) -> None:
 
         # Should not raise - pydantic by default ignores extra fields
         llm = load_llm_config(config_path)
-        assert llm.model == "gpt-4o"
+        assert llm.model == "gpt-4"
 
     def test_unicode_in_config(self, tmp_path: Path) -> None:
         """Config with unicode characters loads correctly."""
-        config = {"model": "gpt-4o", "api_key": "key-with-émojis-🔑"}
+        config = {"model": "gpt-4", "api_key": "key-with-émojis-🔑"}
         config_path = tmp_path / "config.json"
         config_path.write_text(json.dumps(config, ensure_ascii=False))
 
diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk
index bde715c12..b498a6990 160000
--- a/vendor/software-agent-sdk
+++ b/vendor/software-agent-sdk
@@ -1 +1 @@
-Subproject commit bde715c12bce8fb112980529d5ad162f6b81a7f1
+Subproject commit b498a69908f7d06feb3921ffe05ff7e781a6f108

From 721ed01d7e4551b63868b34106ee5015dc316e71 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Wed, 11 Mar 2026 23:04:14 +0000
Subject: [PATCH 2/2] fix: update image_exists imports after revert

The revert of 2bfcc6c correctly removed the functions added in that commit,
but didn't account for the subsequent rename of image_exists to remote_image_exists
in commit 92efb47 (#471).

Since we're reverting to the pre-2bfcc6c state, the function should be called
image_exists, not remote_image_exists.

Changes:
- benchmarks/swebench/build_images.py: import and use image_exists
- benchmarks/gaia/build_images.py: import and use image_exists
- benchmarks/swtbench/build_eval_env_images.py: use alias (as it was pre-2bfcc6c)

This fixes the ImportError that was causing SWE-bench builds to fail.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 benchmarks/gaia/build_images.py              | 4 ++--
 benchmarks/swebench/build_images.py          | 4 ++--
 benchmarks/swtbench/build_eval_env_images.py | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmarks/gaia/build_images.py b/benchmarks/gaia/build_images.py
index 45aee7550..8d3969532 100644
--- a/benchmarks/gaia/build_images.py
+++ b/benchmarks/gaia/build_images.py
@@ -21,7 +21,7 @@
     get_build_parser,
     run_docker_build_layer,
 )
-from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.image_utils import image_exists
 from openhands.sdk import get_logger
 
 
@@ -81,7 +81,7 @@ def tag_fn(_base: str) -> str:
     # inflating the image and causing runtime OOM crashes.
     _, git_sha, _ = _get_sdk_submodule_info()
     base_gaia_image = f"{args.image}:{git_sha[:7]}-gaia-{args.target}"
-    if not args.dry_run and remote_image_exists(base_gaia_image):
+    if not args.dry_run and image_exists(base_gaia_image):
         logger.info("Image %s already exists. Skipping build.", base_gaia_image)
         return 0
 
diff --git a/benchmarks/swebench/build_images.py b/benchmarks/swebench/build_images.py
index 5ace5419b..cae96b871 100644
--- a/benchmarks/swebench/build_images.py
+++ b/benchmarks/swebench/build_images.py
@@ -22,7 +22,7 @@
     run_docker_build_layer,
 )
 from benchmarks.utils.dataset import get_dataset
-from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.image_utils import image_exists
 from openhands.sdk import get_logger
 
 
@@ -94,7 +94,7 @@ def wrap_image(agent_image: str, push: bool = False) -> BuildOutput:
     For pushes, verify the base tag exists in the registry. For local builds,
     assume the tag is available locally or resolvable by Docker during buildx.
     """
-    if push and not remote_image_exists(agent_image):
+    if push and not image_exists(agent_image):
         return BuildOutput(
             base_image=agent_image,
             tags=[],
diff --git a/benchmarks/swtbench/build_eval_env_images.py b/benchmarks/swtbench/build_eval_env_images.py
index 2f0ea9862..fde30ed9c 100644
--- a/benchmarks/swtbench/build_eval_env_images.py
+++ b/benchmarks/swtbench/build_eval_env_images.py
@@ -12,7 +12,7 @@
 from benchmarks.swtbench.config import EVAL_DEFAULTS
 from benchmarks.swtbench.image_utils import ensure_swt_bench_repo
 from benchmarks.utils.dataset import get_dataset
-from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.image_utils import image_exists as remote_image_exists
 from openhands.sdk import get_logger