From 79d308adb921dd1de7ed97165ab1e1ca04cfae98 Mon Sep 17 00:00:00 2001 From: openhands Date: Thu, 12 Mar 2026 22:01:59 +0000 Subject: [PATCH 1/3] Clarify Apptainer support in benchmark docs Co-authored-by: openhands --- AGENTS.md | 5 +++++ README.md | 15 ++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/AGENTS.md b/AGENTS.md index 0206a51d1..7c5dda499 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -100,4 +100,9 @@ When converting between OpenHands format and benchmark-specific formats: - Handle missing/optional fields gracefully - Log conversion warnings for debugging - Validate output format before evaluation + +# Workspace runtimes +- The benchmark CLI currently accepts only `docker` and `remote` for `--workspace` +- The vendored SDK also includes `openhands.workspace.ApptainerWorkspace`, but the benchmark repo does not yet wire it into `run_infer.py` +- On Docker-restricted systems, document Apptainer as an SDK capability rather than a benchmark CLI option unless code support is added diff --git a/README.md b/README.md index 1dcb9fcbf..5c1b75832 100644 --- a/README.md +++ b/README.md @@ -173,7 +173,7 @@ Inputs (forwarded to the SDK `run-eval.yml` workflow): ## Workspace Types -Benchmarks support two workspace types for running evaluations: +Benchmarks currently expose two workspace types in their CLIs: ### Docker Workspace (Default) @@ -191,6 +191,19 @@ Uses a [remote runtime API](https://openhands.dev/blog/evaluation-of-llms-as-cod - **Cons**: Requires pre-built images and API access - **Use case**: Large-scale evaluations, benchmarking runs +### Apptainer on Docker-Restricted Systems + +The vendored SDK includes `openhands.workspace.ApptainerWorkspace`, which can run a pre-built agent-server image without a local Docker daemon. It converts OCI/Docker images to Apptainer SIF files with `apptainer pull docker://...`, so it is a good fit for HPC or university environments where Docker is unavailable. + +However, the benchmark repo does **not** currently expose `apptainer` as a supported `--workspace` value. Today, the benchmark CLIs and metadata models only accept `docker` and `remote`. + +If your machine cannot run Docker, the supported paths today are: + +1. Use `--workspace remote` and point the benchmark at a runtime API. +2. Add a local integration that swaps benchmark `DockerWorkspace` usage for `ApptainerWorkspace` in the relevant `run_infer.py` implementation, using pre-built agent-server images. + +In other words: Apptainer is supported by the underlying SDK, but it is not yet a turnkey benchmark workspace option in this repository. + #### How Remote Runtime Works 1. **Pre-build Agent Images**: Agent-server images must be pre-built for a specific SDK commit (SHA) and pushed to a public container registry (e.g., `ghcr.io/openhands/eval-agent-server`) From 8d6113efd41f17e32aab28c8743021e25f408eec Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 13 Mar 2026 00:10:41 +0000 Subject: [PATCH 2/3] Add benchmark-side Apptainer workspace support Co-authored-by: openhands --- README.md | 36 +++++++---- benchmarks/commit0/run_infer.py | 22 +++++-- benchmarks/gaia/run_infer.py | 16 +++-- benchmarks/multiswebench/README.md | 14 +++++ benchmarks/multiswebench/run_infer.py | 17 ++++-- benchmarks/openagentsafety/run_infer.py | 53 ++++++++++------ benchmarks/swebench/README.md | 15 +++++ benchmarks/swebench/run_infer.py | 10 ++- benchmarks/swebenchmultimodal/README.md | 2 +- benchmarks/swebenchmultimodal/run_infer.py | 17 ++++-- benchmarks/swefficiency/README.md | 16 ++++- benchmarks/swefficiency/run_infer.py | 10 ++- benchmarks/swtbench/run_infer.py | 20 ++++-- benchmarks/utils/args_parser.py | 4 +- benchmarks/utils/image_utils.py | 53 +++++++++++++++- benchmarks/utils/models.py | 4 +- tests/test_image_utils.py | 71 +++++++++++++++++++++- tests/test_workspace_types.py | 9 +++ 18 files changed, 322 insertions(+), 67 deletions(-) create mode 100644 tests/test_workspace_types.py diff --git a/README.md b/README.md index 5c1b75832..d37faa7a4 100644 --- a/README.md +++ b/README.md @@ -173,7 +173,7 @@ Inputs (forwarded to the SDK `run-eval.yml` workflow): ## Workspace Types -Benchmarks currently expose two workspace types in their CLIs: +Benchmarks expose three workspace types in their CLIs: ### Docker Workspace (Default) @@ -183,26 +183,36 @@ Uses local Docker containers to run agent evaluations. Images are built locally - **Cons**: Resource-intensive on local machine, slower for large-scale evaluations - **Use case**: Development, testing, small-scale evaluations -### Remote Workspace +### Apptainer Workspace -Uses a [remote runtime API](https://openhands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed) to provision containers in a cloud environment, enabling massive parallelization. +Uses `openhands.workspace.ApptainerWorkspace` from the vendored SDK to run a pre-built agent-server image without a local Docker daemon. The workspace pulls OCI/Docker images with `apptainer pull docker://...`, so it is a good fit for HPC or university environments where Docker is unavailable. -- **Pros**: Scalable to hundreds of parallel workers, no local resource constraints -- **Cons**: Requires pre-built images and API access -- **Use case**: Large-scale evaluations, benchmarking runs +- **Pros**: No Docker daemon required, works on many shared/HPC systems +- **Cons**: Requires a pre-built agent-server image in a registry; unlike Docker mode, it cannot build from a base image on the fly +- **Use case**: Local benchmark runs on Docker-restricted machines -### Apptainer on Docker-Restricted Systems +Example: -The vendored SDK includes `openhands.workspace.ApptainerWorkspace`, which can run a pre-built agent-server image without a local Docker daemon. It converts OCI/Docker images to Apptainer SIF files with `apptainer pull docker://...`, so it is a good fit for HPC or university environments where Docker is unavailable. +```bash +uv run swebench-infer path/to/llm_config.json \ + --dataset princeton-nlp/SWE-bench_Verified \ + --split test \ + --workspace apptainer +``` -However, the benchmark repo does **not** currently expose `apptainer` as a supported `--workspace` value. Today, the benchmark CLIs and metadata models only accept `docker` and `remote`. +Useful environment variables: +- `APPTAINER_CACHE_DIR`: Override the SIF/cache directory +- `APPTAINER_HOST_PORT`: Pin the local port used by the agent server +- `APPTAINER_USE_FAKEROOT=0`: Disable fakeroot if your cluster does not support it +- `APPTAINER_ENABLE_DOCKER_COMPAT=0`: Disable `--compat` for custom Apptainer behavior -If your machine cannot run Docker, the supported paths today are: +### Remote Workspace -1. Use `--workspace remote` and point the benchmark at a runtime API. -2. Add a local integration that swaps benchmark `DockerWorkspace` usage for `ApptainerWorkspace` in the relevant `run_infer.py` implementation, using pre-built agent-server images. +Uses a [remote runtime API](https://openhands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed) to provision containers in a cloud environment, enabling massive parallelization. -In other words: Apptainer is supported by the underlying SDK, but it is not yet a turnkey benchmark workspace option in this repository. +- **Pros**: Scalable to hundreds of parallel workers, no local resource constraints +- **Cons**: Requires pre-built images and API access +- **Use case**: Large-scale evaluations, benchmarking runs #### How Remote Runtime Works diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index b9a08102a..e68014693 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -23,7 +23,11 @@ construct_eval_output_dir, get_default_on_result_writer, ) -from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists +from benchmarks.utils.image_utils import ( + create_apptainer_workspace, + create_docker_workspace, + remote_image_exists, +) from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, @@ -186,18 +190,24 @@ def prepare_workspace( build_target = "source-minimal" logger.info(f"Using base docker image: {base_docker_image}") + custom_tag = extract_custom_tag(base_docker_image) + suffix = f"-{build_target}" if build_target != "binary" else "" + agent_server_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" + ) + if self.metadata.workspace_type == "docker": - custom_tag = extract_custom_tag(base_docker_image) - suffix = f"-{build_target}" if build_target != "binary" else "" - agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" - ) workspace = create_docker_workspace( agent_server_image=agent_server_image, base_image=base_docker_image, build_target=build_target, forward_env=forward_env, ) + elif self.metadata.workspace_type == "apptainer": + workspace = create_apptainer_workspace( + agent_server_image=agent_server_image, + forward_env=forward_env, + ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") if not runtime_api_key: diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index a100a2cfb..c4229ac9a 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -27,7 +27,11 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response -from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists +from benchmarks.utils.image_utils import ( + create_apptainer_workspace, + create_docker_workspace, + remote_image_exists, +) from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput from benchmarks.utils.version import IMAGE_TAG_PREFIX @@ -155,16 +159,20 @@ def prepare_workspace( """ logger.info(f"Preparing workspace for instance {instance.id}") + agent_server_image = f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary" + if self.metadata.workspace_type == "docker": - agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary" - ) workspace = create_docker_workspace( agent_server_image=agent_server_image, base_image="nikolaik/python-nodejs:python3.12-nodejs22", build_target="binary", forward_env=forward_env, ) + elif self.metadata.workspace_type == "apptainer": + workspace = create_apptainer_workspace( + agent_server_image=agent_server_image, + forward_env=forward_env, + ) elif self.metadata.workspace_type == "remote": # For workflow, use APIRemoteWorkspace with pre-built GAIA image # GAIA uses a universal agent server image (one image for all instances) diff --git a/benchmarks/multiswebench/README.md b/benchmarks/multiswebench/README.md index f259a64d1..20045a519 100644 --- a/benchmarks/multiswebench/README.md +++ b/benchmarks/multiswebench/README.md @@ -83,6 +83,20 @@ LANGUAGE=java uv run multi-swebench-infer path/to/llm_config.json \ --workspace docker ``` +### Apptainer Workspace (Local Evaluation without Docker) + +If Docker is unavailable, you can run against pre-built agent-server images with Apptainer: + +```bash +LANGUAGE=java uv run multi-swebench-infer path/to/llm_config.json \ + --dataset bytedance-research/Multi-SWE-Bench \ + --split java_verified \ + --workspace apptainer +``` + +Apptainer mode requires the agent-server images to already exist in a registry; it does not build them locally from base images. + + ### Remote Workspace (Scalable Cloud Evaluation) Remote workspace enables running evaluations at scale by using a cloud-based runtime API to provision containers. This is ideal for large-scale benchmark runs with high parallelization. diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index 9d47d87e9..e74490c94 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -25,7 +25,10 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response -from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.image_utils import ( + create_apptainer_workspace, + remote_image_exists, +) from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, @@ -207,10 +210,11 @@ def prepare_workspace( # For non-binary targets, append target suffix suffix = f"-{build_target}" if build_target != "binary" else "" + agent_server_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" + ) + if self.metadata.workspace_type == "docker": - agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" - ) ensure_local_image( agent_server_image=agent_server_image, base_image=official_docker_image, @@ -222,6 +226,11 @@ def prepare_workspace( working_dir="/workspace", forward_env=forward_env or [], ) + elif self.metadata.workspace_type == "apptainer": + workspace = create_apptainer_workspace( + agent_server_image=agent_server_image, + forward_env=forward_env, + ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") if not runtime_api_key: diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py index 3c55b5310..aa837c640 100644 --- a/benchmarks/openagentsafety/run_infer.py +++ b/benchmarks/openagentsafety/run_infer.py @@ -25,6 +25,7 @@ from benchmarks.utils.evaluation import Evaluation from benchmarks.utils.evaluation_utils import construct_eval_output_dir from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response +from benchmarks.utils.image_utils import create_apptainer_workspace from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput from openhands.sdk import Agent, Conversation, Tool, get_logger @@ -212,7 +213,7 @@ def cleanup_docker_containers(): def setup_host_mapping(workspace): """Add the-agent-company.com host mapping inside the container.""" try: - gateway_ip = "172.17.0.1" + gateway_ip = os.getenv("THE_AGENT_COMPANY_HOST_IP", "172.17.0.1") logger.info(f"Adding host mapping: {gateway_ip} the-agent-company.com") workspace.execute_command( f"echo '{gateway_ip} the-agent-company.com' >> /etc/hosts" @@ -388,32 +389,44 @@ def prepare_workspace( resource_factor: int = 1, forward_env: list[str] | None = None, ) -> RemoteWorkspace: - """Create a fresh Docker workspace for this instance. + """Create a fresh workspace for this instance. Args: instance: The evaluation instance to prepare workspace for. resource_factor: Resource factor for runtime allocation (default: 1). forward_env: Environment variables to forward into the workspace. """ - # Try to build image on-the-fly, fall back to pre-built if build fails - try: - server_image = build_workspace_image() - except (subprocess.CalledProcessError, RuntimeError) as e: - logger.warning(f"On-the-fly build failed: {e}") + if self.metadata.workspace_type == "docker": + # Try to build image on-the-fly, fall back to pre-built if build fails + try: + server_image = build_workspace_image() + except (subprocess.CalledProcessError, RuntimeError) as e: + logger.warning(f"On-the-fly build failed: {e}") + server_image = get_image_name() + + if not check_image_exists(server_image): + raise RuntimeError( + f"On-the-fly build failed and pre-built image {server_image} does not exist" + ) + logger.info(f"Using pre-built image {server_image}") + + workspace = DockerWorkspace( + server_image=server_image, + platform="linux/amd64", + extra_ports=True, + forward_env=forward_env or [], + ) + elif self.metadata.workspace_type == "apptainer": server_image = get_image_name() - - if not check_image_exists(server_image): - raise RuntimeError( - f"On-the-fly build failed and pre-built image {server_image} does not exist" - ) - logger.info(f"Using pre-built image {server_image}") - - workspace = DockerWorkspace( - server_image=server_image, - platform="linux/amd64", - extra_ports=True, - forward_env=forward_env or [], - ) + workspace = create_apptainer_workspace( + agent_server_image=server_image, + forward_env=forward_env, + extra_ports=True, + ) + else: + raise ValueError( + f"Unsupported workspace_type: {self.metadata.workspace_type}" + ) # Setup host mapping for The Agent Company services setup_host_mapping(workspace) diff --git a/benchmarks/swebench/README.md b/benchmarks/swebench/README.md index a045aa3d6..9b3ef4355 100644 --- a/benchmarks/swebench/README.md +++ b/benchmarks/swebench/README.md @@ -58,6 +58,21 @@ uv run swebench-infer path/to/llm_config.json \ --workspace docker ``` +### Apptainer Workspace (Local Evaluation without Docker) + +If Docker is unavailable, you can use the same pre-built agent-server images with Apptainer: + +```bash +uv run swebench-infer path/to/llm_config.json \ + --dataset princeton-nlp/SWE-bench_Verified \ + --split test \ + --max-iterations 100 \ + --workspace apptainer +``` + +Unlike Docker mode, Apptainer mode cannot build images from base images on the fly. Build and push the agent-server images first, then run inference with `--workspace apptainer`. + + ### Remote Workspace (Scalable Cloud Evaluation) Remote workspace enables running evaluations at scale by using a cloud-based runtime API to provision containers. This is ideal for large-scale benchmark runs with high parallelization. diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index 065c3aecd..9cc94a5f2 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -25,7 +25,10 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response -from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.image_utils import ( + create_apptainer_workspace, + remote_image_exists, +) from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, @@ -183,6 +186,11 @@ def prepare_workspace( working_dir="/workspace", forward_env=forward_env or [], ) + elif self.metadata.workspace_type == "apptainer": + workspace = create_apptainer_workspace( + agent_server_image=agent_server_image, + forward_env=forward_env, + ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") if not runtime_api_key: diff --git a/benchmarks/swebenchmultimodal/README.md b/benchmarks/swebenchmultimodal/README.md index 08acf1c67..7b9540c8c 100644 --- a/benchmarks/swebenchmultimodal/README.md +++ b/benchmarks/swebenchmultimodal/README.md @@ -65,7 +65,7 @@ The benchmark uses the same configuration options as regular SWE-Bench: - `--split`: Dataset split (e.g., `test`, `dev`) - `--llm-config`: Path to LLM configuration file - `--max-iterations`: Maximum number of agent iterations -- `--workspace-type`: Either `docker` or `remote` +- `--workspace`: One of `docker`, `apptainer`, or `remote` - `--num-workers`: Number of parallel workers ## Environment Variables diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 7eb3d348f..0220c928e 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -23,7 +23,10 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response -from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.image_utils import ( + create_apptainer_workspace, + remote_image_exists, +) from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, @@ -160,10 +163,11 @@ def prepare_workspace( # For non-binary targets, append target suffix suffix = f"-{build_target}" if build_target != "binary" else "" + agent_server_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" + ) + if self.metadata.workspace_type == "docker": - agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" - ) ensure_local_image( agent_server_image=agent_server_image, base_image=official_docker_image, @@ -175,6 +179,11 @@ def prepare_workspace( working_dir="/workspace", forward_env=forward_env or [], ) + elif self.metadata.workspace_type == "apptainer": + workspace = create_apptainer_workspace( + agent_server_image=agent_server_image, + forward_env=forward_env, + ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") if not runtime_api_key: diff --git a/benchmarks/swefficiency/README.md b/benchmarks/swefficiency/README.md index 0a207ea5c..bf4551705 100644 --- a/benchmarks/swefficiency/README.md +++ b/benchmarks/swefficiency/README.md @@ -54,6 +54,20 @@ uv run swefficiency-infer path/to/llm_config.json \ --workspace docker ``` + +### Apptainer Workspace (Local Evaluation without Docker) + +If Docker is unavailable, you can run SWE-fficiency with Apptainer against pre-built agent-server images: + +```bash +uv run swefficiency-infer path/to/llm_config.json \ + --dataset swefficiency/swefficiency \ + --split test \ + --workspace apptainer +``` + +Apptainer mode does not apply the Docker-only CPU and memory limits above; it expects the image to already be published in a registry. + ### Remote Workspace (Scalable Cloud Evaluation) ```bash @@ -75,7 +89,7 @@ After running inference, use the official SWE-fficiency benchmark evaluation too |--------|-------------|---------| | `--dataset` | HuggingFace dataset name | `swefficiency/swefficiency` | | `--split` | Dataset split | `test` | -| `--workspace` | Workspace type (`docker` or `remote`) | `docker` | +| `--workspace` | Workspace type (`docker`, `apptainer`, or `remote`) | `docker` | | `--num-workers` | Number of parallel workers | `4` | | `--max-iterations` | Maximum agent iterations | `500` | | `--num-cpus-per-worker` | CPUs per Docker container | `4` | diff --git a/benchmarks/swefficiency/run_infer.py b/benchmarks/swefficiency/run_infer.py index bb3efd908..82bea0b3b 100644 --- a/benchmarks/swefficiency/run_infer.py +++ b/benchmarks/swefficiency/run_infer.py @@ -20,7 +20,10 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response -from benchmarks.utils.image_utils import remote_image_exists +from benchmarks.utils.image_utils import ( + create_apptainer_workspace, + remote_image_exists, +) from benchmarks.utils.models import ( EvalInstance, EvalMetadata, @@ -235,6 +238,11 @@ def prepare_workspace( workspace._cpu_group = cpu_group workspace._cpu_groups_queue = self.cpu_groups_queue + elif self.metadata.workspace_type == "apptainer": + workspace = create_apptainer_workspace( + agent_server_image=agent_server_image, + forward_env=forward_env, + ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") if not runtime_api_key: diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index bdfb7b13e..8ffdb0d38 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -17,7 +17,11 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response -from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists +from benchmarks.utils.image_utils import ( + create_apptainer_workspace, + create_docker_workspace, + remote_image_exists, +) from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, @@ -151,7 +155,7 @@ def prepare_workspace( forward_env: list[str] | None = None, ) -> RemoteWorkspace: """ - Create workspace based on workspace_type (docker or remote). + Create workspace based on workspace_type (docker, apptainer, or remote). Args: instance: The evaluation instance to prepare workspace for. @@ -169,16 +173,22 @@ def prepare_workspace( # For non-binary targets, append target suffix suffix = f"-{build_target}" if build_target != "binary" else "" + agent_server_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" + ) + if self.metadata.workspace_type == "docker": - agent_server_image = ( - f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}" - ) workspace = create_docker_workspace( agent_server_image=agent_server_image, base_image=official_docker_image, build_target=build_target, forward_env=forward_env, ) + elif self.metadata.workspace_type == "apptainer": + workspace = create_apptainer_workspace( + agent_server_image=agent_server_image, + forward_env=forward_env, + ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") if not runtime_api_key: diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 698f2fd4f..8ade8f018 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -41,8 +41,8 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: "--workspace", type=str, default="remote", - choices=["docker", "remote"], - help="Type of workspace to use (default: remote)", + choices=["docker", "apptainer", "remote"], + help="Type of workspace to use: docker, apptainer, or remote (default: remote)", ) parser.add_argument( "--max-iterations", diff --git a/benchmarks/utils/image_utils.py b/benchmarks/utils/image_utils.py index 467074cb9..231094679 100644 --- a/benchmarks/utils/image_utils.py +++ b/benchmarks/utils/image_utils.py @@ -10,7 +10,11 @@ if TYPE_CHECKING: from openhands.sdk.workspace import TargetType - from openhands.workspace import DockerDevWorkspace, DockerWorkspace + from openhands.workspace import ( + ApptainerWorkspace, + DockerDevWorkspace, + DockerWorkspace, + ) import requests @@ -84,6 +88,53 @@ def local_image_exists(image: str) -> bool: return False +def _env_flag(name: str, default: bool) -> bool: + value = os.getenv(name) + if value is None: + return default + return value.lower() in ("1", "true", "yes", "on") + + +def create_apptainer_workspace( + agent_server_image: str, + working_dir: str = "/workspace", + forward_env: list[str] | None = None, + extra_ports: bool = False, +) -> ApptainerWorkspace: + """Create an Apptainer workspace from a pre-built agent-server image. + + Unlike DockerDevWorkspace, ApptainerWorkspace cannot build images from a + base image on the fly. The image must already exist in a container registry + that `apptainer pull docker://...` can access. + """ + from openhands.workspace import ApptainerWorkspace + + if not remote_image_exists(agent_server_image): + raise RuntimeError( + f"Agent server image {agent_server_image} does not exist in container registry. " + "Apptainer workspace requires a pre-built image that can be pulled " + "with Apptainer." + ) + + logger.info(f"Using Apptainer workspace with image {agent_server_image}") + + host_port = os.getenv("APPTAINER_HOST_PORT") + cache_dir = os.getenv("APPTAINER_CACHE_DIR") + mount_dir = os.getenv("APPTAINER_MOUNT_DIR") + + return ApptainerWorkspace( + server_image=agent_server_image, + working_dir=working_dir, + forward_env=forward_env or [], + extra_ports=extra_ports, + host_port=int(host_port) if host_port else None, + cache_dir=cache_dir or None, + mount_dir=mount_dir or None, + use_fakeroot=_env_flag("APPTAINER_USE_FAKEROOT", True), + enable_docker_compat=_env_flag("APPTAINER_ENABLE_DOCKER_COMPAT", True), + ) + + def create_docker_workspace( agent_server_image: str, base_image: str, diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py index b1845e049..165d5566b 100644 --- a/benchmarks/utils/models.py +++ b/benchmarks/utils/models.py @@ -52,9 +52,9 @@ class EvalMetadata(BaseModel): ge=0, description="Maximum number of retries for instances that throw exceptions", ) - workspace_type: Literal["docker", "remote"] = Field( + workspace_type: Literal["docker", "apptainer", "remote"] = Field( default="docker", - description="Type of workspace to use, e.g., 'docker' or 'remote'", + description="Type of workspace to use, e.g., 'docker', 'apptainer', or 'remote'", ) base_resource_factor: int = Field( default=1, diff --git a/tests/test_image_utils.py b/tests/test_image_utils.py index c46830cb6..3daca9553 100644 --- a/tests/test_image_utils.py +++ b/tests/test_image_utils.py @@ -1,7 +1,8 @@ """Tests for image_utils and build_utils helper functions. -Tests cover local_image_exists(), create_docker_workspace(), and ensure_local_image() -which centralize Docker image detection and build logic across all benchmarks. +Tests cover local_image_exists(), create_docker_workspace(), +create_apptainer_workspace(), and ensure_local_image() which centralize +container image detection and workspace creation across benchmarks. """ import os @@ -142,6 +143,72 @@ def test_custom_working_dir_and_forward_env(self, _mock_exists): ) +class TestCreateApptainerWorkspace: + """Tests for create_apptainer_workspace().""" + + @patch("benchmarks.utils.image_utils.remote_image_exists", return_value=True) + def test_returns_apptainer_workspace_when_image_exists(self, _mock_exists): + from benchmarks.utils.image_utils import create_apptainer_workspace + from openhands.workspace import ApptainerWorkspace + + sentinel = MagicMock(spec=ApptainerWorkspace) + with patch( + "openhands.workspace.ApptainerWorkspace", return_value=sentinel + ) as spy: + ws = create_apptainer_workspace( + agent_server_image="ghcr.io/example/agent-server:v1", + forward_env=["API_KEY"], + extra_ports=True, + ) + spy.assert_called_once_with( + server_image="ghcr.io/example/agent-server:v1", + working_dir="/workspace", + forward_env=["API_KEY"], + extra_ports=True, + host_port=None, + cache_dir=None, + mount_dir=None, + use_fakeroot=True, + enable_docker_compat=True, + ) + assert ws is sentinel + + @patch("benchmarks.utils.image_utils.remote_image_exists", return_value=False) + def test_raises_when_image_missing_from_registry(self, _mock_exists): + from benchmarks.utils.image_utils import create_apptainer_workspace + + with pytest.raises(RuntimeError, match="pre-built image"): + create_apptainer_workspace("ghcr.io/example/agent-server:missing") + + @patch.dict( + os.environ, + { + "APPTAINER_HOST_PORT": "8123", + "APPTAINER_CACHE_DIR": "/tmp/apptainer-cache", + "APPTAINER_MOUNT_DIR": "/tmp/workspace-mount", + "APPTAINER_USE_FAKEROOT": "0", + "APPTAINER_ENABLE_DOCKER_COMPAT": "false", + }, + ) + @patch("benchmarks.utils.image_utils.remote_image_exists", return_value=True) + def test_forwards_apptainer_env_configuration(self, _mock_exists): + from benchmarks.utils.image_utils import create_apptainer_workspace + + with patch("openhands.workspace.ApptainerWorkspace") as mock_workspace: + create_apptainer_workspace("ghcr.io/example/agent-server:v1") + mock_workspace.assert_called_once_with( + server_image="ghcr.io/example/agent-server:v1", + working_dir="/workspace", + forward_env=[], + extra_ports=False, + host_port=8123, + cache_dir="/tmp/apptainer-cache", + mount_dir="/tmp/workspace-mount", + use_fakeroot=False, + enable_docker_compat=False, + ) + + class TestEnsureLocalImage: """Tests for ensure_local_image(). diff --git a/tests/test_workspace_types.py b/tests/test_workspace_types.py new file mode 100644 index 000000000..bab8f27ce --- /dev/null +++ b/tests/test_workspace_types.py @@ -0,0 +1,9 @@ +"""Tests for shared workspace type configuration.""" + +from benchmarks.utils.args_parser import get_parser + + +def test_parser_accepts_apptainer_workspace() -> None: + parser = get_parser(add_llm_config=False) + args = parser.parse_args(["--workspace", "apptainer"]) + assert args.workspace == "apptainer" From 2f2385388abc5f6fee5e735c8d94ce0de685403c Mon Sep 17 00:00:00 2001 From: openhands Date: Mon, 16 Mar 2026 13:57:15 +0000 Subject: [PATCH 3/3] Improve Apptainer image guidance and cache reuse Co-authored-by: openhands --- README.md | 9 ++++- benchmarks/swebench/README.md | 23 ++++++++++-- benchmarks/utils/image_utils.py | 64 +++++++++++++++++++++++---------- tests/test_image_utils.py | 34 ++++++++++++++++-- 4 files changed, 106 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index d37faa7a4..9cf03737f 100644 --- a/README.md +++ b/README.md @@ -188,9 +188,16 @@ Uses local Docker containers to run agent evaluations. Images are built locally Uses `openhands.workspace.ApptainerWorkspace` from the vendored SDK to run a pre-built agent-server image without a local Docker daemon. The workspace pulls OCI/Docker images with `apptainer pull docker://...`, so it is a good fit for HPC or university environments where Docker is unavailable. - **Pros**: No Docker daemon required, works on many shared/HPC systems -- **Cons**: Requires a pre-built agent-server image in a registry; unlike Docker mode, it cannot build from a base image on the fly +- **Cons**: Requires a pre-built agent-server image in a registry or a cached SIF file; unlike Docker mode, it cannot build from a base image on the fly - **Use case**: Local benchmark runs on Docker-restricted machines +Typical flow: +1. Run the benchmark's `build_images.py` script with `--push` from a Docker-capable machine or CI runner. +2. Run the corresponding `*-infer` command with `--workspace apptainer` from the Docker-restricted machine. +3. Reuse the cached SIF in `APPTAINER_CACHE_DIR` on subsequent runs. + +If you build without `--push`, the images only exist in the local container daemon and Apptainer will not be able to use them. + Example: ```bash diff --git a/benchmarks/swebench/README.md b/benchmarks/swebench/README.md index 9b3ef4355..4eacd0137 100644 --- a/benchmarks/swebench/README.md +++ b/benchmarks/swebench/README.md @@ -60,7 +60,26 @@ uv run swebench-infer path/to/llm_config.json \ ### Apptainer Workspace (Local Evaluation without Docker) -If Docker is unavailable, you can use the same pre-built agent-server images with Apptainer: +If Docker is unavailable, you can run SWE-Bench with Apptainer, but the required agent-server images must already be available to Apptainer. + +#### Step 1: Build and push the agent-server images + +Build the images from a Docker-capable machine or CI runner, and include `--push` so they end up in a registry Apptainer can pull from: + +```bash +uv run python -m benchmarks.swebench.build_images \ + --dataset princeton-nlp/SWE-bench_Verified \ + --split test \ + --image ghcr.io/openhands/eval-agent-server \ + --target source-minimal \ + --push +``` + +If you run `build_images.py` without `--push`, the resulting images only exist in the local container daemon and Apptainer cannot use them. + +If you build the images from a different checkout or SDK revision than the one used for inference, set `IMAGE_TAG_PREFIX` during inference so it matches the tag prefix used during the build. + +#### Step 2: Run inference with Apptainer ```bash uv run swebench-infer path/to/llm_config.json \ @@ -70,7 +89,7 @@ uv run swebench-infer path/to/llm_config.json \ --workspace apptainer ``` -Unlike Docker mode, Apptainer mode cannot build images from base images on the fly. Build and push the agent-server images first, then run inference with `--workspace apptainer`. +Apptainer can either pull the image from the registry on first use or reuse the cached SIF in `APPTAINER_CACHE_DIR` on subsequent runs. ### Remote Workspace (Scalable Cloud Evaluation) diff --git a/benchmarks/utils/image_utils.py b/benchmarks/utils/image_utils.py index 231094679..396b024b8 100644 --- a/benchmarks/utils/image_utils.py +++ b/benchmarks/utils/image_utils.py @@ -5,6 +5,7 @@ import os import subprocess import sys +from pathlib import Path from typing import TYPE_CHECKING @@ -95,6 +96,20 @@ def _env_flag(name: str, default: bool) -> bool: return value.lower() in ("1", "true", "yes", "on") +def get_apptainer_cache_dir() -> str: + """Return the Apptainer cache directory used by the workspace.""" + return os.getenv("APPTAINER_CACHE_DIR") or str(Path.home() / ".apptainer_cache") + + +def get_apptainer_sif_path( + agent_server_image: str, cache_dir: str | None = None +) -> str: + """Return the cached SIF path ApptainerWorkspace would use for an image.""" + resolved_cache_dir = cache_dir or get_apptainer_cache_dir() + sif_name = agent_server_image.replace(":", "_").replace("/", "_") + ".sif" + return str(Path(resolved_cache_dir) / sif_name) + + def create_apptainer_workspace( agent_server_image: str, working_dir: str = "/workspace", @@ -105,34 +120,45 @@ def create_apptainer_workspace( Unlike DockerDevWorkspace, ApptainerWorkspace cannot build images from a base image on the fly. The image must already exist in a container registry - that `apptainer pull docker://...` can access. + that `apptainer pull docker://...` can access, or the corresponding SIF file + must already be present in the configured Apptainer cache. """ from openhands.workspace import ApptainerWorkspace + host_port = os.getenv("APPTAINER_HOST_PORT") + cache_dir = get_apptainer_cache_dir() + mount_dir = os.getenv("APPTAINER_MOUNT_DIR") + sif_path = get_apptainer_sif_path(agent_server_image, cache_dir) + + workspace_kwargs = { + "working_dir": working_dir, + "forward_env": forward_env or [], + "extra_ports": extra_ports, + "host_port": int(host_port) if host_port else None, + "cache_dir": cache_dir, + "mount_dir": mount_dir or None, + "use_fakeroot": _env_flag("APPTAINER_USE_FAKEROOT", True), + "enable_docker_compat": _env_flag("APPTAINER_ENABLE_DOCKER_COMPAT", True), + } + + if Path(sif_path).exists(): + logger.info( + "Using cached Apptainer SIF %s for image %s", sif_path, agent_server_image + ) + return ApptainerWorkspace(sif_file=sif_path, **workspace_kwargs) + if not remote_image_exists(agent_server_image): raise RuntimeError( f"Agent server image {agent_server_image} does not exist in container registry. " - "Apptainer workspace requires a pre-built image that can be pulled " - "with Apptainer." + "Apptainer can only use a registry-pullable image or an existing cached SIF file. " + "If you built images with a benchmark build_images.py script, re-run it with --push " + "from a Docker-capable machine or CI; local-only builds are not enough. " + "If the images were built from a different checkout, make sure IMAGE_TAG_PREFIX " + "matches the tag prefix used during the build." ) logger.info(f"Using Apptainer workspace with image {agent_server_image}") - - host_port = os.getenv("APPTAINER_HOST_PORT") - cache_dir = os.getenv("APPTAINER_CACHE_DIR") - mount_dir = os.getenv("APPTAINER_MOUNT_DIR") - - return ApptainerWorkspace( - server_image=agent_server_image, - working_dir=working_dir, - forward_env=forward_env or [], - extra_ports=extra_ports, - host_port=int(host_port) if host_port else None, - cache_dir=cache_dir or None, - mount_dir=mount_dir or None, - use_fakeroot=_env_flag("APPTAINER_USE_FAKEROOT", True), - enable_docker_compat=_env_flag("APPTAINER_ENABLE_DOCKER_COMPAT", True), - ) + return ApptainerWorkspace(server_image=agent_server_image, **workspace_kwargs) def create_docker_workspace( diff --git a/tests/test_image_utils.py b/tests/test_image_utils.py index 3daca9553..684d1f6a2 100644 --- a/tests/test_image_utils.py +++ b/tests/test_image_utils.py @@ -7,6 +7,7 @@ import os import subprocess +from pathlib import Path from unittest.mock import MagicMock, patch import pytest @@ -166,18 +167,47 @@ def test_returns_apptainer_workspace_when_image_exists(self, _mock_exists): forward_env=["API_KEY"], extra_ports=True, host_port=None, - cache_dir=None, + cache_dir=str(Path.home() / ".apptainer_cache"), mount_dir=None, use_fakeroot=True, enable_docker_compat=True, ) assert ws is sentinel + def test_uses_cached_sif_without_registry_lookup(self, tmp_path, monkeypatch): + from benchmarks.utils.image_utils import ( + create_apptainer_workspace, + get_apptainer_sif_path, + ) + + image = "ghcr.io/example/agent-server:v1" + monkeypatch.setenv("APPTAINER_CACHE_DIR", str(tmp_path)) + sif_path = get_apptainer_sif_path(image, str(tmp_path)) + Path(sif_path).write_text("cached") + + with ( + patch("benchmarks.utils.image_utils.remote_image_exists") as mock_exists, + patch("openhands.workspace.ApptainerWorkspace") as mock_workspace, + ): + create_apptainer_workspace(image) + mock_exists.assert_not_called() + mock_workspace.assert_called_once_with( + sif_file=sif_path, + working_dir="/workspace", + forward_env=[], + extra_ports=False, + host_port=None, + cache_dir=str(tmp_path), + mount_dir=None, + use_fakeroot=True, + enable_docker_compat=True, + ) + @patch("benchmarks.utils.image_utils.remote_image_exists", return_value=False) def test_raises_when_image_missing_from_registry(self, _mock_exists): from benchmarks.utils.image_utils import create_apptainer_workspace - with pytest.raises(RuntimeError, match="pre-built image"): + with pytest.raises(RuntimeError, match="local-only builds are not enough"): create_apptainer_workspace("ghcr.io/example/agent-server:missing") @patch.dict(