From 79d308adb921dd1de7ed97165ab1e1ca04cfae98 Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Thu, 12 Mar 2026 22:01:59 +0000
Subject: [PATCH 1/3] Clarify Apptainer support in benchmark docs

Co-authored-by: openhands <openhands@all-hands.dev>
---
 AGENTS.md |  5 +++++
 README.md | 15 ++++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/AGENTS.md b/AGENTS.md
index 0206a51d1..7c5dda499 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -100,4 +100,9 @@ When converting between OpenHands format and benchmark-specific formats:
 - Handle missing/optional fields gracefully
 - Log conversion warnings for debugging
 - Validate output format before evaluation
+
+# Workspace runtimes
+- The benchmark CLI currently accepts only `docker` and `remote` for `--workspace`
+- The vendored SDK also includes `openhands.workspace.ApptainerWorkspace`, but the benchmark repo does not yet wire it into `run_infer.py`
+- On Docker-restricted systems, document Apptainer as an SDK capability rather than a benchmark CLI option unless code support is added
 </BENCHMARK_SPECIFIC>
diff --git a/README.md b/README.md
index 1dcb9fcbf..5c1b75832 100644
--- a/README.md
+++ b/README.md
@@ -173,7 +173,7 @@ Inputs (forwarded to the SDK `run-eval.yml` workflow):
 
 ## Workspace Types
 
-Benchmarks support two workspace types for running evaluations:
+Benchmarks currently expose two workspace types in their CLIs:
 
 ### Docker Workspace (Default)
 
@@ -191,6 +191,19 @@ Uses a [remote runtime API](https://openhands.dev/blog/evaluation-of-llms-as-cod
 - **Cons**: Requires pre-built images and API access
 - **Use case**: Large-scale evaluations, benchmarking runs
 
+### Apptainer on Docker-Restricted Systems
+
+The vendored SDK includes `openhands.workspace.ApptainerWorkspace`, which can run a pre-built agent-server image without a local Docker daemon. It converts OCI/Docker images to Apptainer SIF files with `apptainer pull docker://...`, so it is a good fit for HPC or university environments where Docker is unavailable.
+
+However, the benchmark repo does **not** currently expose `apptainer` as a supported `--workspace` value. Today, the benchmark CLIs and metadata models only accept `docker` and `remote`.
+
+If your machine cannot run Docker, the supported paths today are:
+
+1. Use `--workspace remote` and point the benchmark at a runtime API.
+2. Add a local integration that swaps benchmark `DockerWorkspace` usage for `ApptainerWorkspace` in the relevant `run_infer.py` implementation, using pre-built agent-server images.
+
+In other words: Apptainer is supported by the underlying SDK, but it is not yet a turnkey benchmark workspace option in this repository.
+
 #### How Remote Runtime Works
 
 1. **Pre-build Agent Images**: Agent-server images must be pre-built for a specific SDK commit (SHA) and pushed to a public container registry (e.g., `ghcr.io/openhands/eval-agent-server`)

From 8d6113efd41f17e32aab28c8743021e25f408eec Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Fri, 13 Mar 2026 00:10:41 +0000
Subject: [PATCH 2/3] Add benchmark-side Apptainer workspace support

Co-authored-by: openhands <openhands@all-hands.dev>
---
 README.md                                  | 36 +++++++----
 benchmarks/commit0/run_infer.py            | 22 +++++--
 benchmarks/gaia/run_infer.py               | 16 +++--
 benchmarks/multiswebench/README.md         | 14 +++++
 benchmarks/multiswebench/run_infer.py      | 17 ++++--
 benchmarks/openagentsafety/run_infer.py    | 53 ++++++++++------
 benchmarks/swebench/README.md              | 15 +++++
 benchmarks/swebench/run_infer.py           | 10 ++-
 benchmarks/swebenchmultimodal/README.md    |  2 +-
 benchmarks/swebenchmultimodal/run_infer.py | 17 ++++--
 benchmarks/swefficiency/README.md          | 16 ++++-
 benchmarks/swefficiency/run_infer.py       | 10 ++-
 benchmarks/swtbench/run_infer.py           | 20 ++++--
 benchmarks/utils/args_parser.py            |  4 +-
 benchmarks/utils/image_utils.py            | 53 +++++++++++++++-
 benchmarks/utils/models.py                 |  4 +-
 tests/test_image_utils.py                  | 71 +++++++++++++++++++++-
 tests/test_workspace_types.py              |  9 +++
 18 files changed, 322 insertions(+), 67 deletions(-)
 create mode 100644 tests/test_workspace_types.py

diff --git a/README.md b/README.md
index 5c1b75832..d37faa7a4 100644
--- a/README.md
+++ b/README.md
@@ -173,7 +173,7 @@ Inputs (forwarded to the SDK `run-eval.yml` workflow):
 
 ## Workspace Types
 
-Benchmarks currently expose two workspace types in their CLIs:
+Benchmarks expose three workspace types in their CLIs:
 
 ### Docker Workspace (Default)
 
@@ -183,26 +183,36 @@ Uses local Docker containers to run agent evaluations. Images are built locally
 - **Cons**: Resource-intensive on local machine, slower for large-scale evaluations
 - **Use case**: Development, testing, small-scale evaluations
 
-### Remote Workspace
+### Apptainer Workspace
 
-Uses a [remote runtime API](https://openhands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed) to provision containers in a cloud environment, enabling massive parallelization.
+Uses `openhands.workspace.ApptainerWorkspace` from the vendored SDK to run a pre-built agent-server image without a local Docker daemon. The workspace pulls OCI/Docker images with `apptainer pull docker://...`, so it is a good fit for HPC or university environments where Docker is unavailable.
 
-- **Pros**: Scalable to hundreds of parallel workers, no local resource constraints
-- **Cons**: Requires pre-built images and API access
-- **Use case**: Large-scale evaluations, benchmarking runs
+- **Pros**: No Docker daemon required, works on many shared/HPC systems
+- **Cons**: Requires a pre-built agent-server image in a registry; unlike Docker mode, it cannot build from a base image on the fly
+- **Use case**: Local benchmark runs on Docker-restricted machines
 
-### Apptainer on Docker-Restricted Systems
+Example:
 
-The vendored SDK includes `openhands.workspace.ApptainerWorkspace`, which can run a pre-built agent-server image without a local Docker daemon. It converts OCI/Docker images to Apptainer SIF files with `apptainer pull docker://...`, so it is a good fit for HPC or university environments where Docker is unavailable.
+```bash
+uv run swebench-infer path/to/llm_config.json \
+    --dataset princeton-nlp/SWE-bench_Verified \
+    --split test \
+    --workspace apptainer
+```
 
-However, the benchmark repo does **not** currently expose `apptainer` as a supported `--workspace` value. Today, the benchmark CLIs and metadata models only accept `docker` and `remote`.
+Useful environment variables:
+- `APPTAINER_CACHE_DIR`: Override the SIF/cache directory
+- `APPTAINER_HOST_PORT`: Pin the local port used by the agent server
+- `APPTAINER_USE_FAKEROOT=0`: Disable fakeroot if your cluster does not support it
+- `APPTAINER_ENABLE_DOCKER_COMPAT=0`: Disable `--compat` for custom Apptainer behavior
 
-If your machine cannot run Docker, the supported paths today are:
+### Remote Workspace
 
-1. Use `--workspace remote` and point the benchmark at a runtime API.
-2. Add a local integration that swaps benchmark `DockerWorkspace` usage for `ApptainerWorkspace` in the relevant `run_infer.py` implementation, using pre-built agent-server images.
+Uses a [remote runtime API](https://openhands.dev/blog/evaluation-of-llms-as-coding-agents-on-swe-bench-at-30x-speed) to provision containers in a cloud environment, enabling massive parallelization.
 
-In other words: Apptainer is supported by the underlying SDK, but it is not yet a turnkey benchmark workspace option in this repository.
+- **Pros**: Scalable to hundreds of parallel workers, no local resource constraints
+- **Cons**: Requires pre-built images and API access
+- **Use case**: Large-scale evaluations, benchmarking runs
 
 #### How Remote Runtime Works
 
diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py
index b9a08102a..e68014693 100644
--- a/benchmarks/commit0/run_infer.py
+++ b/benchmarks/commit0/run_infer.py
@@ -23,7 +23,11 @@
     construct_eval_output_dir,
     get_default_on_result_writer,
 )
-from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
+from benchmarks.utils.image_utils import (
+    create_apptainer_workspace,
+    create_docker_workspace,
+    remote_image_exists,
+)
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -186,18 +190,24 @@ def prepare_workspace(
         build_target = "source-minimal"
         logger.info(f"Using base docker image: {base_docker_image}")
 
+        custom_tag = extract_custom_tag(base_docker_image)
+        suffix = f"-{build_target}" if build_target != "binary" else ""
+        agent_server_image = (
+            f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
+        )
+
         if self.metadata.workspace_type == "docker":
-            custom_tag = extract_custom_tag(base_docker_image)
-            suffix = f"-{build_target}" if build_target != "binary" else ""
-            agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
-            )
             workspace = create_docker_workspace(
                 agent_server_image=agent_server_image,
                 base_image=base_docker_image,
                 build_target=build_target,
                 forward_env=forward_env,
             )
+        elif self.metadata.workspace_type == "apptainer":
+            workspace = create_apptainer_workspace(
+                agent_server_image=agent_server_image,
+                forward_env=forward_env,
+            )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
             if not runtime_api_key:
diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py
index a100a2cfb..c4229ac9a 100644
--- a/benchmarks/gaia/run_infer.py
+++ b/benchmarks/gaia/run_infer.py
@@ -27,7 +27,11 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
-from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
+from benchmarks.utils.image_utils import (
+    create_apptainer_workspace,
+    create_docker_workspace,
+    remote_image_exists,
+)
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
 from benchmarks.utils.version import IMAGE_TAG_PREFIX
@@ -155,16 +159,20 @@ def prepare_workspace(
         """
         logger.info(f"Preparing workspace for instance {instance.id}")
 
+        agent_server_image = f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary"
+
         if self.metadata.workspace_type == "docker":
-            agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary"
-            )
             workspace = create_docker_workspace(
                 agent_server_image=agent_server_image,
                 base_image="nikolaik/python-nodejs:python3.12-nodejs22",
                 build_target="binary",
                 forward_env=forward_env,
             )
+        elif self.metadata.workspace_type == "apptainer":
+            workspace = create_apptainer_workspace(
+                agent_server_image=agent_server_image,
+                forward_env=forward_env,
+            )
         elif self.metadata.workspace_type == "remote":
             # For workflow, use APIRemoteWorkspace with pre-built GAIA image
             # GAIA uses a universal agent server image (one image for all instances)
diff --git a/benchmarks/multiswebench/README.md b/benchmarks/multiswebench/README.md
index f259a64d1..20045a519 100644
--- a/benchmarks/multiswebench/README.md
+++ b/benchmarks/multiswebench/README.md
@@ -83,6 +83,20 @@ LANGUAGE=java uv run multi-swebench-infer path/to/llm_config.json \
     --workspace docker
 ```
 
+### Apptainer Workspace (Local Evaluation without Docker)
+
+If Docker is unavailable, you can run against pre-built agent-server images with Apptainer:
+
+```bash
+LANGUAGE=java uv run multi-swebench-infer path/to/llm_config.json \
+    --dataset bytedance-research/Multi-SWE-Bench \
+    --split java_verified \
+    --workspace apptainer
+```
+
+Apptainer mode requires the agent-server images to already exist in a registry; it does not build them locally from base images.
+
+
 ### Remote Workspace (Scalable Cloud Evaluation)
 
 Remote workspace enables running evaluations at scale by using a cloud-based runtime API to provision containers. This is ideal for large-scale benchmark runs with high parallelization.
diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py
index 9d47d87e9..e74490c94 100644
--- a/benchmarks/multiswebench/run_infer.py
+++ b/benchmarks/multiswebench/run_infer.py
@@ -25,7 +25,10 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
-from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.image_utils import (
+    create_apptainer_workspace,
+    remote_image_exists,
+)
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -207,10 +210,11 @@ def prepare_workspace(
         # For non-binary targets, append target suffix
         suffix = f"-{build_target}" if build_target != "binary" else ""
 
+        agent_server_image = (
+            f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
+        )
+
         if self.metadata.workspace_type == "docker":
-            agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
-            )
             ensure_local_image(
                 agent_server_image=agent_server_image,
                 base_image=official_docker_image,
@@ -222,6 +226,11 @@ def prepare_workspace(
                 working_dir="/workspace",
                 forward_env=forward_env or [],
             )
+        elif self.metadata.workspace_type == "apptainer":
+            workspace = create_apptainer_workspace(
+                agent_server_image=agent_server_image,
+                forward_env=forward_env,
+            )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
             if not runtime_api_key:
diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py
index 3c55b5310..aa837c640 100644
--- a/benchmarks/openagentsafety/run_infer.py
+++ b/benchmarks/openagentsafety/run_infer.py
@@ -25,6 +25,7 @@
 from benchmarks.utils.evaluation import Evaluation
 from benchmarks.utils.evaluation_utils import construct_eval_output_dir
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
+from benchmarks.utils.image_utils import create_apptainer_workspace
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
 from openhands.sdk import Agent, Conversation, Tool, get_logger
@@ -212,7 +213,7 @@ def cleanup_docker_containers():
 def setup_host_mapping(workspace):
     """Add the-agent-company.com host mapping inside the container."""
     try:
-        gateway_ip = "172.17.0.1"
+        gateway_ip = os.getenv("THE_AGENT_COMPANY_HOST_IP", "172.17.0.1")
         logger.info(f"Adding host mapping: {gateway_ip} the-agent-company.com")
         workspace.execute_command(
             f"echo '{gateway_ip} the-agent-company.com' >> /etc/hosts"
@@ -388,32 +389,44 @@ def prepare_workspace(
         resource_factor: int = 1,
         forward_env: list[str] | None = None,
     ) -> RemoteWorkspace:
-        """Create a fresh Docker workspace for this instance.
+        """Create a fresh workspace for this instance.
 
         Args:
             instance: The evaluation instance to prepare workspace for.
             resource_factor: Resource factor for runtime allocation (default: 1).
             forward_env: Environment variables to forward into the workspace.
         """
-        # Try to build image on-the-fly, fall back to pre-built if build fails
-        try:
-            server_image = build_workspace_image()
-        except (subprocess.CalledProcessError, RuntimeError) as e:
-            logger.warning(f"On-the-fly build failed: {e}")
+        if self.metadata.workspace_type == "docker":
+            # Try to build image on-the-fly, fall back to pre-built if build fails
+            try:
+                server_image = build_workspace_image()
+            except (subprocess.CalledProcessError, RuntimeError) as e:
+                logger.warning(f"On-the-fly build failed: {e}")
+                server_image = get_image_name()
+
+                if not check_image_exists(server_image):
+                    raise RuntimeError(
+                        f"On-the-fly build failed and pre-built image {server_image} does not exist"
+                    )
+                logger.info(f"Using pre-built image {server_image}")
+
+            workspace = DockerWorkspace(
+                server_image=server_image,
+                platform="linux/amd64",
+                extra_ports=True,
+                forward_env=forward_env or [],
+            )
+        elif self.metadata.workspace_type == "apptainer":
             server_image = get_image_name()
-
-            if not check_image_exists(server_image):
-                raise RuntimeError(
-                    f"On-the-fly build failed and pre-built image {server_image} does not exist"
-                )
-            logger.info(f"Using pre-built image {server_image}")
-
-        workspace = DockerWorkspace(
-            server_image=server_image,
-            platform="linux/amd64",
-            extra_ports=True,
-            forward_env=forward_env or [],
-        )
+            workspace = create_apptainer_workspace(
+                agent_server_image=server_image,
+                forward_env=forward_env,
+                extra_ports=True,
+            )
+        else:
+            raise ValueError(
+                f"Unsupported workspace_type: {self.metadata.workspace_type}"
+            )
 
         # Setup host mapping for The Agent Company services
         setup_host_mapping(workspace)
diff --git a/benchmarks/swebench/README.md b/benchmarks/swebench/README.md
index a045aa3d6..9b3ef4355 100644
--- a/benchmarks/swebench/README.md
+++ b/benchmarks/swebench/README.md
@@ -58,6 +58,21 @@ uv run swebench-infer path/to/llm_config.json \
     --workspace docker
 ```
 
+### Apptainer Workspace (Local Evaluation without Docker)
+
+If Docker is unavailable, you can use the same pre-built agent-server images with Apptainer:
+
+```bash
+uv run swebench-infer path/to/llm_config.json \
+    --dataset princeton-nlp/SWE-bench_Verified \
+    --split test \
+    --max-iterations 100 \
+    --workspace apptainer
+```
+
+Unlike Docker mode, Apptainer mode cannot build images from base images on the fly. Build and push the agent-server images first, then run inference with `--workspace apptainer`.
+
+
 ### Remote Workspace (Scalable Cloud Evaluation)
 
 Remote workspace enables running evaluations at scale by using a cloud-based runtime API to provision containers. This is ideal for large-scale benchmark runs with high parallelization.
diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py
index 065c3aecd..9cc94a5f2 100644
--- a/benchmarks/swebench/run_infer.py
+++ b/benchmarks/swebench/run_infer.py
@@ -25,7 +25,10 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
-from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.image_utils import (
+    create_apptainer_workspace,
+    remote_image_exists,
+)
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -183,6 +186,11 @@ def prepare_workspace(
                 working_dir="/workspace",
                 forward_env=forward_env or [],
             )
+        elif self.metadata.workspace_type == "apptainer":
+            workspace = create_apptainer_workspace(
+                agent_server_image=agent_server_image,
+                forward_env=forward_env,
+            )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
             if not runtime_api_key:
diff --git a/benchmarks/swebenchmultimodal/README.md b/benchmarks/swebenchmultimodal/README.md
index 08acf1c67..7b9540c8c 100644
--- a/benchmarks/swebenchmultimodal/README.md
+++ b/benchmarks/swebenchmultimodal/README.md
@@ -65,7 +65,7 @@ The benchmark uses the same configuration options as regular SWE-Bench:
 - `--split`: Dataset split (e.g., `test`, `dev`)
 - `--llm-config`: Path to LLM configuration file
 - `--max-iterations`: Maximum number of agent iterations
-- `--workspace-type`: Either `docker` or `remote`
+- `--workspace`: One of `docker`, `apptainer`, or `remote`
 - `--num-workers`: Number of parallel workers
 
 ## Environment Variables
diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py
index 7eb3d348f..0220c928e 100644
--- a/benchmarks/swebenchmultimodal/run_infer.py
+++ b/benchmarks/swebenchmultimodal/run_infer.py
@@ -23,7 +23,10 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
-from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.image_utils import (
+    create_apptainer_workspace,
+    remote_image_exists,
+)
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -160,10 +163,11 @@ def prepare_workspace(
         # For non-binary targets, append target suffix
         suffix = f"-{build_target}" if build_target != "binary" else ""
 
+        agent_server_image = (
+            f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
+        )
+
         if self.metadata.workspace_type == "docker":
-            agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
-            )
             ensure_local_image(
                 agent_server_image=agent_server_image,
                 base_image=official_docker_image,
@@ -175,6 +179,11 @@ def prepare_workspace(
                 working_dir="/workspace",
                 forward_env=forward_env or [],
             )
+        elif self.metadata.workspace_type == "apptainer":
+            workspace = create_apptainer_workspace(
+                agent_server_image=agent_server_image,
+                forward_env=forward_env,
+            )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
             if not runtime_api_key:
diff --git a/benchmarks/swefficiency/README.md b/benchmarks/swefficiency/README.md
index 0a207ea5c..bf4551705 100644
--- a/benchmarks/swefficiency/README.md
+++ b/benchmarks/swefficiency/README.md
@@ -54,6 +54,20 @@ uv run swefficiency-infer path/to/llm_config.json \
     --workspace docker
 ```
 
+
+### Apptainer Workspace (Local Evaluation without Docker)
+
+If Docker is unavailable, you can run SWE-fficiency with Apptainer against pre-built agent-server images:
+
+```bash
+uv run swefficiency-infer path/to/llm_config.json \
+    --dataset swefficiency/swefficiency \
+    --split test \
+    --workspace apptainer
+```
+
+Apptainer mode does not apply the Docker-only CPU and memory limits above; it expects the image to already be published in a registry.
+
 ### Remote Workspace (Scalable Cloud Evaluation)
 
 ```bash
@@ -75,7 +89,7 @@ After running inference, use the official SWE-fficiency benchmark evaluation too
 |--------|-------------|---------|
 | `--dataset` | HuggingFace dataset name | `swefficiency/swefficiency` |
 | `--split` | Dataset split | `test` |
-| `--workspace` | Workspace type (`docker` or `remote`) | `docker` |
+| `--workspace` | Workspace type (`docker`, `apptainer`, or `remote`) | `docker` |
 | `--num-workers` | Number of parallel workers | `4` |
 | `--max-iterations` | Maximum agent iterations | `500` |
 | `--num-cpus-per-worker` | CPUs per Docker container | `4` |
diff --git a/benchmarks/swefficiency/run_infer.py b/benchmarks/swefficiency/run_infer.py
index bb3efd908..82bea0b3b 100644
--- a/benchmarks/swefficiency/run_infer.py
+++ b/benchmarks/swefficiency/run_infer.py
@@ -20,7 +20,10 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
-from benchmarks.utils.image_utils import remote_image_exists
+from benchmarks.utils.image_utils import (
+    create_apptainer_workspace,
+    remote_image_exists,
+)
 from benchmarks.utils.models import (
     EvalInstance,
     EvalMetadata,
@@ -235,6 +238,11 @@ def prepare_workspace(
             workspace._cpu_group = cpu_group
             workspace._cpu_groups_queue = self.cpu_groups_queue
 
+        elif self.metadata.workspace_type == "apptainer":
+            workspace = create_apptainer_workspace(
+                agent_server_image=agent_server_image,
+                forward_env=forward_env,
+            )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
             if not runtime_api_key:
diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py
index bdfb7b13e..8ffdb0d38 100644
--- a/benchmarks/swtbench/run_infer.py
+++ b/benchmarks/swtbench/run_infer.py
@@ -17,7 +17,11 @@
     get_default_on_result_writer,
 )
 from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
-from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
+from benchmarks.utils.image_utils import (
+    create_apptainer_workspace,
+    create_docker_workspace,
+    remote_image_exists,
+)
 from benchmarks.utils.llm_config import load_llm_config
 from benchmarks.utils.models import (
     EvalInstance,
@@ -151,7 +155,7 @@ def prepare_workspace(
         forward_env: list[str] | None = None,
     ) -> RemoteWorkspace:
         """
-        Create workspace based on workspace_type (docker or remote).
+        Create workspace based on workspace_type (docker, apptainer, or remote).
 
         Args:
             instance: The evaluation instance to prepare workspace for.
@@ -169,16 +173,22 @@ def prepare_workspace(
         # For non-binary targets, append target suffix
         suffix = f"-{build_target}" if build_target != "binary" else ""
 
+        agent_server_image = (
+            f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
+        )
+
         if self.metadata.workspace_type == "docker":
-            agent_server_image = (
-                f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
-            )
             workspace = create_docker_workspace(
                 agent_server_image=agent_server_image,
                 base_image=official_docker_image,
                 build_target=build_target,
                 forward_env=forward_env,
             )
+        elif self.metadata.workspace_type == "apptainer":
+            workspace = create_apptainer_workspace(
+                agent_server_image=agent_server_image,
+                forward_env=forward_env,
+            )
         elif self.metadata.workspace_type == "remote":
             runtime_api_key = os.getenv("RUNTIME_API_KEY")
             if not runtime_api_key:
diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py
index 698f2fd4f..8ade8f018 100644
--- a/benchmarks/utils/args_parser.py
+++ b/benchmarks/utils/args_parser.py
@@ -41,8 +41,8 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
         "--workspace",
         type=str,
         default="remote",
-        choices=["docker", "remote"],
-        help="Type of workspace to use (default: remote)",
+        choices=["docker", "apptainer", "remote"],
+        help="Type of workspace to use: docker, apptainer, or remote (default: remote)",
     )
     parser.add_argument(
         "--max-iterations",
diff --git a/benchmarks/utils/image_utils.py b/benchmarks/utils/image_utils.py
index 467074cb9..231094679 100644
--- a/benchmarks/utils/image_utils.py
+++ b/benchmarks/utils/image_utils.py
@@ -10,7 +10,11 @@
 
 if TYPE_CHECKING:
     from openhands.sdk.workspace import TargetType
-    from openhands.workspace import DockerDevWorkspace, DockerWorkspace
+    from openhands.workspace import (
+        ApptainerWorkspace,
+        DockerDevWorkspace,
+        DockerWorkspace,
+    )
 
 import requests
 
@@ -84,6 +88,53 @@ def local_image_exists(image: str) -> bool:
         return False
 
 
+def _env_flag(name: str, default: bool) -> bool:
+    value = os.getenv(name)
+    if value is None:
+        return default
+    return value.lower() in ("1", "true", "yes", "on")
+
+
+def create_apptainer_workspace(
+    agent_server_image: str,
+    working_dir: str = "/workspace",
+    forward_env: list[str] | None = None,
+    extra_ports: bool = False,
+) -> ApptainerWorkspace:
+    """Create an Apptainer workspace from a pre-built agent-server image.
+
+    Unlike DockerDevWorkspace, ApptainerWorkspace cannot build images from a
+    base image on the fly. The image must already exist in a container registry
+    that `apptainer pull docker://...` can access.
+    """
+    from openhands.workspace import ApptainerWorkspace
+
+    if not remote_image_exists(agent_server_image):
+        raise RuntimeError(
+            f"Agent server image {agent_server_image} does not exist in container registry. "
+            "Apptainer workspace requires a pre-built image that can be pulled "
+            "with Apptainer."
+        )
+
+    logger.info(f"Using Apptainer workspace with image {agent_server_image}")
+
+    host_port = os.getenv("APPTAINER_HOST_PORT")
+    cache_dir = os.getenv("APPTAINER_CACHE_DIR")
+    mount_dir = os.getenv("APPTAINER_MOUNT_DIR")
+
+    return ApptainerWorkspace(
+        server_image=agent_server_image,
+        working_dir=working_dir,
+        forward_env=forward_env or [],
+        extra_ports=extra_ports,
+        host_port=int(host_port) if host_port else None,
+        cache_dir=cache_dir or None,
+        mount_dir=mount_dir or None,
+        use_fakeroot=_env_flag("APPTAINER_USE_FAKEROOT", True),
+        enable_docker_compat=_env_flag("APPTAINER_ENABLE_DOCKER_COMPAT", True),
+    )
+
+
 def create_docker_workspace(
     agent_server_image: str,
     base_image: str,
diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py
index b1845e049..165d5566b 100644
--- a/benchmarks/utils/models.py
+++ b/benchmarks/utils/models.py
@@ -52,9 +52,9 @@ class EvalMetadata(BaseModel):
         ge=0,
         description="Maximum number of retries for instances that throw exceptions",
     )
-    workspace_type: Literal["docker", "remote"] = Field(
+    workspace_type: Literal["docker", "apptainer", "remote"] = Field(
         default="docker",
-        description="Type of workspace to use, e.g., 'docker' or 'remote'",
+        description="Type of workspace to use, e.g., 'docker', 'apptainer', or 'remote'",
     )
     base_resource_factor: int = Field(
         default=1,
diff --git a/tests/test_image_utils.py b/tests/test_image_utils.py
index c46830cb6..3daca9553 100644
--- a/tests/test_image_utils.py
+++ b/tests/test_image_utils.py
@@ -1,7 +1,8 @@
 """Tests for image_utils and build_utils helper functions.
 
-Tests cover local_image_exists(), create_docker_workspace(), and ensure_local_image()
-which centralize Docker image detection and build logic across all benchmarks.
+Tests cover local_image_exists(), create_docker_workspace(),
+create_apptainer_workspace(), and ensure_local_image() which centralize
+container image detection and workspace creation across benchmarks.
 """
 
 import os
@@ -142,6 +143,72 @@ def test_custom_working_dir_and_forward_env(self, _mock_exists):
             )
 
 
+class TestCreateApptainerWorkspace:
+    """Tests for create_apptainer_workspace()."""
+
+    @patch("benchmarks.utils.image_utils.remote_image_exists", return_value=True)
+    def test_returns_apptainer_workspace_when_image_exists(self, _mock_exists):
+        from benchmarks.utils.image_utils import create_apptainer_workspace
+        from openhands.workspace import ApptainerWorkspace
+
+        sentinel = MagicMock(spec=ApptainerWorkspace)
+        with patch(
+            "openhands.workspace.ApptainerWorkspace", return_value=sentinel
+        ) as spy:
+            ws = create_apptainer_workspace(
+                agent_server_image="ghcr.io/example/agent-server:v1",
+                forward_env=["API_KEY"],
+                extra_ports=True,
+            )
+            spy.assert_called_once_with(
+                server_image="ghcr.io/example/agent-server:v1",
+                working_dir="/workspace",
+                forward_env=["API_KEY"],
+                extra_ports=True,
+                host_port=None,
+                cache_dir=None,
+                mount_dir=None,
+                use_fakeroot=True,
+                enable_docker_compat=True,
+            )
+            assert ws is sentinel
+
+    @patch("benchmarks.utils.image_utils.remote_image_exists", return_value=False)
+    def test_raises_when_image_missing_from_registry(self, _mock_exists):
+        from benchmarks.utils.image_utils import create_apptainer_workspace
+
+        with pytest.raises(RuntimeError, match="pre-built image"):
+            create_apptainer_workspace("ghcr.io/example/agent-server:missing")
+
+    @patch.dict(
+        os.environ,
+        {
+            "APPTAINER_HOST_PORT": "8123",
+            "APPTAINER_CACHE_DIR": "/tmp/apptainer-cache",
+            "APPTAINER_MOUNT_DIR": "/tmp/workspace-mount",
+            "APPTAINER_USE_FAKEROOT": "0",
+            "APPTAINER_ENABLE_DOCKER_COMPAT": "false",
+        },
+    )
+    @patch("benchmarks.utils.image_utils.remote_image_exists", return_value=True)
+    def test_forwards_apptainer_env_configuration(self, _mock_exists):
+        from benchmarks.utils.image_utils import create_apptainer_workspace
+
+        with patch("openhands.workspace.ApptainerWorkspace") as mock_workspace:
+            create_apptainer_workspace("ghcr.io/example/agent-server:v1")
+            mock_workspace.assert_called_once_with(
+                server_image="ghcr.io/example/agent-server:v1",
+                working_dir="/workspace",
+                forward_env=[],
+                extra_ports=False,
+                host_port=8123,
+                cache_dir="/tmp/apptainer-cache",
+                mount_dir="/tmp/workspace-mount",
+                use_fakeroot=False,
+                enable_docker_compat=False,
+            )
+
+
 class TestEnsureLocalImage:
     """Tests for ensure_local_image().
 
diff --git a/tests/test_workspace_types.py b/tests/test_workspace_types.py
new file mode 100644
index 000000000..bab8f27ce
--- /dev/null
+++ b/tests/test_workspace_types.py
@@ -0,0 +1,9 @@
+"""Tests for shared workspace type configuration."""
+
+from benchmarks.utils.args_parser import get_parser
+
+
+def test_parser_accepts_apptainer_workspace() -> None:
+    parser = get_parser(add_llm_config=False)
+    args = parser.parse_args(["--workspace", "apptainer"])
+    assert args.workspace == "apptainer"

From 2f2385388abc5f6fee5e735c8d94ce0de685403c Mon Sep 17 00:00:00 2001
From: openhands <openhands@all-hands.dev>
Date: Mon, 16 Mar 2026 13:57:15 +0000
Subject: [PATCH 3/3] Improve Apptainer image guidance and cache reuse

Co-authored-by: openhands <openhands@all-hands.dev>
---
 README.md                       |  9 ++++-
 benchmarks/swebench/README.md   | 23 ++++++++++--
 benchmarks/utils/image_utils.py | 64 +++++++++++++++++++++++----------
 tests/test_image_utils.py       | 34 ++++++++++++++++--
 4 files changed, 106 insertions(+), 24 deletions(-)

diff --git a/README.md b/README.md
index d37faa7a4..9cf03737f 100644
--- a/README.md
+++ b/README.md
@@ -188,9 +188,16 @@ Uses local Docker containers to run agent evaluations. Images are built locally
 Uses `openhands.workspace.ApptainerWorkspace` from the vendored SDK to run a pre-built agent-server image without a local Docker daemon. The workspace pulls OCI/Docker images with `apptainer pull docker://...`, so it is a good fit for HPC or university environments where Docker is unavailable.
 
 - **Pros**: No Docker daemon required, works on many shared/HPC systems
-- **Cons**: Requires a pre-built agent-server image in a registry; unlike Docker mode, it cannot build from a base image on the fly
+- **Cons**: Requires a pre-built agent-server image in a registry or a cached SIF file; unlike Docker mode, it cannot build from a base image on the fly
 - **Use case**: Local benchmark runs on Docker-restricted machines
 
+Typical flow:
+1. Run the benchmark's `build_images.py` script with `--push` from a Docker-capable machine or CI runner.
+2. Run the corresponding `*-infer` command with `--workspace apptainer` from the Docker-restricted machine.
+3. Reuse the cached SIF in `APPTAINER_CACHE_DIR` on subsequent runs.
+
+If you build without `--push`, the images only exist in the local container daemon and Apptainer will not be able to use them.
+
 Example:
 
 ```bash
diff --git a/benchmarks/swebench/README.md b/benchmarks/swebench/README.md
index 9b3ef4355..4eacd0137 100644
--- a/benchmarks/swebench/README.md
+++ b/benchmarks/swebench/README.md
@@ -60,7 +60,26 @@ uv run swebench-infer path/to/llm_config.json \
 
 ### Apptainer Workspace (Local Evaluation without Docker)
 
-If Docker is unavailable, you can use the same pre-built agent-server images with Apptainer:
+If Docker is unavailable, you can run SWE-Bench with Apptainer, but the required agent-server images must already be available to Apptainer.
+
+#### Step 1: Build and push the agent-server images
+
+Build the images from a Docker-capable machine or CI runner, and include `--push` so they end up in a registry Apptainer can pull from:
+
+```bash
+uv run python -m benchmarks.swebench.build_images \
+  --dataset princeton-nlp/SWE-bench_Verified \
+  --split test \
+  --image ghcr.io/openhands/eval-agent-server \
+  --target source-minimal \
+  --push
+```
+
+If you run `build_images.py` without `--push`, the resulting images only exist in the local container daemon and Apptainer cannot use them.
+
+If you build the images from a different checkout or SDK revision than the one used for inference, set `IMAGE_TAG_PREFIX` during inference so it matches the tag prefix used during the build.
+
+#### Step 2: Run inference with Apptainer
 
 ```bash
 uv run swebench-infer path/to/llm_config.json \
@@ -70,7 +89,7 @@ uv run swebench-infer path/to/llm_config.json \
     --workspace apptainer
 ```
 
-Unlike Docker mode, Apptainer mode cannot build images from base images on the fly. Build and push the agent-server images first, then run inference with `--workspace apptainer`.
+Apptainer can either pull the image from the registry on first use or reuse the cached SIF in `APPTAINER_CACHE_DIR` on subsequent runs.
 
 
 ### Remote Workspace (Scalable Cloud Evaluation)
diff --git a/benchmarks/utils/image_utils.py b/benchmarks/utils/image_utils.py
index 231094679..396b024b8 100644
--- a/benchmarks/utils/image_utils.py
+++ b/benchmarks/utils/image_utils.py
@@ -5,6 +5,7 @@
 import os
 import subprocess
 import sys
+from pathlib import Path
 from typing import TYPE_CHECKING
 
 
@@ -95,6 +96,20 @@ def _env_flag(name: str, default: bool) -> bool:
     return value.lower() in ("1", "true", "yes", "on")
 
 
+def get_apptainer_cache_dir() -> str:
+    """Return the Apptainer cache directory used by the workspace."""
+    return os.getenv("APPTAINER_CACHE_DIR") or str(Path.home() / ".apptainer_cache")
+
+
+def get_apptainer_sif_path(
+    agent_server_image: str, cache_dir: str | None = None
+) -> str:
+    """Return the cached SIF path ApptainerWorkspace would use for an image."""
+    resolved_cache_dir = cache_dir or get_apptainer_cache_dir()
+    sif_name = agent_server_image.replace(":", "_").replace("/", "_") + ".sif"
+    return str(Path(resolved_cache_dir) / sif_name)
+
+
 def create_apptainer_workspace(
     agent_server_image: str,
     working_dir: str = "/workspace",
@@ -105,34 +120,45 @@ def create_apptainer_workspace(
 
     Unlike DockerDevWorkspace, ApptainerWorkspace cannot build images from a
     base image on the fly. The image must already exist in a container registry
-    that `apptainer pull docker://...` can access.
+    that `apptainer pull docker://...` can access, or the corresponding SIF file
+    must already be present in the configured Apptainer cache.
     """
     from openhands.workspace import ApptainerWorkspace
 
+    host_port = os.getenv("APPTAINER_HOST_PORT")
+    cache_dir = get_apptainer_cache_dir()
+    mount_dir = os.getenv("APPTAINER_MOUNT_DIR")
+    sif_path = get_apptainer_sif_path(agent_server_image, cache_dir)
+
+    workspace_kwargs = {
+        "working_dir": working_dir,
+        "forward_env": forward_env or [],
+        "extra_ports": extra_ports,
+        "host_port": int(host_port) if host_port else None,
+        "cache_dir": cache_dir,
+        "mount_dir": mount_dir or None,
+        "use_fakeroot": _env_flag("APPTAINER_USE_FAKEROOT", True),
+        "enable_docker_compat": _env_flag("APPTAINER_ENABLE_DOCKER_COMPAT", True),
+    }
+
+    if Path(sif_path).exists():
+        logger.info(
+            "Using cached Apptainer SIF %s for image %s", sif_path, agent_server_image
+        )
+        return ApptainerWorkspace(sif_file=sif_path, **workspace_kwargs)
+
     if not remote_image_exists(agent_server_image):
         raise RuntimeError(
             f"Agent server image {agent_server_image} does not exist in container registry. "
-            "Apptainer workspace requires a pre-built image that can be pulled "
-            "with Apptainer."
+            "Apptainer can only use a registry-pullable image or an existing cached SIF file. "
+            "If you built images with a benchmark build_images.py script, re-run it with --push "
+            "from a Docker-capable machine or CI; local-only builds are not enough. "
+            "If the images were built from a different checkout, make sure IMAGE_TAG_PREFIX "
+            "matches the tag prefix used during the build."
         )
 
     logger.info(f"Using Apptainer workspace with image {agent_server_image}")
-
-    host_port = os.getenv("APPTAINER_HOST_PORT")
-    cache_dir = os.getenv("APPTAINER_CACHE_DIR")
-    mount_dir = os.getenv("APPTAINER_MOUNT_DIR")
-
-    return ApptainerWorkspace(
-        server_image=agent_server_image,
-        working_dir=working_dir,
-        forward_env=forward_env or [],
-        extra_ports=extra_ports,
-        host_port=int(host_port) if host_port else None,
-        cache_dir=cache_dir or None,
-        mount_dir=mount_dir or None,
-        use_fakeroot=_env_flag("APPTAINER_USE_FAKEROOT", True),
-        enable_docker_compat=_env_flag("APPTAINER_ENABLE_DOCKER_COMPAT", True),
-    )
+    return ApptainerWorkspace(server_image=agent_server_image, **workspace_kwargs)
 
 
 def create_docker_workspace(
diff --git a/tests/test_image_utils.py b/tests/test_image_utils.py
index 3daca9553..684d1f6a2 100644
--- a/tests/test_image_utils.py
+++ b/tests/test_image_utils.py
@@ -7,6 +7,7 @@
 
 import os
 import subprocess
+from pathlib import Path
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -166,18 +167,47 @@ def test_returns_apptainer_workspace_when_image_exists(self, _mock_exists):
                 forward_env=["API_KEY"],
                 extra_ports=True,
                 host_port=None,
-                cache_dir=None,
+                cache_dir=str(Path.home() / ".apptainer_cache"),
                 mount_dir=None,
                 use_fakeroot=True,
                 enable_docker_compat=True,
             )
             assert ws is sentinel
 
+    def test_uses_cached_sif_without_registry_lookup(self, tmp_path, monkeypatch):
+        from benchmarks.utils.image_utils import (
+            create_apptainer_workspace,
+            get_apptainer_sif_path,
+        )
+
+        image = "ghcr.io/example/agent-server:v1"
+        monkeypatch.setenv("APPTAINER_CACHE_DIR", str(tmp_path))
+        sif_path = get_apptainer_sif_path(image, str(tmp_path))
+        Path(sif_path).write_text("cached")
+
+        with (
+            patch("benchmarks.utils.image_utils.remote_image_exists") as mock_exists,
+            patch("openhands.workspace.ApptainerWorkspace") as mock_workspace,
+        ):
+            create_apptainer_workspace(image)
+            mock_exists.assert_not_called()
+            mock_workspace.assert_called_once_with(
+                sif_file=sif_path,
+                working_dir="/workspace",
+                forward_env=[],
+                extra_ports=False,
+                host_port=None,
+                cache_dir=str(tmp_path),
+                mount_dir=None,
+                use_fakeroot=True,
+                enable_docker_compat=True,
+            )
+
     @patch("benchmarks.utils.image_utils.remote_image_exists", return_value=False)
     def test_raises_when_image_missing_from_registry(self, _mock_exists):
         from benchmarks.utils.image_utils import create_apptainer_workspace
 
-        with pytest.raises(RuntimeError, match="pre-built image"):
+        with pytest.raises(RuntimeError, match="local-only builds are not enough"):
             create_apptainer_workspace("ghcr.io/example/agent-server:missing")
 
     @patch.dict(