From d5dd8c8ae0870a2b23e383cca36f6f123afe879d Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Fri, 13 Mar 2026 17:01:55 -0300
Subject: [PATCH 1/3] build: expose image build parallelism knobs to workflows

Add an explicit build_batch_size parameter to the shared build helper, thread it through the current image build entrypoints, and surface the corresponding workflow inputs for SWE-Bench and SWT-Bench. This lets workflow-dispatched max worker and batch-size values reach the build logic without being overridden by environment defaults.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swebench-images.yml   | 13 +++-
 .github/workflows/build-swtbench-images.yml   | 14 +++-
 benchmarks/commit0/build_images.py            |  1 +
 benchmarks/gaia/build_images.py               |  1 +
 benchmarks/multiswebench/build_images.py      |  1 +
 benchmarks/swebench/build_images.py           |  1 +
 .../swebenchmultilingual/build_images.py      |  2 +
 benchmarks/swebenchmultimodal/build_images.py |  1 +
 benchmarks/swegym/build_images.py             |  1 +
 benchmarks/swesmith/build_images.py           |  1 +
 benchmarks/swtbench/build_images.py           |  1 +
 benchmarks/utils/build_utils.py               | 18 +++++-
 tests/test_image_utils.py                     | 64 +++++++++++++++++++
 13 files changed, 112 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build-swebench-images.yml b/.github/workflows/build-swebench-images.yml
index 9a307dd63..b3d7db27e 100644
--- a/.github/workflows/build-swebench-images.yml
+++ b/.github/workflows/build-swebench-images.yml
@@ -22,7 +22,12 @@ on:
       max-workers:
         description: 'Number of concurrent builds'
         required: false
-        default: '12'
+        default: '32'
+        type: string
+      build-batch-size:
+        description: 'Number of images to submit per batch'
+        required: false
+        default: '50'
         type: string
       max-retries:
         description: 'Retries per image build'
@@ -59,12 +64,12 @@ on:
 env:
   DATASET: princeton-nlp/SWE-bench_Verified
   SPLIT: test
-  MAX_WORKERS: '12'
+  MAX_WORKERS: '32'
   MAX_RETRIES: '5'
   N_LIMIT: '500'
   INSTANCE_IDS: ''
   SELECT_FILE: ''
-  BUILD_BATCH_SIZE: '15'
+  BUILD_BATCH_SIZE: '50'
   BUILDKIT_PRUNE_KEEP_GB: '60'
   BUILDKIT_PRUNE_THRESHOLD_PCT: '60'
 
@@ -119,6 +124,7 @@ jobs:
           if [ -n "${{ inputs.dataset }}" ]; then echo "DATASET=${{ inputs.dataset }}" >> "$GITHUB_ENV"; fi
           if [ -n "${{ inputs.split }}" ]; then echo "SPLIT=${{ inputs.split }}" >> "$GITHUB_ENV"; fi
           if [ -n "${{ inputs.max-workers }}" ]; then echo "MAX_WORKERS=${{ inputs.max-workers }}" >> "$GITHUB_ENV"; fi
+          if [ -n "${{ inputs.build-batch-size }}" ]; then echo "BUILD_BATCH_SIZE=${{ inputs.build-batch-size }}" >> "$GITHUB_ENV"; fi
           if [ -n "${{ inputs.max-retries }}" ]; then echo "MAX_RETRIES=${{ inputs.max-retries }}" >> "$GITHUB_ENV"; fi
           # Empty string means "no limit"
           if [ -n "${{ inputs.n-limit }}" ]; then echo "N_LIMIT=${{ inputs.n-limit }}" >> "$GITHUB_ENV"; else echo "N_LIMIT=" >> "$GITHUB_ENV"; fi
@@ -238,6 +244,7 @@ jobs:
             --image ghcr.io/openhands/eval-agent-server \
             --push \
             --max-workers '${MAX_WORKERS}' \
+            --build-batch-size '${BUILD_BATCH_SIZE}' \
             --max-retries '${MAX_RETRIES}'"
 
           # Only include --n-limit if provided (non-empty)
diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml
index db4a66ce9..b1ffb93b7 100644
--- a/.github/workflows/build-swtbench-images.yml
+++ b/.github/workflows/build-swtbench-images.yml
@@ -22,7 +22,12 @@ on:
       max-workers:
         description: 'Maximum number of parallel workers'
         required: false
-        default: '4'
+        default: '16'
+        type: string
+      agent-build-batch-size:
+        description: 'Number of agent-server images to submit per batch'
+        required: false
+        default: '50'
         type: string
       n-limit:
         description: 'Limit number of images to build (0 for all)'
@@ -87,7 +92,8 @@ jobs:
     env:
       DATASET: eth-sri/SWT-bench_Verified_bm25_27k_zsp
       SPLIT: test
-      MAX_WORKERS: '4'
+      MAX_WORKERS: '16'
+      BUILD_BATCH_SIZE: '50'
       N_LIMIT: '0'
       INSTANCE_IDS: ''
       SELECT_FILE: ''
@@ -146,7 +152,8 @@ jobs:
           # Get inputs with defaults
           DATASET="${{ inputs.dataset || 'eth-sri/SWT-bench_Verified_bm25_27k_zsp' }}"
           SPLIT="${{ inputs.split || 'test' }}"
-          MAX_WORKERS="${{ inputs.max-workers || '4' }}"
+          MAX_WORKERS="${{ inputs.max-workers || '16' }}"
+          BUILD_BATCH_SIZE="${{ inputs.agent-build-batch-size || '50' }}"
           N_LIMIT="${{ inputs.n-limit || '0' }}"
           INSTANCE_IDS="${{ inputs.instance-ids }}"
           FORCE_BUILD="${{ inputs.force-build || 'false' }}"
@@ -172,6 +179,7 @@ jobs:
             --image ghcr.io/openhands/eval-agent-server \
             --target ${TARGET} \
             --max-workers ${MAX_WORKERS} \
+            --build-batch-size ${BUILD_BATCH_SIZE} \
             --push"
 
           # Add n-limit if specified
diff --git a/benchmarks/commit0/build_images.py b/benchmarks/commit0/build_images.py
index 23bce135c..58ab16542 100644
--- a/benchmarks/commit0/build_images.py
+++ b/benchmarks/commit0/build_images.py
@@ -124,6 +124,7 @@ def main(argv: list[str]) -> int:
         image=args.image,
         push=args.push,
         max_workers=args.max_workers,
+        build_batch_size=args.build_batch_size,
         dry_run=args.dry_run,
         force_build=args.force_build,
         max_retries=args.max_retries,
diff --git a/benchmarks/gaia/build_images.py b/benchmarks/gaia/build_images.py
index c7eb4df65..6da18829b 100644
--- a/benchmarks/gaia/build_images.py
+++ b/benchmarks/gaia/build_images.py
@@ -100,6 +100,7 @@ def tag_fn(_base: str) -> str:
         image=args.image,
         push=args.push,
         max_workers=1,  # Only building one image
+        build_batch_size=args.build_batch_size,
         dry_run=args.dry_run,
         force_build=args.force_build,
         max_retries=args.max_retries,
diff --git a/benchmarks/multiswebench/build_images.py b/benchmarks/multiswebench/build_images.py
index 649e27ab2..3db208e50 100644
--- a/benchmarks/multiswebench/build_images.py
+++ b/benchmarks/multiswebench/build_images.py
@@ -118,6 +118,7 @@ def main():
         ),
         base_image_to_custom_tag_fn=extract_custom_tag,
         max_workers=args.num_workers,
+        build_batch_size=args.build_batch_size,
         dry_run=False,
         force_build=args.force_build,
     )
diff --git a/benchmarks/swebench/build_images.py b/benchmarks/swebench/build_images.py
index 0146dae7d..90306433f 100644
--- a/benchmarks/swebench/build_images.py
+++ b/benchmarks/swebench/build_images.py
@@ -177,6 +177,7 @@ def main(argv: list[str]) -> int:
         image=args.image,
         push=args.push,
         max_workers=args.max_workers,
+        build_batch_size=args.build_batch_size,
         dry_run=args.dry_run,
         force_build=args.force_build,
         max_retries=args.max_retries,
diff --git a/benchmarks/swebenchmultilingual/build_images.py b/benchmarks/swebenchmultilingual/build_images.py
index 8c7e1e579..e078cea30 100644
--- a/benchmarks/swebenchmultilingual/build_images.py
+++ b/benchmarks/swebenchmultilingual/build_images.py
@@ -177,7 +177,9 @@ def main(argv: list[str]) -> int:
         image=args.image,
         push=args.push,
         max_workers=args.max_workers,
+        build_batch_size=args.build_batch_size,
         dry_run=args.dry_run,
+        force_build=args.force_build,
         max_retries=args.max_retries,
         base_image_to_custom_tag_fn=extract_custom_tag,
         post_build_fn=_wrap_if_needed,
diff --git a/benchmarks/swebenchmultimodal/build_images.py b/benchmarks/swebenchmultimodal/build_images.py
index 775e20fa7..3c436a31c 100644
--- a/benchmarks/swebenchmultimodal/build_images.py
+++ b/benchmarks/swebenchmultimodal/build_images.py
@@ -83,6 +83,7 @@ def main(argv: list[str]) -> int:
         image=args.image,
         push=args.push,
         max_workers=args.max_workers,
+        build_batch_size=args.build_batch_size,
         dry_run=args.dry_run,
         force_build=args.force_build,
         max_retries=args.max_retries,
diff --git a/benchmarks/swegym/build_images.py b/benchmarks/swegym/build_images.py
index 94e2be1f2..e5941bf81 100644
--- a/benchmarks/swegym/build_images.py
+++ b/benchmarks/swegym/build_images.py
@@ -87,6 +87,7 @@ def main(argv: list[str]) -> int:
         image=args.image,
         push=args.push,
         max_workers=args.max_workers,
+        build_batch_size=args.build_batch_size,
         dry_run=args.dry_run,
         force_build=args.force_build,
         max_retries=args.max_retries,
diff --git a/benchmarks/swesmith/build_images.py b/benchmarks/swesmith/build_images.py
index eb59b7796..2c63bb065 100644
--- a/benchmarks/swesmith/build_images.py
+++ b/benchmarks/swesmith/build_images.py
@@ -84,6 +84,7 @@ def main(argv: list[str]) -> int:
         image=args.image,
         push=args.push,
         max_workers=args.max_workers,
+        build_batch_size=args.build_batch_size,
         dry_run=args.dry_run,
         force_build=args.force_build,
         max_retries=args.max_retries,
diff --git a/benchmarks/swtbench/build_images.py b/benchmarks/swtbench/build_images.py
index 1c0cb19b7..f28ac5eb8 100644
--- a/benchmarks/swtbench/build_images.py
+++ b/benchmarks/swtbench/build_images.py
@@ -44,6 +44,7 @@ def main(argv: list[str]) -> int:
         image=args.image,
         push=args.push,
         max_workers=args.max_workers,
+        build_batch_size=args.build_batch_size,
         dry_run=args.dry_run,
         force_build=args.force_build,
         max_retries=args.max_retries,
diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py
index 2b6646752..37284ddc7 100644
--- a/benchmarks/utils/build_utils.py
+++ b/benchmarks/utils/build_utils.py
@@ -289,6 +289,15 @@ def get_build_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--max-workers", type=int, default=1, help="Concurrent builds (be cautious)"
     )
+    parser.add_argument(
+        "--build-batch-size",
+        type=int,
+        default=None,
+        help=(
+            "Number of images to submit per batch. Defaults to BUILD_BATCH_SIZE "
+            "when unset."
+        ),
+    )
     parser.add_argument(
         "--dry-run", action="store_true", help="List base images only, don’t build"
     )
@@ -609,6 +618,7 @@ def build_all_images(
     push: bool = False,
     base_image_to_custom_tag_fn: Callable[[str], str] | None = None,
     max_workers: int = 1,
+    build_batch_size: int | None = None,
     dry_run: bool = False,
     force_build: bool = False,
     max_retries: int = 3,
@@ -627,6 +637,8 @@ def build_all_images(
         base_image_to_custom_tag_fn: Function to extract a custom tag from a base image.
             Evaluated before scheduling builds so it can safely be a closure.
         max_workers: Number of concurrent builds.
+        build_batch_size: Number of images to submit per batch. If None, use the
+            BUILD_BATCH_SIZE environment variable.
         dry_run: If True, only list base images without building.
         force_build: If True, rebuild even when matching remote images already exist.
         max_retries: Number of times to retry each failed build (default: 3).
@@ -655,7 +667,11 @@ def build_all_images(
 
     # Batch/prune settings (tunable via env to control disk usage on sticky runners)
     # Default to smaller batches and more aggressive pruning on shared runners.
-    batch_size = int(os.getenv("BUILD_BATCH_SIZE", "15"))
+    batch_size = (
+        build_batch_size
+        if build_batch_size is not None
+        else int(os.getenv("BUILD_BATCH_SIZE", "15"))
+    )
     prune_keep_storage_gb = int(os.getenv("BUILDKIT_PRUNE_KEEP_GB", "60"))
     prune_threshold_pct = float(os.getenv("BUILDKIT_PRUNE_THRESHOLD_PCT", "60"))
     # Prune aggressively by default; filters like "unused-for=12h" prevented GC from
diff --git a/tests/test_image_utils.py b/tests/test_image_utils.py
index 40535446e..6d9134404 100644
--- a/tests/test_image_utils.py
+++ b/tests/test_image_utils.py
@@ -390,6 +390,70 @@ def test_build_parser_accepts_force_build(self):
         assert args.force_build is True
 
 
+class TestBuildBatchSizeConfig:
+    def test_build_parser_accepts_build_batch_size(self):
+        from benchmarks.utils.build_utils import get_build_parser
+
+        args = get_build_parser().parse_args(["--build-batch-size", "50"])
+
+        assert args.build_batch_size == 50
+
+    @patch.dict(os.environ, {"BUILD_BATCH_SIZE": "99"})
+    def test_build_all_images_prefers_explicit_batch_size_over_env(
+        self,
+        tmp_path: Path,
+    ):
+        from benchmarks.utils import build_utils
+
+        seen_batches: list[list[str]] = []
+
+        class FakeFuture:
+            def __init__(self, result: BuildOutput):
+                self._result = result
+
+            def result(self) -> BuildOutput:
+                return self._result
+
+        class FakeExecutor:
+            def __init__(self, *args, **kwargs):
+                self._batch: list[str] = []
+
+            def __enter__(self):
+                seen_batches.append(self._batch)
+                return self
+
+            def __exit__(self, exc_type, exc, tb):
+                return False
+
+            def submit(self, fn, **kwargs):
+                self._batch.append(kwargs["base_image"])
+                return FakeFuture(
+                    BuildOutput(
+                        base_image=kwargs["base_image"],
+                        tags=[f"tag:{kwargs['base_image']}"],
+                        error=None,
+                    )
+                )
+
+        with (
+            patch.object(build_utils, "ProcessPoolExecutor", FakeExecutor),
+            patch.object(
+                build_utils, "as_completed", side_effect=lambda futures: futures
+            ),
+            patch.object(build_utils, "buildkit_disk_usage", return_value=(0, 0)),
+            patch.object(build_utils, "maybe_prune_buildkit_cache", return_value=False),
+        ):
+            exit_code = build_utils.build_all_images(
+                base_images=["base-1", "base-2", "base-3"],
+                target="source-minimal",
+                build_dir=tmp_path,
+                build_batch_size=2,
+            )
+
+        assert exit_code == 0
+        assert seen_batches == [["base-1", "base-2"], ["base-3"]]
+
+
 class TestBuildWithLoggingTelemetry:
     @patch("benchmarks.utils.build_utils.maybe_reset_buildkit")
     @patch("benchmarks.utils.build_utils.time.monotonic", side_effect=[100.0, 109.5])

From 89e3a3e2a84eb6bae856499a8e8e7dd9c451c0eb Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Fri, 13 Mar 2026 17:33:40 -0300
Subject: [PATCH 2/3] build: normalize SWT workflow batch-size input

Rename the SWT image-build batch-size workflow input to match the SWE-Bench workflow and move the eval-env-specific knob to eval-env-build-batch-size for clarity.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swtbench-images.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml
index b1ffb93b7..317dbd718 100644
--- a/.github/workflows/build-swtbench-images.yml
+++ b/.github/workflows/build-swtbench-images.yml
@@ -24,7 +24,7 @@ on:
         required: false
         default: '16'
         type: string
-      agent-build-batch-size:
+      build-batch-size:
         description: 'Number of agent-server images to submit per batch'
         required: false
         default: '50'
@@ -58,7 +58,7 @@ on:
         required: false
         default: '2'
         type: string
-      build-batch-size:
+      eval-env-build-batch-size:
         description: 'Env images per batch for eval env builds'
         required: false
         default: '10'
@@ -153,7 +153,7 @@ jobs:
           DATASET="${{ inputs.dataset || 'eth-sri/SWT-bench_Verified_bm25_27k_zsp' }}"
           SPLIT="${{ inputs.split || 'test' }}"
           MAX_WORKERS="${{ inputs.max-workers || '16' }}"
-          BUILD_BATCH_SIZE="${{ inputs.agent-build-batch-size || '50' }}"
+          BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '50' }}"
           N_LIMIT="${{ inputs.n-limit || '0' }}"
           INSTANCE_IDS="${{ inputs.instance-ids }}"
           FORCE_BUILD="${{ inputs.force-build || 'false' }}"
@@ -218,7 +218,7 @@ jobs:
           MAX_WORKERS="${{ inputs.max-workers || '4' }}"
           BUILD_MODE="${{ inputs.build-mode || 'cli' }}"
           MAX_RETRIES="${{ inputs.max-retries || '2' }}"
-          BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '10' }}"
+          BUILD_BATCH_SIZE="${{ inputs.eval-env-build-batch-size || '10' }}"
           FORCE_BUILD="${{ inputs.force-build || 'false' }}"
 
           echo "N_LIMIT=${N_LIMIT}" >> "$GITHUB_ENV"

From 03bd11d75a27101842497da4411051c712843d6b Mon Sep 17 00:00:00 2001
From: Simon Rosenberg <simonrosen10@gmail.com>
Date: Fri, 13 Mar 2026 17:50:41 -0300
Subject: [PATCH 3/3] build: restore workflow default parallelism values

Keep the newly exposed workflow knobs, but reset their default values to match the original main-branch behavior for SWE-Bench and SWT-Bench image builds.

Co-authored-by: openhands <openhands@all-hands.dev>
---
 .github/workflows/build-swebench-images.yml |  8 ++++----
 .github/workflows/build-swtbench-images.yml | 12 ++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build-swebench-images.yml b/.github/workflows/build-swebench-images.yml
index b3d7db27e..323164724 100644
--- a/.github/workflows/build-swebench-images.yml
+++ b/.github/workflows/build-swebench-images.yml
@@ -22,12 +22,12 @@ on:
       max-workers:
         description: 'Number of concurrent builds'
         required: false
-        default: '32'
+        default: '12'
         type: string
       build-batch-size:
         description: 'Number of images to submit per batch'
         required: false
-        default: '50'
+        default: '15'
         type: string
       max-retries:
         description: 'Retries per image build'
@@ -64,12 +64,12 @@ on:
 env:
   DATASET: princeton-nlp/SWE-bench_Verified
   SPLIT: test
-  MAX_WORKERS: '32'
+  MAX_WORKERS: '12'
   MAX_RETRIES: '5'
   N_LIMIT: '500'
   INSTANCE_IDS: ''
   SELECT_FILE: ''
-  BUILD_BATCH_SIZE: '50'
+  BUILD_BATCH_SIZE: '15'
   BUILDKIT_PRUNE_KEEP_GB: '60'
   BUILDKIT_PRUNE_THRESHOLD_PCT: '60'
 
diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml
index 317dbd718..56d5fc2a2 100644
--- a/.github/workflows/build-swtbench-images.yml
+++ b/.github/workflows/build-swtbench-images.yml
@@ -22,12 +22,12 @@ on:
       max-workers:
         description: 'Maximum number of parallel workers'
         required: false
-        default: '16'
+        default: '4'
         type: string
       build-batch-size:
         description: 'Number of agent-server images to submit per batch'
         required: false
-        default: '50'
+        default: '15'
         type: string
       n-limit:
         description: 'Limit number of images to build (0 for all)'
@@ -92,8 +92,8 @@ jobs:
     env:
       DATASET: eth-sri/SWT-bench_Verified_bm25_27k_zsp
       SPLIT: test
-      MAX_WORKERS: '16'
-      BUILD_BATCH_SIZE: '50'
+      MAX_WORKERS: '4'
+      BUILD_BATCH_SIZE: '15'
       N_LIMIT: '0'
       INSTANCE_IDS: ''
       SELECT_FILE: ''
@@ -152,8 +152,8 @@ jobs:
           # Get inputs with defaults
           DATASET="${{ inputs.dataset || 'eth-sri/SWT-bench_Verified_bm25_27k_zsp' }}"
           SPLIT="${{ inputs.split || 'test' }}"
-          MAX_WORKERS="${{ inputs.max-workers || '16' }}"
-          BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '50' }}"
+          MAX_WORKERS="${{ inputs.max-workers || '4' }}"
+          BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '15' }}"
           N_LIMIT="${{ inputs.n-limit || '0' }}"
           INSTANCE_IDS="${{ inputs.instance-ids }}"
           FORCE_BUILD="${{ inputs.force-build || 'false' }}"