From d5dd8c8ae0870a2b23e383cca36f6f123afe879d Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Fri, 13 Mar 2026 17:01:55 -0300 Subject: [PATCH 1/3] build: expose image build parallelism knobs to workflows Add an explicit build_batch_size parameter to the shared build helper, thread it through the current image build entrypoints, and surface the corresponding workflow inputs for SWE-Bench and SWT-Bench. This lets workflow-dispatched max worker and batch-size values reach the build logic without being overridden by environment defaults. Co-authored-by: openhands --- .github/workflows/build-swebench-images.yml | 13 +++- .github/workflows/build-swtbench-images.yml | 14 +++- benchmarks/commit0/build_images.py | 1 + benchmarks/gaia/build_images.py | 1 + benchmarks/multiswebench/build_images.py | 1 + benchmarks/swebench/build_images.py | 1 + .../swebenchmultilingual/build_images.py | 2 + benchmarks/swebenchmultimodal/build_images.py | 1 + benchmarks/swegym/build_images.py | 1 + benchmarks/swesmith/build_images.py | 1 + benchmarks/swtbench/build_images.py | 1 + benchmarks/utils/build_utils.py | 18 +++++- tests/test_image_utils.py | 64 +++++++++++++++++++ 13 files changed, 112 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-swebench-images.yml b/.github/workflows/build-swebench-images.yml index 9a307dd63..b3d7db27e 100644 --- a/.github/workflows/build-swebench-images.yml +++ b/.github/workflows/build-swebench-images.yml @@ -22,7 +22,12 @@ on: max-workers: description: 'Number of concurrent builds' required: false - default: '12' + default: '32' + type: string + build-batch-size: + description: 'Number of images to submit per batch' + required: false + default: '50' type: string max-retries: description: 'Retries per image build' @@ -59,12 +64,12 @@ on: env: DATASET: princeton-nlp/SWE-bench_Verified SPLIT: test - MAX_WORKERS: '12' + MAX_WORKERS: '32' MAX_RETRIES: '5' N_LIMIT: '500' INSTANCE_IDS: '' SELECT_FILE: '' - BUILD_BATCH_SIZE: '15' + BUILD_BATCH_SIZE: '50' BUILDKIT_PRUNE_KEEP_GB: '60' BUILDKIT_PRUNE_THRESHOLD_PCT: '60' @@ -119,6 +124,7 @@ jobs: if [ -n "${{ inputs.dataset }}" ]; then echo "DATASET=${{ inputs.dataset }}" >> "$GITHUB_ENV"; fi if [ -n "${{ inputs.split }}" ]; then echo "SPLIT=${{ inputs.split }}" >> "$GITHUB_ENV"; fi if [ -n "${{ inputs.max-workers }}" ]; then echo "MAX_WORKERS=${{ inputs.max-workers }}" >> "$GITHUB_ENV"; fi + if [ -n "${{ inputs.build-batch-size }}" ]; then echo "BUILD_BATCH_SIZE=${{ inputs.build-batch-size }}" >> "$GITHUB_ENV"; fi if [ -n "${{ inputs.max-retries }}" ]; then echo "MAX_RETRIES=${{ inputs.max-retries }}" >> "$GITHUB_ENV"; fi # Empty string means "no limit" if [ -n "${{ inputs.n-limit }}" ]; then echo "N_LIMIT=${{ inputs.n-limit }}" >> "$GITHUB_ENV"; else echo "N_LIMIT=" >> "$GITHUB_ENV"; fi @@ -238,6 +244,7 @@ jobs: --image ghcr.io/openhands/eval-agent-server \ --push \ --max-workers '${MAX_WORKERS}' \ + --build-batch-size '${BUILD_BATCH_SIZE}' \ --max-retries '${MAX_RETRIES}'" # Only include --n-limit if provided (non-empty) diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml index db4a66ce9..b1ffb93b7 100644 --- a/.github/workflows/build-swtbench-images.yml +++ b/.github/workflows/build-swtbench-images.yml @@ -22,7 +22,12 @@ on: max-workers: description: 'Maximum number of parallel workers' required: false - default: '4' + default: '16' + type: string + agent-build-batch-size: + description: 'Number of agent-server images to submit per batch' + required: false + default: '50' type: string n-limit: description: 'Limit number of images to build (0 for all)' @@ -87,7 +92,8 @@ jobs: env: DATASET: eth-sri/SWT-bench_Verified_bm25_27k_zsp SPLIT: test - MAX_WORKERS: '4' + MAX_WORKERS: '16' + BUILD_BATCH_SIZE: '50' N_LIMIT: '0' INSTANCE_IDS: '' SELECT_FILE: '' @@ -146,7 +152,8 @@ jobs: # Get inputs with defaults DATASET="${{ inputs.dataset || 'eth-sri/SWT-bench_Verified_bm25_27k_zsp' }}" SPLIT="${{ inputs.split || 'test' }}" - MAX_WORKERS="${{ inputs.max-workers || '4' }}" + MAX_WORKERS="${{ inputs.max-workers || '16' }}" + BUILD_BATCH_SIZE="${{ inputs.agent-build-batch-size || '50' }}" N_LIMIT="${{ inputs.n-limit || '0' }}" INSTANCE_IDS="${{ inputs.instance-ids }}" FORCE_BUILD="${{ inputs.force-build || 'false' }}" @@ -172,6 +179,7 @@ jobs: --image ghcr.io/openhands/eval-agent-server \ --target ${TARGET} \ --max-workers ${MAX_WORKERS} \ + --build-batch-size ${BUILD_BATCH_SIZE} \ --push" # Add n-limit if specified diff --git a/benchmarks/commit0/build_images.py b/benchmarks/commit0/build_images.py index 23bce135c..58ab16542 100644 --- a/benchmarks/commit0/build_images.py +++ b/benchmarks/commit0/build_images.py @@ -124,6 +124,7 @@ def main(argv: list[str]) -> int: image=args.image, push=args.push, max_workers=args.max_workers, + build_batch_size=args.build_batch_size, dry_run=args.dry_run, force_build=args.force_build, max_retries=args.max_retries, diff --git a/benchmarks/gaia/build_images.py b/benchmarks/gaia/build_images.py index c7eb4df65..6da18829b 100644 --- a/benchmarks/gaia/build_images.py +++ b/benchmarks/gaia/build_images.py @@ -100,6 +100,7 @@ def tag_fn(_base: str) -> str: image=args.image, push=args.push, max_workers=1, # Only building one image + build_batch_size=args.build_batch_size, dry_run=args.dry_run, force_build=args.force_build, max_retries=args.max_retries, diff --git a/benchmarks/multiswebench/build_images.py b/benchmarks/multiswebench/build_images.py index 649e27ab2..3db208e50 100644 --- a/benchmarks/multiswebench/build_images.py +++ b/benchmarks/multiswebench/build_images.py @@ -118,6 +118,7 @@ def main(): ), base_image_to_custom_tag_fn=extract_custom_tag, max_workers=args.num_workers, + build_batch_size=args.build_batch_size, dry_run=False, force_build=args.force_build, ) diff --git a/benchmarks/swebench/build_images.py b/benchmarks/swebench/build_images.py index 0146dae7d..90306433f 100644 --- a/benchmarks/swebench/build_images.py +++ b/benchmarks/swebench/build_images.py @@ -177,6 +177,7 @@ def main(argv: list[str]) -> int: image=args.image, push=args.push, max_workers=args.max_workers, + build_batch_size=args.build_batch_size, dry_run=args.dry_run, force_build=args.force_build, max_retries=args.max_retries, diff --git a/benchmarks/swebenchmultilingual/build_images.py b/benchmarks/swebenchmultilingual/build_images.py index 8c7e1e579..e078cea30 100644 --- a/benchmarks/swebenchmultilingual/build_images.py +++ b/benchmarks/swebenchmultilingual/build_images.py @@ -177,7 +177,9 @@ def main(argv: list[str]) -> int: image=args.image, push=args.push, max_workers=args.max_workers, + build_batch_size=args.build_batch_size, dry_run=args.dry_run, + force_build=args.force_build, max_retries=args.max_retries, base_image_to_custom_tag_fn=extract_custom_tag, post_build_fn=_wrap_if_needed, diff --git a/benchmarks/swebenchmultimodal/build_images.py b/benchmarks/swebenchmultimodal/build_images.py index 775e20fa7..3c436a31c 100644 --- a/benchmarks/swebenchmultimodal/build_images.py +++ b/benchmarks/swebenchmultimodal/build_images.py @@ -83,6 +83,7 @@ def main(argv: list[str]) -> int: image=args.image, push=args.push, max_workers=args.max_workers, + build_batch_size=args.build_batch_size, dry_run=args.dry_run, force_build=args.force_build, max_retries=args.max_retries, diff --git a/benchmarks/swegym/build_images.py b/benchmarks/swegym/build_images.py index 94e2be1f2..e5941bf81 100644 --- a/benchmarks/swegym/build_images.py +++ b/benchmarks/swegym/build_images.py @@ -87,6 +87,7 @@ def main(argv: list[str]) -> int: image=args.image, push=args.push, max_workers=args.max_workers, + build_batch_size=args.build_batch_size, dry_run=args.dry_run, force_build=args.force_build, max_retries=args.max_retries, diff --git a/benchmarks/swesmith/build_images.py b/benchmarks/swesmith/build_images.py index eb59b7796..2c63bb065 100644 --- a/benchmarks/swesmith/build_images.py +++ b/benchmarks/swesmith/build_images.py @@ -84,6 +84,7 @@ def main(argv: list[str]) -> int: image=args.image, push=args.push, max_workers=args.max_workers, + build_batch_size=args.build_batch_size, dry_run=args.dry_run, force_build=args.force_build, max_retries=args.max_retries, diff --git a/benchmarks/swtbench/build_images.py b/benchmarks/swtbench/build_images.py index 1c0cb19b7..f28ac5eb8 100644 --- a/benchmarks/swtbench/build_images.py +++ b/benchmarks/swtbench/build_images.py @@ -44,6 +44,7 @@ def main(argv: list[str]) -> int: image=args.image, push=args.push, max_workers=args.max_workers, + build_batch_size=args.build_batch_size, dry_run=args.dry_run, force_build=args.force_build, max_retries=args.max_retries, diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py index 2b6646752..37284ddc7 100644 --- a/benchmarks/utils/build_utils.py +++ b/benchmarks/utils/build_utils.py @@ -289,6 +289,15 @@ def get_build_parser() -> argparse.ArgumentParser: parser.add_argument( "--max-workers", type=int, default=1, help="Concurrent builds (be cautious)" ) + parser.add_argument( + "--build-batch-size", + type=int, + default=None, + help=( + "Number of images to submit per batch. Defaults to BUILD_BATCH_SIZE " + "when unset." + ), + ) parser.add_argument( "--dry-run", action="store_true", help="List base images only, don’t build" ) @@ -609,6 +618,7 @@ def build_all_images( push: bool = False, base_image_to_custom_tag_fn: Callable[[str], str] | None = None, max_workers: int = 1, + build_batch_size: int | None = None, dry_run: bool = False, force_build: bool = False, max_retries: int = 3, @@ -627,6 +637,8 @@ def build_all_images( base_image_to_custom_tag_fn: Function to extract a custom tag from a base image. Evaluated before scheduling builds so it can safely be a closure. max_workers: Number of concurrent builds. + build_batch_size: Number of images to submit per batch. If None, use the + BUILD_BATCH_SIZE environment variable. dry_run: If True, only list base images without building. force_build: If True, rebuild even when matching remote images already exist. max_retries: Number of times to retry each failed build (default: 3). @@ -655,7 +667,11 @@ def build_all_images( # Batch/prune settings (tunable via env to control disk usage on sticky runners) # Default to smaller batches and more aggressive pruning on shared runners. - batch_size = int(os.getenv("BUILD_BATCH_SIZE", "15")) + batch_size = ( + build_batch_size + if build_batch_size is not None + else int(os.getenv("BUILD_BATCH_SIZE", "15")) + ) prune_keep_storage_gb = int(os.getenv("BUILDKIT_PRUNE_KEEP_GB", "60")) prune_threshold_pct = float(os.getenv("BUILDKIT_PRUNE_THRESHOLD_PCT", "60")) # Prune aggressively by default; filters like "unused-for=12h" prevented GC from diff --git a/tests/test_image_utils.py b/tests/test_image_utils.py index 40535446e..6d9134404 100644 --- a/tests/test_image_utils.py +++ b/tests/test_image_utils.py @@ -390,6 +390,70 @@ def test_build_parser_accepts_force_build(self): assert args.force_build is True +class TestBuildBatchSizeConfig: + def test_build_parser_accepts_build_batch_size(self): + from benchmarks.utils.build_utils import get_build_parser + + args = get_build_parser().parse_args(["--build-batch-size", "50"]) + + assert args.build_batch_size == 50 + + @patch.dict(os.environ, {"BUILD_BATCH_SIZE": "99"}) + def test_build_all_images_prefers_explicit_batch_size_over_env( + self, + tmp_path: Path, + ): + from benchmarks.utils import build_utils + + seen_batches: list[list[str]] = [] + + class FakeFuture: + def __init__(self, result: BuildOutput): + self._result = result + + def result(self) -> BuildOutput: + return self._result + + class FakeExecutor: + def __init__(self, *args, **kwargs): + self._batch: list[str] = [] + + def __enter__(self): + seen_batches.append(self._batch) + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def submit(self, fn, **kwargs): + self._batch.append(kwargs["base_image"]) + return FakeFuture( + BuildOutput( + base_image=kwargs["base_image"], + tags=[f"tag:{kwargs['base_image']}"], + error=None, + ) + ) + + with ( + patch.object(build_utils, "ProcessPoolExecutor", FakeExecutor), + patch.object( + build_utils, "as_completed", side_effect=lambda futures: futures + ), + patch.object(build_utils, "buildkit_disk_usage", return_value=(0, 0)), + patch.object(build_utils, "maybe_prune_buildkit_cache", return_value=False), + ): + exit_code = build_utils.build_all_images( + base_images=["base-1", "base-2", "base-3"], + target="source-minimal", + build_dir=tmp_path, + build_batch_size=2, + ) + + assert exit_code == 0 + assert seen_batches == [["base-1", "base-2"], ["base-3"]] + + class TestBuildWithLoggingTelemetry: @patch("benchmarks.utils.build_utils.maybe_reset_buildkit") @patch("benchmarks.utils.build_utils.time.monotonic", side_effect=[100.0, 109.5]) From 89e3a3e2a84eb6bae856499a8e8e7dd9c451c0eb Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Fri, 13 Mar 2026 17:33:40 -0300 Subject: [PATCH 2/3] build: normalize SWT workflow batch-size input Rename the SWT image-build batch-size workflow input to match the SWE-Bench workflow and move the eval-env-specific knob to eval-env-build-batch-size for clarity. Co-authored-by: openhands --- .github/workflows/build-swtbench-images.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml index b1ffb93b7..317dbd718 100644 --- a/.github/workflows/build-swtbench-images.yml +++ b/.github/workflows/build-swtbench-images.yml @@ -24,7 +24,7 @@ on: required: false default: '16' type: string - agent-build-batch-size: + build-batch-size: description: 'Number of agent-server images to submit per batch' required: false default: '50' @@ -58,7 +58,7 @@ on: required: false default: '2' type: string - build-batch-size: + eval-env-build-batch-size: description: 'Env images per batch for eval env builds' required: false default: '10' @@ -153,7 +153,7 @@ jobs: DATASET="${{ inputs.dataset || 'eth-sri/SWT-bench_Verified_bm25_27k_zsp' }}" SPLIT="${{ inputs.split || 'test' }}" MAX_WORKERS="${{ inputs.max-workers || '16' }}" - BUILD_BATCH_SIZE="${{ inputs.agent-build-batch-size || '50' }}" + BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '50' }}" N_LIMIT="${{ inputs.n-limit || '0' }}" INSTANCE_IDS="${{ inputs.instance-ids }}" FORCE_BUILD="${{ inputs.force-build || 'false' }}" @@ -218,7 +218,7 @@ jobs: MAX_WORKERS="${{ inputs.max-workers || '4' }}" BUILD_MODE="${{ inputs.build-mode || 'cli' }}" MAX_RETRIES="${{ inputs.max-retries || '2' }}" - BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '10' }}" + BUILD_BATCH_SIZE="${{ inputs.eval-env-build-batch-size || '10' }}" FORCE_BUILD="${{ inputs.force-build || 'false' }}" echo "N_LIMIT=${N_LIMIT}" >> "$GITHUB_ENV" From 03bd11d75a27101842497da4411051c712843d6b Mon Sep 17 00:00:00 2001 From: Simon Rosenberg Date: Fri, 13 Mar 2026 17:50:41 -0300 Subject: [PATCH 3/3] build: restore workflow default parallelism values Keep the newly exposed workflow knobs, but reset their default values to match the original main-branch behavior for SWE-Bench and SWT-Bench image builds. Co-authored-by: openhands --- .github/workflows/build-swebench-images.yml | 8 ++++---- .github/workflows/build-swtbench-images.yml | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-swebench-images.yml b/.github/workflows/build-swebench-images.yml index b3d7db27e..323164724 100644 --- a/.github/workflows/build-swebench-images.yml +++ b/.github/workflows/build-swebench-images.yml @@ -22,12 +22,12 @@ on: max-workers: description: 'Number of concurrent builds' required: false - default: '32' + default: '12' type: string build-batch-size: description: 'Number of images to submit per batch' required: false - default: '50' + default: '15' type: string max-retries: description: 'Retries per image build' @@ -64,12 +64,12 @@ on: env: DATASET: princeton-nlp/SWE-bench_Verified SPLIT: test - MAX_WORKERS: '32' + MAX_WORKERS: '12' MAX_RETRIES: '5' N_LIMIT: '500' INSTANCE_IDS: '' SELECT_FILE: '' - BUILD_BATCH_SIZE: '50' + BUILD_BATCH_SIZE: '15' BUILDKIT_PRUNE_KEEP_GB: '60' BUILDKIT_PRUNE_THRESHOLD_PCT: '60' diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml index 317dbd718..56d5fc2a2 100644 --- a/.github/workflows/build-swtbench-images.yml +++ b/.github/workflows/build-swtbench-images.yml @@ -22,12 +22,12 @@ on: max-workers: description: 'Maximum number of parallel workers' required: false - default: '16' + default: '4' type: string build-batch-size: description: 'Number of agent-server images to submit per batch' required: false - default: '50' + default: '15' type: string n-limit: description: 'Limit number of images to build (0 for all)' @@ -92,8 +92,8 @@ jobs: env: DATASET: eth-sri/SWT-bench_Verified_bm25_27k_zsp SPLIT: test - MAX_WORKERS: '16' - BUILD_BATCH_SIZE: '50' + MAX_WORKERS: '4' + BUILD_BATCH_SIZE: '15' N_LIMIT: '0' INSTANCE_IDS: '' SELECT_FILE: '' @@ -152,8 +152,8 @@ jobs: # Get inputs with defaults DATASET="${{ inputs.dataset || 'eth-sri/SWT-bench_Verified_bm25_27k_zsp' }}" SPLIT="${{ inputs.split || 'test' }}" - MAX_WORKERS="${{ inputs.max-workers || '16' }}" - BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '50' }}" + MAX_WORKERS="${{ inputs.max-workers || '4' }}" + BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '15' }}" N_LIMIT="${{ inputs.n-limit || '0' }}" INSTANCE_IDS="${{ inputs.instance-ids }}" FORCE_BUILD="${{ inputs.force-build || 'false' }}"