diff --git a/.github/workflows/build-swebench-images.yml b/.github/workflows/build-swebench-images.yml index 9a307dd63..323164724 100644 --- a/.github/workflows/build-swebench-images.yml +++ b/.github/workflows/build-swebench-images.yml @@ -24,6 +24,11 @@ on: required: false default: '12' type: string + build-batch-size: + description: 'Number of images to submit per batch' + required: false + default: '15' + type: string max-retries: description: 'Retries per image build' required: false @@ -119,6 +124,7 @@ jobs: if [ -n "${{ inputs.dataset }}" ]; then echo "DATASET=${{ inputs.dataset }}" >> "$GITHUB_ENV"; fi if [ -n "${{ inputs.split }}" ]; then echo "SPLIT=${{ inputs.split }}" >> "$GITHUB_ENV"; fi if [ -n "${{ inputs.max-workers }}" ]; then echo "MAX_WORKERS=${{ inputs.max-workers }}" >> "$GITHUB_ENV"; fi + if [ -n "${{ inputs.build-batch-size }}" ]; then echo "BUILD_BATCH_SIZE=${{ inputs.build-batch-size }}" >> "$GITHUB_ENV"; fi if [ -n "${{ inputs.max-retries }}" ]; then echo "MAX_RETRIES=${{ inputs.max-retries }}" >> "$GITHUB_ENV"; fi # Empty string means "no limit" if [ -n "${{ inputs.n-limit }}" ]; then echo "N_LIMIT=${{ inputs.n-limit }}" >> "$GITHUB_ENV"; else echo "N_LIMIT=" >> "$GITHUB_ENV"; fi @@ -238,6 +244,7 @@ jobs: --image ghcr.io/openhands/eval-agent-server \ --push \ --max-workers '${MAX_WORKERS}' \ + --build-batch-size '${BUILD_BATCH_SIZE}' \ --max-retries '${MAX_RETRIES}'" # Only include --n-limit if provided (non-empty) diff --git a/.github/workflows/build-swtbench-images.yml b/.github/workflows/build-swtbench-images.yml index db4a66ce9..56d5fc2a2 100644 --- a/.github/workflows/build-swtbench-images.yml +++ b/.github/workflows/build-swtbench-images.yml @@ -24,6 +24,11 @@ on: required: false default: '4' type: string + build-batch-size: + description: 'Number of agent-server images to submit per batch' + required: false + default: '15' + type: string n-limit: description: 'Limit number of images to build (0 for all)' required: false @@ -53,7 +58,7 @@ on: required: false default: '2' type: string - build-batch-size: + eval-env-build-batch-size: description: 'Env images per batch for eval env builds' required: false default: '10' @@ -88,6 +93,7 @@ jobs: DATASET: eth-sri/SWT-bench_Verified_bm25_27k_zsp SPLIT: test MAX_WORKERS: '4' + BUILD_BATCH_SIZE: '15' N_LIMIT: '0' INSTANCE_IDS: '' SELECT_FILE: '' @@ -147,6 +153,7 @@ jobs: DATASET="${{ inputs.dataset || 'eth-sri/SWT-bench_Verified_bm25_27k_zsp' }}" SPLIT="${{ inputs.split || 'test' }}" MAX_WORKERS="${{ inputs.max-workers || '4' }}" + BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '15' }}" N_LIMIT="${{ inputs.n-limit || '0' }}" INSTANCE_IDS="${{ inputs.instance-ids }}" FORCE_BUILD="${{ inputs.force-build || 'false' }}" @@ -172,6 +179,7 @@ jobs: --image ghcr.io/openhands/eval-agent-server \ --target ${TARGET} \ --max-workers ${MAX_WORKERS} \ + --build-batch-size ${BUILD_BATCH_SIZE} \ --push" # Add n-limit if specified @@ -210,7 +218,7 @@ jobs: MAX_WORKERS="${{ inputs.max-workers || '4' }}" BUILD_MODE="${{ inputs.build-mode || 'cli' }}" MAX_RETRIES="${{ inputs.max-retries || '2' }}" - BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '10' }}" + BUILD_BATCH_SIZE="${{ inputs.eval-env-build-batch-size || '10' }}" FORCE_BUILD="${{ inputs.force-build || 'false' }}" echo "N_LIMIT=${N_LIMIT}" >> "$GITHUB_ENV" diff --git a/benchmarks/commit0/build_images.py b/benchmarks/commit0/build_images.py index 23bce135c..58ab16542 100644 --- a/benchmarks/commit0/build_images.py +++ b/benchmarks/commit0/build_images.py @@ -124,6 +124,7 @@ def main(argv: list[str]) -> int: image=args.image, push=args.push, max_workers=args.max_workers, + build_batch_size=args.build_batch_size, dry_run=args.dry_run, force_build=args.force_build, max_retries=args.max_retries, diff --git a/benchmarks/gaia/build_images.py b/benchmarks/gaia/build_images.py index c7eb4df65..6da18829b 100644 --- a/benchmarks/gaia/build_images.py +++ b/benchmarks/gaia/build_images.py @@ -100,6 +100,7 @@ def tag_fn(_base: str) -> str: image=args.image, push=args.push, max_workers=1, # Only building one image + build_batch_size=args.build_batch_size, dry_run=args.dry_run, force_build=args.force_build, max_retries=args.max_retries, diff --git a/benchmarks/multiswebench/build_images.py b/benchmarks/multiswebench/build_images.py index 649e27ab2..3db208e50 100644 --- a/benchmarks/multiswebench/build_images.py +++ b/benchmarks/multiswebench/build_images.py @@ -118,6 +118,7 @@ def main(): ), base_image_to_custom_tag_fn=extract_custom_tag, max_workers=args.num_workers, + build_batch_size=args.build_batch_size, dry_run=False, force_build=args.force_build, ) diff --git a/benchmarks/swebench/build_images.py b/benchmarks/swebench/build_images.py index 0146dae7d..90306433f 100644 --- a/benchmarks/swebench/build_images.py +++ b/benchmarks/swebench/build_images.py @@ -177,6 +177,7 @@ def main(argv: list[str]) -> int: image=args.image, push=args.push, max_workers=args.max_workers, + build_batch_size=args.build_batch_size, dry_run=args.dry_run, force_build=args.force_build, max_retries=args.max_retries, diff --git a/benchmarks/swebenchmultilingual/build_images.py b/benchmarks/swebenchmultilingual/build_images.py index 8c7e1e579..e078cea30 100644 --- a/benchmarks/swebenchmultilingual/build_images.py +++ b/benchmarks/swebenchmultilingual/build_images.py @@ -177,7 +177,9 @@ def main(argv: list[str]) -> int: image=args.image, push=args.push, max_workers=args.max_workers, + build_batch_size=args.build_batch_size, dry_run=args.dry_run, + force_build=args.force_build, max_retries=args.max_retries, base_image_to_custom_tag_fn=extract_custom_tag, post_build_fn=_wrap_if_needed, diff --git a/benchmarks/swebenchmultimodal/build_images.py b/benchmarks/swebenchmultimodal/build_images.py index 775e20fa7..3c436a31c 100644 --- a/benchmarks/swebenchmultimodal/build_images.py +++ b/benchmarks/swebenchmultimodal/build_images.py @@ -83,6 +83,7 @@ def main(argv: list[str]) -> int: image=args.image, push=args.push, max_workers=args.max_workers, + build_batch_size=args.build_batch_size, dry_run=args.dry_run, force_build=args.force_build, max_retries=args.max_retries, diff --git a/benchmarks/swegym/build_images.py b/benchmarks/swegym/build_images.py index 94e2be1f2..e5941bf81 100644 --- a/benchmarks/swegym/build_images.py +++ b/benchmarks/swegym/build_images.py @@ -87,6 +87,7 @@ def main(argv: list[str]) -> int: image=args.image, push=args.push, max_workers=args.max_workers, + build_batch_size=args.build_batch_size, dry_run=args.dry_run, force_build=args.force_build, max_retries=args.max_retries, diff --git a/benchmarks/swesmith/build_images.py b/benchmarks/swesmith/build_images.py index eb59b7796..2c63bb065 100644 --- a/benchmarks/swesmith/build_images.py +++ b/benchmarks/swesmith/build_images.py @@ -84,6 +84,7 @@ def main(argv: list[str]) -> int: image=args.image, push=args.push, max_workers=args.max_workers, + build_batch_size=args.build_batch_size, dry_run=args.dry_run, force_build=args.force_build, max_retries=args.max_retries, diff --git a/benchmarks/swtbench/build_images.py b/benchmarks/swtbench/build_images.py index 1c0cb19b7..f28ac5eb8 100644 --- a/benchmarks/swtbench/build_images.py +++ b/benchmarks/swtbench/build_images.py @@ -44,6 +44,7 @@ def main(argv: list[str]) -> int: image=args.image, push=args.push, max_workers=args.max_workers, + build_batch_size=args.build_batch_size, dry_run=args.dry_run, force_build=args.force_build, max_retries=args.max_retries, diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py index d0a33d647..d73808ed4 100644 --- a/benchmarks/utils/build_utils.py +++ b/benchmarks/utils/build_utils.py @@ -342,6 +342,15 @@ def get_build_parser() -> argparse.ArgumentParser: parser.add_argument( "--max-workers", type=int, default=1, help="Concurrent builds (be cautious)" ) + parser.add_argument( + "--build-batch-size", + type=int, + default=None, + help=( + "Number of images to submit per batch. Defaults to BUILD_BATCH_SIZE " + "when unset." + ), + ) parser.add_argument( "--dry-run", action="store_true", help="List base images only, don’t build" ) @@ -666,6 +675,7 @@ def build_all_images( push: bool = False, base_image_to_custom_tag_fn: Callable[[str], str] | None = None, max_workers: int = 1, + build_batch_size: int | None = None, dry_run: bool = False, force_build: bool = False, max_retries: int = 3, @@ -684,6 +694,8 @@ def build_all_images( base_image_to_custom_tag_fn: Function to extract a custom tag from a base image. Evaluated before scheduling builds so it can safely be a closure. max_workers: Number of concurrent builds. + build_batch_size: Number of images to submit per batch. If None, use the + BUILD_BATCH_SIZE environment variable. dry_run: If True, only list base images without building. force_build: If True, rebuild even when matching remote images already exist. max_retries: Number of times to retry each failed build (default: 3). @@ -712,7 +724,11 @@ def build_all_images( # Batch/prune settings (tunable via env to control disk usage on sticky runners) # Default to smaller batches and more aggressive pruning on shared runners. - batch_size = int(os.getenv("BUILD_BATCH_SIZE", "15")) + batch_size = ( + build_batch_size + if build_batch_size is not None + else int(os.getenv("BUILD_BATCH_SIZE", "15")) + ) prune_keep_storage_gb = int(os.getenv("BUILDKIT_PRUNE_KEEP_GB", "60")) prune_threshold_pct = float(os.getenv("BUILDKIT_PRUNE_THRESHOLD_PCT", "60")) # Prune aggressively by default; filters like "unused-for=12h" prevented GC from diff --git a/tests/test_image_utils.py b/tests/test_image_utils.py index 470b050f5..f0458940d 100644 --- a/tests/test_image_utils.py +++ b/tests/test_image_utils.py @@ -391,6 +391,70 @@ def test_build_parser_accepts_force_build(self): assert args.force_build is True +class TestBuildBatchSizeConfig: + def test_build_parser_accepts_build_batch_size(self): + from benchmarks.utils.build_utils import get_build_parser + + args = get_build_parser().parse_args(["--build-batch-size", "50"]) + + assert args.build_batch_size == 50 + + @patch.dict(os.environ, {"BUILD_BATCH_SIZE": "99"}) + def test_build_all_images_prefers_explicit_batch_size_over_env( + self, + tmp_path: Path, + ): + from benchmarks.utils import build_utils + + seen_batches: list[list[str]] = [] + + class FakeFuture: + def __init__(self, result: BuildOutput): + self._result = result + + def result(self) -> BuildOutput: + return self._result + + class FakeExecutor: + def __init__(self, *args, **kwargs): + self._batch: list[str] = [] + + def __enter__(self): + seen_batches.append(self._batch) + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def submit(self, fn, **kwargs): + self._batch.append(kwargs["base_image"]) + return FakeFuture( + BuildOutput( + base_image=kwargs["base_image"], + tags=[f"tag:{kwargs['base_image']}"], + error=None, + ) + ) + + with ( + patch.object(build_utils, "ProcessPoolExecutor", FakeExecutor), + patch.object( + build_utils, "as_completed", side_effect=lambda futures: futures + ), + patch.object(build_utils, "buildkit_disk_usage", return_value=(0, 0)), + patch.object(build_utils, "maybe_prune_buildkit_cache", return_value=False), + ): + exit_code = build_utils.build_all_images( + base_images=["base-1", "base-2", "base-3"], + target="source-minimal", + build_dir=tmp_path, + build_batch_size=2, + ) + + assert exit_code == 0 + assert seen_batches == [["base-1", "base-2"], ["base-3"]] + + class TestCachedSdistReuse: def test_build_image_passes_cached_sdist_to_sdk_build_module( self,