Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/build-swebench-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ on:
required: false
default: '12'
type: string
build-batch-size:
description: 'Number of images to submit per batch'
required: false
default: '15'
type: string
max-retries:
description: 'Retries per image build'
required: false
Expand Down Expand Up @@ -119,6 +124,7 @@ jobs:
if [ -n "${{ inputs.dataset }}" ]; then echo "DATASET=${{ inputs.dataset }}" >> "$GITHUB_ENV"; fi
if [ -n "${{ inputs.split }}" ]; then echo "SPLIT=${{ inputs.split }}" >> "$GITHUB_ENV"; fi
if [ -n "${{ inputs.max-workers }}" ]; then echo "MAX_WORKERS=${{ inputs.max-workers }}" >> "$GITHUB_ENV"; fi
if [ -n "${{ inputs.build-batch-size }}" ]; then echo "BUILD_BATCH_SIZE=${{ inputs.build-batch-size }}" >> "$GITHUB_ENV"; fi
if [ -n "${{ inputs.max-retries }}" ]; then echo "MAX_RETRIES=${{ inputs.max-retries }}" >> "$GITHUB_ENV"; fi
# Empty string means "no limit"
if [ -n "${{ inputs.n-limit }}" ]; then echo "N_LIMIT=${{ inputs.n-limit }}" >> "$GITHUB_ENV"; else echo "N_LIMIT=" >> "$GITHUB_ENV"; fi
Expand Down Expand Up @@ -238,6 +244,7 @@ jobs:
--image ghcr.io/openhands/eval-agent-server \
--push \
--max-workers '${MAX_WORKERS}' \
--build-batch-size '${BUILD_BATCH_SIZE}' \
--max-retries '${MAX_RETRIES}'"

# Only include --n-limit if provided (non-empty)
Expand Down
12 changes: 10 additions & 2 deletions .github/workflows/build-swtbench-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ on:
required: false
default: '4'
type: string
build-batch-size:
description: 'Number of agent-server images to submit per batch'
required: false
default: '15'
type: string
n-limit:
description: 'Limit number of images to build (0 for all)'
required: false
Expand Down Expand Up @@ -53,7 +58,7 @@ on:
required: false
default: '2'
type: string
build-batch-size:
eval-env-build-batch-size:
description: 'Env images per batch for eval env builds'
required: false
default: '10'
Expand Down Expand Up @@ -88,6 +93,7 @@ jobs:
DATASET: eth-sri/SWT-bench_Verified_bm25_27k_zsp
SPLIT: test
MAX_WORKERS: '4'
BUILD_BATCH_SIZE: '15'
N_LIMIT: '0'
INSTANCE_IDS: ''
SELECT_FILE: ''
Expand Down Expand Up @@ -147,6 +153,7 @@ jobs:
DATASET="${{ inputs.dataset || 'eth-sri/SWT-bench_Verified_bm25_27k_zsp' }}"
SPLIT="${{ inputs.split || 'test' }}"
MAX_WORKERS="${{ inputs.max-workers || '4' }}"
BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '15' }}"
N_LIMIT="${{ inputs.n-limit || '0' }}"
INSTANCE_IDS="${{ inputs.instance-ids }}"
FORCE_BUILD="${{ inputs.force-build || 'false' }}"
Expand All @@ -172,6 +179,7 @@ jobs:
--image ghcr.io/openhands/eval-agent-server \
--target ${TARGET} \
--max-workers ${MAX_WORKERS} \
--build-batch-size ${BUILD_BATCH_SIZE} \
--push"

# Add n-limit if specified
Expand Down Expand Up @@ -210,7 +218,7 @@ jobs:
MAX_WORKERS="${{ inputs.max-workers || '4' }}"
BUILD_MODE="${{ inputs.build-mode || 'cli' }}"
MAX_RETRIES="${{ inputs.max-retries || '2' }}"
BUILD_BATCH_SIZE="${{ inputs.build-batch-size || '10' }}"
BUILD_BATCH_SIZE="${{ inputs.eval-env-build-batch-size || '10' }}"
FORCE_BUILD="${{ inputs.force-build || 'false' }}"

echo "N_LIMIT=${N_LIMIT}" >> "$GITHUB_ENV"
Expand Down
1 change: 1 addition & 0 deletions benchmarks/commit0/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def main(argv: list[str]) -> int:
image=args.image,
push=args.push,
max_workers=args.max_workers,
build_batch_size=args.build_batch_size,
dry_run=args.dry_run,
force_build=args.force_build,
max_retries=args.max_retries,
Expand Down
1 change: 1 addition & 0 deletions benchmarks/gaia/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def tag_fn(_base: str) -> str:
image=args.image,
push=args.push,
max_workers=1, # Only building one image
build_batch_size=args.build_batch_size,
dry_run=args.dry_run,
force_build=args.force_build,
max_retries=args.max_retries,
Expand Down
1 change: 1 addition & 0 deletions benchmarks/multiswebench/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def main():
),
base_image_to_custom_tag_fn=extract_custom_tag,
max_workers=args.num_workers,
build_batch_size=args.build_batch_size,
dry_run=False,
force_build=args.force_build,
)
Expand Down
1 change: 1 addition & 0 deletions benchmarks/swebench/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ def main(argv: list[str]) -> int:
image=args.image,
push=args.push,
max_workers=args.max_workers,
build_batch_size=args.build_batch_size,
dry_run=args.dry_run,
force_build=args.force_build,
max_retries=args.max_retries,
Expand Down
2 changes: 2 additions & 0 deletions benchmarks/swebenchmultilingual/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,9 @@ def main(argv: list[str]) -> int:
image=args.image,
push=args.push,
max_workers=args.max_workers,
build_batch_size=args.build_batch_size,
dry_run=args.dry_run,
force_build=args.force_build,
max_retries=args.max_retries,
base_image_to_custom_tag_fn=extract_custom_tag,
post_build_fn=_wrap_if_needed,
Expand Down
1 change: 1 addition & 0 deletions benchmarks/swebenchmultimodal/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ def main(argv: list[str]) -> int:
image=args.image,
push=args.push,
max_workers=args.max_workers,
build_batch_size=args.build_batch_size,
dry_run=args.dry_run,
force_build=args.force_build,
max_retries=args.max_retries,
Expand Down
1 change: 1 addition & 0 deletions benchmarks/swegym/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ def main(argv: list[str]) -> int:
image=args.image,
push=args.push,
max_workers=args.max_workers,
build_batch_size=args.build_batch_size,
dry_run=args.dry_run,
force_build=args.force_build,
max_retries=args.max_retries,
Expand Down
1 change: 1 addition & 0 deletions benchmarks/swesmith/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def main(argv: list[str]) -> int:
image=args.image,
push=args.push,
max_workers=args.max_workers,
build_batch_size=args.build_batch_size,
dry_run=args.dry_run,
force_build=args.force_build,
max_retries=args.max_retries,
Expand Down
1 change: 1 addition & 0 deletions benchmarks/swtbench/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def main(argv: list[str]) -> int:
image=args.image,
push=args.push,
max_workers=args.max_workers,
build_batch_size=args.build_batch_size,
dry_run=args.dry_run,
force_build=args.force_build,
max_retries=args.max_retries,
Expand Down
18 changes: 17 additions & 1 deletion benchmarks/utils/build_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,15 @@ def get_build_parser() -> argparse.ArgumentParser:
parser.add_argument(
"--max-workers", type=int, default=1, help="Concurrent builds (be cautious)"
)
parser.add_argument(
"--build-batch-size",
type=int,
default=None,
help=(
"Number of images to submit per batch. Defaults to BUILD_BATCH_SIZE "
"when unset."
),
)
parser.add_argument(
"--dry-run", action="store_true", help="List base images only, don’t build"
)
Expand Down Expand Up @@ -666,6 +675,7 @@ def build_all_images(
push: bool = False,
base_image_to_custom_tag_fn: Callable[[str], str] | None = None,
max_workers: int = 1,
build_batch_size: int | None = None,
dry_run: bool = False,
force_build: bool = False,
max_retries: int = 3,
Expand All @@ -684,6 +694,8 @@ def build_all_images(
base_image_to_custom_tag_fn: Function to extract a custom tag from a base image.
Evaluated before scheduling builds so it can safely be a closure.
max_workers: Number of concurrent builds.
build_batch_size: Number of images to submit per batch. If None, use the
BUILD_BATCH_SIZE environment variable.
dry_run: If True, only list base images without building.
force_build: If True, rebuild even when matching remote images already exist.
max_retries: Number of times to retry each failed build (default: 3).
Expand Down Expand Up @@ -712,7 +724,11 @@ def build_all_images(

# Batch/prune settings (tunable via env to control disk usage on sticky runners)
# Default to smaller batches and more aggressive pruning on shared runners.
batch_size = int(os.getenv("BUILD_BATCH_SIZE", "15"))
batch_size = (
build_batch_size
if build_batch_size is not None
else int(os.getenv("BUILD_BATCH_SIZE", "15"))
)
prune_keep_storage_gb = int(os.getenv("BUILDKIT_PRUNE_KEEP_GB", "60"))
prune_threshold_pct = float(os.getenv("BUILDKIT_PRUNE_THRESHOLD_PCT", "60"))
# Prune aggressively by default; filters like "unused-for=12h" prevented GC from
Expand Down
64 changes: 64 additions & 0 deletions tests/test_image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,70 @@ def test_build_parser_accepts_force_build(self):
assert args.force_build is True


class TestBuildBatchSizeConfig:
def test_build_parser_accepts_build_batch_size(self):
from benchmarks.utils.build_utils import get_build_parser

args = get_build_parser().parse_args(["--build-batch-size", "50"])

assert args.build_batch_size == 50

@patch.dict(os.environ, {"BUILD_BATCH_SIZE": "99"})
def test_build_all_images_prefers_explicit_batch_size_over_env(
self,
tmp_path: Path,
):
from benchmarks.utils import build_utils

seen_batches: list[list[str]] = []

class FakeFuture:
def __init__(self, result: BuildOutput):
self._result = result

def result(self) -> BuildOutput:
return self._result

class FakeExecutor:
def __init__(self, *args, **kwargs):
self._batch: list[str] = []

def __enter__(self):
seen_batches.append(self._batch)
return self

def __exit__(self, exc_type, exc, tb):
return False

def submit(self, fn, **kwargs):
self._batch.append(kwargs["base_image"])
return FakeFuture(
BuildOutput(
base_image=kwargs["base_image"],
tags=[f"tag:{kwargs['base_image']}"],
error=None,
)
)

with (
patch.object(build_utils, "ProcessPoolExecutor", FakeExecutor),
patch.object(
build_utils, "as_completed", side_effect=lambda futures: futures
),
patch.object(build_utils, "buildkit_disk_usage", return_value=(0, 0)),
patch.object(build_utils, "maybe_prune_buildkit_cache", return_value=False),
):
exit_code = build_utils.build_all_images(
base_images=["base-1", "base-2", "base-3"],
target="source-minimal",
build_dir=tmp_path,
build_batch_size=2,
)

assert exit_code == 0
assert seen_batches == [["base-1", "base-2"], ["base-3"]]


class TestCachedSdistReuse:
def test_build_image_passes_cached_sdist_to_sdk_build_module(
self,
Expand Down
Loading