Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 31 additions & 17 deletions benchmarks/commit0/run_infer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import os
from collections import Counter
from pathlib import Path
from typing import Any, List

from commit0.harness.constants import SPLIT
Expand All @@ -12,7 +13,7 @@
get_base_docker_image,
)
from benchmarks.commit0.config import INFER_DEFAULTS
from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.console_logging import summarize_instance
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
from benchmarks.utils.conversation import build_event_persistence_callback
Expand All @@ -23,19 +24,19 @@
construct_eval_output_dir,
get_default_on_result_writer,
)
from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
from benchmarks.utils.image_utils import image_exists
from benchmarks.utils.llm_config import load_llm_config
from benchmarks.utils.models import (
EvalInstance,
EvalMetadata,
EvalOutput,
)
from benchmarks.utils.version import IMAGE_TAG_PREFIX
from benchmarks.utils.version import SDK_SHORT_SHA
from openhands.sdk import Agent, Conversation, Tool, get_logger
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.delegate import DelegateTool
from openhands.tools.preset.default import get_default_tools
from openhands.workspace import APIRemoteWorkspace
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace


logger = get_logger(__name__)
Expand Down Expand Up @@ -187,16 +188,15 @@ def prepare_workspace(
logger.info(f"Using base docker image: {base_docker_image}")

if self.metadata.workspace_type == "docker":
custom_tag = extract_custom_tag(base_docker_image)
suffix = f"-{build_target}" if build_target != "binary" else ""
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
)
workspace = create_docker_workspace(
agent_server_image=agent_server_image,
# Build agent-server image from base commit0 image
workspace = DockerDevWorkspace(
base_image=base_docker_image,
build_target=build_target,
forward_env=forward_env,
working_dir="/workspace",
target=build_target,
forward_env=forward_env or [],
)
logger.info(
f"Building workspace from {base_docker_image}. This may take a while..."
)
elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
Expand All @@ -205,21 +205,22 @@ def prepare_workspace(
"RUNTIME_API_KEY environment variable is not set for remote workspace"
)

sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
custom_tag = extract_custom_tag(base_docker_image)
suffix = f"-{build_target}" if build_target != "binary" else ""
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
)

if not remote_image_exists(agent_server_image):
if not image_exists(agent_server_image):
raise RuntimeError(
f"Agent server image {agent_server_image} does not exist in container registry. "
"Run 'benchmarks/commit0/build_images.py --push' to build and push it first."
)

logger.info(
f"Using remote workspace with image {agent_server_image} "
f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
)
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
workspace = APIRemoteWorkspace(
Expand Down Expand Up @@ -591,8 +592,21 @@ def evaluate_instance(


def main() -> None:
prompt_dir = (Path(__file__).parent / "prompts").resolve()
choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
default_prompt_path = prompt_dir / "default.j2"
assert default_prompt_path.exists(), (
f"Default prompt {default_prompt_path} not found"
)

parser = get_parser()
add_prompt_path_argument(parser, __file__)
parser.add_argument(
"--prompt-path",
type=str,
default=str(default_prompt_path),
choices=choices,
help="Path to prompt template file",
)
parser.add_argument(
"--repo-split",
type=str,
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/gaia/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
get_build_parser,
run_docker_build_layer,
)
from benchmarks.utils.image_utils import remote_image_exists
from benchmarks.utils.image_utils import image_exists
from openhands.sdk import get_logger


Expand Down Expand Up @@ -81,7 +81,7 @@ def tag_fn(_base: str) -> str:
# inflating the image and causing runtime OOM crashes.
_, git_sha, _ = _get_sdk_submodule_info()
base_gaia_image = f"{args.image}:{git_sha[:7]}-gaia-{args.target}"
if not args.dry_run and remote_image_exists(base_gaia_image):
if not args.dry_run and image_exists(base_gaia_image):
logger.info("Image %s already exists. Skipping build.", base_gaia_image)
return 0

Expand Down
25 changes: 11 additions & 14 deletions benchmarks/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
get_default_on_result_writer,
)
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
from benchmarks.utils.image_utils import image_exists
from benchmarks.utils.llm_config import load_llm_config
from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
from benchmarks.utils.version import IMAGE_TAG_PREFIX
from benchmarks.utils.version import SDK_SHORT_SHA
from openhands.sdk import (
Agent,
Conversation,
Expand All @@ -47,7 +47,7 @@
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.delegate import DelegateTool
from openhands.tools.preset.default import get_default_tools
from openhands.workspace import APIRemoteWorkspace
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace


logger = get_logger(__name__)
Expand Down Expand Up @@ -156,14 +156,11 @@ def prepare_workspace(
logger.info(f"Preparing workspace for instance {instance.id}")

if self.metadata.workspace_type == "docker":
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary"
)
workspace = create_docker_workspace(
agent_server_image=agent_server_image,
# Use DockerDevWorkspace with base image (same as main branch)
workspace = DockerDevWorkspace(
base_image="nikolaik/python-nodejs:python3.12-nodejs22",
build_target="binary",
forward_env=forward_env,
working_dir="/workspace",
forward_env=forward_env or [],
)
elif self.metadata.workspace_type == "remote":
# For workflow, use APIRemoteWorkspace with pre-built GAIA image
Expand All @@ -177,19 +174,20 @@ def prepare_workspace(
"RUNTIME_API_KEY environment variable is not set for remote workspace"
)

sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary"
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-gaia-binary"
)

if not remote_image_exists(agent_server_image):
if not image_exists(agent_server_image):
raise RuntimeError(
f"Agent server image {agent_server_image} does not exist in container registry. "
f"Run 'benchmarks/gaia/build_images.py --push' to build and push it first."
)

logger.info(
f"Using remote workspace with GAIA image {agent_server_image} "
f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
)
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
workspace = APIRemoteWorkspace(
Expand Down Expand Up @@ -592,7 +590,6 @@ def main() -> None:
max_attempts=args.max_attempts,
critic=critic,
selected_instances_file=args.select,
max_retries=args.max_retries,
workspace_type=args.workspace,
enable_delegation=args.enable_delegation,
)
Expand Down
67 changes: 53 additions & 14 deletions benchmarks/multiswebench/run_infer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
from pathlib import Path
from typing import List, cast

import pandas as pd
Expand All @@ -12,8 +13,8 @@
)
from benchmarks.multiswebench.download_dataset import download_and_concat_dataset
from benchmarks.multiswebench.scripts.data.data_change import format_data_for_inference
from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser
from benchmarks.utils.build_utils import ensure_local_image
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.build_utils import build_image
from benchmarks.utils.console_logging import summarize_instance
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
from benchmarks.utils.conversation import build_event_persistence_callback
Expand All @@ -25,14 +26,14 @@
get_default_on_result_writer,
)
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
from benchmarks.utils.image_utils import remote_image_exists
from benchmarks.utils.image_utils import image_exists
from benchmarks.utils.llm_config import load_llm_config
from benchmarks.utils.models import (
EvalInstance,
EvalMetadata,
EvalOutput,
)
from benchmarks.utils.version import IMAGE_TAG_PREFIX
from benchmarks.utils.version import SDK_SHORT_SHA
from openhands.sdk import Agent, Conversation, Tool, get_logger
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.delegate import DelegateTool
Expand Down Expand Up @@ -209,37 +210,62 @@ def prepare_workspace(

if self.metadata.workspace_type == "docker":
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
)
ensure_local_image(
agent_server_image=agent_server_image,
base_image=official_docker_image,
custom_tag=custom_tag,
target=build_target,
SKIP_BUILD = os.getenv("MULTI_SWE_BENCH_SKIP_BUILD", "0").lower() in (
"1",
"true",
"yes",
)
logger.info(f"MULTI_SWE_BENCH_SKIP_BUILD={SKIP_BUILD}")
if not SKIP_BUILD:
logger.info(
f"Building workspace from {official_docker_image} "
f"for instance {instance.id}. "
"This may take a while...\n"
"You can run benchmarks/multiswebench/build_images.py and set "
"MULTI_SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
"agent-server image."
)
output = build_image(
base_image=official_docker_image,
target_image=EVAL_AGENT_SERVER_IMAGE,
custom_tag=custom_tag,
target=build_target,
push=False,
)
logger.info(f"Image build output: {output}")
assert output.error is None, f"Image build failed: {output.error}"
if agent_server_image not in output.tags:
raise RuntimeError(
f"Built image tags {output.tags} do not include expected tag "
f"{agent_server_image}"
)

workspace = DockerWorkspace(
server_image=agent_server_image,
working_dir="/workspace",
forward_env=forward_env or [],
)
elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
if not runtime_api_key:
raise ValueError(
"RUNTIME_API_KEY environment variable is not set for remote workspace"
)

agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
)
if not remote_image_exists(agent_server_image):
if not image_exists(agent_server_image):
raise RuntimeError(
f"Agent server image {agent_server_image} does not exist in container registry, "
"make sure to build, push it, and make it public accessible before using remote workspace."
)
logger.info(
f"Using remote workspace with image {agent_server_image} "
f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
)
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
workspace = APIRemoteWorkspace(
Expand Down Expand Up @@ -402,8 +428,21 @@ def evaluate_instance(


def main() -> None:
prompt_dir = (Path(__file__).parent / "prompts").resolve()
choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
default_prompt_path = prompt_dir / "default.j2"
assert default_prompt_path.exists(), (
f"Default prompt {default_prompt_path} not found"
)

parser = get_parser()
add_prompt_path_argument(parser, __file__)
parser.add_argument(
"--prompt-path",
type=str,
default=str(default_prompt_path),
choices=choices,
help="Path to prompt template file",
)
parser.add_argument(
"--lang",
type=str,
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/swebench/build_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
run_docker_build_layer,
)
from benchmarks.utils.dataset import get_dataset
from benchmarks.utils.image_utils import remote_image_exists
from benchmarks.utils.image_utils import image_exists
from openhands.sdk import get_logger


Expand Down Expand Up @@ -94,7 +94,7 @@ def wrap_image(agent_image: str, push: bool = False) -> BuildOutput:
For pushes, verify the base tag exists in the registry. For local builds,
assume the tag is available locally or resolvable by Docker during buildx.
"""
if push and not remote_image_exists(agent_image):
if push and not image_exists(agent_image):
return BuildOutput(
base_image=agent_image,
tags=[],
Expand Down
Loading
Loading