Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/run-eval.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ on:
- swtbench
- commit0
- swebenchmultimodal
- terminalbench
sdk_ref:
description: SDK commit/ref to evaluate
required: true
Expand Down
6 changes: 6 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,4 +100,10 @@ When converting between OpenHands format and benchmark-specific formats:
- Handle missing/optional fields gracefully
- Log conversion warnings for debugging
- Validate output format before evaluation

# Terminal-Bench Notes
- Harbor's installable package is `harbor` (not `harbor-bench`).
- The Harbor dataset name used in CI is `terminal-bench@2.0`.
- For CI smoke tests, pass `--n-limit <count>` to `terminalbench-infer` so Harbor only runs the requested subset.

</BENCHMARK_SPECIFIC>
7 changes: 5 additions & 2 deletions benchmarks/terminalbench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ Terminal-Bench evaluates how well AI agents can handle real-world, end-to-end ta
1. **Install Harbor**: Harbor is the official harness for running Terminal-Bench 2.0.

```bash
pip install harbor-bench
pip install harbor
# or
uv pip install harbor-bench
uv pip install harbor
```

2. **Docker**: Harbor requires Docker to be installed and running.
Expand All @@ -43,6 +43,9 @@ uv run terminalbench-infer .llm_config/claude.json --select tasks.txt
# Run with specific dataset version
uv run terminalbench-infer .llm_config/claude.json --dataset terminal-bench@2.0

# Limit the run to 5 tasks (useful for CI smoke tests)
uv run terminalbench-infer .llm_config/claude.json --n-limit 5

# Run with multiple workers
uv run terminalbench-infer .llm_config/claude.json --num-workers 4
```
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/terminalbench/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# Default inference settings (only include values actually used by argparse)
INFER_DEFAULTS = {
"dataset": "terminal-bench-2",
"dataset": "terminal-bench@2.0",
"output_dir": "./evaluation_outputs",
"num_workers": 1,
}
Expand Down
23 changes: 17 additions & 6 deletions benchmarks/terminalbench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
with the standard evaluation pipeline.

Usage:
uv run terminalbench-infer <llm_config_path> --dataset terminal-bench@head
uv run terminalbench-infer <llm_config_path> --dataset terminal-bench@2.0
"""

import argparse
Expand Down Expand Up @@ -51,15 +51,17 @@ def run_harbor_evaluation(
output_dir: str,
num_workers: int = 1,
task_ids: list[str] | None = None,
n_limit: int | None = None,
) -> Path:
"""Run harbor evaluation with openhands-sdk agent.

Args:
llm: LLM configuration for the agent.
dataset: Harbor dataset name (e.g., terminal-bench@head).
dataset: Harbor dataset name (e.g., terminal-bench@2.0).
output_dir: Directory to store output files.
num_workers: Number of parallel workers.
task_ids: Optional list of specific task IDs to run.
n_limit: Optional maximum number of dataset tasks to run.

Returns:
Path to the harbor output directory.
Expand Down Expand Up @@ -101,6 +103,9 @@ def run_harbor_evaluation(
for task_id in task_ids:
cmd.extend(["--task-name", task_id])

if n_limit is not None:
cmd.extend(["--n-tasks", str(n_limit)])

logger.info(f"Running harbor command: {' '.join(cmd)}")
logger.info(f"Output directory: {harbor_output_dir}")

Expand All @@ -122,7 +127,7 @@ def run_harbor_evaluation(

except FileNotFoundError:
raise RuntimeError(
"Harbor CLI not found. Please install harbor: pip install harbor-bench"
"Harbor CLI not found. Please install harbor: pip install harbor"
)

return harbor_output_dir
Expand Down Expand Up @@ -300,7 +305,7 @@ def main() -> None:
"--dataset",
type=str,
default=INFER_DEFAULTS["dataset"],
help="Harbor dataset name (e.g., terminal-bench@head, terminal-bench@2.0)",
help="Harbor dataset name (e.g., terminal-bench@2.0)",
)
parser.add_argument(
"--output-dir",
Expand All @@ -314,6 +319,11 @@ def main() -> None:
default=INFER_DEFAULTS["num_workers"],
help="Number of parallel workers",
)
parser.add_argument(
"--n-limit",
type=int,
help="Maximum number of dataset tasks to run after Harbor filtering",
)
parser.add_argument(
"--select",
type=str,
Expand Down Expand Up @@ -352,9 +362,9 @@ def main() -> None:
if not args.skip_harbor and not check_harbor_installed():
logger.error(
"Harbor CLI is not installed. Please install it:\n"
" pip install harbor-bench\n"
" pip install harbor\n"
" # or\n"
" uv pip install harbor-bench"
" uv pip install harbor"
)
sys.exit(1)

Expand Down Expand Up @@ -404,6 +414,7 @@ def main() -> None:
output_dir=structured_output_dir,
num_workers=args.num_workers,
task_ids=task_ids,
n_limit=args.n_limit,
)

# Convert harbor output to standard format
Expand Down
70 changes: 69 additions & 1 deletion tests/test_terminalbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@

import pytest

from benchmarks.terminalbench.config import INFER_DEFAULTS
from benchmarks.terminalbench.eval_infer import process_terminalbench_results
from benchmarks.terminalbench.run_infer import convert_harbor_to_eval_output
from benchmarks.terminalbench.run_infer import (
convert_harbor_to_eval_output,
run_harbor_evaluation,
)
from openhands.sdk import LLM


class TestProcessTerminalbenchResults:
Expand Down Expand Up @@ -206,6 +211,69 @@ def test_report_file_written(self, tmp_path: Path) -> None:
assert "resolved_ids" in report


class TestRunHarborEvaluation:
"""Tests for building Harbor invocation arguments."""

def test_default_dataset_matches_harbor_registry(self) -> None:
"""Test that the default dataset name matches Harbor's published registry."""
assert INFER_DEFAULTS["dataset"] == "terminal-bench@2.0"

def test_run_harbor_evaluation_passes_filters_and_limits(
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
"""Test Harbor command includes task filters and n-limit for CI runs."""
captured: dict[str, list[str]] = {}

def fake_run(cmd: list[str], capture_output: bool, text: bool):
captured["cmd"] = cmd
return type(
"Completed",
(),
{"returncode": 0, "stdout": "ok", "stderr": ""},
)()

monkeypatch.setattr(
"benchmarks.terminalbench.run_infer.subprocess.run", fake_run
)

harbor_output_dir = run_harbor_evaluation(
llm=LLM(
model="litellm_proxy/test-model",
api_key="test-key",
base_url="https://proxy.example.com",
),
dataset=INFER_DEFAULTS["dataset"],
output_dir=str(tmp_path),
num_workers=3,
task_ids=["task-a", "task-b"],
n_limit=5,
)

expected_output_dir = tmp_path / "harbor_output"
assert harbor_output_dir == expected_output_dir

cmd = captured["cmd"]
assert cmd[:8] == [
"harbor",
"run",
"-d",
"terminal-bench@2.0",
"-a",
"openhands-sdk",
"-m",
"litellm_proxy/test-model",
]
assert "--jobs-dir" in cmd
assert str(expected_output_dir.resolve()) in cmd
assert cmd.count("--task-name") == 2
assert "task-a" in cmd
assert "task-b" in cmd
assert cmd[cmd.index("--n-concurrent") + 1] == "3"
assert cmd[cmd.index("--n-tasks") + 1] == "5"
assert "LLM_API_KEY=test-key" in cmd
assert "LLM_BASE_URL=https://proxy.example.com" in cmd


class TestConvertHarborToEvalOutput:
"""Tests for convert_harbor_to_eval_output function."""

Expand Down
Loading