OpenHands · neubig · Mar 8, 2026
diff --git a/.github/workflows/run-eval.yml b/.github/workflows/run-eval.yml
@@ -14,6 +14,7 @@ on:
           - swtbench
           - commit0
           - swebenchmultimodal
+          - terminalbench
       sdk_ref:
         description: SDK commit/ref to evaluate
         required: true

diff --git a/AGENTS.md b/AGENTS.md
@@ -100,4 +100,10 @@ When converting between OpenHands format and benchmark-specific formats:
 - Handle missing/optional fields gracefully
 - Log conversion warnings for debugging
 - Validate output format before evaluation
+
+# Terminal-Bench Notes
+- Harbor's installable package is `harbor` (not `harbor-bench`).
+- The Harbor dataset name used in CI is `terminal-bench@2.0`.
+- For CI smoke tests, pass `--n-limit <count>` to `terminalbench-infer` so Harbor only runs the requested subset.
+
 </BENCHMARK_SPECIFIC>
diff --git a/benchmarks/terminalbench/README.md b/benchmarks/terminalbench/README.md
@@ -15,9 +15,9 @@ Terminal-Bench evaluates how well AI agents can handle real-world, end-to-end ta
 1. **Install Harbor**: Harbor is the official harness for running Terminal-Bench 2.0.
 
 ```bash
-pip install harbor-bench
+pip install harbor
 # or
-uv pip install harbor-bench
+uv pip install harbor
 ```
 
 2. **Docker**: Harbor requires Docker to be installed and running.
@@ -43,6 +43,9 @@ uv run terminalbench-infer .llm_config/claude.json --select tasks.txt
 # Run with specific dataset version
 uv run terminalbench-infer .llm_config/claude.json --dataset terminal-bench@2.0
 
+# Limit the run to 5 tasks (useful for CI smoke tests)
+uv run terminalbench-infer .llm_config/claude.json --n-limit 5
+
 # Run with multiple workers
 uv run terminalbench-infer .llm_config/claude.json --num-workers 4
 ```

diff --git a/benchmarks/terminalbench/config.py b/benchmarks/terminalbench/config.py
@@ -2,7 +2,7 @@
 
 # Default inference settings (only include values actually used by argparse)
 INFER_DEFAULTS = {
-    "dataset": "terminal-bench-2",
+    "dataset": "terminal-bench@2.0",
     "output_dir": "./evaluation_outputs",
     "num_workers": 1,
 }

diff --git a/benchmarks/terminalbench/run_infer.py b/benchmarks/terminalbench/run_infer.py
@@ -5,7 +5,7 @@
 with the standard evaluation pipeline.
 
 Usage:
-    uv run terminalbench-infer <llm_config_path> --dataset terminal-bench@head
+    uv run terminalbench-infer <llm_config_path> --dataset terminal-bench@2.0
 """
 
 import argparse
@@ -51,15 +51,17 @@ def run_harbor_evaluation(
     output_dir: str,
     num_workers: int = 1,
     task_ids: list[str] | None = None,
+    n_limit: int | None = None,
 ) -> Path:
     """Run harbor evaluation with openhands-sdk agent.
 
     Args:
         llm: LLM configuration for the agent.
-        dataset: Harbor dataset name (e.g., terminal-bench@head).
+        dataset: Harbor dataset name (e.g., terminal-bench@2.0).
         output_dir: Directory to store output files.
         num_workers: Number of parallel workers.
         task_ids: Optional list of specific task IDs to run.
+        n_limit: Optional maximum number of dataset tasks to run.
 
     Returns:
         Path to the harbor output directory.
@@ -101,6 +103,9 @@ def run_harbor_evaluation(
         for task_id in task_ids:
             cmd.extend(["--task-name", task_id])
 
+    if n_limit is not None:
+        cmd.extend(["--n-tasks", str(n_limit)])
+
     logger.info(f"Running harbor command: {' '.join(cmd)}")
     logger.info(f"Output directory: {harbor_output_dir}")
 
@@ -122,7 +127,7 @@ def run_harbor_evaluation(
 
     except FileNotFoundError:
         raise RuntimeError(
-            "Harbor CLI not found. Please install harbor: pip install harbor-bench"
+            "Harbor CLI not found. Please install harbor: pip install harbor"
         )
 
     return harbor_output_dir
@@ -300,7 +305,7 @@ def main() -> None:
         "--dataset",
         type=str,
         default=INFER_DEFAULTS["dataset"],
-        help="Harbor dataset name (e.g., terminal-bench@head, terminal-bench@2.0)",
+        help="Harbor dataset name (e.g., terminal-bench@2.0)",
     )
     parser.add_argument(
         "--output-dir",
@@ -314,6 +319,11 @@ def main() -> None:
         default=INFER_DEFAULTS["num_workers"],
         help="Number of parallel workers",
     )
+    parser.add_argument(
+        "--n-limit",
+        type=int,
+        help="Maximum number of dataset tasks to run after Harbor filtering",
+    )
     parser.add_argument(
         "--select",
         type=str,
@@ -352,9 +362,9 @@ def main() -> None:
     if not args.skip_harbor and not check_harbor_installed():
         logger.error(
             "Harbor CLI is not installed. Please install it:\n"
-            "  pip install harbor-bench\n"
+            "  pip install harbor\n"
             "  # or\n"
-            "  uv pip install harbor-bench"
+            "  uv pip install harbor"
         )
         sys.exit(1)
 
@@ -404,6 +414,7 @@ def main() -> None:
                 output_dir=structured_output_dir,
                 num_workers=args.num_workers,
                 task_ids=task_ids,
+                n_limit=args.n_limit,
             )
 
             # Convert harbor output to standard format

diff --git a/tests/test_terminalbench.py b/tests/test_terminalbench.py
@@ -5,8 +5,13 @@
 
 import pytest
 
+from benchmarks.terminalbench.config import INFER_DEFAULTS
 from benchmarks.terminalbench.eval_infer import process_terminalbench_results
-from benchmarks.terminalbench.run_infer import convert_harbor_to_eval_output
+from benchmarks.terminalbench.run_infer import (
+    convert_harbor_to_eval_output,
+    run_harbor_evaluation,
+)
+from openhands.sdk import LLM
 
 
 class TestProcessTerminalbenchResults:
@@ -206,6 +211,69 @@ def test_report_file_written(self, tmp_path: Path) -> None:
         assert "resolved_ids" in report
 
 
+class TestRunHarborEvaluation:
+    """Tests for building Harbor invocation arguments."""
+
+    def test_default_dataset_matches_harbor_registry(self) -> None:
+        """Test that the default dataset name matches Harbor's published registry."""
+        assert INFER_DEFAULTS["dataset"] == "terminal-bench@2.0"
+
+    def test_run_harbor_evaluation_passes_filters_and_limits(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Test Harbor command includes task filters and n-limit for CI runs."""
+        captured: dict[str, list[str]] = {}
+
+        def fake_run(cmd: list[str], capture_output: bool, text: bool):
+            captured["cmd"] = cmd
+            return type(
+                "Completed",
+                (),
+                {"returncode": 0, "stdout": "ok", "stderr": ""},
+            )()
+
+        monkeypatch.setattr(
+            "benchmarks.terminalbench.run_infer.subprocess.run", fake_run
+        )
+
+        harbor_output_dir = run_harbor_evaluation(
+            llm=LLM(
+                model="litellm_proxy/test-model",
+                api_key="test-key",
+                base_url="https://proxy.example.com",
+            ),
+            dataset=INFER_DEFAULTS["dataset"],
+            output_dir=str(tmp_path),
+            num_workers=3,
+            task_ids=["task-a", "task-b"],
+            n_limit=5,
+        )
+
+        expected_output_dir = tmp_path / "harbor_output"
+        assert harbor_output_dir == expected_output_dir
+
+        cmd = captured["cmd"]
+        assert cmd[:8] == [
+            "harbor",
+            "run",
+            "-d",
+            "terminal-bench@2.0",
+            "-a",
+            "openhands-sdk",
+            "-m",
+            "litellm_proxy/test-model",
+        ]
+        assert "--jobs-dir" in cmd
+        assert str(expected_output_dir.resolve()) in cmd
+        assert cmd.count("--task-name") == 2
+        assert "task-a" in cmd
+        assert "task-b" in cmd
+        assert cmd[cmd.index("--n-concurrent") + 1] == "3"
+        assert cmd[cmd.index("--n-tasks") + 1] == "5"
+        assert "LLM_API_KEY=test-key" in cmd
+        assert "LLM_BASE_URL=https://proxy.example.com" in cmd
+
+
 class TestConvertHarborToEvalOutput:
     """Tests for convert_harbor_to_eval_output function."""