diff --git a/benchmarks/commit0/config.py b/benchmarks/commit0/config.py index a41a044f4..1d9fb0420 100644 --- a/benchmarks/commit0/config.py +++ b/benchmarks/commit0/config.py @@ -14,13 +14,13 @@ } # Inference defaults (used by run_infer.py) -# Note: commit0 uses max_attempts=1 and max_retries=1 (different from default of 3) +# Note: commit0 uses n_critic_runs=1 and max_retries=3 (different from default of 3) INFER_DEFAULTS = { "dataset": "wentingzhao/commit0_combined", "split": "test", "repo_split": "lite", "num_workers": 16, - "max_attempts": 1, + "n_critic_runs": 1, "max_retries": 3, **CONDENSER_DEFAULTS, } diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index 0ab228b1d..d3865e1e4 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -589,9 +589,9 @@ def main() -> None: parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() - # Validate max_attempts - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + # Validate n_critic_runs + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) @@ -618,7 +618,7 @@ def main() -> None: prompt_path=args.prompt_path, eval_limit=args.n_limit, env_setup_commands=None, - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=create_critic(args), selected_instances_file=args.select, max_retries=args.max_retries, diff --git a/benchmarks/gaia/README.md b/benchmarks/gaia/README.md index 4b1da6a02..ba46e9347 100644 --- a/benchmarks/gaia/README.md +++ b/benchmarks/gaia/README.md @@ -62,7 +62,7 @@ uv run python -m benchmarks.gaia.get_score --file outputs/gaia/output.jsonl - `--output-dir`: Base directory for outputs (default: `outputs`) - `--n-limit`: Limit number of instances to evaluate (default: 0 = all) - `--num-workers`: Number of parallel workers (default: 1) -- `--max-attempts`: Maximum attempts for iterative mode (default: 1) +- `--n-critic-runs`: Number of critic evaluation runs for iterative mode (default: 1) - `--note`: Optional note to add to output directory name diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index 942f65af9..7bc7523e6 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -574,8 +574,8 @@ def main() -> None: logger.info(f"Using critic: {type(critic).__name__}") # Validate arguments - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) @@ -601,7 +601,7 @@ def main() -> None: eval_output_dir=structured_output_dir, details={"level": args.level}, eval_limit=args.n_limit, - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index c9b9361a9..481dfa000 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -420,9 +420,9 @@ def main() -> None: ) args = parser.parse_args() - # Validate max_attempts - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + # Validate n_critic_runs + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) @@ -460,7 +460,7 @@ def main() -> None: prompt_path=args.prompt_path, eval_limit=args.n_limit, env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, diff --git a/benchmarks/openagentsafety/run_infer.py b/benchmarks/openagentsafety/run_infer.py index 3c55b5310..30da0db89 100644 --- a/benchmarks/openagentsafety/run_infer.py +++ b/benchmarks/openagentsafety/run_infer.py @@ -646,8 +646,8 @@ def main() -> None: args = parser.parse_args() # Validate args - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") # Load LLM config llm = load_llm_config(args.llm_config_path) @@ -681,7 +681,7 @@ def main() -> None: "platform": "linux/amd64", }, eval_limit=args.n_limit, - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index de77d69ca..746976d87 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -360,9 +360,9 @@ def main() -> None: parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() - # Validate max_attempts - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + # Validate n_critic_runs + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) @@ -400,7 +400,7 @@ def main() -> None: prompt_path=args.prompt_path, eval_limit=args.n_limit, env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, diff --git a/benchmarks/swebenchmultilingual/run_infer.py b/benchmarks/swebenchmultilingual/run_infer.py index 227db99ce..b037bd08a 100644 --- a/benchmarks/swebenchmultilingual/run_infer.py +++ b/benchmarks/swebenchmultilingual/run_infer.py @@ -352,9 +352,9 @@ def main() -> None: parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() - # Validate max_attempts - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + # Validate n_critic_runs + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) @@ -386,7 +386,7 @@ def main() -> None: prompt_path=args.prompt_path, eval_limit=args.n_limit, env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 1bb4653e5..5a6a5bfad 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -414,9 +414,9 @@ def main() -> None: parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() - # Validate max_attempts - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + # Validate n_critic_runs + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) @@ -453,7 +453,7 @@ def main() -> None: prompt_path=args.prompt_path, eval_limit=args.n_limit, env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, diff --git a/benchmarks/swefficiency/run_infer.py b/benchmarks/swefficiency/run_infer.py index bb3efd908..a5ab3a001 100644 --- a/benchmarks/swefficiency/run_infer.py +++ b/benchmarks/swefficiency/run_infer.py @@ -412,9 +412,9 @@ def main() -> None: parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() - # Validate max_attempts - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + # Validate n_critic_runs + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm_config_path = args.llm_config_path if not os.path.isfile(llm_config_path): @@ -450,7 +450,7 @@ def main() -> None: prompt_path=args.prompt_path, eval_limit=args.n_limit, env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, diff --git a/benchmarks/swtbench/README.md b/benchmarks/swtbench/README.md index 2637340ea..6a84ee452 100644 --- a/benchmarks/swtbench/README.md +++ b/benchmarks/swtbench/README.md @@ -8,7 +8,7 @@ Before running any benchmarks, you need to set up the environment see main READM ### 1. Run SWT-Bench Evaluation ```bash # Run evaluation with your configured LLM -uv run swtbench-infer .llm_config/sonnet-4.json --max-attempts 3 --n-limit 500 --max-iterations 500 --critic finish_with_patch +uv run swtbench-infer .llm_config/sonnet-4.json --n-critic-runs 3 --n-limit 500 --max-iterations 500 --critic finish_with_patch ``` ### 2. Selecting Specific Instances @@ -26,7 +26,7 @@ requests__requests-5555 2. Run evaluation with the selection file: ```bash -uv run swtbench-infer .llm_config/sonnet-4.json --max-attempts 3 --select instances.txt --n-limit 500 --max-iterations 500 --critic finish_with_patch +uv run swtbench-infer .llm_config/sonnet-4.json --n-critic-runs 3 --select instances.txt --n-limit 500 --max-iterations 500 --critic finish_with_patch ``` This will only evaluate the instances listed in the file. diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index c651056d3..9f4538d4c 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -352,9 +352,9 @@ def main() -> None: parser.set_defaults(**INFER_DEFAULTS) args = parser.parse_args() - # Validate max_attempts - if args.max_attempts < 1: - raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}") + # Validate n_critic_runs + if args.n_critic_runs < 1: + raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}") llm = load_llm_config(args.llm_config_path) logger.info("Using LLM config: %s", llm.model_dump_json(indent=2)) @@ -389,7 +389,7 @@ def main() -> None: prompt_path=args.prompt_path, eval_limit=args.n_limit, env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"], - max_attempts=args.max_attempts, + n_critic_runs=args.n_critic_runs, critic=critic, selected_instances_file=args.select, max_retries=args.max_retries, diff --git a/benchmarks/utils/args_parser.py b/benchmarks/utils/args_parser.py index 8c1eefad9..3399980a7 100644 --- a/benchmarks/utils/args_parser.py +++ b/benchmarks/utils/args_parser.py @@ -65,10 +65,10 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser: help="Limit number of instances to evaluate (0 = no limit)", ) parser.add_argument( - "--max-attempts", + "--n-critic-runs", type=int, default=3, - help="Maximum number of attempts for iterative mode (default: 3, min: 1)", + help="Number of critic evaluation runs for iterative mode (default: 3, min: 1)", ) # Add critic arguments (no default) diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index 4e27ba4ee..21ad5929a 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -209,13 +209,13 @@ def run( """ Run evaluation with iterative mode support. - If max_attempts > 1, will retry failed instances multiple times. - If max_attempts == 1, will run once without retries. + If n_critic_runs > 1, will retry failed instances multiple times. + If n_critic_runs == 1, will run once without retries. """ logger.info("Starting evaluation (process pool)") logger.info("metadata=%s", self.metadata) logger.info("workers=%d", self.num_workers) - logger.info("max_attempts=%d", self.metadata.max_attempts) + logger.info("n_critic_runs=%d", self.metadata.n_critic_runs) # Use iterative mode for all cases return self._run_iterative_mode(on_result=on_result) @@ -332,9 +332,9 @@ def _run_iterative_mode( critic = self.metadata.critic all_outputs: List[EvalOutput] = [] - for attempt in range(1, self.metadata.max_attempts + 1): + for attempt in range(1, self.metadata.n_critic_runs + 1): self.current_attempt = attempt - logger.info(f"Starting attempt {attempt}/{self.metadata.max_attempts}") + logger.info(f"Starting attempt {attempt}/{self.metadata.n_critic_runs}") instances_to_process = self._get_instances_for_attempt( attempt, all_instances, critic @@ -520,14 +520,14 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None: logger.info("Aggregating results from all attempts") aggregate_results( output_dir=self.metadata.eval_output_dir, - max_attempts=self.metadata.max_attempts, + n_critic_runs=self.metadata.n_critic_runs, critic=self.metadata.critic, final_output_file="output.jsonl", ) logger.info( f"Evaluation complete: {total_instances} total instances, " - f"{self.metadata.max_attempts} max attempts" + f"{self.metadata.n_critic_runs} critic runs" ) return all_outputs diff --git a/benchmarks/utils/iterative.py b/benchmarks/utils/iterative.py index 02560e76e..c72810ff6 100644 --- a/benchmarks/utils/iterative.py +++ b/benchmarks/utils/iterative.py @@ -81,7 +81,7 @@ def get_failed_instances(output_file: str, critic: CriticBase) -> Set[EvalInstan def aggregate_results( output_dir: str, - max_attempts: int, + n_critic_runs: int, critic: "CriticBase", final_output_file: str = "output.jsonl", ) -> None: @@ -93,17 +93,17 @@ def aggregate_results( Args: output_dir: Directory containing attempt files - max_attempts: Maximum number of attempts + n_critic_runs: Maximum number of attempts critic: Critic instance to use for evaluation final_output_file: Name of the final output file """ - logger.info(f"Aggregating results from {max_attempts} attempts") + logger.info(f"Aggregating results from {n_critic_runs} attempts") # Dictionary to store the best result for each instance best_results: dict[EvalInstanceID, EvalOutput] = {} # Work backwards from the last attempt to the first - for attempt in range(max_attempts, 0, -1): + for attempt in range(n_critic_runs, 0, -1): attempt_file = os.path.join( output_dir, f"output.critic_attempt_{attempt}.jsonl" ) diff --git a/benchmarks/utils/models.py b/benchmarks/utils/models.py index fa85f8bbc..831ad4861 100644 --- a/benchmarks/utils/models.py +++ b/benchmarks/utils/models.py @@ -31,8 +31,10 @@ class EvalMetadata(BaseModel): eval_limit: int = Field( default=0, description="Number of instances to evaluate, 0 means all" ) - max_attempts: int = Field( - default=1, ge=1, description="Maximum number of attempts for iterative mode" + n_critic_runs: int = Field( + default=1, + ge=1, + description="Number of critic evaluation runs for iterative mode", ) critic: CriticBase = Field( description=( diff --git a/tests/test_aggregate_results.py b/tests/test_aggregate_results.py index 4ace34627..8a5d096b3 100644 --- a/tests/test_aggregate_results.py +++ b/tests/test_aggregate_results.py @@ -98,7 +98,7 @@ def test_prefers_non_error_over_error_when_last_attempt_errors( f.write(output_3.model_dump_json() + "\n") # Run aggregation - aggregate_results(temp_output_dir, max_attempts=3, critic=critic) + aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic) # Verify output.jsonl contains the instance (not dropped) final_output_file = os.path.join(temp_output_dir, "output.jsonl") @@ -135,7 +135,7 @@ def test_prefers_critic_success_over_non_error_critic_fail(self, temp_output_dir f.write(output.model_dump_json() + "\n") # Run aggregation - aggregate_results(temp_output_dir, max_attempts=3, critic=critic) + aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic) # Verify output.jsonl contains the instance final_output_file = os.path.join(temp_output_dir, "output.jsonl") @@ -185,7 +185,7 @@ def test_multiple_instances_with_mixed_results(self, temp_output_dir): f.write(create_output("instance_3", error=None).model_dump_json() + "\n") # Run aggregation - aggregate_results(temp_output_dir, max_attempts=3, critic=critic) + aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic) # Verify all instances appear in output.jsonl final_output_file = os.path.join(temp_output_dir, "output.jsonl") @@ -214,7 +214,7 @@ def test_all_attempts_error_instance_dropped(self, temp_output_dir): f.write(output.model_dump_json() + "\n") # Run aggregation - aggregate_results(temp_output_dir, max_attempts=3, critic=critic) + aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic) # Verify output.jsonl is empty (instance dropped because all attempts errored) final_output_file = os.path.join(temp_output_dir, "output.jsonl") @@ -228,7 +228,7 @@ def test_empty_attempts(self, temp_output_dir): critic = PassCritic() # Run aggregation with no attempt files - aggregate_results(temp_output_dir, max_attempts=3, critic=critic) + aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic) # Verify output.jsonl is created but empty final_output_file = os.path.join(temp_output_dir, "output.jsonl") diff --git a/tests/test_iterative_resume.py b/tests/test_iterative_resume.py index 157638f76..108f2dcb4 100644 --- a/tests/test_iterative_resume.py +++ b/tests/test_iterative_resume.py @@ -58,8 +58,8 @@ def test_iterative_resume_with_expanded_n_limit(): Test that iterative evaluation correctly handles resume when n-limit is expanded. Scenario: - 1. First run: Process 50 instances with max_attempts=3 - 2. Second run: Expand to 200 instances with max_attempts=3 + 1. First run: Process 50 instances with n_critic_runs=3 + 2. Second run: Expand to 200 instances with n_critic_runs=3 Expected behavior: - The 150 new instances (51-200) should be processed starting from attempt 1 @@ -109,7 +109,7 @@ def test_iterative_resume_with_expanded_n_limit(): eval_output_dir=tmpdir, details={}, eval_limit=200, - max_attempts=3, + n_critic_runs=3, max_retries=0, critic=PassCritic(), ) @@ -191,7 +191,7 @@ def test_iterative_resume_with_same_n_limit(): eval_output_dir=tmpdir, details={}, eval_limit=50, - max_attempts=3, + n_critic_runs=3, max_retries=0, critic=PassCritic(), ) @@ -280,7 +280,7 @@ def evaluate(self, events, git_patch=None): eval_output_dir=tmpdir, details={}, eval_limit=4, - max_attempts=3, + n_critic_runs=3, max_retries=0, critic=critic, ) @@ -392,7 +392,7 @@ def evaluate(self, events, git_patch=None): eval_output_dir=tmpdir, details={}, eval_limit=4, - max_attempts=3, + n_critic_runs=3, max_retries=0, critic=critic, ) diff --git a/tests/test_keyboard_interrupt.py b/tests/test_keyboard_interrupt.py index 3c669e548..e8940b0cc 100644 --- a/tests/test_keyboard_interrupt.py +++ b/tests/test_keyboard_interrupt.py @@ -74,7 +74,7 @@ def evaluate_instance( eval_output_dir="{tmpdir}", details={{}}, eval_limit=0, - max_attempts=1, + n_critic_runs=1, max_retries=0, critic=PassCritic(), ) diff --git a/tests/test_workspace_cleanup.py b/tests/test_workspace_cleanup.py index c5d1f32f2..a35cfa304 100644 --- a/tests/test_workspace_cleanup.py +++ b/tests/test_workspace_cleanup.py @@ -38,7 +38,7 @@ def test_workspace_cleanup_called_on_success(): eval_output_dir="/tmp/test", details={}, eval_limit=1, - max_attempts=1, + n_critic_runs=1, max_retries=0, critic=PassCritic(), ) @@ -89,7 +89,7 @@ def test_workspace_cleanup_called_on_failure(): eval_output_dir="/tmp/test", details={}, eval_limit=1, - max_attempts=1, + n_critic_runs=1, max_retries=0, critic=PassCritic(), ) @@ -149,7 +149,7 @@ def test_workspace_cleanup_handles_cleanup_exception(): eval_output_dir="/tmp/test", details={}, eval_limit=1, - max_attempts=1, + n_critic_runs=1, max_retries=0, critic=PassCritic(), ) @@ -205,7 +205,7 @@ def create_mock_workspace(): eval_output_dir="/tmp/test", details={}, eval_limit=1, - max_attempts=1, + n_critic_runs=1, max_retries=2, critic=PassCritic(), ) @@ -284,7 +284,7 @@ def test_datapoint_trace_id_linked_in_worker(): eval_output_dir="/tmp/test", details={}, eval_limit=1, - max_attempts=1, + n_critic_runs=1, max_retries=0, critic=PassCritic(), ) @@ -359,7 +359,7 @@ def test_datapoint_trace_id_not_linked_without_datapoint(): eval_output_dir="/tmp/test", details={}, eval_limit=1, - max_attempts=1, + n_critic_runs=1, max_retries=0, critic=PassCritic(), ) @@ -426,7 +426,7 @@ def test_update_datapoint_trace_id_failure_does_not_break_eval(): eval_output_dir="/tmp/test", details={}, eval_limit=1, - max_attempts=1, + n_critic_runs=1, max_retries=0, critic=PassCritic(), )