Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions benchmarks/commit0/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@
}

# Inference defaults (used by run_infer.py)
# Note: commit0 uses max_attempts=1 and max_retries=1 (different from default of 3)
# Note: commit0 uses n_critic_runs=1 and max_retries=3 (different from default of 3)
INFER_DEFAULTS = {
"dataset": "wentingzhao/commit0_combined",
"split": "test",
"repo_split": "lite",
"num_workers": 16,
"max_attempts": 1,
"n_critic_runs": 1,
"max_retries": 3,
**CONDENSER_DEFAULTS,
}
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/commit0/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,9 +589,9 @@ def main() -> None:
parser.set_defaults(**INFER_DEFAULTS)
args = parser.parse_args()

# Validate max_attempts
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
# Validate n_critic_runs
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm = load_llm_config(args.llm_config_path)
logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
Expand All @@ -618,7 +618,7 @@ def main() -> None:
prompt_path=args.prompt_path,
eval_limit=args.n_limit,
env_setup_commands=None,
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=create_critic(args),
selected_instances_file=args.select,
max_retries=args.max_retries,
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/gaia/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ uv run python -m benchmarks.gaia.get_score --file outputs/gaia/output.jsonl
- `--output-dir`: Base directory for outputs (default: `outputs`)
- `--n-limit`: Limit number of instances to evaluate (default: 0 = all)
- `--num-workers`: Number of parallel workers (default: 1)
- `--max-attempts`: Maximum attempts for iterative mode (default: 1)
- `--n-critic-runs`: Number of critic evaluation runs for iterative mode (default: 1)
- `--note`: Optional note to add to output directory name


Expand Down
6 changes: 3 additions & 3 deletions benchmarks/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,8 +574,8 @@ def main() -> None:
logger.info(f"Using critic: {type(critic).__name__}")

# Validate arguments
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm = load_llm_config(args.llm_config_path)
logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
Expand All @@ -601,7 +601,7 @@ def main() -> None:
eval_output_dir=structured_output_dir,
details={"level": args.level},
eval_limit=args.n_limit,
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=critic,
selected_instances_file=args.select,
max_retries=args.max_retries,
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/multiswebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,9 +420,9 @@ def main() -> None:
)
args = parser.parse_args()

# Validate max_attempts
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
# Validate n_critic_runs
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm = load_llm_config(args.llm_config_path)
logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
Expand Down Expand Up @@ -460,7 +460,7 @@ def main() -> None:
prompt_path=args.prompt_path,
eval_limit=args.n_limit,
env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"],
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=critic,
selected_instances_file=args.select,
max_retries=args.max_retries,
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/openagentsafety/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -646,8 +646,8 @@ def main() -> None:
args = parser.parse_args()

# Validate args
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

# Load LLM config
llm = load_llm_config(args.llm_config_path)
Expand Down Expand Up @@ -681,7 +681,7 @@ def main() -> None:
"platform": "linux/amd64",
},
eval_limit=args.n_limit,
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=critic,
selected_instances_file=args.select,
max_retries=args.max_retries,
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/swebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,9 +360,9 @@ def main() -> None:
parser.set_defaults(**INFER_DEFAULTS)
args = parser.parse_args()

# Validate max_attempts
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
# Validate n_critic_runs
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm = load_llm_config(args.llm_config_path)
logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
Expand Down Expand Up @@ -400,7 +400,7 @@ def main() -> None:
prompt_path=args.prompt_path,
eval_limit=args.n_limit,
env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"],
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=critic,
selected_instances_file=args.select,
max_retries=args.max_retries,
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/swebenchmultilingual/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,9 +352,9 @@ def main() -> None:
parser.set_defaults(**INFER_DEFAULTS)
args = parser.parse_args()

# Validate max_attempts
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
# Validate n_critic_runs
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm = load_llm_config(args.llm_config_path)
logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
Expand Down Expand Up @@ -386,7 +386,7 @@ def main() -> None:
prompt_path=args.prompt_path,
eval_limit=args.n_limit,
env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"],
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=critic,
selected_instances_file=args.select,
max_retries=args.max_retries,
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/swebenchmultimodal/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,9 +414,9 @@ def main() -> None:
parser.set_defaults(**INFER_DEFAULTS)
args = parser.parse_args()

# Validate max_attempts
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
# Validate n_critic_runs
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm = load_llm_config(args.llm_config_path)
logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
Expand Down Expand Up @@ -453,7 +453,7 @@ def main() -> None:
prompt_path=args.prompt_path,
eval_limit=args.n_limit,
env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"],
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=critic,
selected_instances_file=args.select,
max_retries=args.max_retries,
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/swefficiency/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,9 +412,9 @@ def main() -> None:
parser.set_defaults(**INFER_DEFAULTS)
args = parser.parse_args()

# Validate max_attempts
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
# Validate n_critic_runs
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm_config_path = args.llm_config_path
if not os.path.isfile(llm_config_path):
Expand Down Expand Up @@ -450,7 +450,7 @@ def main() -> None:
prompt_path=args.prompt_path,
eval_limit=args.n_limit,
env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"],
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=critic,
selected_instances_file=args.select,
max_retries=args.max_retries,
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/swtbench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ Before running any benchmarks, you need to set up the environment see main READM
### 1. Run SWT-Bench Evaluation
```bash
# Run evaluation with your configured LLM
uv run swtbench-infer .llm_config/sonnet-4.json --max-attempts 3 --n-limit 500 --max-iterations 500 --critic finish_with_patch
uv run swtbench-infer .llm_config/sonnet-4.json --n-critic-runs 3 --n-limit 500 --max-iterations 500 --critic finish_with_patch
```

### 2. Selecting Specific Instances
Expand All @@ -26,7 +26,7 @@ requests__requests-5555

2. Run evaluation with the selection file:
```bash
uv run swtbench-infer .llm_config/sonnet-4.json --max-attempts 3 --select instances.txt --n-limit 500 --max-iterations 500 --critic finish_with_patch
uv run swtbench-infer .llm_config/sonnet-4.json --n-critic-runs 3 --select instances.txt --n-limit 500 --max-iterations 500 --critic finish_with_patch
```

This will only evaluate the instances listed in the file.
Expand Down
8 changes: 4 additions & 4 deletions benchmarks/swtbench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,9 +352,9 @@ def main() -> None:
parser.set_defaults(**INFER_DEFAULTS)
args = parser.parse_args()

# Validate max_attempts
if args.max_attempts < 1:
raise ValueError(f"max_attempts must be >= 1, got {args.max_attempts}")
# Validate n_critic_runs
if args.n_critic_runs < 1:
raise ValueError(f"n_critic_runs must be >= 1, got {args.n_critic_runs}")

llm = load_llm_config(args.llm_config_path)
logger.info("Using LLM config: %s", llm.model_dump_json(indent=2))
Expand Down Expand Up @@ -389,7 +389,7 @@ def main() -> None:
prompt_path=args.prompt_path,
eval_limit=args.n_limit,
env_setup_commands=["export PIP_CACHE_DIR=~/.cache/pip"],
max_attempts=args.max_attempts,
n_critic_runs=args.n_critic_runs,
critic=critic,
selected_instances_file=args.select,
max_retries=args.max_retries,
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/utils/args_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ def get_parser(add_llm_config: bool = True) -> argparse.ArgumentParser:
help="Limit number of instances to evaluate (0 = no limit)",
)
parser.add_argument(
"--max-attempts",
"--n-critic-runs",
type=int,
default=3,
help="Maximum number of attempts for iterative mode (default: 3, min: 1)",
help="Number of critic evaluation runs for iterative mode (default: 3, min: 1)",
)

# Add critic arguments (no default)
Expand Down
14 changes: 7 additions & 7 deletions benchmarks/utils/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,13 +209,13 @@ def run(
"""
Run evaluation with iterative mode support.

If max_attempts > 1, will retry failed instances multiple times.
If max_attempts == 1, will run once without retries.
If n_critic_runs > 1, will retry failed instances multiple times.
If n_critic_runs == 1, will run once without retries.
"""
logger.info("Starting evaluation (process pool)")
logger.info("metadata=%s", self.metadata)
logger.info("workers=%d", self.num_workers)
logger.info("max_attempts=%d", self.metadata.max_attempts)
logger.info("n_critic_runs=%d", self.metadata.n_critic_runs)

# Use iterative mode for all cases
return self._run_iterative_mode(on_result=on_result)
Expand Down Expand Up @@ -332,9 +332,9 @@ def _run_iterative_mode(
critic = self.metadata.critic
all_outputs: List[EvalOutput] = []

for attempt in range(1, self.metadata.max_attempts + 1):
for attempt in range(1, self.metadata.n_critic_runs + 1):
self.current_attempt = attempt
logger.info(f"Starting attempt {attempt}/{self.metadata.max_attempts}")
logger.info(f"Starting attempt {attempt}/{self.metadata.n_critic_runs}")

instances_to_process = self._get_instances_for_attempt(
attempt, all_instances, critic
Expand Down Expand Up @@ -520,14 +520,14 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None:
logger.info("Aggregating results from all attempts")
aggregate_results(
output_dir=self.metadata.eval_output_dir,
max_attempts=self.metadata.max_attempts,
n_critic_runs=self.metadata.n_critic_runs,
critic=self.metadata.critic,
final_output_file="output.jsonl",
)

logger.info(
f"Evaluation complete: {total_instances} total instances, "
f"{self.metadata.max_attempts} max attempts"
f"{self.metadata.n_critic_runs} critic runs"
)
return all_outputs

Expand Down
8 changes: 4 additions & 4 deletions benchmarks/utils/iterative.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def get_failed_instances(output_file: str, critic: CriticBase) -> Set[EvalInstan

def aggregate_results(
output_dir: str,
max_attempts: int,
n_critic_runs: int,
critic: "CriticBase",
final_output_file: str = "output.jsonl",
) -> None:
Expand All @@ -93,17 +93,17 @@ def aggregate_results(

Args:
output_dir: Directory containing attempt files
max_attempts: Maximum number of attempts
n_critic_runs: Maximum number of attempts
critic: Critic instance to use for evaluation
final_output_file: Name of the final output file
"""
logger.info(f"Aggregating results from {max_attempts} attempts")
logger.info(f"Aggregating results from {n_critic_runs} attempts")

# Dictionary to store the best result for each instance
best_results: dict[EvalInstanceID, EvalOutput] = {}

# Work backwards from the last attempt to the first
for attempt in range(max_attempts, 0, -1):
for attempt in range(n_critic_runs, 0, -1):
attempt_file = os.path.join(
output_dir, f"output.critic_attempt_{attempt}.jsonl"
)
Expand Down
6 changes: 4 additions & 2 deletions benchmarks/utils/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,10 @@ class EvalMetadata(BaseModel):
eval_limit: int = Field(
default=0, description="Number of instances to evaluate, 0 means all"
)
max_attempts: int = Field(
default=1, ge=1, description="Maximum number of attempts for iterative mode"
n_critic_runs: int = Field(
default=1,
ge=1,
description="Number of critic evaluation runs for iterative mode",
)
critic: CriticBase = Field(
description=(
Expand Down
10 changes: 5 additions & 5 deletions tests/test_aggregate_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def test_prefers_non_error_over_error_when_last_attempt_errors(
f.write(output_3.model_dump_json() + "\n")

# Run aggregation
aggregate_results(temp_output_dir, max_attempts=3, critic=critic)
aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic)

# Verify output.jsonl contains the instance (not dropped)
final_output_file = os.path.join(temp_output_dir, "output.jsonl")
Expand Down Expand Up @@ -135,7 +135,7 @@ def test_prefers_critic_success_over_non_error_critic_fail(self, temp_output_dir
f.write(output.model_dump_json() + "\n")

# Run aggregation
aggregate_results(temp_output_dir, max_attempts=3, critic=critic)
aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic)

# Verify output.jsonl contains the instance
final_output_file = os.path.join(temp_output_dir, "output.jsonl")
Expand Down Expand Up @@ -185,7 +185,7 @@ def test_multiple_instances_with_mixed_results(self, temp_output_dir):
f.write(create_output("instance_3", error=None).model_dump_json() + "\n")

# Run aggregation
aggregate_results(temp_output_dir, max_attempts=3, critic=critic)
aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic)

# Verify all instances appear in output.jsonl
final_output_file = os.path.join(temp_output_dir, "output.jsonl")
Expand Down Expand Up @@ -214,7 +214,7 @@ def test_all_attempts_error_instance_dropped(self, temp_output_dir):
f.write(output.model_dump_json() + "\n")

# Run aggregation
aggregate_results(temp_output_dir, max_attempts=3, critic=critic)
aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic)

# Verify output.jsonl is empty (instance dropped because all attempts errored)
final_output_file = os.path.join(temp_output_dir, "output.jsonl")
Expand All @@ -228,7 +228,7 @@ def test_empty_attempts(self, temp_output_dir):
critic = PassCritic()

# Run aggregation with no attempt files
aggregate_results(temp_output_dir, max_attempts=3, critic=critic)
aggregate_results(temp_output_dir, n_critic_runs=3, critic=critic)

# Verify output.jsonl is created but empty
final_output_file = os.path.join(temp_output_dir, "output.jsonl")
Expand Down
Loading
Loading