diff --git a/benchmarks/utils/evaluation.py b/benchmarks/utils/evaluation.py index bb1a9b68a..e3a6f464e 100644 --- a/benchmarks/utils/evaluation.py +++ b/benchmarks/utils/evaluation.py @@ -14,7 +14,7 @@ from dataclasses import dataclass from datetime import datetime, timezone from pathlib import Path -from typing import Callable, List, Optional, Tuple +from typing import Any, Callable, List, Optional, Tuple from uuid import UUID from lmnr import Laminar @@ -384,19 +384,20 @@ def attempt_on_result(instance: EvalInstance, out: EvalOutput) -> None: pending_instances: dict[Future, PendingInstance] = {} try: for index, inst in enumerate(instances_to_process): - datapoint_id, lmnr_span_ctx = ( - LaminarService.get().create_evaluation_datapoint( - self.metadata.lmnr.eval_id, - inst.id, - self.metadata.model_dump(mode="json"), - index, - session_id=self._laminar_session_id, - trace_metadata=self._laminar_trace_meta, - ) + datapoint_id = LaminarService.get().create_evaluation_datapoint( + self.metadata.lmnr.eval_id, + inst.id, + self.metadata.model_dump(mode="json"), + index, ) fut = pool.submit( - self._process_one_mp, inst, lmnr_span_ctx, attempt + self._process_one_mp, + inst, + attempt, + lmnr_session_id=self._laminar_session_id, + lmnr_trace_metadata=self._laminar_trace_meta, + lmnr_datapoint_id=datapoint_id, ) futures.append(fut) pending_instances[fut] = PendingInstance( @@ -570,9 +571,35 @@ def _calculate_resource_factor(self, runtime_failure_count: int) -> int: factor = self.metadata.base_resource_factor * (2**runtime_failure_count) return min(factor, self.metadata.max_resource_factor) + def _cleanup_workspace( + self, workspace: RemoteWorkspace, instance: EvalInstance + ) -> None: + """Clean up workspace resources and capture conversation archive.""" + try: + self._capture_conversation_archive(workspace, instance) + except Exception as archive_error: + logger.warning( + "[child] Failed to capture conversation archive for %s: %s", + instance.id, + archive_error, + ) + try: + workspace.__exit__(None, None, None) + logger.debug("[child] cleaned up workspace for id=%s", instance.id) + except Exception as cleanup_error: + logger.warning( + f"[child] Failed to cleanup workspace for {instance.id}: " + f"{str(cleanup_error)[:50]}" + ) + # --- Worker-side method (executed in child processes) --------------------------- def _process_one_mp( - self, instance: EvalInstance, eval_span_ctx: str | None, critic_attempt: int + self, + instance: EvalInstance, + critic_attempt: int, + lmnr_session_id: str | None = None, + lmnr_trace_metadata: dict[str, Any] | None = None, + lmnr_datapoint_id: UUID | None = None, ) -> Tuple[EvalInstance, EvalOutput]: """Execute one instance in a child process with retry logic. @@ -593,155 +620,211 @@ def _process_one_mp( with redirect_stdout_stderr(log_file): logger.info("[child] start id=%s", instance.id) - retry_count = 0 - runtime_failure_count = 0 - last_error = None - max_retries = self.metadata.max_retries - runtime_runs: list[RemoteRuntimeAllocation] = [] - - while retry_count <= max_retries: - workspace = None - - # Start Laminar execution span and inject context into os.environ so workspace can pick it up - # Escape the serialized context to safely pass as a cli argument - lmnr_span = Laminar.start_active_span( - "Execution", - span_type="EXECUTOR", # type: ignore - parent_span_context=Laminar.deserialize_span_context(eval_span_ctx) - if eval_span_ctx - else None, + # Two-phase datapoint linking: + # 1. Parent creates datapoint immediately (for UI progress tracking) + # 2. Child starts eval_span when work begins (accurate timeline) + # 3. Link them via update_datapoint_trace_id (requires lmnr>=0.7.41) + # + # We don't create the datapoint here with the trace_id directly + # because the parent process queues tasks before workers pick them + # up, which would include idle wait time in the span duration. + eval_span = None + try: + eval_span = Laminar.start_active_span( + "Evaluation", + span_type="EVALUATION", # type: ignore + session_id=lmnr_session_id, + metadata=lmnr_trace_metadata, ) - exec_span_ctx = json.dumps(Laminar.serialize_span_context(lmnr_span)) - os.environ["LMNR_SPAN_CONTEXT"] = exec_span_ctx or "" - - try: - # Calculate resource factor based on runtime failures - resource_factor = self._calculate_resource_factor( - runtime_failure_count - ) - if runtime_failure_count > 0: - logger.warning( - f"[child] Instance {instance.id}: " - f"attempt {retry_count + 1}/{max_retries + 1}, " - f"runtime_failure_count={runtime_failure_count}, " - f"resource_factor={resource_factor}" - ) - - workspace = self.prepare_workspace( - instance, - resource_factor=resource_factor, - forward_env=LMNR_ENV_VARS, + eval_span_ctx = Laminar.get_laminar_span_context(eval_span) + + if lmnr_datapoint_id is not None and self.metadata.lmnr is not None: + # OpenTelemetry trace_id is a 128-bit integer in span context + trace_id = UUID(int=eval_span.get_span_context().trace_id) + logger.info( + "[child] Linking datapoint %s to trace %s for instance %s", + lmnr_datapoint_id, + trace_id, + instance.id, ) - - # Record runtime/pod mapping only for remote runtimes - if isinstance(workspace, APIRemoteWorkspace): - retry_number = retry_count + 1 # 1-indexed for readability - runtime_run = RemoteRuntimeAllocation( - runtime_id=getattr(workspace, "_runtime_id", None), - session_id=getattr(workspace, "session_id", None), - runtime_url=getattr(workspace, "_runtime_url", None), - resource_factor=resource_factor, - critic_attempt=critic_attempt, - retry=retry_number, - started_at=datetime.now(timezone.utc), + try: + # Re-initialize in child process — multiprocessing + # isolation means the parent's SDK state is not shared. + LaminarService.get().initialize() + LaminarService.get().update_datapoint_trace_id( + eval_id=self.metadata.lmnr.eval_id, + datapoint_id=lmnr_datapoint_id, + trace_id=trace_id, ) - runtime_runs.append(runtime_run) - logger.info( - "[child] runtime allocated instance=%s attempt=%d retry=%d workspace=%s runtime_id=%s session_id=%s resource_factor=%s", + except Exception as exc: + logger.error( + "[child] Failed to link datapoint %s to trace for instance %s: %s", + lmnr_datapoint_id, instance.id, - critic_attempt, - retry_number, - workspace.__class__.__name__, - runtime_run.runtime_id, - runtime_run.session_id, - runtime_run.resource_factor, + exc, + exc_info=True, ) - out = self.evaluate_instance(instance, workspace) - if runtime_runs: - out.runtime_runs = runtime_runs - logger.info("[child] done id=%s", instance.id) - return instance, out - except Exception as e: - last_error = e - retry_count += 1 - lmnr_span.record_exception(e) - # Log structured runtime allocation/init failures so we can trace instance -> runtime/pod - runtime_id = ( - getattr(workspace, "_runtime_id", None) if workspace else None + retry_count = 0 + runtime_failure_count = 0 + max_retries = self.metadata.max_retries + runtime_runs: list[RemoteRuntimeAllocation] = [] + + # max_retries is the number of *additional* attempts after the + # first, so total attempts = max_retries + 1 (retry_count 0..N). + while retry_count <= max_retries: + out = self._execute_single_attempt( + instance=instance, + eval_span_ctx=eval_span_ctx, + critic_attempt=critic_attempt, + resource_factor=self._calculate_resource_factor( + runtime_failure_count + ), + retry_count=retry_count, + max_retries=max_retries, + runtime_failure_count=runtime_failure_count, + runtime_runs=runtime_runs, ) - session_id = ( - getattr(workspace, "session_id", None) if workspace else None - ) - if isinstance(workspace, APIRemoteWorkspace) or ( - "Runtime not yet ready" in str(e) - ): - logger.warning( - "[child] runtime init failure instance=%s attempt=%d retry=%d runtime_id=%s session_id=%s error=%s", - instance.id, - critic_attempt, - retry_count, - runtime_id, - session_id, - str(e), - ) + if out is not None: + return instance, out - # TODO(#277): add an exception classifier to decide when to bump resources + # _execute_single_attempt returns None on non-final failure + retry_count += 1 runtime_failure_count += 1 - logger.warning( - f"[child] Instance {instance.id}: runtime_failure_count=" - f"{runtime_failure_count}" - ) - if retry_count <= max_retries: - logger.warning( - f"[child] Instance {instance.id} failed " - f"(attempt {retry_count}/{max_retries}): " - f"{str(e)}" - ) - else: - logger.error( - f"[child] Instance {instance.id} failed after " - f"{max_retries} retries. Last error: {str(e)}", - exc_info=True, - ) - # Create error output for final failure - error_output = self._create_error_output( - instance, last_error, max_retries - ) - if runtime_runs: - error_output.runtime_runs = runtime_runs - return instance, error_output - finally: - # Ensure workspace cleanup happens regardless of success or failure - if workspace is not None: - try: - self._capture_conversation_archive(workspace, instance) - except Exception as archive_error: - logger.warning( - "[child] Failed to capture conversation archive for %s: %s", - instance.id, - archive_error, - ) - try: - workspace.__exit__(None, None, None) - logger.debug( - "[child] cleaned up workspace for id=%s", instance.id - ) - except Exception as cleanup_error: - logger.warning( - f"[child] Failed to cleanup workspace for {instance.id}: " - f"{str(cleanup_error)[:50]}" - ) - lmnr_span.end() + # Unreachable: _execute_single_attempt always returns EvalOutput + # on the final retry, but pyright can't prove the loop exits early. + raise AssertionError("unreachable") # pragma: no cover + finally: + if eval_span is not None: + eval_span.end() + + def _execute_single_attempt( + self, + instance: EvalInstance, + eval_span_ctx: Any, + critic_attempt: int, + resource_factor: int, + retry_count: int, + max_retries: int, + runtime_failure_count: int, + runtime_runs: list[RemoteRuntimeAllocation], + ) -> EvalOutput | None: + """Execute one attempt with proper span and workspace lifecycle. + + Returns: + EvalOutput: on success, or on the *final* retry failure + (retry_count == max_retries) so the caller can report it. + None: on a non-final failure, signalling the caller should retry:: + + out = self._execute_single_attempt(...) + if out is not None: + return instance, out # done (success or final failure) + # else: bump counters and loop + """ + workspace = None + exec_span = None + try: + # Serialize span context and inject via environment variable so workspace can pick it up + exec_span = Laminar.start_active_span( + "Execution", + span_type="EXECUTOR", # type: ignore + parent_span_context=eval_span_ctx, + ) + exec_span_ctx = json.dumps(Laminar.serialize_span_context(exec_span)) + os.environ["LMNR_SPAN_CONTEXT"] = exec_span_ctx or "" + + if runtime_failure_count > 0: + logger.warning( + f"[child] Instance {instance.id}: " + f"attempt {retry_count + 1}/{max_retries + 1}, " + f"runtime_failure_count={runtime_failure_count}, " + f"resource_factor={resource_factor}" + ) - # This should never be reached, but added for type safety - error_output = self._create_error_output( - instance, Exception("Unexpected error: no attempts made"), max_retries + workspace = self.prepare_workspace( + instance, + resource_factor=resource_factor, + forward_env=LMNR_ENV_VARS, ) + + # Record runtime/pod mapping only for remote runtimes + if isinstance(workspace, APIRemoteWorkspace): + retry_number = retry_count + 1 # 1-indexed for readability + runtime_run = RemoteRuntimeAllocation( + runtime_id=getattr(workspace, "_runtime_id", None), + session_id=getattr(workspace, "session_id", None), + runtime_url=getattr(workspace, "_runtime_url", None), + resource_factor=resource_factor, + critic_attempt=critic_attempt, + retry=retry_number, + started_at=datetime.now(timezone.utc), + ) + runtime_runs.append(runtime_run) + logger.info( + "[child] runtime allocated instance=%s attempt=%d retry=%d workspace=%s runtime_id=%s session_id=%s resource_factor=%s", + instance.id, + critic_attempt, + retry_number, + workspace.__class__.__name__, + runtime_run.runtime_id, + runtime_run.session_id, + runtime_run.resource_factor, + ) + out = self.evaluate_instance(instance, workspace) if runtime_runs: - error_output.runtime_runs = runtime_runs - return instance, error_output + out.runtime_runs = runtime_runs + logger.info("[child] done id=%s", instance.id) + return out + except Exception as e: + if exec_span is not None: + exec_span.record_exception(e) + + # Log structured runtime allocation/init failures so we can trace instance -> runtime/pod + runtime_id = getattr(workspace, "_runtime_id", None) if workspace else None + session_id = getattr(workspace, "session_id", None) if workspace else None + if isinstance(workspace, APIRemoteWorkspace) or ( + "Runtime not yet ready" in str(e) + ): + logger.warning( + "[child] runtime init failure instance=%s attempt=%d retry=%d runtime_id=%s session_id=%s error=%s", + instance.id, + critic_attempt, + retry_count + 1, + runtime_id, + session_id, + str(e), + ) + + # TODO(#277): add an exception classifier to decide when to bump resources + logger.warning( + f"[child] Instance {instance.id}: runtime_failure_count=" + f"{runtime_failure_count + 1}" + ) + + if retry_count < max_retries: + logger.warning( + f"[child] Instance {instance.id} failed " + f"(attempt {retry_count + 1}/{max_retries}): " + f"{str(e)}" + ) + else: + logger.error( + f"[child] Instance {instance.id} failed after " + f"{max_retries} retries. Last error: {str(e)}", + exc_info=True, + ) + # Create error output for final failure + error_output = self._create_error_output(instance, e, max_retries) + if runtime_runs: + error_output.runtime_runs = runtime_runs + return error_output + return None + finally: + if workspace is not None: + self._cleanup_workspace(workspace, instance) + if exec_span is not None: + exec_span.end() # ---------- Multiprocessing logging helper --------------------------------------- diff --git a/benchmarks/utils/laminar.py b/benchmarks/utils/laminar.py index 921418191..0fe9773a0 100644 --- a/benchmarks/utils/laminar.py +++ b/benchmarks/utils/laminar.py @@ -103,51 +103,71 @@ def create_evaluation_datapoint( data: Any, metadata: dict[str, Any], index: int, - session_id: str | None = None, - trace_metadata: dict[str, Any] | None = None, - ) -> tuple[UUID | None, str | None]: + ) -> UUID | None: """ - Create a Laminar datapoint. - Creates a new span for the evaluation and returns the span context. - Session ID and trace metadata are set on the span if provided. + Create a Laminar datapoint without trace linkage. + + The datapoint is created immediately for UI visibility, but the trace_id + is set later (via update_datapoint_trace_id) when the child process + actually starts the evaluation span. This ensures accurate timeline + measurement that excludes queue wait time. + + Note: session_id and trace_metadata are intentionally not set here; + they are applied when the child process creates the root eval span. + + Returns the datapoint_id. """ if eval_id is None: - return None, None + return None client = self._get_client() if client is None: - return None, None + return None try: - eval_span = Laminar.start_active_span( - "Evaluation", - span_type="EVALUATION", # type: ignore - ) - # Set session ID and metadata on the active span - if session_id: - Laminar.set_trace_session_id(session_id) - if trace_metadata: - Laminar.set_trace_metadata(trace_metadata) - - lmnr_span_ctx = Laminar.serialize_span_context(eval_span) - eval_span.end() - return client.evals.create_datapoint( eval_id=eval_id, data=data, target=1, metadata=metadata, index=index, - trace_id=UUID(int=eval_span.get_span_context().trace_id), - ), lmnr_span_ctx + ) except Exception as exc: logger.debug( "Failed to create Laminar datapoint for eval %s: %s", eval_id, exc, ) - return None, None + return None + + def update_datapoint_trace_id( + self, + eval_id: UUID | None, + datapoint_id: UUID | None, + trace_id: UUID, + ) -> None: + """Link a datapoint to a trace after the span has been created.""" + + client = self._get_client() + if client is None or not eval_id or not datapoint_id: + return + + try: + client.evals.update_datapoint( + eval_id=eval_id, + datapoint_id=datapoint_id, + scores={}, + trace_id=trace_id, + ) + except Exception as exc: # pragma: no cover - defensive logging + logger.warning( + "Failed to link datapoint %s to trace %s for eval %s: %s", + datapoint_id, + trace_id, + eval_id, + exc, + ) def _update_evaluation_datapoint( self, diff --git a/pyproject.toml b/pyproject.toml index 2cb142648..330d901b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ "python-json-logger>=3.3.0", "tenacity>=9.1.2", "websockets>=12", - "lmnr>=0.7.24", + "lmnr>=0.7.41", # TODO Remove the macOS exclusion once https://github.com/multi-swe-bench/multi-swe-bench/pull/93 is merged and released "multi-swe-bench>=1.1.1; sys_platform != 'darwin'", "swt-bench @ git+https://github.com/logic-star-ai/swt-bench.git@5fdcd446ff05e248ecfffc19d560a210699f71f8", diff --git a/tests/test_workspace_cleanup.py b/tests/test_workspace_cleanup.py index 1af4e764a..c5d1f32f2 100644 --- a/tests/test_workspace_cleanup.py +++ b/tests/test_workspace_cleanup.py @@ -1,10 +1,12 @@ """Tests for workspace cleanup functionality in the evaluation module.""" from typing import List -from unittest.mock import Mock +from unittest.mock import Mock, patch +from uuid import UUID import pytest +from benchmarks.utils.laminar import LaminarEvalMetadata from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput from openhands.sdk import LLM from openhands.sdk.critic import PassCritic @@ -60,7 +62,7 @@ def evaluate_instance(self, instance, workspace): evaluator = TestEvaluation(metadata=metadata, num_workers=1) result_instance, result_output = evaluator._process_one_mp( - test_instance, None, critic_attempt=1 + test_instance, critic_attempt=1 ) mock_workspace.__exit__.assert_called_once_with(None, None, None) @@ -111,7 +113,7 @@ def evaluate_instance(self, instance, workspace): evaluator = TestEvaluation(metadata=metadata, num_workers=1) result_instance, result_output = evaluator._process_one_mp( - test_instance, None, critic_attempt=1 + test_instance, critic_attempt=1 ) mock_workspace.__exit__.assert_called_once_with(None, None, None) @@ -171,7 +173,7 @@ def evaluate_instance(self, instance, workspace): evaluator = TestEvaluation(metadata=metadata, num_workers=1) result_instance, result_output = evaluator._process_one_mp( - test_instance, None, critic_attempt=1 + test_instance, critic_attempt=1 ) mock_workspace.__exit__.assert_called_once_with(None, None, None) @@ -241,7 +243,7 @@ def evaluate_instance(self, instance, workspace): evaluator = TestEvaluation(metadata=metadata, num_workers=1) result_instance, result_output = evaluator._process_one_mp( - test_instance, None, critic_attempt=1 + test_instance, critic_attempt=1 ) assert len(workspaces_created) == 3 @@ -253,5 +255,220 @@ def evaluate_instance(self, instance, workspace): assert result_output.error is None +def test_datapoint_trace_id_linked_in_worker(): + """Test that update_datapoint_trace_id is called when a datapoint_id is provided.""" + from benchmarks.utils.evaluation import Evaluation + + mock_workspace = Mock() + mock_workspace.__exit__ = Mock() + + test_instance = EvalInstance(id="test_instance", data={"test": "data"}) + test_output = EvalOutput( + instance_id="test_instance", + test_result={"success": True}, + instruction="test instruction", + error=None, + history=[], + instance={"test": "data"}, + ) + + eval_id = UUID("12345678-1234-1234-1234-123456789abc") + datapoint_id = UUID("abcdef01-abcd-abcd-abcd-abcdef012345") + + llm = LLM(model="test-model") + metadata = EvalMetadata( + llm=llm, + dataset="test", + dataset_split="test", + max_iterations=10, + eval_output_dir="/tmp/test", + details={}, + eval_limit=1, + max_attempts=1, + max_retries=0, + critic=PassCritic(), + ) + metadata.lmnr = LaminarEvalMetadata(eval_id=eval_id) + + class TestEvaluation(Evaluation): + def prepare_instances(self) -> List[EvalInstance]: + return [test_instance] + + def prepare_workspace( + self, + instance: EvalInstance, + resource_factor: int = 1, + forward_env: list[str] | None = None, + ): + mock_workspace.forward_env = forward_env or [] + mock_workspace.resource_factor = resource_factor + return mock_workspace + + def evaluate_instance(self, instance, workspace): + return test_output + + evaluator = TestEvaluation(metadata=metadata, num_workers=1) + + with patch("benchmarks.utils.evaluation.LaminarService") as mock_lmnr_svc: + mock_service = Mock() + mock_lmnr_svc.get.return_value = mock_service + + result_instance, result_output = evaluator._process_one_mp( + test_instance, + critic_attempt=1, + lmnr_datapoint_id=datapoint_id, + ) + + # Verify update_datapoint_trace_id was called with the correct eval_id and datapoint_id + mock_service.update_datapoint_trace_id.assert_called_once() + call_kwargs = mock_service.update_datapoint_trace_id.call_args + assert call_kwargs.kwargs["eval_id"] == eval_id + assert call_kwargs.kwargs["datapoint_id"] == datapoint_id + assert isinstance(call_kwargs.kwargs["trace_id"], UUID) + assert ( + call_kwargs.kwargs["trace_id"].int != 0 + ) # Verify it's not a zero/default UUID + + assert result_instance.id == "test_instance" + assert result_output.error is None + + +def test_datapoint_trace_id_not_linked_without_datapoint(): + """Test that update_datapoint_trace_id is NOT called when no datapoint_id is provided.""" + from benchmarks.utils.evaluation import Evaluation + + mock_workspace = Mock() + mock_workspace.__exit__ = Mock() + + test_instance = EvalInstance(id="test_instance", data={"test": "data"}) + test_output = EvalOutput( + instance_id="test_instance", + test_result={"success": True}, + instruction="test instruction", + error=None, + history=[], + instance={"test": "data"}, + ) + + llm = LLM(model="test-model") + metadata = EvalMetadata( + llm=llm, + dataset="test", + dataset_split="test", + max_iterations=10, + eval_output_dir="/tmp/test", + details={}, + eval_limit=1, + max_attempts=1, + max_retries=0, + critic=PassCritic(), + ) + + class TestEvaluation(Evaluation): + def prepare_instances(self) -> List[EvalInstance]: + return [test_instance] + + def prepare_workspace( + self, + instance: EvalInstance, + resource_factor: int = 1, + forward_env: list[str] | None = None, + ): + mock_workspace.forward_env = forward_env or [] + mock_workspace.resource_factor = resource_factor + return mock_workspace + + def evaluate_instance(self, instance, workspace): + return test_output + + evaluator = TestEvaluation(metadata=metadata, num_workers=1) + + with patch("benchmarks.utils.evaluation.LaminarService") as mock_lmnr_svc: + mock_service = Mock() + mock_lmnr_svc.get.return_value = mock_service + + result_instance, result_output = evaluator._process_one_mp( + test_instance, + critic_attempt=1, + # No lmnr_datapoint_id passed + ) + + mock_service.update_datapoint_trace_id.assert_not_called() + assert result_output.error is None + + +def test_update_datapoint_trace_id_failure_does_not_break_eval(): + """Test that a failure in update_datapoint_trace_id does not prevent evaluation.""" + from benchmarks.utils.evaluation import Evaluation + + mock_workspace = Mock() + mock_workspace.__exit__ = Mock() + + test_instance = EvalInstance(id="test_instance", data={"test": "data"}) + test_output = EvalOutput( + instance_id="test_instance", + test_result={"success": True}, + instruction="test instruction", + error=None, + history=[], + instance={"test": "data"}, + ) + + eval_id = UUID("12345678-1234-1234-1234-123456789abc") + datapoint_id = UUID("abcdef01-abcd-abcd-abcd-abcdef012345") + + llm = LLM(model="test-model") + metadata = EvalMetadata( + llm=llm, + dataset="test", + dataset_split="test", + max_iterations=10, + eval_output_dir="/tmp/test", + details={}, + eval_limit=1, + max_attempts=1, + max_retries=0, + critic=PassCritic(), + ) + metadata.lmnr = LaminarEvalMetadata(eval_id=eval_id) + + class TestEvaluation(Evaluation): + def prepare_instances(self) -> List[EvalInstance]: + return [test_instance] + + def prepare_workspace( + self, + instance: EvalInstance, + resource_factor: int = 1, + forward_env: list[str] | None = None, + ): + mock_workspace.forward_env = forward_env or [] + mock_workspace.resource_factor = resource_factor + return mock_workspace + + def evaluate_instance(self, instance, workspace): + return test_output + + evaluator = TestEvaluation(metadata=metadata, num_workers=1) + + with patch("benchmarks.utils.evaluation.LaminarService") as mock_lmnr_svc: + mock_service = Mock() + mock_service.update_datapoint_trace_id.side_effect = RuntimeError( + "Network error" + ) + mock_lmnr_svc.get.return_value = mock_service + + # Should not raise despite update_datapoint_trace_id failure + result_instance, result_output = evaluator._process_one_mp( + test_instance, + critic_attempt=1, + lmnr_datapoint_id=datapoint_id, + ) + + # The evaluation should still succeed + assert result_instance.id == "test_instance" + assert result_output.error is None + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/uv.lock b/uv.lock index db834cb47..301f593f5 100644 --- a/uv.lock +++ b/uv.lock @@ -1741,7 +1741,7 @@ wheels = [ [[package]] name = "lmnr" -version = "0.7.25" +version = "0.7.44" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "grpcio", version = "1.67.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14'" }, @@ -1762,9 +1762,9 @@ dependencies = [ { name = "tenacity" }, { name = "tqdm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/10/bd/a65219ca6f09199e35a14a55acb503e3ac896db15018d342076bd24401e1/lmnr-0.7.25.tar.gz", hash = "sha256:a3a0ba9a305243bbe97f2fcb8afc7d39d201dc11107b4633c257b64b838b2979", size = 203876, upload-time = "2025-12-18T17:31:24.507Z" } +sdist = { url = "https://files.pythonhosted.org/packages/45/8d/6077e76cc1801799c496cd31686c68a40ac87afc508de5725767c1bea51d/lmnr-0.7.44.tar.gz", hash = "sha256:001cdb87554afcc1afff72333fce820591a595b30962486437186590ceb1c20b", size = 239647, upload-time = "2026-02-27T18:41:02.638Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/45/35/1f22e3fea98416d58dddbdbc63e18ddcfb2b8f850b8ec065652a90d99666/lmnr-0.7.25-py3-none-any.whl", hash = "sha256:c0539d5f8c8e59a2d5d0ab04e498a82351d51fde8cf04ef8b312424e0be537ac", size = 266040, upload-time = "2025-12-18T17:31:22.986Z" }, + { url = "https://files.pythonhosted.org/packages/be/0d/7b4cd20a70085fd0ff844197245b214aaaf8a0f13098f356be9e7d22fc4f/lmnr-0.7.44-py3-none-any.whl", hash = "sha256:3c6f7b444586d9aa3db0b0ff9f9249c4d3647e146c899523bfefad2ffd332760", size = 314771, upload-time = "2026-02-27T18:41:00.869Z" }, ] [[package]] @@ -2399,7 +2399,7 @@ requires-dist = [ { name = "huggingface-hub" }, { name = "jinja2" }, { name = "litellm", specifier = ">=1.77.7.dev9" }, - { name = "lmnr", specifier = ">=0.7.24" }, + { name = "lmnr", specifier = ">=0.7.41" }, { name = "modal", specifier = ">=1.1.4" }, { name = "multi-swe-bench", marker = "sys_platform != 'darwin'", specifier = ">=1.1.1" }, { name = "openhands-agent-server", editable = "vendor/software-agent-sdk/openhands-agent-server" }, @@ -2526,32 +2526,32 @@ requires-dist = [ [[package]] name = "opentelemetry-api" -version = "1.38.0" +version = "1.39.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "importlib-metadata" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/08/d8/0f354c375628e048bd0570645b310797299754730079853095bf000fba69/opentelemetry_api-1.38.0.tar.gz", hash = "sha256:f4c193b5e8acb0912b06ac5b16321908dd0843d75049c091487322284a3eea12", size = 65242, upload-time = "2025-10-16T08:35:50.25Z" } +sdist = { url = "https://files.pythonhosted.org/packages/97/b9/3161be15bb8e3ad01be8be5a968a9237c3027c5be504362ff800fca3e442/opentelemetry_api-1.39.1.tar.gz", hash = "sha256:fbde8c80e1b937a2c61f20347e91c0c18a1940cecf012d62e65a7caf08967c9c", size = 65767, upload-time = "2025-12-11T13:32:39.182Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/ae/a2/d86e01c28300bd41bab8f18afd613676e2bd63515417b77636fc1add426f/opentelemetry_api-1.38.0-py3-none-any.whl", hash = "sha256:2891b0197f47124454ab9f0cf58f3be33faca394457ac3e09daba13ff50aa582", size = 65947, upload-time = "2025-10-16T08:35:30.23Z" }, + { url = "https://files.pythonhosted.org/packages/cf/df/d3f1ddf4bb4cb50ed9b1139cc7b1c54c34a1e7ce8fd1b9a37c0d1551a6bd/opentelemetry_api-1.39.1-py3-none-any.whl", hash = "sha256:2edd8463432a7f8443edce90972169b195e7d6a05500cd29e6d13898187c9950", size = 66356, upload-time = "2025-12-11T13:32:17.304Z" }, ] [[package]] name = "opentelemetry-exporter-otlp-proto-common" -version = "1.38.0" +version = "1.39.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "opentelemetry-proto" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/19/83/dd4660f2956ff88ed071e9e0e36e830df14b8c5dc06722dbde1841accbe8/opentelemetry_exporter_otlp_proto_common-1.38.0.tar.gz", hash = "sha256:e333278afab4695aa8114eeb7bf4e44e65c6607d54968271a249c180b2cb605c", size = 20431, upload-time = "2025-10-16T08:35:53.285Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e9/9d/22d241b66f7bbde88a3bfa6847a351d2c46b84de23e71222c6aae25c7050/opentelemetry_exporter_otlp_proto_common-1.39.1.tar.gz", hash = "sha256:763370d4737a59741c89a67b50f9e39271639ee4afc999dadfe768541c027464", size = 20409, upload-time = "2025-12-11T13:32:40.885Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a7/9e/55a41c9601191e8cd8eb626b54ee6827b9c9d4a46d736f32abc80d8039fc/opentelemetry_exporter_otlp_proto_common-1.38.0-py3-none-any.whl", hash = "sha256:03cb76ab213300fe4f4c62b7d8f17d97fcfd21b89f0b5ce38ea156327ddda74a", size = 18359, upload-time = "2025-10-16T08:35:34.099Z" }, + { url = "https://files.pythonhosted.org/packages/8c/02/ffc3e143d89a27ac21fd557365b98bd0653b98de8a101151d5805b5d4c33/opentelemetry_exporter_otlp_proto_common-1.39.1-py3-none-any.whl", hash = "sha256:08f8a5862d64cc3435105686d0216c1365dc5701f86844a8cd56597d0c764fde", size = 18366, upload-time = "2025-12-11T13:32:20.2Z" }, ] [[package]] name = "opentelemetry-exporter-otlp-proto-grpc" -version = "1.38.0" +version = "1.39.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "googleapis-common-protos" }, @@ -2563,14 +2563,14 @@ dependencies = [ { name = "opentelemetry-sdk" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/a2/c0/43222f5b97dc10812bc4f0abc5dc7cd0a2525a91b5151d26c9e2e958f52e/opentelemetry_exporter_otlp_proto_grpc-1.38.0.tar.gz", hash = "sha256:2473935e9eac71f401de6101d37d6f3f0f1831db92b953c7dcc912536158ebd6", size = 24676, upload-time = "2025-10-16T08:35:53.83Z" } +sdist = { url = "https://files.pythonhosted.org/packages/53/48/b329fed2c610c2c32c9366d9dc597202c9d1e58e631c137ba15248d8850f/opentelemetry_exporter_otlp_proto_grpc-1.39.1.tar.gz", hash = "sha256:772eb1c9287485d625e4dbe9c879898e5253fea111d9181140f51291b5fec3ad", size = 24650, upload-time = "2025-12-11T13:32:41.429Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/28/f0/bd831afbdba74ca2ce3982142a2fad707f8c487e8a3b6fef01f1d5945d1b/opentelemetry_exporter_otlp_proto_grpc-1.38.0-py3-none-any.whl", hash = "sha256:7c49fd9b4bd0dbe9ba13d91f764c2d20b0025649a6e4ac35792fb8d84d764bc7", size = 19695, upload-time = "2025-10-16T08:35:35.053Z" }, + { url = "https://files.pythonhosted.org/packages/81/a3/cc9b66575bd6597b98b886a2067eea2693408d2d5f39dad9ab7fc264f5f3/opentelemetry_exporter_otlp_proto_grpc-1.39.1-py3-none-any.whl", hash = "sha256:fa1c136a05c7e9b4c09f739469cbdb927ea20b34088ab1d959a849b5cc589c18", size = 19766, upload-time = "2025-12-11T13:32:21.027Z" }, ] [[package]] name = "opentelemetry-exporter-otlp-proto-http" -version = "1.38.0" +version = "1.39.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "googleapis-common-protos" }, @@ -2581,14 +2581,14 @@ dependencies = [ { name = "requests" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/81/0a/debcdfb029fbd1ccd1563f7c287b89a6f7bef3b2902ade56797bfd020854/opentelemetry_exporter_otlp_proto_http-1.38.0.tar.gz", hash = "sha256:f16bd44baf15cbe07633c5112ffc68229d0edbeac7b37610be0b2def4e21e90b", size = 17282, upload-time = "2025-10-16T08:35:54.422Z" } +sdist = { url = "https://files.pythonhosted.org/packages/80/04/2a08fa9c0214ae38880df01e8bfae12b067ec0793446578575e5080d6545/opentelemetry_exporter_otlp_proto_http-1.39.1.tar.gz", hash = "sha256:31bdab9745c709ce90a49a0624c2bd445d31a28ba34275951a6a362d16a0b9cb", size = 17288, upload-time = "2025-12-11T13:32:42.029Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e5/77/154004c99fb9f291f74aa0822a2f5bbf565a72d8126b3a1b63ed8e5f83c7/opentelemetry_exporter_otlp_proto_http-1.38.0-py3-none-any.whl", hash = "sha256:84b937305edfc563f08ec69b9cb2298be8188371217e867c1854d77198d0825b", size = 19579, upload-time = "2025-10-16T08:35:36.269Z" }, + { url = "https://files.pythonhosted.org/packages/95/f1/b27d3e2e003cd9a3592c43d099d2ed8d0a947c15281bf8463a256db0b46c/opentelemetry_exporter_otlp_proto_http-1.39.1-py3-none-any.whl", hash = "sha256:d9f5207183dd752a412c4cd564ca8875ececba13be6e9c6c370ffb752fd59985", size = 19641, upload-time = "2025-12-11T13:32:22.248Z" }, ] [[package]] name = "opentelemetry-instrumentation" -version = "0.59b0" +version = "0.60b1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "opentelemetry-api" }, @@ -2596,62 +2596,62 @@ dependencies = [ { name = "packaging" }, { name = "wrapt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/04/ed/9c65cd209407fd807fa05be03ee30f159bdac8d59e7ea16a8fe5a1601222/opentelemetry_instrumentation-0.59b0.tar.gz", hash = "sha256:6010f0faaacdaf7c4dff8aac84e226d23437b331dcda7e70367f6d73a7db1adc", size = 31544, upload-time = "2025-10-16T08:39:31.959Z" } +sdist = { url = "https://files.pythonhosted.org/packages/41/0f/7e6b713ac117c1f5e4e3300748af699b9902a2e5e34c9cf443dde25a01fa/opentelemetry_instrumentation-0.60b1.tar.gz", hash = "sha256:57ddc7974c6eb35865af0426d1a17132b88b2ed8586897fee187fd5b8944bd6a", size = 31706, upload-time = "2025-12-11T13:36:42.515Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/10/f5/7a40ff3f62bfe715dad2f633d7f1174ba1a7dd74254c15b2558b3401262a/opentelemetry_instrumentation-0.59b0-py3-none-any.whl", hash = "sha256:44082cc8fe56b0186e87ee8f7c17c327c4c2ce93bdbe86496e600985d74368ee", size = 33020, upload-time = "2025-10-16T08:38:31.463Z" }, + { url = "https://files.pythonhosted.org/packages/77/d2/6788e83c5c86a2690101681aeef27eeb2a6bf22df52d3f263a22cee20915/opentelemetry_instrumentation-0.60b1-py3-none-any.whl", hash = "sha256:04480db952b48fb1ed0073f822f0ee26012b7be7c3eac1a3793122737c78632d", size = 33096, upload-time = "2025-12-11T13:35:33.067Z" }, ] [[package]] name = "opentelemetry-instrumentation-threading" -version = "0.59b0" +version = "0.60b1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "opentelemetry-api" }, { name = "opentelemetry-instrumentation" }, { name = "wrapt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/82/7a/84e97d8992808197006e607ae410c2219bdbbc23d1289ba0c244d3220741/opentelemetry_instrumentation_threading-0.59b0.tar.gz", hash = "sha256:ce5658730b697dcbc0e0d6d13643a69fd8aeb1b32fa8db3bade8ce114c7975f3", size = 8770, upload-time = "2025-10-16T08:40:03.587Z" } +sdist = { url = "https://files.pythonhosted.org/packages/9b/0a/e36123ec4c0910a3936b92982545a53e9bca5b26a28df06883751a783f84/opentelemetry_instrumentation_threading-0.60b1.tar.gz", hash = "sha256:20b18a68abe5801fa9474336b7c27487d4af3e00b66f6a8734e4fdd75c8b0b43", size = 8768, upload-time = "2025-12-11T13:37:16.29Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b8/50/32d29076aaa1c91983cdd3ca8c6bb4d344830cd7d87a7c0fdc2d98c58509/opentelemetry_instrumentation_threading-0.59b0-py3-none-any.whl", hash = "sha256:76da2fc01fe1dccebff6581080cff9e42ac7b27cc61eb563f3c4435c727e8eca", size = 9313, upload-time = "2025-10-16T08:39:15.876Z" }, + { url = "https://files.pythonhosted.org/packages/c7/a3/448738b927bcc1843ace7d4ed55dd54441a71363075eeeee89c5944dd740/opentelemetry_instrumentation_threading-0.60b1-py3-none-any.whl", hash = "sha256:92a52a60fee5e32bc6aa8f5acd749b15691ad0bc4457a310f5736b76a6d9d1de", size = 9312, upload-time = "2025-12-11T13:36:28.434Z" }, ] [[package]] name = "opentelemetry-proto" -version = "1.38.0" +version = "1.39.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "protobuf" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/51/14/f0c4f0f6371b9cb7f9fa9ee8918bfd59ac7040c7791f1e6da32a1839780d/opentelemetry_proto-1.38.0.tar.gz", hash = "sha256:88b161e89d9d372ce723da289b7da74c3a8354a8e5359992be813942969ed468", size = 46152, upload-time = "2025-10-16T08:36:01.612Z" } +sdist = { url = "https://files.pythonhosted.org/packages/49/1d/f25d76d8260c156c40c97c9ed4511ec0f9ce353f8108ca6e7561f82a06b2/opentelemetry_proto-1.39.1.tar.gz", hash = "sha256:6c8e05144fc0d3ed4d22c2289c6b126e03bcd0e6a7da0f16cedd2e1c2772e2c8", size = 46152, upload-time = "2025-12-11T13:32:48.681Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b6/6a/82b68b14efca5150b2632f3692d627afa76b77378c4999f2648979409528/opentelemetry_proto-1.38.0-py3-none-any.whl", hash = "sha256:b6ebe54d3217c42e45462e2a1ae28c3e2bf2ec5a5645236a490f55f45f1a0a18", size = 72535, upload-time = "2025-10-16T08:35:45.749Z" }, + { url = "https://files.pythonhosted.org/packages/51/95/b40c96a7b5203005a0b03d8ce8cd212ff23f1793d5ba289c87a097571b18/opentelemetry_proto-1.39.1-py3-none-any.whl", hash = "sha256:22cdc78efd3b3765d09e68bfbd010d4fc254c9818afd0b6b423387d9dee46007", size = 72535, upload-time = "2025-12-11T13:32:33.866Z" }, ] [[package]] name = "opentelemetry-sdk" -version = "1.38.0" +version = "1.39.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "opentelemetry-api" }, { name = "opentelemetry-semantic-conventions" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/85/cb/f0eee1445161faf4c9af3ba7b848cc22a50a3d3e2515051ad8628c35ff80/opentelemetry_sdk-1.38.0.tar.gz", hash = "sha256:93df5d4d871ed09cb4272305be4d996236eedb232253e3ab864c8620f051cebe", size = 171942, upload-time = "2025-10-16T08:36:02.257Z" } +sdist = { url = "https://files.pythonhosted.org/packages/eb/fb/c76080c9ba07e1e8235d24cdcc4d125ef7aa3edf23eb4e497c2e50889adc/opentelemetry_sdk-1.39.1.tar.gz", hash = "sha256:cf4d4563caf7bff906c9f7967e2be22d0d6b349b908be0d90fb21c8e9c995cc6", size = 171460, upload-time = "2025-12-11T13:32:49.369Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/2f/2e/e93777a95d7d9c40d270a371392b6d6f1ff170c2a3cb32d6176741b5b723/opentelemetry_sdk-1.38.0-py3-none-any.whl", hash = "sha256:1c66af6564ecc1553d72d811a01df063ff097cdc82ce188da9951f93b8d10f6b", size = 132349, upload-time = "2025-10-16T08:35:46.995Z" }, + { url = "https://files.pythonhosted.org/packages/7c/98/e91cf858f203d86f4eccdf763dcf01cf03f1dae80c3750f7e635bfa206b6/opentelemetry_sdk-1.39.1-py3-none-any.whl", hash = "sha256:4d5482c478513ecb0a5d938dcc61394e647066e0cc2676bee9f3af3f3f45f01c", size = 132565, upload-time = "2025-12-11T13:32:35.069Z" }, ] [[package]] name = "opentelemetry-semantic-conventions" -version = "0.59b0" +version = "0.60b1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "opentelemetry-api" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/40/bc/8b9ad3802cd8ac6583a4eb7de7e5d7db004e89cb7efe7008f9c8a537ee75/opentelemetry_semantic_conventions-0.59b0.tar.gz", hash = "sha256:7a6db3f30d70202d5bf9fa4b69bc866ca6a30437287de6c510fb594878aed6b0", size = 129861, upload-time = "2025-10-16T08:36:03.346Z" } +sdist = { url = "https://files.pythonhosted.org/packages/91/df/553f93ed38bf22f4b999d9be9c185adb558982214f33eae539d3b5cd0858/opentelemetry_semantic_conventions-0.60b1.tar.gz", hash = "sha256:87c228b5a0669b748c76d76df6c364c369c28f1c465e50f661e39737e84bc953", size = 137935, upload-time = "2025-12-11T13:32:50.487Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/24/7d/c88d7b15ba8fe5c6b8f93be50fc11795e9fc05386c44afaf6b76fe191f9b/opentelemetry_semantic_conventions-0.59b0-py3-none-any.whl", hash = "sha256:35d3b8833ef97d614136e253c1da9342b4c3c083bbaf29ce31d572a1c3825eed", size = 207954, upload-time = "2025-10-16T08:35:48.054Z" }, + { url = "https://files.pythonhosted.org/packages/7a/5e/5958555e09635d09b75de3c4f8b9cae7335ca545d77392ffe7331534c402/opentelemetry_semantic_conventions-0.60b1-py3-none-any.whl", hash = "sha256:9fa8c8b0c110da289809292b0591220d3a7b53c1526a23021e977d68597893fb", size = 219982, upload-time = "2025-12-11T13:32:36.955Z" }, ] [[package]]