runloopai · sid-rl · Dec 17, 2025 · Dec 16, 2025 · Dec 16, 2025 · Dec 17, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,7 +15,7 @@ dependencies = [
   "anyio>=3.5.0, <5",
   "distro>=1.7.0, <2",
   "sniffio",
-    "uuid-utils>=0.11.0",
+  "uuid-utils>=0.11.0",
 ]
 
 requires-python = ">= 3.9"

diff --git a/requirements-dev.lock b/requirements-dev.lock
@@ -94,7 +94,7 @@ python-dateutil==2.9.0.post0 ; python_full_version < '3.10'
     # via time-machine
 respx==0.22.0
 rich==14.2.0
-ruff==0.14.8
+ruff==0.14.9
 six==1.17.0 ; python_full_version < '3.10'
     # via python-dateutil
 sniffio==1.3.1

diff --git a/src/runloop_api_client/sdk/_types.py b/src/runloop_api_client/sdk/_types.py
@@ -5,6 +5,7 @@
 from ..lib.polling import PollingConfig
 from ..types.devboxes import DiskSnapshotListParams, DiskSnapshotUpdateParams
 from ..types.scenarios import ScorerListParams, ScorerCreateParams, ScorerUpdateParams, ScorerValidateParams
+from ..types.benchmarks import RunListScenarioRunsParams
 from ..types.input_context import InputContext
 from ..types.scenario_view import ScenarioView
 from ..types.agent_list_params import AgentListParams
@@ -203,3 +204,8 @@ class ScenarioPreview(ScenarioView):
 
     input_context: InputContextPreview  # type: ignore[assignment]
     """The input context for the Scenario."""
+
+
+# Benchmark Run params
+class SDKBenchmarkRunListScenarioRunsParams(RunListScenarioRunsParams, BaseRequestOptions):
+    pass
diff --git a/src/runloop_api_client/sdk/async_benchmark_run.py b/src/runloop_api_client/sdk/async_benchmark_run.py
@@ -0,0 +1,127 @@
+"""AsyncBenchmarkRun resource class for asynchronous operations."""
+
+from __future__ import annotations
+
+from typing import List
+from typing_extensions import Unpack, override
+
+from ..types import BenchmarkRunView
+from ._types import BaseRequestOptions, LongRequestOptions, SDKBenchmarkRunListScenarioRunsParams
+from .._client import AsyncRunloop
+from .async_scenario_run import AsyncScenarioRun
+
+
+class AsyncBenchmarkRun:
+    """A benchmark run for evaluating agent performance across scenarios (async).
+
+    Provides async methods for monitoring run status, managing the run lifecycle,
+    and accessing scenario run results. Obtain instances via
+    ``benchmark.run()`` or ``benchmark.list_runs()``.
+
+    Example:
+        >>> benchmark = runloop.benchmark.from_id("bench-xxx")
+        >>> run = await benchmark.run(run_name="evaluation-v1")
+        >>> info = await run.get_info()
+        >>> scenario_runs = await run.list_scenario_runs()
+    """
+
+    def __init__(self, client: AsyncRunloop, run_id: str, benchmark_id: str) -> None:
+        """Create an AsyncBenchmarkRun instance.
+
+        :param client: AsyncRunloop client instance
+        :type client: AsyncRunloop
+        :param run_id: Benchmark run ID
+        :type run_id: str
+        :param benchmark_id: Parent benchmark ID
+        :type benchmark_id: str
+        """
+        self._client = client
+        self._id = run_id
+        self._benchmark_id = benchmark_id
+
+    @override
+    def __repr__(self) -> str:
+        return f"<AsyncBenchmarkRun id={self._id!r}>"
+
+    @property
+    def id(self) -> str:
+        """Return the benchmark run ID.
+
+        :return: Unique benchmark run ID
+        :rtype: str
+        """
+        return self._id
+
+    @property
+    def benchmark_id(self) -> str:
+        """Return the parent benchmark ID.
+
+        :return: Parent benchmark ID
+        :rtype: str
+        """
+        return self._benchmark_id
+
+    async def get_info(
+        self,
+        **options: Unpack[BaseRequestOptions],
+    ) -> BenchmarkRunView:
+        """Retrieve current benchmark run status and metadata.
+
+        :param options: See :typeddict:`~runloop_api_client.sdk._types.BaseRequestOptions` for available options
+        :return: Current benchmark run state info
+        :rtype: BenchmarkRunView
+        """
+        return await self._client.benchmarks.runs.retrieve(
+            self._id,
+            **options,
+        )
+
+    async def cancel(
+        self,
+        **options: Unpack[LongRequestOptions],
+    ) -> BenchmarkRunView:
+        """Cancel the benchmark run.
+
+        Stops all running scenarios and marks the run as canceled.
+
+        :param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
+        :return: Updated benchmark run state
+        :rtype: BenchmarkRunView
+        """
+        return await self._client.benchmarks.runs.cancel(
+            self._id,
+            **options,
+        )
+
+    async def complete(
+        self,
+        **options: Unpack[LongRequestOptions],
+    ) -> BenchmarkRunView:
+        """Complete the benchmark run.
+
+        Marks the run as completed. Call this after all scenarios have finished.
+
+        :param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
+        :return: Completed benchmark run state
+        :rtype: BenchmarkRunView
+        """
+        return await self._client.benchmarks.runs.complete(
+            self._id,
+            **options,
+        )
+
+    async def list_scenario_runs(
+        self,
+        **params: Unpack[SDKBenchmarkRunListScenarioRunsParams],
+    ) -> List[AsyncScenarioRun]:
+        """List all scenario runs for this benchmark run.
+
+        :param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkRunListScenarioRunsParams` for available parameters
+        :return: List of async scenario run objects
+        :rtype: List[AsyncScenarioRun]
+        """
+        page = await self._client.benchmarks.runs.list_scenario_runs(
+            self._id,
+            **params,
+        )
+        return [AsyncScenarioRun(self._client, run.id, run.devbox_id) for run in page.runs]
diff --git a/src/runloop_api_client/sdk/benchmark_run.py b/src/runloop_api_client/sdk/benchmark_run.py
@@ -0,0 +1,127 @@
+"""BenchmarkRun resource class for synchronous operations."""
+
+from __future__ import annotations
+
+from typing import List
+from typing_extensions import Unpack, override
+
+from ..types import BenchmarkRunView
+from ._types import BaseRequestOptions, LongRequestOptions, SDKBenchmarkRunListScenarioRunsParams
+from .._client import Runloop
+from .scenario_run import ScenarioRun
+
+
+class BenchmarkRun:
+    """A benchmark run for evaluating agent performance across scenarios.
+
+    Provides methods for monitoring run status, managing the run lifecycle,
+    and accessing scenario run results. Obtain instances via
+    ``benchmark.run()`` or ``benchmark.list_runs()``.
+
+    Example:
+        >>> benchmark = runloop.benchmark.from_id("bench-xxx")
+        >>> run = benchmark.run(run_name="evaluation-v1")
+        >>> info = run.get_info()
+        >>> scenario_runs = run.list_scenario_runs()
+    """
+
+    def __init__(self, client: Runloop, run_id: str, benchmark_id: str) -> None:
+        """Create a BenchmarkRun instance.
+
+        :param client: Runloop client instance
+        :type client: Runloop
+        :param run_id: Benchmark run ID
+        :type run_id: str
+        :param benchmark_id: Parent benchmark ID
+        :type benchmark_id: str
+        """
+        self._client = client
+        self._id = run_id
+        self._benchmark_id = benchmark_id
+
+    @override
+    def __repr__(self) -> str:
+        return f"<BenchmarkRun id={self._id!r}>"
+
+    @property
+    def id(self) -> str:
+        """Return the benchmark run ID.
+
+        :return: Unique benchmark run ID
+        :rtype: str
+        """
+        return self._id
+
+    @property
+    def benchmark_id(self) -> str:
+        """Return the parent benchmark ID.
+
+        :return: Parent benchmark ID
+        :rtype: str
+        """
+        return self._benchmark_id
+
+    def get_info(
+        self,
+        **options: Unpack[BaseRequestOptions],
+    ) -> BenchmarkRunView:
+        """Retrieve current benchmark run status and metadata.
+
+        :param options: See :typeddict:`~runloop_api_client.sdk._types.BaseRequestOptions` for available options
+        :return: Current benchmark run state info
+        :rtype: BenchmarkRunView
+        """
+        return self._client.benchmarks.runs.retrieve(
+            self._id,
+            **options,
+        )
+
+    def cancel(
+        self,
+        **options: Unpack[LongRequestOptions],
+    ) -> BenchmarkRunView:
+        """Cancel the benchmark run.
+
+        Stops all running scenarios and marks the run as canceled.
+
+        :param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
+        :return: Updated benchmark run state
+        :rtype: BenchmarkRunView
+        """
+        return self._client.benchmarks.runs.cancel(
+            self._id,
+            **options,
+        )
+
+    def complete(
+        self,
+        **options: Unpack[LongRequestOptions],
+    ) -> BenchmarkRunView:
+        """Complete the benchmark run.
+
+        Marks the run as completed. Call this after all scenarios have finished.
+
+        :param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
+        :return: Completed benchmark run state
+        :rtype: BenchmarkRunView
+        """
+        return self._client.benchmarks.runs.complete(
+            self._id,
+            **options,
+        )
+
+    def list_scenario_runs(
+        self,
+        **params: Unpack[SDKBenchmarkRunListScenarioRunsParams],
+    ) -> List[ScenarioRun]:
+        """List all scenario runs for this benchmark run.
+
+        :param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkRunListScenarioRunsParams` for available parameters
+        :return: List of scenario run objects
+        :rtype: List[ScenarioRun]
+        """
+        page = self._client.benchmarks.runs.list_scenario_runs(
+            self._id,
+            **params,
+        )
+        return [ScenarioRun(self._client, run.id, run.devbox_id) for run in page.runs]
diff --git a/tests/sdk/conftest.py b/tests/sdk/conftest.py
@@ -129,6 +129,30 @@ class MockScenarioRunView:
     scoring_contract_result: object = None
 
 
+@dataclass
+class MockBenchmarkRunView:
+    """Mock BenchmarkRunView for testing."""
+
+    id: str = "bench_run_123"
+    benchmark_id: str = "bench_123"
+    state: str = "running"
+    metadata: Dict[str, str] = field(default_factory=dict)
+    start_time_ms: int = 1234567890000
+    duration_ms: int | None = None
+    score: float | None = None
+
+
+class AsyncIterableMock:
+    """A simple async iterable mock for testing paginated responses."""
+
+    def __init__(self, items: list[Any]) -> None:
+        self._items = items
+
+    async def __aiter__(self):
+        for item in self._items:
+            yield item
+
+
 def create_mock_httpx_client(methods: dict[str, Any] | None = None) -> AsyncMock:
     """
     Create a mock httpx.AsyncClient with proper context manager setup.
@@ -237,6 +261,12 @@ def scenario_run_view() -> MockScenarioRunView:
     return MockScenarioRunView()
 
 
+@pytest.fixture
+def benchmark_run_view() -> MockBenchmarkRunView:
+    """Create a mock BenchmarkRunView."""
+    return MockBenchmarkRunView()
+
+
 @pytest.fixture
 def mock_httpx_response() -> Mock:
     """Create a mock httpx.Response."""