Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies = [
"anyio>=3.5.0, <5",
"distro>=1.7.0, <2",
"sniffio",
"uuid-utils>=0.11.0",
"uuid-utils>=0.11.0",
]

requires-python = ">= 3.9"
Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.lock
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ python-dateutil==2.9.0.post0 ; python_full_version < '3.10'
# via time-machine
respx==0.22.0
rich==14.2.0
ruff==0.14.8
ruff==0.14.9
six==1.17.0 ; python_full_version < '3.10'
# via python-dateutil
sniffio==1.3.1
Expand Down
6 changes: 6 additions & 0 deletions src/runloop_api_client/sdk/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ..lib.polling import PollingConfig
from ..types.devboxes import DiskSnapshotListParams, DiskSnapshotUpdateParams
from ..types.scenarios import ScorerListParams, ScorerCreateParams, ScorerUpdateParams, ScorerValidateParams
from ..types.benchmarks import RunListScenarioRunsParams
from ..types.input_context import InputContext
from ..types.scenario_view import ScenarioView
from ..types.agent_list_params import AgentListParams
Expand Down Expand Up @@ -203,3 +204,8 @@ class ScenarioPreview(ScenarioView):

input_context: InputContextPreview # type: ignore[assignment]
"""The input context for the Scenario."""


# Benchmark Run params
class SDKBenchmarkRunListScenarioRunsParams(RunListScenarioRunsParams, BaseRequestOptions):
pass
127 changes: 127 additions & 0 deletions src/runloop_api_client/sdk/async_benchmark_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""AsyncBenchmarkRun resource class for asynchronous operations."""

from __future__ import annotations

from typing import List
from typing_extensions import Unpack, override

from ..types import BenchmarkRunView
from ._types import BaseRequestOptions, LongRequestOptions, SDKBenchmarkRunListScenarioRunsParams
from .._client import AsyncRunloop
from .async_scenario_run import AsyncScenarioRun


class AsyncBenchmarkRun:
"""A benchmark run for evaluating agent performance across scenarios (async).

Provides async methods for monitoring run status, managing the run lifecycle,
and accessing scenario run results. Obtain instances via
``benchmark.run()`` or ``benchmark.list_runs()``.

Example:
>>> benchmark = runloop.benchmark.from_id("bench-xxx")
>>> run = await benchmark.run(run_name="evaluation-v1")
>>> info = await run.get_info()
>>> scenario_runs = await run.list_scenario_runs()
"""

def __init__(self, client: AsyncRunloop, run_id: str, benchmark_id: str) -> None:
"""Create an AsyncBenchmarkRun instance.

:param client: AsyncRunloop client instance
:type client: AsyncRunloop
:param run_id: Benchmark run ID
:type run_id: str
:param benchmark_id: Parent benchmark ID
:type benchmark_id: str
"""
self._client = client
self._id = run_id
self._benchmark_id = benchmark_id

@override
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you want @override without a base class

def __repr__(self) -> str:
return f"<AsyncBenchmarkRun id={self._id!r}>"

@property
def id(self) -> str:
"""Return the benchmark run ID.

:return: Unique benchmark run ID
:rtype: str
"""
return self._id

@property
def benchmark_id(self) -> str:
"""Return the parent benchmark ID.

:return: Parent benchmark ID
:rtype: str
"""
return self._benchmark_id

async def get_info(
self,
**options: Unpack[BaseRequestOptions],
) -> BenchmarkRunView:
Comment on lines +64 to +67
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we call this something else instead? get_state or refresh?

"""Retrieve current benchmark run status and metadata.

:param options: See :typeddict:`~runloop_api_client.sdk._types.BaseRequestOptions` for available options
:return: Current benchmark run state info
:rtype: BenchmarkRunView
"""
return await self._client.benchmarks.runs.retrieve(
self._id,
**options,
)

async def cancel(
self,
**options: Unpack[LongRequestOptions],
) -> BenchmarkRunView:
"""Cancel the benchmark run.

Stops all running scenarios and marks the run as canceled.

:param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
:return: Updated benchmark run state
:rtype: BenchmarkRunView
"""
return await self._client.benchmarks.runs.cancel(
self._id,
**options,
)

async def complete(
self,
**options: Unpack[LongRequestOptions],
) -> BenchmarkRunView:
"""Complete the benchmark run.

Marks the run as completed. Call this after all scenarios have finished.

:param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
:return: Completed benchmark run state
:rtype: BenchmarkRunView
"""
return await self._client.benchmarks.runs.complete(
self._id,
**options,
)

async def list_scenario_runs(
self,
**params: Unpack[SDKBenchmarkRunListScenarioRunsParams],
) -> List[AsyncScenarioRun]:
"""List all scenario runs for this benchmark run.

:param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkRunListScenarioRunsParams` for available parameters
:return: List of async scenario run objects
:rtype: List[AsyncScenarioRun]
"""
page = await self._client.benchmarks.runs.list_scenario_runs(
self._id,
**params,
)
return [AsyncScenarioRun(self._client, run.id, run.devbox_id) for run in page.runs]
127 changes: 127 additions & 0 deletions src/runloop_api_client/sdk/benchmark_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""BenchmarkRun resource class for synchronous operations."""

from __future__ import annotations

from typing import List
from typing_extensions import Unpack, override

from ..types import BenchmarkRunView
from ._types import BaseRequestOptions, LongRequestOptions, SDKBenchmarkRunListScenarioRunsParams
from .._client import Runloop
from .scenario_run import ScenarioRun


class BenchmarkRun:
"""A benchmark run for evaluating agent performance across scenarios.

Provides methods for monitoring run status, managing the run lifecycle,
and accessing scenario run results. Obtain instances via
``benchmark.run()`` or ``benchmark.list_runs()``.

Example:
>>> benchmark = runloop.benchmark.from_id("bench-xxx")
>>> run = benchmark.run(run_name="evaluation-v1")
>>> info = run.get_info()
>>> scenario_runs = run.list_scenario_runs()
"""

def __init__(self, client: Runloop, run_id: str, benchmark_id: str) -> None:
"""Create a BenchmarkRun instance.

:param client: Runloop client instance
:type client: Runloop
:param run_id: Benchmark run ID
:type run_id: str
:param benchmark_id: Parent benchmark ID
:type benchmark_id: str
"""
self._client = client
self._id = run_id
self._benchmark_id = benchmark_id

@override
def __repr__(self) -> str:
return f"<BenchmarkRun id={self._id!r}>"
Comment on lines +42 to +44
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same thing here


@property
def id(self) -> str:
"""Return the benchmark run ID.

:return: Unique benchmark run ID
:rtype: str
"""
return self._id

@property
def benchmark_id(self) -> str:
"""Return the parent benchmark ID.

:return: Parent benchmark ID
:rtype: str
"""
return self._benchmark_id

def get_info(
self,
**options: Unpack[BaseRequestOptions],
) -> BenchmarkRunView:
"""Retrieve current benchmark run status and metadata.

:param options: See :typeddict:`~runloop_api_client.sdk._types.BaseRequestOptions` for available options
:return: Current benchmark run state info
:rtype: BenchmarkRunView
"""
return self._client.benchmarks.runs.retrieve(
self._id,
**options,
)

def cancel(
self,
**options: Unpack[LongRequestOptions],
) -> BenchmarkRunView:
"""Cancel the benchmark run.

Stops all running scenarios and marks the run as canceled.

:param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
:return: Updated benchmark run state
:rtype: BenchmarkRunView
"""
return self._client.benchmarks.runs.cancel(
self._id,
**options,
)

def complete(
self,
**options: Unpack[LongRequestOptions],
) -> BenchmarkRunView:
"""Complete the benchmark run.

Marks the run as completed. Call this after all scenarios have finished.

:param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
:return: Completed benchmark run state
:rtype: BenchmarkRunView
"""
return self._client.benchmarks.runs.complete(
self._id,
**options,
)

def list_scenario_runs(
self,
**params: Unpack[SDKBenchmarkRunListScenarioRunsParams],
) -> List[ScenarioRun]:
"""List all scenario runs for this benchmark run.

:param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkRunListScenarioRunsParams` for available parameters
:return: List of scenario run objects
:rtype: List[ScenarioRun]
"""
page = self._client.benchmarks.runs.list_scenario_runs(
self._id,
**params,
)
return [ScenarioRun(self._client, run.id, run.devbox_id) for run in page.runs]
30 changes: 30 additions & 0 deletions tests/sdk/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,30 @@ class MockScenarioRunView:
scoring_contract_result: object = None


@dataclass
class MockBenchmarkRunView:
"""Mock BenchmarkRunView for testing."""

id: str = "bench_run_123"
benchmark_id: str = "bench_123"
state: str = "running"
metadata: Dict[str, str] = field(default_factory=dict)
start_time_ms: int = 1234567890000
duration_ms: int | None = None
score: float | None = None


class AsyncIterableMock:
"""A simple async iterable mock for testing paginated responses."""

def __init__(self, items: list[Any]) -> None:
self._items = items

async def __aiter__(self):
for item in self._items:
yield item


def create_mock_httpx_client(methods: dict[str, Any] | None = None) -> AsyncMock:
"""
Create a mock httpx.AsyncClient with proper context manager setup.
Expand Down Expand Up @@ -237,6 +261,12 @@ def scenario_run_view() -> MockScenarioRunView:
return MockScenarioRunView()


@pytest.fixture
def benchmark_run_view() -> MockBenchmarkRunView:
"""Create a mock BenchmarkRunView."""
return MockBenchmarkRunView()


@pytest.fixture
def mock_httpx_response() -> Mock:
"""Create a mock httpx.Response."""
Expand Down
Loading