Skip to content
8 changes: 8 additions & 0 deletions src/runloop_api_client/sdk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,24 @@
from .scorer import Scorer
from .scenario import Scenario
from .snapshot import Snapshot
from .benchmark import Benchmark
from .blueprint import Blueprint
from .execution import Execution
from .async_agent import AsyncAgent
from .async_devbox import AsyncDevbox, AsyncNamedShell
from .async_scorer import AsyncScorer
from .scenario_run import ScenarioRun
from .benchmark_run import BenchmarkRun
from .async_scenario import AsyncScenario
from .async_snapshot import AsyncSnapshot
from .storage_object import StorageObject
from .async_benchmark import AsyncBenchmark
from .async_blueprint import AsyncBlueprint
from .async_execution import AsyncExecution
from .execution_result import ExecutionResult
from .scenario_builder import ScenarioBuilder
from .async_scenario_run import AsyncScenarioRun
from .async_benchmark_run import AsyncBenchmarkRun
from .async_storage_object import AsyncStorageObject
from .async_execution_result import AsyncExecutionResult
from .async_scenario_builder import AsyncScenarioBuilder
Expand Down Expand Up @@ -85,4 +89,8 @@
"AsyncStorageObject",
"NamedShell",
"AsyncNamedShell",
"BenchmarkRun",
"AsyncBenchmarkRun",
"Benchmark",
"AsyncBenchmark",
]
59 changes: 38 additions & 21 deletions src/runloop_api_client/sdk/_types.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,39 @@
from typing import Union, Callable, Optional
from typing_extensions import TypedDict

from ..types import (
InputContext,
ScenarioView,
AgentListParams,
DevboxListParams,
ObjectListParams,
AgentCreateParams,
DevboxCreateParams,
ObjectCreateParams,
ScenarioListParams,
BlueprintListParams,
ObjectDownloadParams,
ScenarioUpdateParams,
BenchmarkUpdateParams,
BlueprintCreateParams,
DevboxUploadFileParams,
DevboxCreateTunnelParams,
DevboxDownloadFileParams,
DevboxRemoveTunnelParams,
DevboxSnapshotDiskParams,
DevboxReadFileContentsParams,
DevboxWriteFileContentsParams,
)
from .._types import Body, Query, Headers, Timeout, NotGiven
from ..lib.polling import PollingConfig
from ..types.devboxes import DiskSnapshotListParams, DiskSnapshotUpdateParams
from ..types.scenarios import ScorerListParams, ScorerCreateParams, ScorerUpdateParams, ScorerValidateParams
from ..types.benchmarks import RunListScenarioRunsParams
from ..types.input_context import InputContext
from ..types.scenario_view import ScenarioView
from ..types.agent_list_params import AgentListParams
from ..types.devbox_list_params import DevboxListParams
from ..types.object_list_params import ObjectListParams
from ..types.agent_create_params import AgentCreateParams
from ..types.devbox_create_params import DevboxCreateParams, DevboxBaseCreateParams
from ..types.object_create_params import ObjectCreateParams
from ..types.scenario_list_params import ScenarioListParams
from ..types.blueprint_list_params import BlueprintListParams
from ..types.object_download_params import ObjectDownloadParams
from ..types.scenario_update_params import ScenarioUpdateParams
from ..types.blueprint_create_params import BlueprintCreateParams
from ..types.devbox_upload_file_params import DevboxUploadFileParams
from ..types.devbox_create_params import DevboxBaseCreateParams
from ..types.scenario_start_run_params import ScenarioStartRunBaseParams
from ..types.devbox_create_tunnel_params import DevboxCreateTunnelParams
from ..types.devbox_download_file_params import DevboxDownloadFileParams
from ..types.benchmark_start_run_params import BenchmarkSelfStartRunParams
from ..types.benchmarks.run_list_params import RunSelfListParams
from ..types.devbox_execute_async_params import DevboxNiceExecuteAsyncParams
from ..types.devbox_remove_tunnel_params import DevboxRemoveTunnelParams
from ..types.devbox_snapshot_disk_params import DevboxSnapshotDiskParams
from ..types.devbox_read_file_contents_params import DevboxReadFileContentsParams
from ..types.devbox_write_file_contents_params import DevboxWriteFileContentsParams

LogCallback = Callable[[str], None]

Expand Down Expand Up @@ -206,6 +212,17 @@ class ScenarioPreview(ScenarioView):
"""The input context for the Scenario."""


# Benchmark Run params
class SDKBenchmarkUpdateParams(BenchmarkUpdateParams, LongRequestOptions):
pass


class SDKBenchmarkStartRunParams(BenchmarkSelfStartRunParams, LongRequestOptions):
pass


class SDKBenchmarkListRunsParams(RunSelfListParams, BaseRequestOptions):
pass


class SDKBenchmarkRunListScenarioRunsParams(RunListScenarioRunsParams, BaseRequestOptions):
pass
164 changes: 164 additions & 0 deletions src/runloop_api_client/sdk/async_benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
"""AsyncBenchmark resource class for asynchronous operations."""

from __future__ import annotations

from typing import List
from typing_extensions import Unpack, override

from ..types import BenchmarkView
from ._types import (
BaseRequestOptions,
LongRequestOptions,
SDKBenchmarkUpdateParams,
SDKBenchmarkListRunsParams,
SDKBenchmarkStartRunParams,
)
from .._types import SequenceNotStr
from .._client import AsyncRunloop
from .async_benchmark_run import AsyncBenchmarkRun


class AsyncBenchmark:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets highlight that this is a handle to benchmark management operations, but that to understand what is in the benchmark, you need a BenchmarkView. This is somewhat stated here, but I think it would be helpful to be more explicit. What do you think of this?


A handle for managing a Runloop Benchmark.

This provides async methods for retrieving benchmark details....

... The [BenchmarkView](some link) object contains details about the contents of the benchmark. The info() call and various update methods all return the most recent benchmark state.

Or something like that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is true of all the classes we have so far: to understand what is actually in the object X, we have to call get_info() and look at the XView. since BenchmarkView is listed as the return type of get_info() and update(), and is documented in the type reference, i think it's fine to leave as is

"""A benchmark for evaluating agent performance across scenarios (async).

Provides async methods for retrieving benchmark details, updating the benchmark,
managing scenarios, and starting benchmark runs. Obtain instances via
``runloop.benchmark.from_id()`` or ``runloop.benchmark.list()``.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a way to create a link to the BenchmarkOps definitions here? That would make the resulting docs really easy to navigate. Eg, maybe something like this?

You obtain a benchmark with the [runloop.benchmark](some useful link) operations, such as runloop.benchmark.create() and runloop.benchmark.list()

Even better if we can link to the specific methods, but that is less critical IMO. (Just as long as we can get people close...)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will do once i add the BenchmarkOps classes! the plan is to add them in a separate pr once this one is merged


Example:
>>> benchmark = runloop.benchmark.from_id("bmd_xxx")
>>> info = await benchmark.get_info()
>>> run = await benchmark.start_run(run_name="evaluation-v1")
>>> for scenario_id in info.scenario_ids:
... scenario = await runloop.scenario.from_id(scenario_id)
... scenario_run = await scenario.run(benchmark_run_id=run.id, run_name="evaluation-v1")
"""

def __init__(self, client: AsyncRunloop, benchmark_id: str) -> None:
"""Create an AsyncBenchmark instance.

:param client: AsyncRunloop client instance
:type client: AsyncRunloop
:param benchmark_id: Benchmark ID
:type benchmark_id: str
"""
self._client = client
self._id = benchmark_id

@override
def __repr__(self) -> str:
return f"<AsyncBenchmark id={self._id!r}>"

@property
def id(self) -> str:
"""Return the benchmark ID.

:return: Unique benchmark ID
:rtype: str
"""
return self._id

async def get_info(
self,
**options: Unpack[BaseRequestOptions],
) -> BenchmarkView:
"""Retrieve current benchmark details.

:param options: See :typeddict:`~runloop_api_client.sdk._types.BaseRequestOptions` for available options
:return: Current benchmark info
:rtype: BenchmarkView
"""
return await self._client.benchmarks.retrieve(
self._id,
**options,
)

async def update(
self,
**params: Unpack[SDKBenchmarkUpdateParams],
) -> BenchmarkView:
"""Update the benchmark.

Only provided fields will be updated.

:param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkUpdateParams` for available parameters
:return: Updated benchmark info
:rtype: BenchmarkView
"""
return await self._client.benchmarks.update(
self._id,
**params,
)

async def start_run(
self,
**params: Unpack[SDKBenchmarkStartRunParams],
) -> AsyncBenchmarkRun:
"""Start a new benchmark run.

Creates a new benchmark run and returns an AsyncBenchmarkRun instance for
managing the run lifecycle.

:param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkStartRunParams` for available parameters
:return: AsyncBenchmarkRun instance for managing the run
:rtype: AsyncBenchmarkRun
"""
run_view = await self._client.benchmarks.start_run(
benchmark_id=self._id,
**params,
)
return AsyncBenchmarkRun(self._client, run_view.id, run_view.benchmark_id)

async def add_scenarios(
self,
scenario_ids: SequenceNotStr[str],
**options: Unpack[LongRequestOptions],
) -> BenchmarkView:
"""Add scenarios to the benchmark.

:param scenario_ids: List of scenario IDs to add
:type scenario_ids: SequenceNotStr[str]
:param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
:return: Updated benchmark info
:rtype: BenchmarkView
"""
return await self._client.benchmarks.update_scenarios(
self._id,
scenarios_to_add=scenario_ids,
**options,
)

async def remove_scenarios(
self,
scenario_ids: SequenceNotStr[str],
**options: Unpack[LongRequestOptions],
) -> BenchmarkView:
"""Remove scenarios from the benchmark.

:param scenario_ids: List of scenario IDs to remove
:type scenario_ids: SequenceNotStr[str]
:param options: See :typeddict:`~runloop_api_client.sdk._types.LongRequestOptions` for available options
:return: Updated benchmark info
:rtype: BenchmarkView
"""
return await self._client.benchmarks.update_scenarios(
self._id,
scenarios_to_remove=scenario_ids,
**options,
)

async def list_runs(
self,
**params: Unpack[SDKBenchmarkListRunsParams],
) -> List[AsyncBenchmarkRun]:
"""List all runs for this benchmark.

:param params: See :typeddict:`~runloop_api_client.sdk._types.SDKBenchmarkListRunsParams` for available parameters
:return: List of async benchmark runs
:rtype: List[AsyncBenchmarkRun]
"""
page = await self._client.benchmarks.runs.list(
benchmark_id=self._id,
**params,
)
return [AsyncBenchmarkRun(self._client, run.id, run.benchmark_id) for run in page.runs]
4 changes: 2 additions & 2 deletions src/runloop_api_client/sdk/async_benchmark_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ class AsyncBenchmarkRun:

Provides async methods for monitoring run status, managing the run lifecycle,
and accessing scenario run results. Obtain instances via
``benchmark.run()`` or ``benchmark.list_runs()``.
``benchmark.start_run()`` or ``benchmark.list_runs()``.

Example:
>>> benchmark = runloop.benchmark.from_id("bench-xxx")
>>> run = await benchmark.run(run_name="evaluation-v1")
>>> run = await benchmark.start_run(run_name="evaluation-v1")
>>> info = await run.get_info()
>>> scenario_runs = await run.list_scenario_runs()
"""
Expand Down
Loading
Loading