-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathtypes.py
More file actions
97 lines (78 loc) · 4.32 KB
/
types.py
File metadata and controls
97 lines (78 loc) · 4.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from typing import Callable, Literal, Optional, Sequence, TypedDict, Union
from pydantic import BaseModel
from typing_extensions import NotRequired
from humanloop.requests import CodeEvaluatorRequestParams as CodeEvaluatorDict
from humanloop.requests import CreateDatapointRequestParams as DatapointDict
from humanloop.requests import ExternalEvaluatorRequestParams as ExternalEvaluator
# We use TypedDicts for requests, which is consistent with the rest of the SDK
from humanloop.requests import FlowKernelRequestParams as FlowDict
from humanloop.requests import HumanEvaluatorRequestParams as HumanEvaluatorDict
from humanloop.requests import LlmEvaluatorRequestParams as LLMEvaluatorDict
from humanloop.requests import PromptKernelRequestParams as PromptDict
from humanloop.requests import ToolKernelRequestParams as ToolDict
from humanloop.types import (
EvaluatorArgumentsType,
EvaluatorReturnTypeEnum,
)
# Responses are Pydantic models and we leverage them for improved request validation
from humanloop.types import (
UpdateDatesetAction as UpdateDatasetAction,
) # TODO: fix original type typo
EvaluatorDict = Union[CodeEvaluatorDict, LLMEvaluatorDict, HumanEvaluatorDict, ExternalEvaluator]
Version = Union[FlowDict, PromptDict, ToolDict, EvaluatorDict]
FileType = Literal["flow", "prompt", "tool", "evaluator"]
class Identifiers(TypedDict):
"""Common identifiers for the objects required to run an Evaluation."""
id: NotRequired[str]
"""The ID of the File on Humanloop."""
path: NotRequired[str]
"""The path of the File on Humanloop."""
version_id: NotRequired[str]
"""The ID of the version of the File on Humanloop."""
environment: NotRequired[str]
"""The environment of the File on Humanloop."""
class FileEvalConfig(Identifiers):
"""A File on Humanloop (Flow, Prompt, Tool, Evaluator)."""
type: Literal["flow", "prompt", "agent"]
"""The type of File this callable relates to on Humanloop."""
version: NotRequired[Version]
"""The contents uniquely define the version of the File on Humanloop."""
callable: NotRequired[Callable]
"""The function being evaluated.
It will be called using your Dataset `inputs` as follows: `output = callable(**datapoint.inputs)`.
If `messages` are defined in your Dataset, then `output = callable(**datapoint.inputs, messages=datapoint.messages)`.
It should return a string or json serializable output.
"""
class DatasetEvalConfig(Identifiers):
datapoints: NotRequired[Sequence[DatapointDict]]
"""The datapoints to map your function over to produce the outputs required by the evaluation."""
action: NotRequired[UpdateDatasetAction]
"""How to update the Dataset given the provided Datapoints;
`set` replaces the existing Datapoints and `add` appends to the existing Datapoints."""
class EvaluatorEvalConfig(Identifiers):
"""The Evaluator to provide judgments for this Evaluation."""
args_type: NotRequired[EvaluatorArgumentsType]
"""The type of arguments the Evaluator expects - only required for local Evaluators."""
return_type: NotRequired[EvaluatorReturnTypeEnum]
"""The type of return value the Evaluator produces - only required for local Evaluators."""
callable: NotRequired[Callable]
"""The function to run on the logs to produce the judgment - only required for local Evaluators."""
threshold: NotRequired[float]
"""The threshold to check the Evaluator against. If the aggregate value of the Evaluator is below this threshold, the check will fail."""
class EvaluatorCheck(BaseModel):
"""Summary data for an Evaluator check."""
path: str
"""The path of the Evaluator used in the check."""
# TODO: Add number valence and improvement check
# improvement_check: bool
# """Whether the latest version of your function has improved across the Dataset for a specific Evaluator."""
score: float
"""The score of the latest version of your function for a specific Evaluator."""
delta: float
"""The change in score since the previous version of your function for a specific Evaluator."""
threshold: Optional[float]
"""The threshold to check the Evaluator against."""
threshold_check: Optional[bool]
"""Whether the latest version has an average Evaluator result above a threshold."""
evaluation_id: str
"""The ID of the corresponding Evaluation."""