-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy patheval_utils.py
More file actions
594 lines (520 loc) · 23.6 KB
/
eval_utils.py
File metadata and controls
594 lines (520 loc) · 23.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
"""
Evaluation utils for the Humanloop SDK.
This module provides a set of utilities to aid running Eval workflows on Humanloop
where you are managing the runtime of your application in your code.
Functions in this module should be accessed via the Humanloop client. They should
not be called directly.
"""
import logging
from datetime import datetime
from functools import partial
import inspect
from logging import INFO
from pydantic import BaseModel, ValidationError
from typing import Callable, Sequence, Literal, Union, Optional, List, Dict, Tuple
from typing_extensions import NotRequired, TypedDict
import time
import sys
import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed
from .client import BaseHumanloop
from .core.api_error import ApiError
# We use TypedDicts for requests, which is consistent with the rest of the SDK
from .requests import FlowKernelRequestParams as FlowDict
from .requests import PromptKernelRequestParams as PromptDict
from .requests import ToolKernelRequestParams as ToolDict
from .requests import CreateDatapointRequestParams as DatapointDict
from .requests import ExternalEvaluatorRequestParams as ExternalEvaluator
from .requests import CodeEvaluatorRequestParams as CodeEvaluatorDict
from .requests import LlmEvaluatorRequestParams as LLMEvaluatorDict
from .requests import HumanEvaluatorRequestParams as HumanEvaluatorDict
# Responses are Pydantic models and we leverage them for improved request validation
from .types import FlowKernelRequest as Flow
from .types import PromptKernelRequest as Prompt
from .types import ToolKernelRequest as Tool
from .types import BooleanEvaluatorStatsResponse as BooleanStats
from .types import NumericEvaluatorStatsResponse as NumericStats
from .types import UpdateDatesetAction as UpdateDatasetAction # TODO: fix original type typo
from .types import DatapointResponse as Datapoint
from .types import (
EvaluationStats,
VersionStatsResponse,
EvaluatorArgumentsType,
EvaluatorReturnTypeEnum,
EvaluationResponse,
)
# Setup logging
logger = logging.getLogger(__name__)
logger.setLevel(level=INFO)
console_handler = logging.StreamHandler()
logger.setLevel(INFO)
formatter = logging.Formatter("%(message)s")
console_handler.setFormatter(formatter)
if not logger.hasHandlers():
logger.addHandler(console_handler)
EvaluatorDict = Union[CodeEvaluatorDict, LLMEvaluatorDict, HumanEvaluatorDict, ExternalEvaluator]
Version = Union[FlowDict, PromptDict, ToolDict, EvaluatorDict]
FileType = Literal["flow", "prompt", "tool", "evaluator"]
# ANSI escape codes for logging colors
YELLOW = "\033[93m"
CYAN = "\033[96m"
GREEN = "\033[92m"
RED = "\033[91m"
RESET = "\033[0m"
class Identifiers(TypedDict):
"""Common identifiers for the objects required to run an Evaluation."""
id: NotRequired[str]
"""The ID of the File on Humanloop."""
path: NotRequired[str]
"""The path of the File on Humanloop."""
class File(Identifiers):
"""A File on Humanloop (Flow, Prompt, Tool, Evaluator)."""
type: NotRequired[FileType]
"""The type of File this function relates to on Humanloop."""
version: NotRequired[Version]
"""The contents uniquely define the version of the File on Humanloop."""
callable: Callable
"""The function being evaluated.
It will be called using your Dataset `inputs` as follows: `output = callable(**datapoint.inputs)`.
If `messages` are defined in your Dataset, then `output = callable(**datapoint.inputs, messages=datapoint.messages)`.
It should return a single string output. If not, you must provide a `custom_logger`.
"""
custom_logger: NotRequired[Callable]
"""function that logs the output of your function to Humanloop, replacing the default logging.
If provided, it will be called as follows:
```
output = callable(**datapoint.inputs).
log = custom_logger(client, output)
```
Inside the custom_logger, you can use the Humanloop `client` to log the output of your function.
If not provided your pipline must return a single string.
"""
class Dataset(Identifiers):
datapoints: Sequence[DatapointDict]
"""The datapoints to map your function over to produce the outputs required by the evaluation."""
action: NotRequired[UpdateDatasetAction]
"""How to update the Dataset given the provided Datapoints;
`set` replaces the existing Datapoints and `add` appends to the existing Datapoints."""
class Evaluator(Identifiers):
"""The Evaluator to provide judgments for this Evaluation."""
args_type: NotRequired[EvaluatorArgumentsType]
"""The type of arguments the Evaluator expects - only required for local Evaluators."""
return_type: NotRequired[EvaluatorReturnTypeEnum]
"""The type of return value the Evaluator produces - only required for local Evaluators."""
callable: NotRequired[Callable]
"""The function to run on the logs to produce the judgment - only required for local Evaluators."""
custom_logger: NotRequired[Callable]
"""optional function that logs the output judgment from your Evaluator to Humanloop, if provided, it will be called as follows:
```
judgment = callable(log_dict)
log = custom_logger(client, judgmemt)
```
Inside the custom_logger, you can use the Humanloop `client` to log the judgment to Humanloop.
If not provided your function must return a single string and by default the code will be used to inform the version of the external Evaluator on Humanloop.
"""
threshold: NotRequired[float]
"""The threshold to check the Evaluator against. If the aggregate value of the Evaluator is below this threshold, the check will fail."""
class EvaluatorCheck(BaseModel):
"""Summary data for an Evaluator check."""
path: str
"""The path of the Evaluator used in the check."""
improvement_check: bool
"""Whether the latest version of your function has improved across the Dataset for a specific Evaluator."""
score: float
"""The score of the latest version of your function for a specific Evaluator."""
delta: float
"""The change in score since the previous version of your function for a specific Evaluator."""
threshold: Optional[float]
"""The threshold to check the Evaluator against."""
threshold_check: Optional[bool]
"""Whether the latest version has an average Evaluator result above a threshold."""
def _run_eval(
client: BaseHumanloop,
file: File,
name: Optional[str],
dataset: Dataset,
evaluators: Optional[Sequence[Evaluator]] = None,
# logs: typing.Sequence[dict] | None = None,
workers: int = 4,
) -> List[EvaluatorCheck]:
"""
Evaluate your function for a given `Dataset` and set of `Evaluators`.
:param client: the Humanloop API client.
:param file: the Humanloop file being evaluated, including a function to run over the dataset.
:param name: the name of the Evaluation to run. If it does not exist, a new Evaluation will be created under your File.
:param dataset: the dataset to map your function over to produce the outputs required by the Evaluation.
:param evaluators: define how judgments are provided for this Evaluation.
:param workers: the number of threads to process datapoints using your function concurrently.
:return: per Evaluator checks.
"""
# Get or create the file on Humanloop
version = file.pop("version", {})
# Raise error if one of path or id not provided
if not file.get("path") and not file.get("id"):
raise ValueError("You must provide a path or id in your `file`.")
# Determine the `type` of the `file` to Evaluate - if not `type` provided, default to `flow`
try:
type_ = file.pop("type")
logger.info(
f"{CYAN}Evaluating your {type_} function corresponding to `{file['path']}` on Humanloop{RESET} \n\n"
)
except KeyError as _:
type_ = "flow"
logger.warning("No `file` type specified, defaulting to flow.")
# If a `callable` is provided, Logs will be generated locally, otherwise Logs will be generated on Humanloop.
function_ = None
try:
function_ = file.pop("callable")
except KeyError as _:
if type_ == "flow":
raise ValueError("You must provide a `callable` for your Flow `file` to run a local eval.")
else:
logger.info(f"No `callable` provided for your {type_} file - will attempt to generate logs on Humanloop.")
custom_logger = file.pop("custom_logger", None)
file_dict = {**file, **version}
if type_ == "flow":
# Be more lenient with Flow versions as they are arbitrary json
try:
Flow.parse_obj(version)
except ValidationError:
flow_version = {"attributes": version}
file_dict = {**file, **flow_version}
hl_file = client.flows.upsert(**file_dict)
elif type_ == "prompt":
try:
_ = Prompt.parse_obj(version)
except ValidationError as error_:
logger.error(msg=f"Invalid Prompt `version` in your `file` request. \n\nValidation error: \n)")
raise error_
hl_file = client.prompts.upsert(**file_dict)
elif type_ == "tool":
try:
_ = Tool.parse_obj(version)
except ValidationError as error_:
logger.error(msg=f"Invalid Tool `version` in your `file` request. \n\nValidation error: \n)")
raise error_
hl_file = client.tools.upsert(**file_dict)
elif type_ == "evaluator":
hl_file = client.evaluators.upsert(**file_dict)
else:
raise NotImplementedError(f"Unsupported File type: {type_}")
# Upsert the Dataset
hl_dataset = client.datasets.upsert(**dataset)
hl_dataset = client.datasets.get(id=hl_dataset.id, include_datapoints=True)
# Upsert the local Evaluators; other Evaluators are just referenced by `path` or `id`
local_evaluators: List[Evaluator] = []
if evaluators:
for evaluator in evaluators:
# If a callable is provided for an Evaluator, we treat it as External
eval_function = evaluator.get("callable")
if eval_function is not None:
# TODO: support the case where `file` logs generated on Humanloop but Evaluator logs generated locally
if function_ is None:
raise ValueError(
f"Local Evaluators are only supported when generating Logs locally using your {type_}'s `callable`. Please provide a `callable` for your file in order to run Evaluators locally."
)
local_evaluators.append(evaluator)
spec = ExternalEvaluator(
arguments_type=evaluator["args_type"],
return_type=evaluator["return_type"],
attributes={"code": inspect.getsource(eval_function)},
evaluator_type="external",
)
_ = client.evaluators.upsert(id=evaluator.get("id"), path=evaluator.get("path"), spec=spec)
# Validate upfront that the local Evaluators and Dataset fit
requires_target = False
for local_evaluator in local_evaluators:
if local_evaluator["args_type"] == "target_required":
requires_target = True
break
if requires_target:
missing_target = 0
for datapoint in hl_dataset.datapoints:
if not datapoint.target:
missing_target += 1
if missing_target > 0:
raise ValueError(
f"{missing_target} Datapoints have no target. A target is required for the Evaluator: {local_evaluator['path']}"
)
# Get or create the Evaluation based on the name
evaluation = None
try:
evaluation = client.evaluations.create(
name=name,
dataset={"file_id": hl_dataset.id},
evaluators=[{"path": e["path"]} for e in evaluators],
file={"id": hl_file.id},
)
except ApiError as error_:
# If the name exists, go and get it # TODO: Update API GET to allow querying by name and file.
if error_.status_code == 409:
evals = client.evaluations.list(file_id=hl_file.id, size=50)
for page in evals.iter_pages():
evaluation = next((e for e in page.items if e.name == name), None)
else:
raise error_
if not evaluation:
raise ValueError(f"Evaluation with name {name} not found.")
# Every run will generate a new batch of logs
batch_id = uuid.uuid4().hex[:10] # ignore risk of collision
log_func = _get_log_func(
client=client,
type_=type_,
file_id=hl_file.id,
version_id=hl_file.version_id,
evaluation_id=evaluation.id,
batch_id=batch_id,
)
# Define the function to execute your function in parallel and Log to Humanloop
def process_datapoint(datapoint: Datapoint):
start_time = datetime.now()
datapoint_dict = datapoint.dict()
try:
if "messages" in datapoint_dict:
output = function_(**datapoint_dict["inputs"], messages=datapoint_dict["messages"])
else:
output = function_(**datapoint_dict["inputs"])
if custom_logger:
log = function_(client=client, output=output)
else:
if not isinstance(output, str):
raise ValueError(
f"Your {type_}'s `callable` must return a string if you do not provide a custom logger."
)
log = log_func(
inputs=datapoint.inputs,
output=output,
source_datapoint_id=datapoint.id,
start_time=start_time,
end_time=datetime.now(),
)
except Exception as e:
log = log_func(
inputs=datapoint.inputs,
error=str(e),
source_datapoint_id=datapoint.id,
start_time=start_time,
end_time=datetime.now(),
)
logger.warning(msg=f"\nYour {type_}'s `callable` failed for Datapoint: {datapoint.id}. \n Error: {str(e)}")
# Apply local Evaluators
for local_evaluator in local_evaluators:
try:
start_time = datetime.now()
eval_function = local_evaluator["callable"]
if local_evaluator["args_type"] == "target_required":
judgment = eval_function(log.dict(), datapoint_dict["target"])
else:
judgment = eval_function(log.dict())
if local_evaluator.get("custom_logger", None):
local_evaluator["custom_logger"](client=client, judgment=judgment)
else:
# The API call will validate the judgment
_ = client.evaluators.log(
parent_id=log.id,
id=local_evaluator.get("id"),
path=local_evaluator.get("path"),
judgment=judgment,
start_time=start_time,
end_time=datetime.now(),
)
except Exception as e:
_ = client.evaluators.log(
parent_id=log.id,
path=local_evaluator.get("path"),
id=local_evaluator.get("id"),
error=str(e),
start_time=start_time,
end_time=datetime.now(),
)
logger.warning(f"\nEvaluator {local_evaluator['path']} failed with error {str(e)}")
# Execute the function and send the logs to Humanloop in parallel
total_datapoints = len(hl_dataset.datapoints)
logger.info(f"\n{CYAN}Navigate to your evals:{RESET}\n{evaluation.url}\n")
logger.info(f"{CYAN}{type_.capitalize()} Version ID: {hl_file.version_id}{RESET}")
logger.info(f"{CYAN}Run ID: {batch_id}{RESET}")
# Generate locally if a file `callable` is provided
if function_:
logger.info(
f"{CYAN}\nRunning {hl_file.name} over the Dataset {hl_dataset.name} using {workers} workers{RESET} "
)
completed_tasks = 0
with ThreadPoolExecutor(max_workers=workers) as executor:
futures = [executor.submit(process_datapoint, datapoint) for datapoint in hl_dataset.datapoints]
for _ in as_completed(futures):
completed_tasks += 1
_progress_bar(total_datapoints, completed_tasks)
else:
# TODO: trigger run when updated API is available
logger.info(f"{CYAN}\nRunning {hl_file.name} over the Dataset {hl_dataset.name}{RESET}")
# Wait for the Evaluation to complete then print the results
complete = False
stats = None
while not complete:
stats = client.evaluations.get_stats(id=evaluation.id)
logger.info(f"\r{stats.progress}")
complete = stats.status == "completed"
if not complete:
time.sleep(5)
# Print Evaluation results
logger.info(stats.report)
checks: List[EvaluatorCheck] = []
for evaluator in evaluators:
improvement_check, score, delta = check_evaluation_improvement(
evaluation=evaluation,
stats=stats,
evaluator_path=evaluator["path"],
batch_id=batch_id,
)
threshold_check = None
threshold = evaluator.get("threshold")
if threshold is not None:
threshold_check = check_evaluation_threshold(
evaluation=evaluation,
stats=stats,
evaluator_path=evaluator["path"],
threshold=threshold,
batch_id=batch_id,
)
checks.append(
EvaluatorCheck(
path=evaluator["path"],
improvement_check=improvement_check,
score=score,
delta=delta,
threshold=threshold,
threshold_check=threshold_check,
)
)
return checks
def _get_log_func(
client: BaseHumanloop,
type_: FileType,
file_id: str,
version_id: str,
evaluation_id: str,
batch_id: str,
) -> Callable:
"""Returns the appropriate log function pre-filled with common parameters."""
log_request = {
# TODO: why does the Log `id` field refer to the file ID in the API?
# Why are both `id` and `version_id` needed in the API?
"id": file_id,
"version_id": version_id,
"evaluation_id": evaluation_id,
"batch_id": batch_id,
}
if type_ == "flow":
return partial(client.flows.log, **log_request, trace_status="complete")
elif type_ == "prompt":
return partial(client.prompts.log, **log_request)
elif type_ == "evaluator":
return partial(client.evaluators.log, **log_request)
elif type_ == "tool":
return partial(client.tools.log, **log_request)
else:
raise NotImplementedError(f"Unsupported File version: {type_}")
def get_score_from_evaluator_stat(stat: Union[NumericStats, BooleanStats]) -> Union[float, None]:
"""Get the score from an Evaluator Stat."""
score = None
if isinstance(stat, BooleanStats):
if stat.total_logs:
score = round(stat.num_true / stat.total_logs, 2)
elif isinstance(stat, NumericStats):
score = round(stat.mean, 2)
else:
pass
return score
def _progress_bar(total: int, progress: int):
"""Simple progress bar for CLI with ETA."""
if total <= 0:
total = 1
if not hasattr(_progress_bar, "start_time"):
_progress_bar.start_time = time.time()
bar_length = 40
block = int(round(bar_length * progress / total))
bar = "#" * block + "-" * (bar_length - block)
percentage = (progress / total) * 100
elapsed_time = time.time() - _progress_bar.start_time
time_per_item = elapsed_time / progress if progress > 0 else 0
eta = (total - progress) * time_per_item
progress_display = f"\r[{bar}] {progress}/{total}"
progress_display += f" ({percentage:.2f}%)"
if progress < total:
progress_display += f" | ETA: {int(eta)}s"
else:
progress_display += " | DONE"
_progress_bar.start_time = None
sys.stderr.write(progress_display)
if progress >= total:
sys.stderr.write("\n")
def get_evaluator_stats_by_path(
stat: VersionStatsResponse, evaluation: EvaluationResponse
) -> Dict[str, Union[NumericStats, BooleanStats]]:
"""Get the Evaluator stats by path."""
# TODO: Update the API so this is not necessary
evaluators_by_id = {evaluator.version.version_id: evaluator for evaluator in evaluation.evaluators}
evaluator_stats_by_path = {
evaluators_by_id[evaluator_stat.evaluator_version_id].version.path: evaluator_stat
for evaluator_stat in stat.evaluator_version_stats
}
return evaluator_stats_by_path
def check_evaluation_threshold(
evaluation: EvaluationResponse,
stats: EvaluationStats,
evaluator_path: str,
threshold: float,
batch_id: str,
) -> bool:
"""Checks if the latest version has an average Evaluator result above a threshold."""
# TODO: Update the API so this is not necessary
evaluator_stats_by_path = get_evaluator_stats_by_path(
stat=next((stat for stat in stats.version_stats if stat.batch_id == batch_id), None), evaluation=evaluation
)
if evaluator_path in evaluator_stats_by_path:
evaluator_stat = evaluator_stats_by_path[evaluator_path]
score = get_score_from_evaluator_stat(stat=evaluator_stat)
if score >= threshold:
logger.info(
f"{GREEN}✅ Latest eval [{score}] above threshold [{threshold}] for evaluator {evaluator_path}.{RESET}"
)
return True
else:
logger.info(
f"{RED}❌ Latest score [{score}] below the threshold [{threshold}] for evaluator {evaluator_path}.{RESET}"
)
return False
else:
raise ValueError(f"Evaluator {evaluator_path} not found in the stats.")
def check_evaluation_improvement(
evaluation: EvaluationResponse,
evaluator_path: str,
stats: EvaluationStats,
batch_id: str,
) -> Tuple[bool, float, float]:
"""
Check the latest version has improved across for a specific Evaluator.
:returns: A tuple of (improvement, latest_score, delta since previous score)
"""
# TODO: Update the API so this is not necessary
latest_evaluator_stats_by_path = get_evaluator_stats_by_path(
stat=next((stat for stat in stats.version_stats if stat.batch_id == batch_id), None), evaluation=evaluation
)
if len(stats.version_stats) == 1:
logger.info(f"{YELLOW}⚠️ No previous versions to compare with.{RESET}")
return True, 0, 0
previous_evaluator_stats_by_path = get_evaluator_stats_by_path(stat=stats.version_stats[-2], evaluation=evaluation)
if evaluator_path in latest_evaluator_stats_by_path and evaluator_path in previous_evaluator_stats_by_path:
latest_evaluator_stat = latest_evaluator_stats_by_path[evaluator_path]
previous_evaluator_stat = previous_evaluator_stats_by_path[evaluator_path]
latest_score = get_score_from_evaluator_stat(stat=latest_evaluator_stat)
previous_score = get_score_from_evaluator_stat(stat=previous_evaluator_stat)
diff = round(latest_score - previous_score, 2)
if diff >= 0:
logger.info(f"{GREEN}✅ Improvement of [{diff}] for evaluator {evaluator_path}{RESET}")
return True, latest_score, diff
else:
logger.info(f"{RED}❌ Regression of [{diff}] for evaluator {evaluator_path}{RESET}")
return False, latest_score, diff
else:
raise ValueError(f"Evaluator {evaluator_path} not found in the stats.")