[ENH] forecasting benchmarking task experiment#176
[ENH] forecasting benchmarking task experiment#176fkiraly wants to merge 5 commits intohyperactive-project:mainfrom
Conversation
There was a problem hiding this comment.
I made some corrections to your file here -
# copyright: hyperactive developers, MIT License (see LICENSE file)
import numpy as np
from hyperactive.base import BaseExperiment
class SktimeForecastingTask(BaseExperiment):
"""Experiment adapter for forecast backtesting benchmark run.
This class is used to perform backtesting experiments using a given
sktime forecaster. It allows for hyperparameter tuning and evaluation of
the model's performance.
The score returned is the summary backtesting score,
of applying ``sktime`` ``evaluate`` to an estimator passed as ``forecaster``
in the ``score`` ``params``.
The backtesting performed is specified by the ``cv`` parameter,
and the scoring metric is specified by the ``scoring`` parameter.
The ``X`` and ``y`` parameters are the input data and target values,
which are used in fit/predict cross-validation.
Differs from ``SktimeForecastingExperiment`` in that ``forecaster``
is passed as a parameter directly to ``score`` and not to ``__init__``.
"""
_tags = {
"authors": "fkiraly",
"maintainers": "fkiraly",
"python_dependencies": "sktime", # python dependencies
}
def __init__(
self,
cv,
y,
X=None,
strategy="refit",
scoring=None,
error_score=np.nan,
cv_X=None,
backend=None,
backend_params=None,
):
self.X = X
self.y = y
self.strategy = strategy
self.scoring = scoring
self.cv = cv
self.error_score = error_score
self.cv_X = cv_X
self.backend = backend
self.backend_params = backend_params
super().__init__()
if scoring is None:
from sktime.performance_metrics.forecasting import (
MeanAbsolutePercentageError,
)
self._scoring = MeanAbsolutePercentageError(symmetric=True)
else:
self._scoring = scoring
# Set a boolean tag indicating whether higher is better.
# If the metric indicates lower_is_better, set False; otherwise True.
try:
lower_is_better = (
True
if scoring is None
else bool(self._scoring.get_tag("lower_is_better", False))
)
except Exception:
# If metric doesn't expose get_tag, default to False (lower is better)
lower_is_better = True if scoring is None else False
higher_is_better = not lower_is_better
# Use a conventional boolean tag for the rest of the codebase
try:
self.set_tags(**{"higher_is_better": higher_is_better})
except Exception:
# If set_tags is not available or fails, ignore tagging but continue.
pass
def _paramnames(self):
"""Return the parameter names of the search."""
return ["forecaster"]
def _evaluate(self, params):
"""Evaluate the parameters.
Parameters
----------
params : dict with string keys
Parameters to evaluate.
Returns
-------
float
The value of the parameters as per evaluation.
dict
Additional metadata about the search.
"""
from sktime.forecasting.model_evaluation import evaluate
forecaster = params.get("forecaster", None)
if forecaster is None:
raise ValueError("SktimeForecastingTask._evaluate requires params to include a 'forecaster' entry")
try:
results = evaluate(
forecaster,
cv=self.cv,
y=self.y,
X=self.X,
strategy=self.strategy,
scoring=self._scoring,
error_score=self.error_score,
cv_X=self.cv_X,
backend=self.backend,
backend_params=self.backend_params,
)
except Exception as e:
# If user explicitly wants exceptions to propagate:
if self.error_score == "raise":
raise
# Otherwise return error_score and capture the exception message
return self.error_score, {"error": str(e)}
# Determine scoring column name robustly
scoring_name = getattr(self._scoring, "name", None) or self._scoring.__class__.__name__
result_name = f"test_{scoring_name}"
add_info = {"results": results}
# Results handling robust to DataFrame-like or dict-like outputs
try:
# If results is a pandas DataFrame-like object:
if hasattr(results, "columns"):
if result_name in results.columns:
res_values = results[result_name]
else:
# find a test_* column as fallback
test_cols = [c for c in results.columns if str(c).startswith("test_")]
if test_cols:
res_values = results[test_cols[0]]
add_info["warning"] = (
f"expected column '{result_name}' not found; using '{test_cols[0]}' instead"
)
else:
raise ValueError(f"No 'test_*' column found in evaluate results; expected '{result_name}'")
else:
# dict-like fallback
if result_name in results:
res_values = results[result_name]
else:
test_keys = [k for k in results.keys() if str(k).startswith("test_")]
if test_keys:
res_values = results[test_keys[0]]
add_info["warning"] = (
f"expected key '{result_name}' not found; using '{test_keys[0]}' instead"
)
else:
raise ValueError(f"No 'test_*' key found in evaluate results; expected '{result_name}'")
except Exception as e:
# Preserve original exception info
if self.error_score == "raise":
raise
return self.error_score, {"error": str(e), **add_info}
# Compute scalar summary result
try:
res_float = float(np.nanmean(res_values))
except Exception:
# Last-resort attempt: convert to numpy array and take mean
try:
res_float = float(np.nanmean(np.asarray(res_values)))
except Exception as e:
if self.error_score == "raise":
raise
return self.error_score, {"error": f"Could not compute mean of results: {e}", **add_info}
return res_float, add_info
@classmethod
def get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the skbase object."""
from sktime.datasets import load_airline, load_longley
from sktime.split import ExpandingWindowSplitter
y = load_airline()
params0 = {
"cv": ExpandingWindowSplitter(initial_window=36, step_length=12, fh=12),
"y": y,
}
from sktime.performance_metrics.forecasting import MeanAbsolutePercentageError
y, X = load_longley()
params1 = {
"cv": ExpandingWindowSplitter(initial_window=3, step_length=3, fh=1),
"y": y,
"X": X,
"scoring": MeanAbsolutePercentageError(symmetric=False),
}
return [params0, params1]
@classmethod
def _get_score_params(cls):
"""Return settings for testing score/evaluate functions. Used in tests only."""
from sktime.forecasting.naive import NaiveForecaster
val0 = {"forecaster": NaiveForecaster(strategy="last")}
val1 = {"forecaster": NaiveForecaster(strategy="last")}
return [val0, val1]|
@arnavk23, can you kindly explain what you corrected and why? |
|
|
@arnavk23, is this AI generated? |
Yes the remark is AI-generated. |
This PR adds a
SktimeForecastingTask, which defines a full benchmarking run for aforecasterthat is passed later in_evaluate.This object could be used as a "task" in the
sktimeForecastingBenchmark.Draft for discussion and reviewing the design:
SktimeForecastingExperimentwhich is used in tuning. How should we deal with the similarity and intersection?forecastergets passed or not. Not sure where that leads thoughsktime?