feature: forecaster hyperparams and eval metrics (OpenSTEF#746)

egordm · web-flow · commit 6b1da449b784 · 2025-11-13T16:05:32.000+01:00
* feature(OpenSTEF#729) Removed to_state and from_state methods in favor of builtin python state saving functions. Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com> * feature(OpenSTEF#729): Fixed issue where generic transform pipeline could not be serialized. Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com> * feature(OpenSTEF#729): Added more state saving tests Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com> * feature(OpenSTEF#729): Added more state saving tests Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com> * feature(OpenSTEF#729): Added more state saving tests Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com> * feature: standardized objective function. Added custom evaluation functions for forecasters. * fix: Formatting. Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com> --------- Signed-off-by: Egor Dmitriev <egor.dmitriev@alliander.com>
diff --git a/packages/openstef-beam/src/openstef_beam/metrics/metrics_probabilistic.py b/packages/openstef-beam/src/openstef_beam/metrics/metrics_probabilistic.py
@@ -19,6 +19,7 @@
 import numpy.typing as npt
 
 from openstef_core.exceptions import MissingExtraError
+from openstef_core.types import Quantile
 
 
 def crps(
@@ -214,3 +215,58 @@ def mean_absolute_calibration_error(
     """
     observed_probs = np.array([observed_probability(y_true, y_pred[:, i]) for i in range(len(quantiles))])
     return float(np.mean(np.abs(observed_probs - quantiles)))
+
+
+def mean_pinball_loss(
+    y_true: npt.NDArray[np.floating],
+    y_pred: npt.NDArray[np.floating],
+    quantiles: list[Quantile],
+    sample_weight: npt.NDArray[np.floating] | None = None,
+) -> float:
+    """Calculate the Mean Pinball Loss for quantile forecasts.
+
+    The Pinball Loss is a proper scoring rule for evaluating quantile forecasts.
+    It penalizes under- and over-predictions differently based on the quantile level.
+
+    Args:
+        y_true: Observed values with shape (num_samples,) or (num_samples, num_quantiles).
+        y_pred: Predicted quantiles with shape (num_samples, num_quantiles).
+            Each column corresponds to predictions for a specific quantile level.
+        quantiles: Quantile levels with shape (num_quantiles,).
+            Must be sorted in ascending order and contain values in [0, 1].
+        sample_weight: Optional weights for each sample with shape (num_samples,).
+
+    Returns:
+        The weighted average Pinball Loss across all samples and quantiles. Lower values indicate better
+        forecast quality.
+    """
+    # Resize the predictions and targets.
+    y_pred = np.reshape(y_pred, [-1, len(quantiles)])
+    n_rows = y_pred.shape[0]
+    y_true = np.reshape(y_true, [n_rows, -1])
+    sample_weight = np.reshape(sample_weight, [n_rows, 1]) if sample_weight is not None else None
+
+    # Extract quantile values into array for vectorized operations
+    quantile_values = np.array(quantiles)  # shape: (n_quantiles,)
+
+    # Compute errors for all quantiles at once
+    errors = y_true - y_pred  # shape: (num_samples, num_quantiles)
+
+    # Compute masks for all quantiles simultaneously
+    underpredict_mask = errors >= 0  # y_true >= y_pred, shape: (num_samples, num_quantiles)
+    overpredict_mask = errors < 0  # y_true < y_pred, shape: (num_samples, num_quantiles)
+
+    # Vectorized pinball loss computation using broadcasting
+    # quantiles broadcasts from (num_quantiles,) to (num_samples, num_quantiles)
+    loss = quantiles * underpredict_mask * errors - (1 - quantile_values) * overpredict_mask * errors
+
+    # Apply sample weights if provided
+    if sample_weight is not None:
+        sample_weight = np.asarray(sample_weight).reshape(-1, 1)  # shape: (num_samples, 1)
+        loss *= sample_weight
+        total_weight = sample_weight.sum() * len(quantiles)
+    else:
+        total_weight = loss.size
+
+    # Return mean loss across all samples and quantiles
+    return float(loss.sum() / total_weight)
diff --git a/packages/openstef-beam/tests/unit/metrics/test_metrics_probabilistic.py b/packages/openstef-beam/tests/unit/metrics/test_metrics_probabilistic.py
@@ -6,8 +6,11 @@
 
 import numpy as np
 import pytest
+from sklearn.metrics import mean_pinball_loss as sk_mean_pinball_loss
 
 from openstef_beam.metrics import crps, mean_absolute_calibration_error, rcrps
+from openstef_beam.metrics.metrics_probabilistic import mean_pinball_loss
+from openstef_core.types import Q
 
 
 # CRPS Test Cases
@@ -151,3 +154,33 @@ def test_mean_absolute_calibration_error() -> None:
 
     assert isinstance(result, float)
     assert result == (0.4 + 0.4) / 3  # observed probabilities are 0.5, 0.5, 0.5 vs 0.1, 0.5, 0.9 quantiles
+
+
+def test_mean_pinball_loss_matches_sklearn_average_when_multi_quantile():
+    # Arrange
+    rng = np.random.default_rng(seed=42)
+    n = 40
+    y_true = rng.normal(loc=1.0, scale=2.0, size=n)
+    quantiles = [Q(0.1), Q(0.5), Q(0.9)]
+    # Simulate predictions with different biases per quantile; shape (n, q)
+    y_pred = np.stack(
+        [
+            y_true + rng.normal(0, 0.7, size=n) - 0.4,  # q=0.1
+            y_true + rng.normal(0, 0.5, size=n) + 0.0,  # q=0.5
+            y_true + rng.normal(0, 0.7, size=n) + 0.4,  # q=0.9
+        ],
+        axis=1,
+    )
+
+    # Act
+    actual = mean_pinball_loss(y_true=y_true, y_pred=y_pred, quantiles=quantiles)
+    expected = np.mean(
+        np.array(
+            [sk_mean_pinball_loss(y_true, y_pred[:, i], alpha=float(quantile)) for i, quantile in enumerate(quantiles)],
+            dtype=float,
+        )
+    )
+
+    # Assert
+    # Multi-quantile mean should equal average of sklearn per-quantile losses
+    assert np.allclose(actual, expected, rtol=1e-12, atol=1e-12)
diff --git a/packages/openstef-core/src/openstef_core/exceptions.py b/packages/openstef-core/src/openstef_core/exceptions.py
@@ -104,6 +104,10 @@ class PredictError(Exception):
     """Exception raised for errors during forecasting operations."""
 
 
+class InputValidationError(ValueError):
+    """Exception raised for input validation errors."""
+
+
 class ModelLoadingError(Exception):
     """Exception raised when a model fails to load properly."""
 
diff --git a/packages/openstef-models/src/openstef_models/models/forecasting/gblinear_forecaster.py b/packages/openstef-models/src/openstef_models/models/forecasting/gblinear_forecaster.py
@@ -11,7 +11,6 @@
 to predict values outside the range of the training data.
 """
 
-from functools import partial
 from typing import Literal, override
 
 import numpy as np
@@ -22,11 +21,16 @@
 
 from openstef_core.datasets.mixins import LeadTime
 from openstef_core.datasets.validated_datasets import ForecastDataset, ForecastInputDataset
-from openstef_core.exceptions import MissingExtraError, NotFittedError
+from openstef_core.exceptions import InputValidationError, MissingExtraError, NotFittedError
 from openstef_core.mixins.predictor import HyperParams
 from openstef_models.explainability.mixins import ExplainableForecaster
 from openstef_models.models.forecasting.forecaster import Forecaster, ForecasterConfig
-from openstef_models.utils.loss_functions import OBJECTIVE_MAP, ObjectiveFunctionType, xgb_prepare_target_for_objective
+from openstef_models.utils.evaluation_functions import EvaluationFunctionType, get_evaluation_function
+from openstef_models.utils.loss_functions import (
+    ObjectiveFunctionType,
+    get_objective_function,
+    xgb_prepare_target_for_objective,
+)
 
 try:
     import xgboost as xgb
@@ -52,16 +56,22 @@ class GBLinearHyperParams(HyperParams):
         "rounds.",
     )
     objective: ObjectiveFunctionType | Literal["reg:quantileerror"] = Field(
-        default="pinball_loss",
-        description="Objective function for training. 'pinball_loss' is recommended for probabilistic forecasting.",
+        default="reg:quantileerror",
+        description="Objective function for training. 'reg:quantileerror' is recommended "
+        "for probabilistic forecasting.",
+    )
+    evaluation_metric: EvaluationFunctionType = Field(
+        default="mean_pinball_loss",
+        description="Metric used for evaluation during training. Defaults to 'mean_pinball_loss' "
+        "for quantile regression.",
     )
 
     # Regularization
     reg_alpha: float = Field(
         default=0.0001, description="L1 regularization on weights. Higher values increase regularization. Range: [0,∞]"
     )
     reg_lambda: float = Field(
-        default=0.0, description="L2 regularization on weights. Higher values increase regularization. Range: [0,∞]"
+        default=0.1, description="L2 regularization on weights. Higher values increase regularization. Range: [0,∞]"
     )
 
     # Feature selection
@@ -176,15 +186,9 @@ def __init__(self, config: GBLinearForecasterConfig) -> None:
         """
         self._config = config or GBLinearForecasterConfig()
 
-        if self.config.hyperparams.objective == "reg:quantileerror":
-            objective = "reg:quantileerror"
-        else:
-            objective = partial(OBJECTIVE_MAP[self._config.hyperparams.objective], quantiles=self._config.quantiles)
-
         self._gblinear_model = xgb.XGBRegressor(
             booster="gblinear",
             # Core parameters for forecasting
-            objective=objective,
             n_estimators=self._config.hyperparams.n_steps,
             learning_rate=self._config.hyperparams.learning_rate,
             early_stopping_rounds=self._config.hyperparams.early_stopping_rounds,
@@ -196,6 +200,16 @@ def __init__(self, config: GBLinearForecasterConfig) -> None:
             updater=self._config.hyperparams.updater,
             quantile_alpha=[float(q) for q in self._config.quantiles],
             top_k=self._config.hyperparams.top_k if self._config.hyperparams.feature_selector == "thrifty" else None,
+            # Objective
+            objective=get_objective_function(
+                function_type=self._config.hyperparams.objective, quantiles=self._config.quantiles
+            )
+            if self._config.hyperparams.objective != "reg:quantileerror"
+            else "reg:quantileerror",
+            eval_metric=get_evaluation_function(
+                function_type=self._config.hyperparams.evaluation_metric, quantiles=self._config.quantiles
+            ),
+            disable_default_eval_metric=True,
         )
         self._target_scaler = StandardScaler()
 
@@ -216,7 +230,6 @@ def is_fitted(self) -> bool:
 
     def _prepare_fit_input(self, data: ForecastInputDataset) -> tuple[pd.DataFrame, np.ndarray, pd.Series]:
         input_data: pd.DataFrame = data.input_data()
-
         # Scale the target variable
         target: np.ndarray = np.asarray(data.target_series.values)
         target = self._target_scaler.transform(target.reshape(-1, 1)).flatten()
@@ -234,9 +247,12 @@ def _prepare_fit_input(self, data: ForecastInputDataset) -> tuple[pd.DataFrame,
 
     @override
     def fit(self, data: ForecastInputDataset, data_val: ForecastInputDataset | None = None) -> None:
-        # Fit the target scaler
-        target: np.ndarray = np.asarray(data.target_series.values)
-        self._target_scaler.fit(target.reshape(-1, 1))
+        # Data checks
+        if data.data.isna().any().any():
+            raise InputValidationError("There are nan values in the input data. Use imputation transform to fix them.")
+
+        # Fit the scalers
+        self._target_scaler.fit(data.target_series.to_frame())
 
         # Prepare training data
         input_data, target, sample_weight = self._prepare_fit_input(data)
@@ -264,11 +280,15 @@ def predict(self, data: ForecastInputDataset) -> ForecastDataset:
         if not self.is_fitted:
             raise NotFittedError(self.__class__.__name__)
 
+        # Data checks
+        if data.input_data().isna().any().any():
+            raise InputValidationError("There are nan values in the input data. Use imputation transform to fix them.")
+
         # Get input features for prediction
         input_data: pd.DataFrame = data.input_data(start=data.forecast_start)
 
         # Generate predictions
-        predictions_array: np.ndarray = self._gblinear_model.predict(input_data)
+        predictions_array: np.ndarray = self._gblinear_model.predict(input_data).reshape(-1, len(self.config.quantiles))
 
         # Inverse transform the scaled predictions
         predictions_array = self._target_scaler.inverse_transform(predictions_array)
diff --git a/packages/openstef-models/src/openstef_models/models/forecasting/xgboost_forecaster.py b/packages/openstef-models/src/openstef_models/models/forecasting/xgboost_forecaster.py
@@ -9,7 +9,6 @@
 comprehensive hyperparameter control for production forecasting workflows.
 """
 
-from functools import partial
 from typing import Literal, override
 
 import numpy as np
@@ -22,7 +21,12 @@
 from openstef_core.mixins import HyperParams
 from openstef_models.explainability.mixins import ExplainableForecaster
 from openstef_models.models.forecasting.forecaster import Forecaster, ForecasterConfig
-from openstef_models.utils.loss_functions import OBJECTIVE_MAP, ObjectiveFunctionType, xgb_prepare_target_for_objective
+from openstef_models.utils.evaluation_functions import EvaluationFunctionType, get_evaluation_function
+from openstef_models.utils.loss_functions import (
+    ObjectiveFunctionType,
+    get_objective_function,
+    xgb_prepare_target_for_objective,
+)
 
 try:
     import xgboost as xgb
@@ -61,7 +65,7 @@ class XGBoostHyperParams(HyperParams):
 
     # Core Tree Boosting Parameters
     n_estimators: int = Field(
-        default=500,
+        default=100,
         description="Number of boosting rounds/trees to fit. Higher values may improve performance but "
         "increase training time and risk overfitting.",
     )
@@ -91,6 +95,11 @@ class XGBoostHyperParams(HyperParams):
         default="pinball_loss",
         description="Objective function for training. 'pinball_loss' is recommended for probabilistic forecasting.",
     )
+    evaluation_metric: EvaluationFunctionType = Field(
+        default="mean_pinball_loss",
+        description="Metric used for evaluation during training. Defaults to 'mean_pinball_loss' "
+        "for quantile regression.",
+    )
 
     # Regularization
     reg_alpha: float = Field(
@@ -149,10 +158,10 @@ class XGBoostHyperParams(HyperParams):
 
     # General Parameters
     random_state: int | None = Field(
-        default=None, alias="seed", description="Random seed for reproducibility. Controls tree structure randomness."
+        default=42, description="Random seed for reproducibility. Controls tree structure randomness."
     )
     early_stopping_rounds: int | None = Field(
-        default=10,
+        default=None,
         description="Training will stop if performance doesn't improve for this many rounds. Requires validation data.",
     )
     use_target_scaling: bool = Field(
@@ -192,7 +201,7 @@ class XGBoostForecasterConfig(ForecasterConfig):
     n_jobs: int = Field(
         default=1, description="Number of parallel threads for tree construction. -1 uses all available cores."
     )
-    verbosity: Literal[0, 1, 2, 3] = Field(
+    verbosity: Literal[0, 1, 2, 3, True] = Field(
         default=1, description="Verbosity level. 0=silent, 1=warning, 2=info, 3=debug"
     )
 
@@ -262,8 +271,6 @@ def __init__(self, config: XGBoostForecasterConfig) -> None:
         """
         self._config = config
 
-        objective = partial(OBJECTIVE_MAP[self._config.hyperparams.objective], quantiles=self._config.quantiles)
-
         self._xgboost_model = xgb.XGBRegressor(
             # Multi-output configuration
             multi_strategy="one_output_per_tree",
@@ -297,7 +304,13 @@ def __init__(self, config: XGBoostForecasterConfig) -> None:
             # Early stopping handled in fit method
             early_stopping_rounds=self._config.hyperparams.early_stopping_rounds,
             # Objective
-            objective=objective,
+            objective=get_objective_function(
+                function_type=self._config.hyperparams.objective, quantiles=self._config.quantiles
+            ),
+            eval_metric=get_evaluation_function(
+                function_type=self._config.hyperparams.evaluation_metric, quantiles=self._config.quantiles
+            ),
+            disable_default_eval_metric=True,
         )
         self._target_scaler = StandardScaler() if self._config.hyperparams.use_target_scaling else None
 
@@ -372,7 +385,7 @@ def predict(self, data: ForecastInputDataset) -> ForecastDataset:
         input_data: pd.DataFrame = data.input_data(start=data.forecast_start)
 
         # Generate predictions
-        predictions_array: np.ndarray = self._xgboost_model.predict(input_data)
+        predictions_array: np.ndarray = self._xgboost_model.predict(input_data).reshape(-1, len(self.config.quantiles))
 
         # Inverse transform the scaled predictions
         if self._target_scaler is not None:
diff --git a/packages/openstef-models/src/openstef_models/utils/evaluation_functions.py b/packages/openstef-models/src/openstef_models/utils/evaluation_functions.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: 2025 Contributors to the OpenSTEF project <short.term.energy.forecasts@alliander.com>
+#
+# SPDX-License-Identifier: MPL-2.0
+"""Utility functions for evaluation metrics in forecasting models."""
+
+from collections.abc import Callable
+from functools import partial
+from typing import Any, Literal
+
+import numpy as np
+
+from openstef_beam.metrics.metrics_probabilistic import mean_pinball_loss
+from openstef_core.types import Quantile
+
+type EvaluationFunctionType = Literal["mean_pinball_loss"]
+
+EVALUATION_MAP = {
+    "mean_pinball_loss": mean_pinball_loss,
+}
+
+
+def get_evaluation_function(
+    function_type: EvaluationFunctionType, quantiles: list[Quantile] | None = None, **kwargs: Any
+) -> Callable[[np.ndarray, np.ndarray], float]:
+    eval_metric = partial(EVALUATION_MAP[function_type], quantiles=quantiles, **kwargs)
+    eval_metric.__name__ = function_type  # pyright: ignore[reportAttributeAccessIssue]
+    return eval_metric
+
+
+__all__ = ["EVALUATION_MAP", "EvaluationFunctionType"]
diff --git a/packages/openstef-models/src/openstef_models/utils/loss_functions.py b/packages/openstef-models/src/openstef_models/utils/loss_functions.py