Source code for esmvaltool.diag_scripts.mlr.models.gbr_base

"""Base class for Gradient Boosting Regression model."""

import logging
import os

import matplotlib.pyplot as plt
import numpy as np

from esmvaltool.diag_scripts import mlr
from esmvaltool.diag_scripts.mlr.models import MLRModel

logger = logging.getLogger(os.path.basename(__file__))


[docs]class GBRModel(MLRModel):
    """Base class for Gradient Boosting Regression models."""

    _CLF_TYPE = None

[docs]    def plot_feature_importance(self, filename=None, color_coded=True):
        """Plot feature importance.

        This function uses properties of the GBR model based on the number of
        appearances of that feature in the regression trees and the
        improvements made by the individual splits (see Friedman, 2001).

        Note
        ----
        The features plotted here are not necessarily the real input features,
        but the ones after preprocessing.

        Parameters
        ----------
        filename : str, optional (default: 'feature_importance')
           Name of the plot file.
        color_coded : bool, optional (default: True)
            If ``True``, mark positive (linear) correlations with red bars and
            negative (linear) correlations with blue bars. If ``False``, all
            bars are blue.

        """
        if not self._is_ready_for_plotting():
            return

        # Get plot path
        if filename is None:
            filename = 'feature_importance'
        new_filename = filename + '.' + self._cfg['output_file_type']
        plot_path = os.path.join(self._cfg['mlr_plot_dir'], new_filename)

        # Get feature importance dictionary and colors for bars
        feature_importance_dict = dict(zip(self.features_after_preprocessing,
                                           self._clf.feature_importances_))
        colors = self._get_colors_for_features(color_coded=color_coded)

        # Plot
        self._plot_feature_importance(feature_importance_dict, colors,
                                      plot_path)

    def _plot_training_progress(self,
                                train_score,
                                test_score=None,
                                filename=None):
        """Plot training progress during fitting."""
        if not self._is_ready_for_plotting():
            return
        logger.info("Plotting training progress for GBR model")
        if filename is None:
            filename = 'training_progress'
        (_, axes) = plt.subplots()
        x_values = np.arange(len(train_score), dtype=np.float64) + 1.0
        x_values_all = []
        scores_all = []
        data_types = []

        # Plot train score
        axes.plot(x_values,
                  train_score,
                  color='b',
                  linestyle='-',
                  label='train data')
        x_values_all.append(x_values)
        scores_all.append(train_score)
        data_types.append(np.full(x_values.shape, 'train'))

        # Plot test score if possible
        if test_score is not None:
            axes.plot(x_values,
                      test_score,
                      color='g',
                      linestyle='-',
                      label='test data')
            x_values_all.append(x_values)
            scores_all.append(test_score)
            data_types.append(np.full(x_values.shape, 'test'))

        # Appearance
        ylim = axes.get_ylim()
        axes.set_ylim(0.0, ylim[1])
        title = f"Training progress ({self._cfg['mlr_model_name']})"
        axes.set_title(title)
        axes.set_xlabel('Boosting iterations')
        axes.set_ylabel('Normalized RMSE')
        axes.legend(loc='upper right')
        new_filename = filename + '.' + self._cfg['output_file_type']
        plot_path = os.path.join(self._cfg['mlr_plot_dir'], new_filename)
        plt.savefig(plot_path, **self._cfg['savefig_kwargs'])
        logger.info("Wrote %s", plot_path)
        plt.close()

        # Save provenance
        cube = mlr.get_1d_cube(
            np.concatenate(x_values_all),
            np.concatenate(scores_all),
            x_kwargs={'var_name': 'iteration',
                      'long_name': 'Boosting Iteration',
                      'units': 'no unit'},
            y_kwargs={'var_name': 'rmse',
                      'long_name': 'Normalized RMSE',
                      'units': '1',
                      'attributes': {'project': '', 'dataset': ''}},
        )
        cube.add_aux_coord(
            self._get_data_type_coord(np.concatenate(data_types)), 0)
        self._write_plot_provenance(
            cube, plot_path, ancestors=self.get_ancestors(prediction_names=[]),
            caption=title + '.', plot_types=['line'])