Source code for esmvaltool.diag_scripts.mlr.models.gbr_base

"""Base class for Gradient Boosting Regression model."""

import logging
import os

import matplotlib.pyplot as plt
import numpy as np

from esmvaltool.diag_scripts import mlr
from esmvaltool.diag_scripts.mlr.models import MLRModel

logger = logging.getLogger(os.path.basename(__file__))


[docs] class GBRModel(MLRModel): """Base class for Gradient Boosting Regression models.""" _CLF_TYPE = None
[docs] def plot_feature_importance(self, filename=None, color_coded=True): """Plot feature importance. This function uses properties of the GBR model based on the number of appearances of that feature in the regression trees and the improvements made by the individual splits (see Friedman, 2001). Note ---- The features plotted here are not necessarily the real input features, but the ones after preprocessing. Parameters ---------- filename : str, optional (default: 'feature_importance') Name of the plot file. color_coded : bool, optional (default: True) If ``True``, mark positive (linear) correlations with red bars and negative (linear) correlations with blue bars. If ``False``, all bars are blue. """ if not self._is_ready_for_plotting(): return # Get plot path if filename is None: filename = 'feature_importance' new_filename = filename + '.' + self._cfg['output_file_type'] plot_path = os.path.join(self._cfg['mlr_plot_dir'], new_filename) # Get feature importance dictionary and colors for bars feature_importance_dict = dict(zip(self.features_after_preprocessing, self._clf.feature_importances_)) colors = self._get_colors_for_features(color_coded=color_coded) # Plot self._plot_feature_importance(feature_importance_dict, colors, plot_path)
def _plot_training_progress(self, train_score, test_score=None, filename=None): """Plot training progress during fitting.""" if not self._is_ready_for_plotting(): return logger.info("Plotting training progress for GBR model") if filename is None: filename = 'training_progress' (_, axes) = plt.subplots() x_values = np.arange(len(train_score), dtype=np.float64) + 1.0 x_values_all = [] scores_all = [] data_types = [] # Plot train score axes.plot(x_values, train_score, color='b', linestyle='-', label='train data') x_values_all.append(x_values) scores_all.append(train_score) data_types.append(np.full(x_values.shape, 'train')) # Plot test score if possible if test_score is not None: axes.plot(x_values, test_score, color='g', linestyle='-', label='test data') x_values_all.append(x_values) scores_all.append(test_score) data_types.append(np.full(x_values.shape, 'test')) # Appearance ylim = axes.get_ylim() axes.set_ylim(0.0, ylim[1]) title = f"Training progress ({self._cfg['mlr_model_name']})" axes.set_title(title) axes.set_xlabel('Boosting iterations') axes.set_ylabel('Loss') axes.legend(loc='upper right') new_filename = filename + '.' + self._cfg['output_file_type'] plot_path = os.path.join(self._cfg['mlr_plot_dir'], new_filename) plt.savefig(plot_path, **self._cfg['savefig_kwargs']) logger.info("Wrote %s", plot_path) plt.close() # Save provenance cube = mlr.get_1d_cube( np.concatenate(x_values_all), np.concatenate(scores_all), x_kwargs={'var_name': 'iteration', 'long_name': 'Boosting Iteration', 'units': 'no unit'}, y_kwargs={'var_name': 'rmse', 'long_name': 'Normalized RMSE', 'units': '1', 'attributes': {'project': '', 'dataset': ''}}, ) cube.add_aux_coord( self._get_data_type_coord(np.concatenate(data_types)), 0) self._write_plot_provenance( cube, plot_path, ancestors=self.get_ancestors(prediction_names=[]), caption=title + '.', plot_types=['line'])