Source code for esmvaltool.diag_scripts.emergent_constraints

"""Convenience functions for emergent constraints diagnostics."""
import logging
import os
from copy import deepcopy
from pprint import pformat

import iris
import iris.pandas
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import yaml
from scipy import integrate
from scipy.stats import linregress

from esmvaltool.diag_scripts.shared import (
    ProvenanceLogger,
    get_diagnostic_filename,
    get_plot_filename,
    io,
)

logger = logging.getLogger(__name__)

ALLOWED_VAR_TYPES = [
    'feature',
    'label',
    'prediction_input',
    'prediction_input_error',
]
COLOR_COMBINED_GROUPS = 'gray'
LEGEND_KWARGS = {
    'loc': 'center left',
    'bbox_to_anchor': [1.05, 0.5],
    'borderaxespad': 0.0,
}
PANDAS_PRINT_OPTIONS = [
    'display.max_rows', None,
    'display.max_colwidth', None,
]


def _check_x_y_arrays(x_array, y_array):
    """Ensure that the X and Y arrays have correct shapes."""
    x_array = np.ma.array(x_array)
    y_array = np.ma.array(y_array)

    # Check shapes
    if x_array.ndim != 1:
        raise ValueError(
            f"Expected 1D array for X training data, got {x_array.ndim:d}D "
            f"array")
    if y_array.ndim != 1:
        raise ValueError(
            f"Expected 1D array for Y training data, got {y_array.ndim:d}D "
            f"array")
    if x_array.shape != y_array.shape:
        raise ValueError(
            f"Expected identical shapes for X and Y training data, got "
            f"{x_array.shape} and {y_array.shape}, respectively")

    # Remove masked values
    mask = np.ma.getmaskarray(x_array)
    mask |= np.ma.getmaskarray(y_array)
    x_array = np.array(x_array[~mask])
    y_array = np.array(y_array[~mask])

    return (x_array, y_array)


def _add_column(data_frame, series, column_name):
    """Add column to :class:`pandas.DataFrame` (expands index if necessary)."""
    rows_to_add = [
        pd.Series(name=row, dtype=np.float64).to_frame().T for row in
        series.index.difference(data_frame.index)
    ]
    data_frame = pd.concat([data_frame] + rows_to_add)
    if column_name in data_frame.columns:
        for row in series.index:
            if np.isnan(data_frame.loc[row, column_name]):
                data_frame.loc[row, column_name] = series.loc[row]
            else:
                if not np.isclose(data_frame.loc[row, column_name],
                                  series.loc[row]):
                    raise ValueError(
                        f"Got duplicate data for tag '{column_name}' of "
                        f"'{row}': {series.loc[row]:e} and "
                        f"{data_frame.loc[row, column_name]:e}")
    else:
        data_frame[column_name] = series
    return data_frame


def _crop_data_frame(data_frame, ref_data_frame, data_name, ref_data_name):
    """Crop columns of a data_frame so that it matches a given reference."""
    diff_not_in_data_frame = list(
        ref_data_frame.columns.difference(data_frame.columns))
    if diff_not_in_data_frame:
        raise ValueError(
            f"No '{data_name}' given for tags {diff_not_in_data_frame}")
    diff_not_in_ref = list(
        data_frame.columns.difference(ref_data_frame.columns))
    if diff_not_in_ref:
        logger.warning(
            "Ignoring '%s' of tags %s: no corresponding '%s' data available",
            data_name, diff_not_in_ref, ref_data_name)
        data_frame = data_frame[ref_data_frame.columns]
    return data_frame


def _check_data_frames(features, label, pred_input, pred_input_err):
    """Check indexes and columns of the input data."""
    if not list(features.columns):
        raise ValueError("Expected at least one feature")
    if len(label.columns) != 1:
        raise ValueError(
            f"Expected exactly 1 'label' variable, got {len(label.columns):d}")

    # Compare features and label
    if list(features.index) != list(label.index):
        raise ValueError(
            f"Expected identical datasets (climate models; independent "
            f"observations) for 'feature' and 'label', got "
            f"{features.index.values} and {label.index.values}")
    if len(features.index) < 2:
        raise ValueError("Expected at least two training points for features")

    # Compare features and prediction input data
    pred_input = _crop_data_frame(pred_input, features, 'prediction_input',
                                  'feature')
    pred_input_err = _crop_data_frame(pred_input_err, features,
                                      'prediction_input_error', 'feature')

    # Compare prediction_input and prediction_input_error
    if not list(pred_input.index):
        raise ValueError("Expected at least one prediction input point")
    if list(pred_input.index) != list(pred_input_err.index):
        raise ValueError(
            f"Expected identical training points for 'prediction_input' and "
            f"'prediction_input_error', got {pred_input.index.values} "
            f"and {pred_input_err.index.values}")

    return (features, label, pred_input, pred_input_err)


def _combine_dicts(old_dict, new_dict):
    """Combine two :obj:`dict` by adding values for identical keys to lists."""
    old_dict = deepcopy(old_dict)
    new_dict = deepcopy(new_dict)
    for (key, val) in new_dict.items():
        if key not in old_dict:
            old_dict[key] = val
            continue
        if isinstance(old_dict[key], list):
            if not isinstance(val, list):
                val = [val]
            old_dict[key] = list(set([*old_dict[key], *val]))
        else:
            if not isinstance(val, list):
                if not np.array_equal(val, old_dict[key]):
                    old_dict[key] = [old_dict[key], val]
            else:
                old_dict[key] = list(set([old_dict[key], *val]))
    return old_dict


def _get_additional_data(additional_data, recipe):
    """Get :class:`iris.cube.CubeList` from additional data."""
    if additional_data is None:
        return iris.cube.CubeList()
    cubes = _metadata_list_to_cube_list(additional_data, 'additional_data')
    for cube in cubes:
        cube.attributes['filename'] = recipe
    return cubes


def _get_attributes(cubes):
    """Extract attributes for features and labels."""
    attributes = {}

    # Extract attributes
    for cube in cubes:
        cube_attrs = cube.attributes
        tag = cube_attrs['tag']
        attributes.setdefault(tag, {})
        if cube_attrs['var_type'] in ('feature', 'label'):
            attributes[tag] = _combine_dicts(attributes[tag],
                                             _metadata_to_dict(cube.metadata))
        elif cube_attrs['var_type'] in ('prediction_input',
                                        'prediction_input_error'):
            attributes[tag] = _combine_dicts(
                attributes[tag], {'filename': cube_attrs['filename']})
        else:
            raise ValueError(
                f"File '{cube_attrs['filename']}' has invalid var_type "
                f"'{cube_attrs['var_type']}'")

    # Set default attributes and remove lengthy 'provenance' entry
    for tag in attributes:
        attributes[tag].pop('provenance', None)
        attributes[tag].setdefault('plot_title', f"Emergent constraint {tag}")
        if 'units' in attributes[tag]:
            axis_label = f"{tag} [{attributes[tag]['units']}]"
            attributes[tag].setdefault('plot_xlabel', axis_label)
            attributes[tag].setdefault('plot_ylabel', axis_label)
        else:
            attributes[tag].setdefault('plot_xlabel', tag)
            attributes[tag].setdefault('plot_ylabel', tag)
        attributes[tag].setdefault('plot_xlim', None)
        attributes[tag].setdefault('plot_ylim', None)

    return attributes


def _get_cube_list(input_files,
                   recipe,
                   additional_data=None,
                   external_file=None,
                   merge_identical_pred_input=True):
    """Get :class:`iris.cube.CubeList` of input files."""
    cubes = iris.cube.CubeList()

    # Input files
    for filename in input_files:
        logger.info("Loading '%s'", filename)
        cube = _load_cube_with_dataset_coord(filename)
        cube.attributes['filename'] = filename
        (feature_cube,
         prediction_cube) = _split_cube(cube, merge_identical_pred_input)
        if feature_cube is not None:
            cubes.append(feature_cube)
        if prediction_cube is not None:
            cubes.append(prediction_cube)

    # Additional data
    cubes.extend(_get_additional_data(additional_data, recipe))

    # External file
    cubes.extend(_get_external_cube_list(external_file))

    # Check metadata of cubes
    for cube in cubes:
        check_metadata(cube.attributes)

    return cubes


def _get_external_cube_list(external_file):
    """Get external :class:`iris.cube.CubeList`."""
    if external_file is None:
        return iris.cube.CubeList()
    with open(external_file, 'r') as infile:
        metadata_list = yaml.safe_load(infile)
    cubes = _metadata_list_to_cube_list(metadata_list, external_file)
    for cube in cubes:
        cube.attributes['filename'] = external_file
    return cubes


def _get_external_file(filepath, auxiliary_data_dir):
    """Get full path to external file (if available)."""
    if not filepath:
        return None
    filepath = os.path.expanduser(os.path.expandvars(filepath))
    if not os.path.isabs(filepath):
        filepath = os.path.join(auxiliary_data_dir, filepath)
    if not os.path.isfile(filepath):
        raise FileNotFoundError(
            f"Desired external file '{filepath}' does not exist")
    logger.info("Found external file '%s'", filepath)
    return filepath


def _get_data_frame(var_type, cubes, label_all_data, group_by=None):
    """Extract :class:`pandas.DataFrame` for a given ``var_type``."""
    data_frame = pd.DataFrame()
    for cube in cubes:
        cube_attrs = cube.attributes
        if var_type != cube_attrs['var_type']:
            continue
        if var_type in ('feature', 'label'):
            if group_by is not None and group_by not in cube_attrs:
                raise AttributeError(
                    f"Group attribute '{group_by}' not available in input "
                    f"file '{cube_attrs['filename']}'")
            group = cube_attrs.get(group_by, label_all_data)
            index = pd.MultiIndex.from_product(
                [[group], cube.coord('dataset').points],
                names=[group_by, 'dataset'])
        else:
            index = cube.coord('dataset').points
        series = pd.Series(data=cube.data, index=index)
        data_frame = _add_column(data_frame, series, cube_attrs['tag'])
    return data_frame


def _metadata_to_dict(metadata):
    """Convert :class:`iris.cube.CubeMetadata` to :obj:`dict`."""
    new_dict = {}
    for (key, val) in metadata._asdict().items():
        if isinstance(val, dict):
            new_dict.update(val)
        else:
            new_dict[key] = val
    return new_dict


def _split_cube(cube, merge_identical_pred_input=True):
    """Split cube in features and prediction_input."""
    if not cube.attributes.get('reference_dataset'):
        return (cube, None)

    # Get feature and prediction_input datasets
    features_datasets = []
    predicton_datasets = []
    references = cube.attributes['reference_dataset']
    for dataset in cube.coord('dataset').points:
        if dataset in references:
            predicton_datasets.append(dataset)
            logger.info(
                "Using dataset '%s' as prediction_input for variable '%s' "
                "with index %d", dataset, cube.var_name,
                len(predicton_datasets) - 1)
        else:
            features_datasets.append(dataset)

    # Extract cubes
    feature_cube = cube.extract(iris.Constraint(dataset=features_datasets))
    prediction_cube = cube.extract(iris.Constraint(dataset=predicton_datasets))
    feature_cube.attributes['var_type'] = 'feature'
    prediction_cube.attributes['var_type'] = 'prediction_input'

    # Merge identical prediction_input if desired
    if merge_identical_pred_input:
        (_, unique_idx) = np.unique(prediction_cube.data, return_index=True)
        diff = len(prediction_cube.coord('dataset').points) - len(unique_idx)
        if diff > 0:
            prediction_cube = prediction_cube[unique_idx]
            logger.info(
                "Removed %d identical prediction_input points for variable "
                "'%s'", diff, prediction_cube.var_name)

    # Set new index for prediction input
    prediction_cube.coord('dataset').points = np.arange(
        len(prediction_cube.coord('dataset').points))
    return (feature_cube, prediction_cube)


def _cube_to_dataset_coord(cube):
    """Convert :class:`iris.cube.Cube` to :class:`iris.coords.AuxCoord`."""
    if cube.ndim == 1:
        datasets = cube.data
    elif cube.ndim == 2:
        cube.data = cube.data.astype(str, casting='same_kind')
        datasets = [''.join(d.compressed()) for d in cube.data]
    else:
        raise ValueError(
            f"Only 1D and 2D cubes supported, got {cube.ndim:d}D cube")
    return iris.coords.AuxCoord(datasets,
                                var_name='dataset',
                                long_name='dataset')


def _get_first_cube_with_coord(cubes, accepted_coord_names):
    """Load single cube of :class:`iris.cube.CubeList` with specific coords."""
    returned_cube = None
    returned_coord = None
    for cube in cubes:
        for coord_name in accepted_coord_names:
            try:
                coord = cube.coord(coord_name)
                returned_cube = cube
                returned_coord = coord
                break
            except iris.exceptions.CoordinateNotFoundError:
                pass
        if returned_cube is not None:
            break
    else:
        raise ValueError(
            f"No cube of {cubes} contains 'dataset' coordinate (i.e. one of "
            f"{accepted_coord_names})")
    return (returned_cube, returned_coord)


def _load_cube_with_dataset_coord(filename):
    """Load cube with single ``dataset``-like coordinate.

    Files created by NCL cannot be read using a simple
    :func:`iris.load_cube`.

    """
    cubes = iris.load(filename)
    accepted_coord_names = ('dataset', 'model')

    # Handle single cube
    if len(cubes) == 1:
        (cube, coord) = _get_first_cube_with_coord(cubes, accepted_coord_names)
        if cube.ndim != 1:
            raise ValueError(
                f"Only 1D cubes supported, got {cube.ndim:d}D cube in file "
                f"'{filename}'")
        coord.var_name = 'dataset'
        coord.standard_name = None
        coord.long_name = 'dataset'
        return cube

    # At most two cubes are supported
    if len(cubes) > 2:
        raise ValueError(
            f"Loading NCL file '{filename}' failed, at most 2 cubes are "
            f"supported, got {len(cubes):d}")

    # Get 'model' or 'dataset' cube
    dataset_cube = None
    for cube in cubes:
        if cube.var_name in accepted_coord_names:
            logger.debug("Found coordinate cube '%s'", cube.var_name)
            dataset_cube = cube
        else:
            data_cube = cube
    if dataset_cube is None:
        raise ValueError(
            f"No 'dataset' coordinate (one of {accepted_coord_names}) in "
            f"file '{filename}' available")

    # Create new coordinate
    if data_cube.ndim != 1:
        raise ValueError(
            f"Only 1D cubes supported, got {data_cube.ndim:d}D cube in file "
            f"'{filename}'")
    if data_cube.shape[0] != dataset_cube.shape[0]:
        raise ValueError(
            f"Got differing sizes for first dimension of data cube "
            f"({data_cube.shape[0]:d}) and dataset cube "
            f"({dataset_cube.shape[0]:d}) in file '{filename}'")
    aux_coord = _cube_to_dataset_coord(dataset_cube)
    data_cube.add_aux_coord(aux_coord, 0)
    return data_cube


def _create_scatterplot(x_data,
                        y_data,
                        numbers_as_markers=True,
                        plot_regression_line_mean=False,
                        axes=None,
                        **kwargs):
    """Create single scatterplot including regression line."""
    if axes is None:
        (_, axes) = plt.subplots()

    # Scatterplots
    scatter_kwargs = dict(kwargs)
    scatter_kwargs.pop('label', None)
    for (idx, _) in enumerate(x_data):
        if numbers_as_markers:
            axes.text(x_data[idx],
                      y_data[idx],
                      x_data.index.get_level_values(-1)[idx],
                      size=7,
                      **scatter_kwargs)
        else:
            axes.scatter(x_data[idx], y_data[idx], **scatter_kwargs)

    # Regression line
    line_kwargs = {**kwargs, 'linestyle': '-'}
    fill_between_kwargs = {**kwargs, 'alpha': 0.3}
    fill_between_kwargs.pop('label', None)
    if plot_regression_line_mean:
        mean_kwargs = {**kwargs, 'marker': 'o'}
        mean_kwargs.pop('label', None)
        mean_kwargs.pop('linestyle', None)
    else:
        mean_kwargs = None
    axes = _create_regplot(x_data,
                           y_data,
                           axes=axes,
                           line_kwargs=line_kwargs,
                           fill_between_kwargs=fill_between_kwargs,
                           mean_kwargs=mean_kwargs)
    return axes


def _create_pred_input_plot(x_pred,
                            x_pred_error,
                            axes,
                            vline_kwargs=None,
                            vspan_kwargs=None):
    """Create plot for prediction input data (vertical lines)."""
    if vline_kwargs is None:
        vline_kwargs = {'color': 'k', 'linestyle': ':', 'label': 'Observation'}
    if vspan_kwargs is None:
        vspan_kwargs = {'color': 'k', 'alpha': 0.1}
    x_pred = x_pred[0]
    x_pred_error = x_pred_error[0]
    axes.axvline(x_pred, **vline_kwargs)
    axes.axvspan(x_pred - x_pred_error, x_pred + x_pred_error, **vspan_kwargs)
    return axes


def _create_pred_output_plot(x_data,
                             y_data,
                             x_pred,
                             x_pred_error,
                             axes,
                             hline_kwargs=None):
    """Create plot for prediction input data (vertical lines)."""
    if hline_kwargs is None:
        hline_kwargs = {'color': 'k', 'linestyle': ':'}
    (_, y_mean, _) = get_constraint(x_data, y_data, x_pred, x_pred_error)
    axes.axhline(y_mean, **hline_kwargs)
    return axes


def _create_regplot(x_data,
                    y_data,
                    axes=None,
                    line_kwargs=None,
                    fill_between_kwargs=None,
                    mean_kwargs=None):
    """Create single regression line plot."""
    if axes is None:
        (_, axes) = plt.subplots()
    if line_kwargs is None:
        line_kwargs = {'linestyle': '-', 'label': 'Linear regression'}
    if fill_between_kwargs is None:
        fill_between_kwargs = {'alpha': 0.3}

    # Create regression line
    reg = regression_line(x_data, y_data)

    # Add R2 and p-value to label if possible
    text = rf"$R^2={reg['rvalue']**2:.2f}, p={reg['pvalue']:.4f}$"
    if 'label' in line_kwargs:
        line_kwargs['label'] += rf' ({text})'
    else:
        if reg['rvalue'] < 0.0:
            axes.text(0.62, 0.93, text, transform=axes.transAxes)
        else:
            axes.text(0.02, 0.93, text, transform=axes.transAxes)

    # Plots regression
    axes.plot(reg['x'], reg['y'], **line_kwargs)
    axes.fill_between(reg['x'], reg['y_minus_err'], reg['y_plus_err'],
                      **fill_between_kwargs)

    # Plot means if desired
    if mean_kwargs is not None:
        x_mean = np.mean(reg['x'])
        y_mean = np.mean(reg['y'])
        axes.scatter(x_mean, y_mean, **mean_kwargs)
    return axes


def _get_pandas_cube(pandas_object):
    """Convert :mod:`pandas` object to cube and fix coordinates."""
    cube = iris.pandas.as_cube(pandas_object)
    for coord_name in ('index', 'columns'):
        try:
            names = getattr(pandas_object, coord_name).names
        except AttributeError:
            continue
        coord = cube.coord(coord_name)
        if not np.issubdtype(coord.dtype, np.number):
            coord.points = coord.points.astype(str)
            if coord.bounds is not None:
                coord.bounds = coord.bounds.astype(str)
        names = [n for n in names if n is not None]
        if not names:
            continue
        new_coord_name = '-'.join(names)
        coord.var_name = new_coord_name
        coord.long_name = new_coord_name
    return cube


def _metadata_list_to_cube_list(metadata_list, source):
    """Convert :obj:`list` of :obj:`dict` to :class:`iris.cube.CubeList`."""
    cubes = iris.cube.CubeList()
    for metadata in metadata_list:
        for attr in ('data', 'dataset'):
            if attr not in metadata:
                raise AttributeError(
                    f"Entry {metadata} from source '{source}' does not "
                    f"contain necessary attribute '{attr}'")
        aux_coord = iris.coords.AuxCoord(metadata.pop('dataset'),
                                         var_name='dataset',
                                         long_name='dataset')
        data_of_cube = metadata.pop('data')
        if data_of_cube is None:
            data_of_cube = np.nan
        cube = iris.cube.Cube(data_of_cube,
                              aux_coords_and_dims=[(aux_coord, ())])
        for key in ('var_name', 'standard_name', 'long_name', 'units'):
            if key in metadata:
                setattr(cube, key, metadata.pop(key))
        cube.attributes = metadata
        cubes.append(cube)
    return cubes


def _gaussian_pdf(x_val, x_mean, x_std):
    """Return Gaussian probability density."""
    norm = np.sqrt(2.0 * np.pi * x_std**2)
    return np.exp(-(x_val - x_mean)**2 / 2.0 / x_std**2) / norm


def _get_target_pdf(x_data,
                    y_data,
                    obs_mean,
                    obs_std,
                    n_points=1000,
                    necessary_p_value=None):
    """Get PDF of target variable including linear regression information."""
    (x_data, y_data) = _check_x_y_arrays(x_data, y_data)
    spe = standard_prediction_error(x_data, y_data)
    reg = linregress(x_data, y_data)

    # Get evenly spaced range of y
    y_range = 1.5 * (np.max(y_data) - np.min(y_data))
    y_lin = np.linspace(
        np.min(y_data) - y_range,
        np.max(y_data) + y_range, n_points)

    # Use unconstrained value of desired and necessary
    if necessary_p_value is not None:
        if reg.pvalue > necessary_p_value:
            y_pdf = _gaussian_pdf(y_lin, np.mean(y_data), np.std(y_data))
            return (y_lin, y_pdf, reg)

    # Helper functions for calculation of constrained target variable
    def obs_pdf(x_new):
        """Return PDF of observations P(x)."""
        return _gaussian_pdf(x_new, obs_mean, obs_std)

    def cond_pdf(x_new, y_new):
        """Return conditional PDF P(y|x)."""
        y_pred = reg.slope * x_new + reg.intercept
        y_std = spe(x_new)
        return _gaussian_pdf(y_new, y_pred, y_std)

    def comb_pdf(x_new, y_new):
        """Return combined PDF P(y,x)."""
        return obs_pdf(x_new) * cond_pdf(x_new, y_new)

    # PDF of target variable P(y)
    x_range = 3 * obs_std
    y_pdf = [
        integrate.quad(comb_pdf,
                       obs_mean - x_range,
                       obs_mean + x_range,
                       args=(y, ))[0] for y in y_lin
    ]
    return (y_lin, np.array(y_pdf), reg)


[docs]def check_metadata(metadata, allowed_var_types=None):
    """Check metadata.

    Parameters
    ----------
    metadata : dict
        Metadata to check.
    allowed_var_types : list of str, optional
        Allowed var_types, defaults to ``ALLOWED_VAR_TYPES``.

    Raises
    ------
    KeyError
        Metadata does not contain necessary keys ``'var_type'`` and ``'tag'``.
    ValueError
        Got invalid value for key ``'var_type'``.

    """
    if allowed_var_types is None:
        allowed_var_types = ALLOWED_VAR_TYPES
    filename = metadata.get('filename', metadata)
    for key in ('var_type', 'tag'):
        if key not in metadata:
            raise KeyError(
                f"Necessary key '{key}' not given in metadata of file "
                f"'{filename}'")
    if metadata['var_type'] not in allowed_var_types:
        raise ValueError(
            f"Expected one of {allowed_var_types} for 'var_type' of file "
            f"'{filename}', got '{metadata['var_type']}'")


[docs]def get_input_files(cfg, patterns=None, ignore_patterns=None):
    """Get input files.

    Parameters
    ----------
    cfg : dict
        Recipe configuration.
    patterns : list of str, optional
        Use only ancestor files that match these patterns as input files.
    ignore_patterns : list of str, optional
        Ignore input files that match these patterns.

    Returns
    -------
    list of str
        Input files.

    """
    input_files = []

    # Include only files that match patterns
    if patterns is None:
        patterns = []
    if not patterns:
        patterns.append('*.nc')
    for pattern in patterns:
        logger.debug("Looking for files matching the pattern '%s'", pattern)
        input_files.extend(io.get_all_ancestor_files(cfg, pattern=pattern))

    # Ignore files
    if not ignore_patterns:
        return input_files
    ignore_files = []
    for pattern in ignore_patterns:
        logger.debug("Ignoring for files matching the pattern '%s'", pattern)
        ignore_files.extend(io.get_all_ancestor_files(cfg, pattern=pattern))
    valid_files = []
    for filename in input_files:
        if filename not in ignore_files:
            valid_files.append(filename)
    return valid_files


[docs]def get_xy_data_without_nans(data_frame, feature, label):
    """Get (X, Y) data for ``(feature, label)`` combination without nans.

    Parameters
    ----------
    data_frame : pandas.DataFrame
        Training data.
    feature : str
        Name of the feature data.
    label : str
        Name of the label data.

    Returns
    -------
    tuple
        Tuple containing a :class:`pandas.DataFrame` for the X axis (feature)
        and a :class:`pandas.DataFrame` for the Y axis (label) without
        missing values.

    """
    idx_slice = pd.IndexSlice[:, [feature, label]]
    data_frame_xy = data_frame.loc[:, idx_slice]
    data_frame_xy.columns = data_frame_xy.columns.droplevel()
    data_frame_xy = data_frame_xy.dropna()
    x_data = data_frame_xy[feature]
    y_data = data_frame_xy[label]
    return (x_data, y_data)


[docs]def get_input_data(cfg):
    """Extract input data.

    Return training data, prediction input data and corresponding attributes.

    Parameters
    ----------
    cfg : dict
        Recipe configuration.

    Returns
    -------
    tuple
        A tuple containing the training data (:class:`pandas.DataFrame`), the
        prediction input data (:class:`pandas.DataFrame`) and the
        corresponding attributes (:obj:`dict`).

    """
    input_files = get_input_files(cfg,
                                  patterns=cfg.get('patterns'),
                                  ignore_patterns=cfg.get('ignore_patterns'))
    logger.debug("Found files:\n%s", pformat(input_files))

    # Get cubes
    external_file = _get_external_file(cfg.get('read_external_file'),
                                       cfg['auxiliary_data_dir'])
    cubes = _get_cube_list(
        input_files,
        recipe=cfg['recipe'],
        additional_data=cfg.get('additional_data'),
        external_file=external_file,
        merge_identical_pred_input=cfg.get('merge_identical_pred_input', True),
    )

    # Extract attributes for features and labels
    attributes = _get_attributes(cubes)

    # Extract DataFrames
    label_all_data = cfg.get('all_data_label', 'all')
    group_by = cfg.get('group_by')
    if group_by:
        logger.info("Grouping features and labels by '%s'", group_by)
    else:
        logger.info("Using label '%s' to label data in plots", label_all_data)
    features = _get_data_frame('feature', cubes, label_all_data, group_by)
    label = _get_data_frame('label', cubes, label_all_data, group_by)
    pred_input = _get_data_frame('prediction_input', cubes, label_all_data,
                                 group_by)
    pred_input_err = _get_data_frame('prediction_input_error', cubes,
                                     label_all_data, group_by)

    # Unify indices of features and label
    rows_to_add_to_label = [
        pd.Series(name=row, dtype=np.float64).to_frame().T for row in
        features.index.difference(label.index)
    ]
    label = pd.concat([label] + rows_to_add_to_label)
    rows_to_add_to_features = [
        pd.Series(name=row, dtype=np.float64).to_frame().T for row in
        label.index.difference(features.index)
    ]
    features = pd.concat([features] + rows_to_add_to_features)

    # Sort data frames
    for data_frame in (features, label, pred_input, pred_input_err):
        data_frame.sort_index(axis=0, inplace=True)
        data_frame.sort_index(axis=1, inplace=True)

    # Check data
    (features, label, pred_input,
     pred_input_err) = _check_data_frames(features, label, pred_input,
                                          pred_input_err)
    training_data = pd.concat([features, label], axis=1, keys=['x', 'y'])
    training_data['idx'] = np.arange(len(training_data.index)) + 1
    training_data.set_index('idx', append=True, inplace=True)
    training_data.index.names = [group_by, 'dataset', 'idx']
    prediction_data = pd.concat([pred_input, pred_input_err],
                                axis=1,
                                keys=['mean', 'error'])
    if training_data.dropna().shape[0] < 2:
        logger.error("Invalid training data:\n%s", training_data)
        raise ValueError(
            f"Expected at least 2 independent observations (=climate models) "
            f"where all training data (features and target label) is "
            f"available, got {training_data.dropna().shape[0]:d}")

    # Logger output
    with pd.option_context(*PANDAS_PRINT_OPTIONS):
        logger.info("Found training data:\n%s", training_data)
        logger.info("Found prediction data:\n%s", prediction_data)
    return (training_data, prediction_data, attributes)


[docs]def combine_groups(groups):
    """Combine :obj:`list` of groups to a single :obj:`str`.

    Parameters
    ----------
    groups : list of str
        List of group names.

    Returns
    -------
    str
        Combined :obj:`str`.

    """
    new_str = ', '.join(groups)
    return new_str


[docs]def pandas_object_to_cube(pandas_object,
                          index_droplevel=None,
                          columns_droplevel=None,
                          **kwargs):
    """Convert pandas object to :class:`iris.cube.Cube`.

    Parameters
    ----------
    pandas_object : pandas.DataFrame or pandas.Series
        Data to convert.
    index_droplevel : int or list of int, optional
        Drop levels of index if not ``None``.
    columns_droplevel : int or list of int, optional
        Drop levels of columns if not ``None``. Can only be used if
        ``pandas_object`` is a :class:`pandas.DataFrame`.
    **kwargs : Keyword arguments
        Keyword arguments used for the cube metadata, e.g. ``standard_name``,
        ``var_name``, etc.

    Returns
    -------
    iris.cube.Cube
        Data cube.

    Raises
    ------
    TypeError
        ``columns_droplevel`` is used when ``pandas_object`` is not a
        :class:`pandas.DataFrame`.

    """
    pandas_object = pandas_object.copy()
    if index_droplevel is not None:
        pandas_object.index = pandas_object.index.droplevel(index_droplevel)
    if columns_droplevel is not None:
        try:
            pandas_object.columns = pandas_object.columns.droplevel(
                columns_droplevel)
        except AttributeError:
            raise TypeError(
                f"'columns_droplevel' only supported for pandas.DataFrame "
                f"object, got {type(pandas_object)}")
    cube = _get_pandas_cube(pandas_object)
    for (key, val) in kwargs.items():
        setattr(cube, key, val)
    return cube


[docs]def set_plot_appearance(axes, attributes, **kwargs):
    """Set appearance of a plot.

    Parameters
    ----------
    axes : matplotlib.axes.Axes
        Matplotlib Axes object which contains the plot.
    attributes : dict
        Plot attributes.
    **kwargs : Keyword arguments
        Keyword arguments of the form ``plot_option=tag`` where ``plot_option``
        is something like ``plot_title``, ``plot_xlabel``, ``plot_xlim``, etc.
        and ``tag`` a key for the plot attributes :obj:`dict` that describes
        which attributes should be considered for that ``plot_option``.

    """
    for (plot_option, tag) in kwargs.items():
        plot_func = plot_option.replace('plot_', 'set_')
        value = attributes[tag][plot_option]
        getattr(axes, plot_func)(value)


[docs]def get_caption(attributes, feature, label, group=None):
    """Construct caption from plotting attributes for (feature, label) pair.

    Parameters
    ----------
    attributes : dict
        Plot attributes.
    feature : str
        Feature.
    label : str
        Label.
    group : str, optional
        Group.

    Returns
    -------
    str
        Caption.

    Raises
    ------
    KeyError
        ``attributes`` does not include necessary keys.

    """
    group_str = '' if group is None else f' ({group})'
    if feature not in attributes:
        raise KeyError(
            f"Attributes do not include necessary key for feature '{feature}'")
    if label not in attributes:
        raise KeyError(
            f"Attributes do not include necessary key for label '{label}'")
    feature_attrs = attributes[feature]
    label_attrs = attributes[label]
    if 'plot_title' not in feature_attrs:
        raise KeyError(
            f"Attributes for feature '{feature}' does not include necessary "
            f"key 'plot_title'")
    if 'plot_xlabel' not in feature_attrs:
        raise KeyError(
            f"Attributes for feature '{feature}' does not include necessary "
            f"key 'plot_xlabel'")
    if 'plot_ylabel' not in label_attrs:
        raise KeyError(
            f"Attributes for label '{label}' does not include necessary "
            f"key 'plot_ylabel'")
    caption = (f"{attributes[feature]['plot_title']}: "
               f"{attributes[label]['plot_ylabel']} vs. "
               f"{attributes[feature]['plot_xlabel']}{group_str}.")
    return caption


[docs]def get_provenance_record(attributes, tags, **kwargs):
    """Get provenance record.

    Parameters
    ----------
    attributes : dict
        Plot attributes. All provenance keys need to start with
        ``'provenance_'``.
    tags : list of str
        Tags used to retrieve data from the ``attributes`` :obj:`dict`, i.e.
        features and/or label.
    **kwargs : Keyword arguments
        Additional ``key:value`` pairs directly passed to the provenance record
        :obj:`dict`. All values may include the format strings ``{feature}``
        and ``{label}``.

    Returns
    -------
    dict
        Provenance record.

    """
    record = {}
    for tag in tags:
        for (key, value) in attributes[tag].items():
            if key.startswith('provenance_'):
                key = key.replace('provenance_', '')
                record.setdefault(key, [])
                if isinstance(value, str):
                    record[key].append(value)
                else:
                    record[key].extend(value)
            record.setdefault('ancestors', [])
            if key == 'filename':
                if isinstance(value, str):
                    record['ancestors'].append(value)
                else:
                    record['ancestors'].extend(value)
    for (key, value) in record.items():
        if isinstance(value, list):
            record[key] = list(set(value))
    record.update(kwargs)
    return record


[docs]def get_colors(cfg, groups=None):
    """Get color palette.

    Parameters
    ----------
    cfg : dict
        Recipe configuration.
    groups : list, optional
        Use to check whether color for combining groups has to be added.

    Returns
    -------
    list
        List of colors that can be used for :mod:`matplotlib`.

    """
    palette = cfg.get('seaborn_settings', {}).get('palette')
    colors = sns.color_palette(palette=palette)
    if groups is None:
        return colors
    if len(groups) > 1 and cfg.get('combine_groups', False):
        return [COLOR_COMBINED_GROUPS] + colors
    return colors


[docs]def get_groups(training_data, add_combined_group=False):
    """Extract groups from training data.

    Parameters
    ----------
    training_data : pandas.DataFrame
        Training data (features, label).
    add_combined_group : bool, optional (default: False)
        Add combined group of all other groups at the beginning of the
        returned :obj:`list`.

    Returns
    -------
    list of str
        Groups.

    """
    groups = list(set(training_data.index.get_level_values(0)))
    groups.sort()
    if add_combined_group and len(groups) > 1:
        groups.insert(0, combine_groups(groups))
    return groups


[docs]def plot_individual_scatterplots(training_data, pred_input_data, attributes,
                                 basename, cfg):
    """Plot individual scatterplots for the different groups.

    Plot scatterplots for all pairs of ``(feature, label)`` data (Separate plot
    for each group).

    Parameters
    ----------
    training_data : pandas.DataFrame
        Training data (features, label).
    pred_input_data : pandas.DataFrame
        Prediction input data (mean and error).
    attributes : dict
        Plot attributes for the different features and the label data.
    basename : str
        Basename for the name of the file.
    cfg : dict
        Recipe configuration.

    """
    logger.info("Plotting individual scatterplots")
    label = training_data.y.columns[0]
    groups = get_groups(training_data,
                        add_combined_group=cfg.get('combine_groups', False))

    # Iterate over features
    for feature in training_data.x.columns:
        (x_data, y_data) = get_xy_data_without_nans(training_data, feature,
                                                    label)

        # Individual plots
        colors = get_colors(cfg, groups=groups)
        for (idx, group) in enumerate(groups):
            try:
                x_sub_data = x_data.loc[group]
                y_sub_data = y_data.loc[group]
                index_droplevel = 1
            except KeyError:
                x_sub_data = x_data
                y_sub_data = y_data
                index_droplevel = [0, 2]
            axes = _create_scatterplot(
                x_sub_data,
                y_sub_data,
                numbers_as_markers=cfg.get('numbers_as_markers', False),
                plot_regression_line_mean=cfg.get('plot_regression_line_mean',
                                                  False),
                color=colors[idx],
                label=group)
            axes = _create_pred_input_plot(
                pred_input_data['mean'][feature].values,
                pred_input_data['error'][feature].values, axes)
            axes = _create_pred_output_plot(
                x_sub_data,
                y_sub_data,
                pred_input_data['mean'][feature].values,
                pred_input_data['error'][feature].values,
                axes,
                hline_kwargs={
                    'color': colors[idx],
                    'linestyle': ':'
                },
            )
            set_plot_appearance(axes,
                                attributes,
                                plot_title=feature,
                                plot_xlabel=feature,
                                plot_ylabel=label,
                                plot_xlim=feature,
                                plot_ylim=label)
            plt.legend(**LEGEND_KWARGS)
            filename = (f"scatterplot_{basename}_{feature}_"
                        f"{group.replace(', ', '-')}")
            plot_path = get_plot_filename(filename, cfg)
            plt.savefig(plot_path,
                        **cfg.get('savefig_kwargs', {}))
            logger.info("Wrote %s", plot_path)
            plt.close()

            # Provenance
            provenance_record = get_provenance_record(
                attributes, [feature, label],
                caption=get_caption(attributes, feature, label, group=group),
                plot_type='scatter')
            with ProvenanceLogger(cfg) as provenance_logger:
                provenance_logger.log(plot_path, provenance_record)

            # Write netCDF file
            cubes = iris.cube.CubeList([
                pandas_object_to_cube(
                    x_sub_data,
                    index_droplevel=index_droplevel,
                    var_name=feature,
                    long_name=attributes[feature]['plot_xlabel'],
                    units=attributes[feature]['units']),
                pandas_object_to_cube(
                    y_sub_data,
                    index_droplevel=index_droplevel,
                    var_name=label,
                    long_name=attributes[label]['plot_ylabel'],
                    units=attributes[label]['units']),
            ])
            netcdf_path = get_diagnostic_filename(filename, cfg)
            io.iris_save(cubes, netcdf_path)
            with ProvenanceLogger(cfg) as provenance_logger:
                provenance_logger.log(netcdf_path, provenance_record)


[docs]def plot_merged_scatterplots(training_data, pred_input_data, attributes,
                             basename, cfg):
    """Plot merged scatterplots (all groups in one plot).

    Plot scatterplots for all pairs of ``(feature, label)`` data (all groups in
    one plot).

    Parameters
    ----------
    training_data : pandas.DataFrame
        Training data (features, label).
    pred_input_data : pandas.DataFrame
        Prediction input data (mean and error).
    attributes : dict
        Plot attributes for the different features and the label data.
    basename : str
        Basename for the name of the file.
    cfg : dict
        Recipe configuration.

    """
    logger.info("Plotting merged scatterplots")
    label = training_data.y.columns[0]
    groups = get_groups(training_data,
                        add_combined_group=cfg.get('combine_groups', False))
    numbers_as_markers = cfg.get('numbers_as_markers', False)
    plot_regression_line_mean = cfg.get('plot_regression_line_mean', False)
    colors = get_colors(cfg)

    # Iterate over features
    for feature in training_data.x.columns:
        (x_data, y_data) = get_xy_data_without_nans(training_data, feature,
                                                    label)
        (_, axes) = plt.subplots()
        if len(groups) > 1 and cfg.get('combine_groups', False):
            axes = _create_regplot(
                x_data,
                y_data,
                axes=axes,
                line_kwargs={
                    'color': COLOR_COMBINED_GROUPS,
                    'label': groups[0],
                    'linestyle': '-'
                },
                fill_between_kwargs={
                    'color': COLOR_COMBINED_GROUPS,
                    'alpha': 0.3
                },
                mean_kwargs=(None
                             if not cfg.get('plot_regression_line_mean') else {
                                 'color': COLOR_COMBINED_GROUPS,
                                 'marker': 'o'
                             }),
            )
            axes = _create_pred_output_plot(
                x_data,
                y_data,
                pred_input_data['mean'][feature].values,
                pred_input_data['error'][feature].values,
                axes,
                hline_kwargs={
                    'color': COLOR_COMBINED_GROUPS,
                    'linestyle': ':'
                },
            )
            for (idx, group) in enumerate(groups[1:]):
                axes = _create_scatterplot(
                    x_data.loc[group],
                    y_data.loc[group],
                    numbers_as_markers=numbers_as_markers,
                    plot_regression_line_mean=plot_regression_line_mean,
                    axes=axes,
                    color=colors[idx],
                    label=group,
                )
                axes = _create_pred_output_plot(
                    x_data.loc[group],
                    y_data.loc[group],
                    pred_input_data['mean'][feature].values,
                    pred_input_data['error'][feature].values,
                    axes,
                    hline_kwargs={
                        'color': colors[idx],
                        'linestyle': ':'
                    },
                )
        else:
            for (idx, group) in enumerate(groups):
                axes = _create_scatterplot(
                    x_data.loc[group],
                    y_data.loc[group],
                    numbers_as_markers=numbers_as_markers,
                    plot_regression_line_mean=plot_regression_line_mean,
                    axes=axes,
                    color=colors[idx],
                    label=group,
                )
                axes = _create_pred_output_plot(
                    x_data.loc[group],
                    y_data.loc[group],
                    pred_input_data['mean'][feature].values,
                    pred_input_data['error'][feature].values,
                    axes,
                    hline_kwargs={
                        'color': colors[idx],
                        'linestyle': ':'
                    },
                )
        axes = _create_pred_input_plot(
            pred_input_data['mean'][feature].values,
            pred_input_data['error'][feature].values, axes)
        set_plot_appearance(axes,
                            attributes,
                            plot_title=feature,
                            plot_xlabel=feature,
                            plot_ylabel=label,
                            plot_xlim=feature,
                            plot_ylim=label)
        plt.legend(**LEGEND_KWARGS)
        filename = f'scatterplot_merged_{basename}_{feature}'
        plot_path = get_plot_filename(filename, cfg)
        plt.savefig(plot_path,
                    **cfg.get('savefig_kwargs', {}))
        logger.info("Wrote %s", plot_path)
        plt.close()

        # Provenance
        provenance_record = get_provenance_record(attributes, [feature, label],
                                                  caption=get_caption(
                                                      attributes, feature,
                                                      label),
                                                  plot_type='scatter')
        with ProvenanceLogger(cfg) as provenance_logger:
            provenance_logger.log(plot_path, provenance_record)

        # Write netCDF file
        cubes = iris.cube.CubeList([
            pandas_object_to_cube(x_data,
                                  index_droplevel=[0, 2],
                                  var_name=feature,
                                  long_name=attributes[feature]['plot_xlabel'],
                                  units=attributes[feature]['units']),
            pandas_object_to_cube(y_data,
                                  index_droplevel=[0, 2],
                                  var_name=label,
                                  long_name=attributes[label]['plot_ylabel'],
                                  units=attributes[label]['units']),
        ])
        netcdf_path = get_diagnostic_filename(filename, cfg)
        io.iris_save(cubes, netcdf_path)
        with ProvenanceLogger(cfg) as provenance_logger:
            provenance_logger.log(netcdf_path, provenance_record)


[docs]def create_simple_scatterplot(x_data, y_data, obs_mean, obs_std):
    """Create simple scatterplot of an emergent relationship (without saving).

    Parameters
    ----------
    x_data : numpy.ndarray
        X data of the emergent constraint.
    y_data : numpy.ndarray
        Y data of the emergent constraint.
    obs_mean : float
        Mean of observational data.
    obs_std : float
        Standard deviation of observational data.

    """
    logger.debug("Plotting simple scatterplot")
    (fig, axes) = plt.subplots()
    axes.scatter(x_data, y_data, color='k', marker='o')
    line_kwargs = {'color': 'C1', 'linestyle': '-'}
    fill_between_kwargs = {**line_kwargs, 'alpha': 0.3}
    axes = _create_regplot(x_data, y_data, axes=axes, line_kwargs=line_kwargs,
                           fill_between_kwargs=fill_between_kwargs)
    axes = _create_pred_input_plot(obs_mean, obs_std, axes)
    return (fig, axes)


[docs]def plot_target_distributions(training_data, pred_input_data, attributes,
                              basename, cfg):
    """Plot distributions of target variable for every feature.

    Parameters
    ----------
    training_data : pandas.DataFrame
        Training data (features, label).
    pred_input_data : pandas.DataFrame
        Prediction input data (mean and error).
    attributes : dict
        Plot attributes for the different features and the label data.
    basename : str
        Basename for the name of the file.
    cfg : dict
        Recipe configuration.

    """
    logger.info("Plotting distributions of target variable")
    label = training_data.y.columns[0]
    groups = get_groups(training_data,
                        add_combined_group=cfg['combine_groups'])
    summary_columns = pd.MultiIndex.from_product(
        [groups, ['best estimate', 'range', 'min', 'max']])
    summaries = []

    # Iterate over features
    for feature in training_data.x.columns:
        (x_data, y_data) = get_xy_data_without_nans(training_data, feature,
                                                    label)
        colors = get_colors(cfg, groups=groups)
        summary_for_feature = pd.Series(
            index=summary_columns, name=feature, dtype=np.float64
        )

        # Iterate over groups
        for (idx, group) in enumerate(groups):
            try:
                x_sub_data = x_data.loc[group]
                y_sub_data = y_data.loc[group]
            except KeyError:
                x_sub_data = x_data
                y_sub_data = y_data
            (y_lin, y_pdf) = target_pdf(
                x_sub_data,
                y_sub_data,
                pred_input_data['mean'][feature].values,
                pred_input_data['error'][feature].values,
            )

            # Plots
            axes = sns.histplot(y_sub_data,
                                bins=7,
                                stat='density',
                                color=colors[idx],
                                alpha=0.4)
            axes.plot(y_lin,
                      y_pdf,
                      color=colors[idx],
                      linestyle='-',
                      label=group)

            # Print results
            (y_min, y_mean, y_max) = get_constraint(
                x_sub_data,
                y_sub_data,
                pred_input_data['mean'][feature].values,
                pred_input_data['error'][feature].values,
                confidence_level=cfg['confidence_level'],
            )
            y_error = np.max([y_max - y_mean, y_mean - y_min])
            reg = linregress(x_sub_data.values, y_sub_data.values)
            logger.info(
                "Constrained %s for feature '%s' and group '%s': %.2f ± %.2f "
                "(%i%% confidence level), R2 = %f, p = %f", label,
                feature, group, y_mean, y_error,
                int(100.0 * cfg['confidence_level']), reg.rvalue**2,
                reg.pvalue)

            # Save results of group
            summary_for_feature[(group, 'best estimate')] = y_mean
            summary_for_feature[(group, 'range')] = y_max - y_min
            summary_for_feature[(group, 'min')] = y_min
            summary_for_feature[(group, 'max')] = y_max

        # Save results for feature
        summaries.append(summary_for_feature.to_frame().T)

        # Plot appearance
        set_plot_appearance(axes, attributes, plot_title=feature)
        axes.set_xlabel(attributes[label]['plot_ylabel'])
        axes.set_ylabel('Probability density')
        if attributes[label]['plot_ylim'] is not None:
            axes.set_xlim(attributes[label]['plot_ylim'])
        axes.set_ylim([0.0, 1.0])
        plt.legend(loc='best')

        # Save plot
        plot_path = get_plot_filename(
            f'target_distribution_{basename}_{feature}', cfg)
        plt.savefig(plot_path,
                    **cfg['savefig_kwargs'])
        logger.info("Wrote %s", plot_path)
        plt.close()

        # Provenance
        provenance_record = get_provenance_record(
            attributes, [feature, label],
            caption=(f"{attributes[feature]['plot_title']}: Probability "
                     f"densitiy of {label}."),
            plot_type='probability')
        with ProvenanceLogger(cfg) as provenance_logger:
            provenance_logger.log(plot_path, provenance_record)

    # Print mean results
    summary = pd.concat(summaries)
    with pd.option_context(*PANDAS_PRINT_OPTIONS):
        logger.info("Constrained ranges:\n%s", summary)
        summary = summary.mean(axis=0)
        logger.info("Mean of constrained ranges:\n%s", summary)


[docs]def export_csv(data_frame, attributes, basename, cfg, tags=None):
    """Export CSV file.

    Parameters
    ----------
    data_frame : pandas.DataFrame
        Data to export.
    attributes : dict
        Plot attributes for the different features and the label data. Used to
        retrieve provenance information.
    basename : str
        Basename for the name of the file.
    cfg : dict
        Recipe configuration.
    tags : iterable of str, optional
        Tags for which provenance information should be retrieved (using
        ``attributes``). If not specified, use (last level of) columns of the
        given ``data_frame``.

    Returns
    -------
    str
        Path to the new CSV file.

    """
    logger.info("Exporting CSV file for '%s'", basename)
    csv_path = get_diagnostic_filename(basename, cfg).replace('.nc', '.csv')
    data_frame.to_csv(csv_path)
    logger.info("Wrote %s", csv_path)
    if tags is None:
        tags = data_frame.columns.get_level_values(-1)
    provenance_record = get_provenance_record(attributes,
                                              tags,
                                              caption=basename)
    with ProvenanceLogger(cfg) as provenance_logger:
        provenance_logger.log(csv_path, provenance_record)
    return csv_path


[docs]def standard_prediction_error(x_data, y_data):
    """Return a function to calculate standard prediction error.

    The standard prediction error of a linear regression is the error when
    predicting a data point which was not used to fit the regression line in
    the first place.

    Parameters
    ----------
    x_data : numpy.ndarray
        X data used to fit the linear regression.
    y_data : numpy.ndarray
        Y data used to fit the linear regression.

    Returns
    -------
    callable
        Function that takes a :obj:`float` as single argument (representing the
        X value of a new data point) and returns the standard prediction error
        for that.

    """
    (x_data, y_data) = _check_x_y_arrays(x_data, y_data)
    reg = linregress(x_data, y_data)
    y_pred = reg.slope * x_data + reg.intercept
    n_data = x_data.shape[0]
    see = np.sqrt(np.sum(np.square(y_data - y_pred)) / (n_data - 2))
    x_mean = np.mean(x_data)
    ssx = np.sum(np.square(x_data - x_mean))

    def spe(x_new):
        """Return standard prediction error."""
        return see * np.sqrt(1.0 + 1.0 / n_data + (x_new - x_mean)**2 / ssx)

    return spe


[docs]def regression_line(x_data, y_data, n_points=1000):
    """Return x and y coordinates of the regression line (mean and error).

    Parameters
    ----------
    x_data : numpy.ndarray
        X data used to fit the linear regression.
    y_data : numpy.ndarray
        Y data used to fit the linear regression.
    n_points : int, optional (default: 1000)
        Number of points for the regression lines.

    Returns
    -------
    dict
        :class:`numpy.ndarray` s for the keys ``'x'``, ``'y'``,
        ``'y_minus_err'``, ``'y_plus_err'``, ``'slope'``, ``'intercept'``,
        ``'pvalue'`` and ``'rvalue'``.

    """
    (x_data, y_data) = _check_x_y_arrays(x_data, y_data)
    spe = np.vectorize(standard_prediction_error(x_data, y_data))
    out = {}
    reg = linregress(x_data, y_data)
    x_range = np.max(x_data) - np.min(x_data)
    x_lin = np.linspace(
        np.min(x_data) - x_range,
        np.max(x_data) + x_range, n_points)
    out['x'] = x_lin
    out['y'] = reg.slope * x_lin + reg.intercept
    out['y_minus_err'] = out['y'] - spe(x_lin)
    out['y_plus_err'] = out['y'] + spe(x_lin)
    out['slope'] = reg.slope
    out['intercept'] = reg.intercept
    out['pvalue'] = reg.pvalue
    out['rvalue'] = reg.rvalue
    return out


[docs]def target_pdf(x_data,
               y_data,
               obs_mean,
               obs_std,
               n_points=1000,
               necessary_p_value=None):
    """Calculate probability density function (PDF) for target variable.

    Parameters
    ----------
    x_data : numpy.ndarray
        X data of the emergent constraint.
    y_data : numpy.ndarray
        Y data of the emergent constraint.
    obs_mean : float
        Mean of observational data.
    obs_std : float
        Standard deviation of observational data.
    n_points : int, optional (default: 1000)
        Number of sampled points for PDF of target variable.
    necessary_p_value : float, optional
        If given, return unconstrained PDF (using Gaussian distribution with
        unconstrained mean and standard deviation) when `p`-value of emergent
        relationship is greater than the given necessary `p`-value.

    Returns
    -------
    tuple of numpy.ndarray
        x and y values for the PDF.

    """
    (y_lin, y_pdf, _) = _get_target_pdf(x_data,
                                        y_data,
                                        obs_mean,
                                        obs_std,
                                        n_points=n_points,
                                        necessary_p_value=necessary_p_value)
    return (y_lin, y_pdf)


[docs]def cdf(data, pdf):
    """Calculate cumulative distribution function for a 1-dimensional PDF.

    Parameters
    ----------
    data : numpy.ndarray
        Data points (1D array).
    pdf : numpy.ndarray
        Corresponding probability density function (PDF).

    Returns
    -------
    numpy.ndarray
        Corresponding cumulative distribution function (CDF).

    """
    idx_range = range(1, len(data) + 1)
    cum_dens = [integrate.simps(pdf[:idx], data[:idx]) for idx in idx_range]
    return np.array(cum_dens)


[docs]def constraint_info_array(x_data,
                          y_data,
                          obs_mean,
                          obs_std,
                          n_points=1000,
                          necessary_p_value=None):
    """Get array with all relevant parameters of emergent constraint.

    Parameters
    ----------
    x_data : numpy.ndarray
        X data of the emergent constraint.
    y_data : numpy.ndarray
        Y data of the emergent constraint.
    obs_mean : float
        Mean of observational data.
    obs_std : float
        Standard deviation of observational data.
    n_points : int, optional (default: 1000)
        Number of sampled points for PDF of target variable.
    necessary_p_value : float, optional
        If given, replace constrained mean and standard deviation with
        unconstrained values when `p`-value of emergent relationship is greater
        than the given necessary `p`-value.

    Returns
    -------
    numpy.ndarray
        Array of shape (8,) with the elements:
            0. Constrained mean of target variable.
            1. Constrained standard deviation of target variable.
            2. Unconstrained mean of target variable.
            3. Unconstrained standard deviation of target variable.
            4. Slope of emergent relationship.
            5. Intercept of emergent relationship.
            6. Correlation coefficient `r` of emergent relationship.
            7. `p`-value of emergent relationship.

    """
    (y_lin, y_pdf, reg) = _get_target_pdf(x_data,
                                          y_data,
                                          obs_mean,
                                          obs_std,
                                          n_points=n_points,
                                          necessary_p_value=necessary_p_value)
    norm = np.sum(y_pdf)
    y_mean = np.sum(y_lin * y_pdf) / norm
    y_std = np.sqrt(np.sum((y_lin - y_mean)**2 * y_pdf) / norm)
    info = [
        y_mean, y_std,
        np.ma.mean(y_data),
        np.ma.std(y_data), reg.slope, reg.intercept, reg.rvalue, reg.pvalue
    ]
    return np.array(info)


[docs]def get_constraint(x_data, y_data, obs_mean, obs_std, confidence_level=0.66):
    """Get constraint on target variable.

    Parameters
    ----------
    x_data : numpy.ndarray
        X data of the emergent constraint.
    y_data : numpy.ndarray
        Y data of the emergent constraint.
    obs_mean : float
        Mean of observational data.
    obs_std : float
        Standard deviation of observational data.
    confidence_level : float, optional (default: 0.66)
        Confindence level to estimate the range of the target variable.

    Returns
    -------
    tuple of float
        Lower confidence limit, best estimate and upper confidence limit of
        target variable.

    """
    (x_data, y_data) = _check_x_y_arrays(x_data, y_data)
    (y_lin, y_pdf) = target_pdf(x_data, y_data, obs_mean, obs_std)
    y_mean = np.sum(y_lin * y_pdf) / np.sum(y_pdf)
    y_cdf = cdf(y_lin, y_pdf)
    y_index_range = np.nonzero((y_cdf >= (1.0 - confidence_level) / 2.0)
                               & (y_cdf <= (1.0 + confidence_level) / 2.0))
    y_range = y_lin[y_index_range]
    return (np.min(y_range), y_mean, np.max(y_range))


[docs]def get_constraint_from_df(training_data,
                           pred_input_data,
                           confidence_level=0.66):
    """Get constraint on target variable from :class:`pandas.DataFrame`.

    Parameters
    ----------
    training_data : pandas.DataFrame
        Training data (features, label).
    pred_input_data : pandas.DataFrame
        Prediction input data (mean and error).
    confidence_level : float, optional (default: 0.66)
        Confindence level to estimate the range of the target variable.

    Returns
    -------
    tuple of float
        Lower confidence limit, best estimate and upper confidence limit of
        target variable.

    """
    if len(training_data.columns) != 2:
        raise ValueError(
            f"Expected exactly two columns for training data (feature and "
            f"label), got {len(training_data.columns):d}")
    if len(pred_input_data.columns) != 2:
        raise ValueError(
            f"Expected exactly two columns for prediction input data (mean "
            f"and error, got {len(pred_input_data.columns):d}")

    # Extract data
    label = training_data.y.columns[0]
    feature = training_data.x.columns[0]
    (x_data, y_data) = get_xy_data_without_nans(training_data, feature, label)
    x_pred = pred_input_data['mean'][feature].values[0]
    x_pred_error = pred_input_data['error'][feature].values[0]

    # Calculate constraint
    constraint = get_constraint(x_data,
                                y_data,
                                x_pred,
                                x_pred_error,
                                confidence_level=confidence_level)
    return constraint