Source code for esmvaltool.diag_scripts.shared._base

"""Convenience functions for running a diagnostic script."""
import argparse
import contextlib
import glob
import logging
import os
import shutil
import sys
import time
from collections import OrderedDict

import yaml

logger = logging.getLogger(__name__)


[docs]def get_plot_filename(basename, cfg): """Get a valid path for saving a diagnostic plot. Parameters ---------- basename: str The basename of the file. cfg: dict Dictionary with diagnostic configuration. Returns ------- str: A valid path for saving a diagnostic plot. """ return os.path.join( cfg['plot_dir'], f"{basename}.{cfg['output_file_type']}", )
[docs]def get_diagnostic_filename(basename, cfg, extension='nc'): """Get a valid path for saving a diagnostic data file. Parameters ---------- basename: str The basename of the file. cfg: dict Dictionary with diagnostic configuration. extension: str File name extension. Returns ------- str: A valid path for saving a diagnostic data file. """ return os.path.join( cfg['work_dir'], f"{basename}.{extension}", )
[docs]class ProvenanceLogger: """Open the provenance logger. Parameters ---------- cfg: dict Dictionary with diagnostic configuration. Example ------- Use as a context manager:: record = { 'caption': "This is a nice plot.", 'statistics': ['mean'], 'domain': 'global', 'plot_type': 'zonal', 'plot_file': '/path/to/result.png', 'authors': [ 'first_author', 'second_author', ], 'references': [ 'acknow_project', ], 'ancestors': [ '/path/to/input_file_1.nc', '/path/to/input_file_2.nc', ], } output_file = '/path/to/result.nc' with ProvenanceLogger(cfg) as provenance_logger: provenance_logger.log(output_file, record) """ def __init__(self, cfg): """Create a provenance logger.""" self._log_file = os.path.join(cfg['run_dir'], 'diagnostic_provenance.yml') if not os.path.exists(self._log_file): self.table = {} else: with open(self._log_file, 'r') as file: self.table = yaml.safe_load(file)
[docs] def log(self, filename, record): """Record provenance. Parameters ---------- filename: str Name of the file containing the diagnostic data. record: dict Dictionary with the provenance information to be logged. Typical keys are: - plot_type - plot_file - caption - ancestors - authors - references Note ---- See also esmvaltool/config-references.yml """ if filename in self.table: raise KeyError( "Provenance record for {} already exists.".format(filename)) self.table[filename] = record
def _save(self): """Save the provenance log to file.""" dirname = os.path.dirname(self._log_file) if not os.path.exists(dirname): os.makedirs(dirname) with open(self._log_file, 'w') as file: yaml.safe_dump(self.table, file) def __enter__(self): """Enter context.""" return self def __exit__(self, *_): """Save the provenance log before exiting context.""" self._save()
[docs]def select_metadata(metadata, **attributes): """Select specific metadata describing preprocessed data. Parameters ---------- metadata : :obj:`list` of :obj:`dict` A list of metadata describing preprocessed data. **attributes : Keyword arguments specifying the required variable attributes and their values. Use the value '*' to select any variable that has the attribute. Returns ------- :obj:`list` of :obj:`dict` A list of matching metadata. """ selection = [] for attribs in metadata: if all( a in attribs and ( attribs[a] == attributes[a] or attributes[a] == '*') for a in attributes): selection.append(attribs) return selection
[docs]def group_metadata(metadata, attribute, sort=None): """Group metadata describing preprocessed data by attribute. Parameters ---------- metadata : :obj:`list` of :obj:`dict` A list of metadata describing preprocessed data. attribute : str The attribute name that the metadata should be grouped by. sort : See `sorted_group_metadata`. Returns ------- :obj:`dict` of :obj:`list` of :obj:`dict` A dictionary containing the requested groups. If sorting is requested, an `OrderedDict` will be returned. """ groups = {} for attributes in metadata: key = attributes.get(attribute) if key not in groups: groups[key] = [] groups[key].append(attributes) if sort: groups = sorted_group_metadata(groups, sort) return groups
[docs]def sorted_metadata(metadata, sort): """Sort a list of metadata describing preprocessed data. Sorting is done on strings and is not case sensitive. Parameters ---------- metadata : :obj:`list` of :obj:`dict` A list of metadata describing preprocessed data. sort : :obj:`str` or :obj:`list` of :obj:`str` One or more attributes to sort by. Returns ------- :obj:`list` of :obj:`dict` The sorted list of variable metadata. """ if isinstance(sort, str): sort = [sort] def normalized_variable_key(attributes): """Define a key to sort the list of attributes by.""" return tuple(str(attributes.get(k, '')).lower() for k in sort) return sorted(metadata, key=normalized_variable_key)
[docs]def sorted_group_metadata(metadata_groups, sort): """Sort grouped metadata. Sorting is done on strings and is not case sensitive. Parameters ---------- metadata_groups : :obj:`dict` of :obj:`list` of :obj:`dict` Dictionary containing the groups of metadata. sort : :obj:`bool` or :obj:`str` or :obj:`list` of :obj:`str` One or more attributes to sort by or True to just sort the groups but not the lists. Returns ------- :obj:`OrderedDict` of :obj:`list` of :obj:`dict` A dictionary containing the requested groups. """ if sort is True: sort = [] def normalized_group_key(key): """Define a key to sort the OrderedDict by.""" return '' if key is None else str(key).lower() groups = OrderedDict() for key in sorted(metadata_groups, key=normalized_group_key): groups[key] = sorted_metadata(metadata_groups[key], sort) return groups
[docs]def extract_variables(cfg, as_iris=False): """Extract basic variable information from configuration dictionary. Returns `short_name`, `standard_name`, `long_name` and `units` keys for each variable. Parameters ---------- cfg : dict Diagnostic script configuration. as_iris : bool, optional Replace `short_name` by `var_name`, this can be used directly in :mod:`iris` classes. Returns ------- dict Variable information in :obj:`dict`s (values) for each `short_name` (key). """ keys_to_extract = [ 'short_name', 'standard_name', 'long_name', 'units', ] # Extract variables input_data = cfg['input_data'].values() variable_data = group_metadata(input_data, 'short_name') variables = {} for (short_name, data) in variable_data.items(): data = data[0] variables[short_name] = {} info = variables[short_name] for key in keys_to_extract: info[key] = data[key] # Replace short_name by var_name if desired if as_iris: info['var_name'] = info.pop('short_name') if info['standard_name'] == '': info['standard_name'] = None return variables
[docs]def variables_available(cfg, short_names): """Check if data from certain variables is available. Parameters ---------- cfg : dict Diagnostic script configuration. short_names : list of str Variable `short_names` which should be checked. Returns ------- bool `True` if all variables available, `False` if not. """ input_data = cfg['input_data'].values() available_short_names = list(group_metadata(input_data, 'short_name')) for var in short_names: if var not in available_short_names: return False return True
[docs]def get_cfg(filename=None): """Read diagnostic script configuration from settings.yml.""" if filename is None: filename = sys.argv[1] with open(filename) as file: cfg = yaml.safe_load(file) return cfg
def _get_input_data_files(cfg): """Get a dictionary containing all data input files.""" metadata_files = [] for filename in cfg['input_files']: if os.path.isdir(filename): metadata_files.extend( glob.glob(os.path.join(filename, '*metadata.yml'))) elif os.path.basename(filename) == 'metadata.yml': metadata_files.append(filename) input_files = {} for filename in metadata_files: with open(filename) as file: metadata = yaml.safe_load(file) input_files.update(metadata) return input_files
[docs]@contextlib.contextmanager def run_diagnostic(): """Run a Python diagnostic. This context manager is the main entry point for most Python diagnostics. Example ------- See esmvaltool/diag_scripts/examples/diagnostic.py for an extensive example of how to start your diagnostic. Basic usage is as follows, add these lines at the bottom of your script:: def main(cfg): # Your diagnostic code goes here. print(cfg) if __name__ == '__main__': with run_diagnostic() as cfg: main(cfg) The `cfg` dict passed to `main` contains the script configuration that can be used with the other functions in this module. """ # Implemented as context manager so we can support clean up actions later parser = argparse.ArgumentParser(description="Diagnostic script") parser.add_argument('filename', help="Path to settings.yml") parser.add_argument( '-f', '--force', help=("Force emptying the output directories" "(useful when re-running the script)"), action='store_true', ) parser.add_argument( '-i', '--ignore-existing', help=("Force running the script, even if output files exists." "(useful when re-running the script, use at your own risk)"), action='store_true', ) parser.add_argument( '-l', '--log-level', help=("Set the log-level"), choices=['debug', 'info', 'warning', 'error'], ) args = parser.parse_args() cfg = get_cfg(args.filename) # Set up logging if args.log_level: cfg['log_level'] = args.log_level logging.basicConfig(format="%(asctime)s [%(process)d] %(levelname)-8s " "%(name)s,%(lineno)s\t%(message)s") logging.Formatter.converter = time.gmtime logging.captureWarnings(True) logging.getLogger().setLevel(cfg['log_level'].upper()) # Read input metadata cfg['input_data'] = _get_input_data_files(cfg) logger.info("Starting diagnostic script %s with configuration:\n%s", cfg['script'], yaml.safe_dump(cfg)) # Create output directories output_directories = [] if cfg['write_netcdf']: output_directories.append(cfg['work_dir']) if cfg['write_plots']: output_directories.append(cfg['plot_dir']) existing = [p for p in output_directories if os.path.exists(p)] if existing: if args.force: for output_directory in existing: logger.info("Removing %s", output_directory) shutil.rmtree(output_directory) elif not args.ignore_existing: logger.error( "Script will abort to prevent accidentally overwriting your " "data in these directories:\n%s\n" "Use -f or --force to force emptying the output directories " "or use -i or --ignore-existing to ignore existing output " "directories.", '\n'.join(existing)) for output_directory in output_directories: logger.info("Creating %s", output_directory) if args.ignore_existing and os.path.exists(output_directory): continue os.makedirs(output_directory) provenance_file = os.path.join(cfg['run_dir'], 'diagnostic_provenance.yml') if os.path.exists(provenance_file): os.remove(provenance_file) yield cfg logger.info("End of diagnostic script run.")