Source code for esmvalcore.esgf.facets

"""Module containing mappings from our names to ESGF names."""

import pyesgf.search

from ..config._esgf_pyclient import get_esgf_config

FACETS = {
    'CMIP3': {
        'dataset': 'model',
        'ensemble': 'ensemble',
        'exp': 'experiment',
        'frequency': 'time_frequency',
        'short_name': 'variable',
    },
    'CMIP5': {
        'dataset': 'model',
        'ensemble': 'ensemble',
        'exp': 'experiment',
        'frequency': 'time_frequency',
        'institute': 'institute',
        'mip': 'cmor_table',
        'product': 'product',
        'short_name': 'variable',
    },
    'CMIP6': {
        'activity': 'activity_drs',
        'dataset': 'source_id',
        'ensemble': 'member_id',
        'exp': 'experiment_id',
        'institute': 'institution_id',
        'grid': 'grid_label',
        'mip': 'table_id',
        'short_name': 'variable',
    },
    'CORDEX': {
        'dataset': 'rcm_name',
        'driver': 'driving_model',
        'domain': 'domain',
        'ensemble': 'ensemble',
        'exp': 'experiment',
        'frequency': 'time_frequency',
        'institute': 'institute',
        'product': 'product',
        'short_name': 'variable',
    },
    'obs4MIPs': {
        'dataset': 'source_id',
        'frequency': 'time_frequency',
        'institute': 'institute',
        'short_name': 'variable',
    }
}
"""Mapping between the recipe and ESGF facet names."""

DATASET_MAP = {
    'CMIP3': {},
    'CMIP5': {
        'ACCESS1-0': 'ACCESS1.0',
        'ACCESS1-3': 'ACCESS1.3',
        'bcc-csm1-1': 'BCC-CSM1.1',
        'bcc-csm1-1-m': 'BCC-CSM1.1(m)',
        'CESM1-BGC': 'CESM1(BGC)',
        'CESM1-CAM5': 'CESM1(CAM5)',
        'CESM1-CAM5-1-FV2': 'CESM1(CAM5.1,FV2)',
        'CESM1-FASTCHEM': 'CESM1(FASTCHEM)',
        'CESM1-WACCM': 'CESM1(WACCM)',
        'CSIRO-Mk3-6-0': 'CSIRO-Mk3.6.0',
        'fio-esm': 'FIO-ESM',
        'GFDL-CM2p1': 'GFDL-CM2.1',
        'inmcm4': 'INM-CM4',
        'MRI-AGCM3-2H': 'MRI-AGCM3.2H',
        'MRI-AGCM3-2S': 'MRI-AGCM3.2S'
    },
    'CMIP6': {},
    'CORDEX': {},
    'obs4MIPs': {},
}
"""Cache for the mapping between recipe/filesystem and ESGF dataset names."""



[docs]
def create_dataset_map():
    """Create the DATASET_MAP from recipe datasets to ESGF dataset names.

    Run `python -m esmvalcore.esgf.facets` to print an up to date map.
    """
    cfg = get_esgf_config()
    search_args = dict(cfg["search_connection"])
    url = search_args.pop("urls")[0]
    connection = pyesgf.search.SearchConnection(url=url, **search_args)

    dataset_map = {}
    indices = {
        'CMIP3': 2,
        'CMIP5': 3,
        'CMIP6': 3,
        'CORDEX': 7,
        'obs4MIPs': 2,
    }

    for project in FACETS:
        dataset_map[project] = {}
        dataset_key = FACETS[project]['dataset']
        ctx = connection.new_context(
            project=project,
            facets=[dataset_key],
            fields=['id'],
            latest=True,
        )
        available_datasets = sorted(ctx.facet_counts[dataset_key])
        print(f"The following datasets are available for project {project}:")
        for dataset in available_datasets:
            print(dataset)

        # Figure out the ESGF name of the requested dataset
        n_available = len(available_datasets)
        for i, dataset in enumerate(available_datasets, 1):
            print(f"Looking for dataset name of facet name"
                  f" {dataset} ({i} of {n_available})")
            query = {dataset_key: dataset}
            dataset_result = next(iter(ctx.search(batch_size=1, **query)))
            print(f"Dataset id: {dataset_result.dataset_id}")
            dataset_id = dataset_result.dataset_id
            if dataset not in dataset_id:
                idx = indices[project]
                dataset_alias = dataset_id.split('.')[idx]
                print(f"Found dataset name '{dataset_alias}'"
                      f" for facet '{dataset}',")
                dataset_map[project][dataset_alias] = dataset

    return dataset_map



if __name__ == '__main__':
    # Run this module to create an up to date DATASET_MAP
    print(create_dataset_map())