Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ADD: Add cmip6/cmip5 directory based parsers #134

Merged
merged 8 commits into from
Oct 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 96 additions & 1 deletion ecgtools/parsers/cmip.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import pathlib
import traceback

import cf_xarray # noqa
import numpy as np
import xarray as xr

from ..builder import INVALID_ASSET, TRACEBACK
from .utilities import extract_attr_with_regex
from .utilities import extract_attr_with_regex, reverse_filename_format


def parse_cmip6(file):
Expand Down Expand Up @@ -86,3 +88,96 @@ def parse_cmip6(file):

except Exception:
return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}


def parse_cmip6_using_directories(file):
"""
Extract attributes of a file using information from CMI6 DRS.
References
CMIP6 DRS: http://goo.gl/v1drZl
Controlled Vocabularies (CVs) for use in CMIP6: https://github.com/WCRP-CMIP/CMIP6_CVs
Directory structure =
<mip_era>/
<activity_id>/
<institution_id>/
<source_id>/
<experiment_id>/
<member_id>/
<table_id>/
<variable_id>/
<grid_label>/
<version>
file name=<variable_id>_<table_id>_<source_id>_<experiment_id >_<member_id>_<grid_label>[_<time_range>].nc
For time-invariant fields, the last segment (time_range) above is omitted.
Example when there is no sub-experiment: tas_Amon_GFDL-CM4_historical_r1i1p1f1_gn_196001-199912.nc
Example with a sub-experiment: pr_day_CNRM-CM6-1_dcppA-hindcast_s1960-r2i1p1f1_gn_198001-198412.nc
"""
basename = pathlib.Path(file).name
filename_template = '{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}_{time_range}.nc'

gridspec_template = (
'{variable_id}_{table_id}_{source_id}_{experiment_id}_{member_id}_{grid_label}.nc'
)
templates = [filename_template, gridspec_template]
fileparts = reverse_filename_format(basename, templates=templates)
try:
parent = str(pathlib.Path(file).parent)
parent_split = parent.split(f"/{fileparts['source_id']}/")
part_1 = parent_split[0].strip('/').split('/')
grid_label = parent.split(f"/{fileparts['variable_id']}/")[1].strip('/').split('/')[0]
fileparts['grid_label'] = grid_label
fileparts['activity_id'] = part_1[-2]
fileparts['institution_id'] = part_1[-1]
version_regex = r'v\d{4}\d{2}\d{2}|v\d{1}'
version = extract_attr_with_regex(parent, regex=version_regex) or 'v0'
fileparts['version'] = version
fileparts['path'] = file
if fileparts['member_id'].startswith('s'):
fileparts['dcpp_init_year'] = float(fileparts['member_id'].split('-')[0][1:])
fileparts['member_id'] = fileparts['member_id'].split('-')[-1]
else:
fileparts['dcpp_init_year'] = np.nan

except Exception:
return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}

return fileparts


def parse_cmip5_using_directories(file):
"""Extract attributes of a file using information from CMIP5 DRS.
Notes
-----
Reference:
- CMIP5 DRS: https://pcmdi.llnl.gov/mips/cmip5/docs/cmip5_data_reference_syntax.pdf?id=27
"""

freq_regex = r'/3hr/|/6hr/|/day/|/fx/|/mon/|/monClim/|/subhr/|/yr/'
realm_regex = r'aerosol|atmos|land|landIce|ocean|ocnBgchem|seaIce'
version_regex = r'v\d{4}\d{2}\d{2}|v\d{1}'

file_basename = str(pathlib.Path(file).name)

filename_template = (
'{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}_{temporal_subset}.nc'
)
gridspec_template = '{variable}_{mip_table}_{model}_{experiment}_{ensemble_member}.nc'

templates = [filename_template, gridspec_template]
fileparts = reverse_filename_format(file_basename, templates)
frequency = extract_attr_with_regex(file, regex=freq_regex, strip_chars='/')
realm = extract_attr_with_regex(file, regex=realm_regex)
version = extract_attr_with_regex(file, regex=version_regex) or 'v0'
fileparts['frequency'] = frequency
fileparts['modeling_realm'] = realm
fileparts['version'] = version
fileparts['path'] = file
try:
part1, part2 = str(pathlib.Path(file).parent).split(fileparts['experiment'])
part1 = part1.strip('/').split('/')
fileparts['institute'] = part1[-2]
fileparts['product_id'] = part1[-3]
except Exception:
return {INVALID_ASSET: file, TRACEBACK: traceback.format_exc()}

return fileparts
22 changes: 22 additions & 0 deletions ecgtools/parsers/utilities.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import re

from intake.source.utils import reverse_format


def extract_attr_with_regex(
input_str: str, regex: str, strip_chars: str = None, ignore_case: bool = True
Expand All @@ -16,3 +18,23 @@ def extract_attr_with_regex(
return match
else:
return None


def reverse_filename_format(filename, templates):
"""
Uses intake's ``reverse_format`` utility to reverse the string method format.
Given format_string and resolved_string, find arguments
that would give format_string.format(arguments) == resolved_string
"""
x = {}

for template in templates:
try:
x = reverse_format(template, filename)
if x:
break
except ValueError:
continue
if not x:
print(f'Failed to parse file: {filename} using patterns: {templates}')
return x
39 changes: 38 additions & 1 deletion tests/parsers/test_cmip.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import pytest

from ecgtools.parsers.cmip import parse_cmip6
from ecgtools.parsers.cmip import (
parse_cmip5_using_directories,
parse_cmip6,
parse_cmip6_using_directories,
)


@pytest.mark.parametrize(
Expand All @@ -18,3 +22,36 @@ def test_parse_cmip6(sample_data_directory, file_path):
assert entry['grid_label'] == 'gn'
assert entry['table_id'] == 'Amon'
assert entry['variable_id'] == 'tasmax'


@pytest.mark.parametrize(
'file_path',
[
'cmip/CMIP6/CMIP/BCC/BCC-ESM1/piControl/r1i1p1f1/Amon/tasmax/gn/v20181214/tasmax/tasmax_Amon_BCC-ESM1_piControl_r1i1p1f1_gn_185001-230012.nc'
],
)
def test_parse_cmip6_using_directories(sample_data_directory, file_path):
path = sample_data_directory / file_path
entry = parse_cmip6_using_directories(path)
assert {'activity_id', 'variable_id', 'table_id'}.issubset(set(list(entry.keys())))
assert entry['experiment_id'] == 'piControl'
assert entry['member_id'] == 'r1i1p1f1'
assert entry['grid_label'] == 'gn'
assert entry['table_id'] == 'Amon'
assert entry['variable_id'] == 'tasmax'


@pytest.mark.parametrize(
'file_path',
[
'cmip/cmip5/output1/CCCma/CanESM2/esmHistorical/mon/ocnBgchem/Omon/r1i1p1/v20111027/fgco2/fgco2_Omon_CanESM2_esmHistorical_r1i1p1_185001-200512.nc'
],
)
def test_parse_cmip5_using_directories(sample_data_directory, file_path):
path = sample_data_directory / file_path
entry = parse_cmip5_using_directories(str(path))
assert {'model', 'variable', 'mip_table'}.issubset(set(list(entry.keys())))
assert entry['experiment'] == 'esmHistorical'
assert entry['ensemble_member'] == 'r1i1p1'
assert entry['mip_table'] == 'Omon'
assert entry['variable'] == 'fgco2'