This repository has been archived by the owner on May 12, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 113
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into CLIMATE-931
- Loading branch information
Showing
11 changed files
with
535 additions
and
143 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
import os | ||
import subprocess | ||
import jinja2 | ||
from metadata_extractor import CORDEXMetadataExtractor, obs4MIPSMetadataExtractor | ||
|
||
# These should be modified. TODO: domains can also be made into separate group | ||
# CORDEX domain | ||
domain = 'NAM-44' | ||
|
||
# The output directory | ||
workdir = '/home/goodman/data_processing/CORDEX/analysis' | ||
|
||
# Location of osb4Mips files | ||
obs_dir = '/proj3/data/obs4mips' | ||
|
||
# Location of CORDEX files | ||
models_dir = '/proj3/data/CORDEX/{domain}/*'.format(domain=domain) | ||
|
||
# Extract metadata from model and obs files, pairing up files with the same | ||
# variables for separate evaluations | ||
obs_extractor = obs4MIPSMetadataExtractor(obs_dir) | ||
models_extractor = CORDEXMetadataExtractor(models_dir) | ||
groups = obs_extractor.group(models_extractor, 'variable') | ||
|
||
# Configuration file template, to be rendered repeatedly for each evaluation | ||
# run | ||
env = jinja2.Environment(loader=jinja2.FileSystemLoader('./templates'), | ||
trim_blocks=True, lstrip_blocks=True) | ||
t = env.get_template('CORDEX.yaml.template') | ||
|
||
# Each group represents a single evaluation. Repeat the evaluation for | ||
# three seasons: Summer, Winter, and Annual. | ||
seasons = ['annual', 'winter', 'summer'] | ||
for group in groups: | ||
obs_info, models_info = group | ||
instrument = obs_info['instrument'] | ||
variable = obs_info['variable'] | ||
for season in seasons: | ||
configfile_basename = '_'.join([domain, instrument, variable, season]) + '.yaml' | ||
configfile_path = os.path.join(workdir, domain, instrument, | ||
variable, season) | ||
if not os.path.exists(configfile_path): | ||
os.makedirs(configfile_path) | ||
configfile_path = os.path.join(configfile_path, configfile_basename) | ||
with open(configfile_path, 'w') as configfile: | ||
configfile.write(t.render(obs_info=obs_info, models_info=models_info, | ||
season=season, output_dir=workdir)) | ||
|
||
# TODO: Do this in parallel. Will change this once this approach | ||
# is well tested. | ||
code = subprocess.call(['python', '../run_RCMES.py', configfile_path]) | ||
errored = [] | ||
if code: | ||
errored.append(configfile_path) | ||
|
||
print("All runs done. The following ended with an error: {}".format(errored)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,245 @@ | ||
import glob | ||
import os | ||
|
||
|
||
class MetadataExtractor(object): | ||
def __init__(self, *paths): | ||
"""Extracts metadata from data filenames. | ||
Instances of MetadataExtractor are used to extract metadata from | ||
filenames in bulk. Example usage: | ||
>>> extractor = MetadataExtractor('/path/to/data') | ||
Suppose the data in this directory had the following files: | ||
pr_*.nc, uas_*.nc, vas_*.nc | ||
All of the metadata lies in the data attribute: | ||
>>> extractor.data | ||
[{'filename': /path/to/data/pr_*.nc, 'variable': 'pr'}, | ||
{'filename': /path/to/data/vas_*.nc, 'variable': 'vas'}, | ||
{'filename': /path/to/data/uas_*.nc, 'variable': 'uas'}] | ||
Results can be narrowed down by specifying values for a field: | ||
>>> extractor.query(variable='pr') | ||
[{'filename': /path/to/data/pr_*.nc, 'variable': 'pr'}] | ||
Finally, metadata from two sets of extractors can be grouped together | ||
based on common field name as follows: | ||
>>> extractor.group(extractor2, 'variable') | ||
This class should only be used as a starting point. We recommend using | ||
the included obs4MIPSMetadataExtractor and CORDEXMetadataExtractor | ||
subclasses or creating your own subclass for your usecase. | ||
""" | ||
self.paths = paths | ||
|
||
@property | ||
def data(self): | ||
""" | ||
The extracted metadata for each file, with all fields listed in | ||
the fields attribute included. | ||
""" | ||
return self._data | ||
|
||
@property | ||
def paths(self): | ||
""" | ||
Search paths containing the dataset files. | ||
""" | ||
return self._paths | ||
|
||
@paths.setter | ||
def paths(self, paths): | ||
""" | ||
Extracts the metadata from scratch when paths are reset. | ||
""" | ||
self._paths = paths | ||
self._extract() | ||
|
||
@property | ||
def fields(self): | ||
""" | ||
The name of field in the filename, assuming the fully filtered | ||
filename conforms to the following convention: | ||
filename = <field[0]>_<field[1]>_..._<field[n]>.nc. Using fewer fields | ||
than the filename defines is allowed. | ||
""" | ||
fields = ['variable'] | ||
return fields | ||
|
||
@property | ||
def files(self): | ||
""" | ||
List of files (or regular expressions) for each dataset. | ||
""" | ||
files = [] | ||
for path in self.paths: | ||
files.extend(glob.glob(os.path.join(path, '*.nc'))) | ||
return list(set(self.get_pattern(fname) for fname in files)) | ||
|
||
@property | ||
def variables(self): | ||
""" | ||
Get the list of variables included accross all the datasets. | ||
""" | ||
return self.get_field('variable') | ||
|
||
@property | ||
def field_filters(self): | ||
""" | ||
Override this to filter out specific characters contained in a field. | ||
""" | ||
return dict() | ||
|
||
def query(self, **kwargs): | ||
""" | ||
Narrow down the list of files by field names. | ||
""" | ||
fields = kwargs.keys() | ||
if not set(fields).issubset(set(self.fields)): | ||
raise ValueError("Invalid fields: {}. Must be subset of: {}" | ||
.format(fields, self.fields)) | ||
data = self.data | ||
for field, value in kwargs.items(): | ||
value = value if isinstance(value, list) else [value] | ||
data = [meta for meta in data | ||
if self._match_filter(meta, field) in value] | ||
return data | ||
|
||
def group(self, extractor, field): | ||
""" | ||
Compare the data of this extractor with another extractor instance | ||
and group each of their metadata together by given field. | ||
""" | ||
# First we only want to consider values of field which are contained | ||
# in both extractors | ||
subset = self.get_field(field) | ||
other_subset = extractor.get_field(field) | ||
intersection = list(subset.intersection(other_subset)) | ||
|
||
# Next we will group the datasets in each extractor together by common | ||
# field values | ||
kwargs = {field: intersection} | ||
results = self.query(**kwargs) | ||
|
||
groups = [] | ||
for meta in results: | ||
val = self._match_filter(meta, field) | ||
kwargs.update({field: val}) | ||
match = extractor.query(**kwargs) | ||
groups.append((meta, match)) | ||
|
||
return groups | ||
|
||
def get_field(self, field): | ||
""" | ||
Returns only the selected field of the extracted data. | ||
""" | ||
if field not in self.fields: | ||
raise ValueError("Invalid field: {}. Must be one of: {}" | ||
.format(field, self.fields)) | ||
sub = set(meta[field] for meta in self.data) | ||
return sub | ||
|
||
def filter_filename(self, fname): | ||
""" | ||
Applies a filter to each individual filename contained in the _files | ||
attribute, which is useful if some files within a data set are known | ||
to not follow conventions, and "fix" them so that they do. | ||
""" | ||
return os.path.basename(fname) | ||
|
||
def get_pattern(self, fname): | ||
""" | ||
Used to group multiple file datasets together via regular expresssions. | ||
The most common convention is to split files by time periods, which | ||
are generally the last field in a filename. | ||
""" | ||
base = fname.split('_') | ||
pattern = '_'.join(base[:len(self.fields)] + ['*.nc']) | ||
return pattern | ||
|
||
def _match_filter(self, meta, field): | ||
""" | ||
Filter (ignore) certain character patterns when matching a field. | ||
""" | ||
val = meta[field] | ||
if field in self.field_filters: | ||
for pattern in self.field_filters[field]: | ||
val = val.replace(pattern, '') | ||
return val | ||
|
||
def _extract(self): | ||
""" | ||
Do the actual metadata extraction from the list of filename given | ||
via filter_filelist(). Additionally, filenames can also be filtered | ||
via filter_filename() to remove unwanted characters from the extraction. | ||
""" | ||
self._data = [] | ||
for fname in self.files: | ||
meta = dict(filename=fname) | ||
|
||
# Perform the actual metadata extraction | ||
fname = self.filter_filename(fname) | ||
meta.update(dict(zip(self.fields, fname.split('_')[:-1]))) | ||
self._data.append(meta) | ||
|
||
|
||
class obs4MIPSMetadataExtractor(MetadataExtractor): | ||
@property | ||
def instruments(self): | ||
""" | ||
Get the list of instruments accross all the datasets. | ||
""" | ||
return self.get_field('instrument') | ||
|
||
@property | ||
def fields(self): | ||
""" | ||
obs4MIPs fields | ||
""" | ||
fields = ['variable', 'instrument', 'processing_level', 'version'] | ||
return fields | ||
|
||
@property | ||
def field_filters(self): | ||
""" | ||
Field filters for CALIPSO | ||
""" | ||
return dict(variable=['calipso', 'Lidarsr532']) | ||
|
||
def filter_filename(self, fname): | ||
""" | ||
CALIPSO files have odd naming conventions, so we will use | ||
a modified version to conform to standard obs4MIPs conventions. | ||
""" | ||
fname = os.path.basename(fname) | ||
fname = fname.replace('_obs4MIPs_', '_') | ||
return fname | ||
|
||
def get_pattern(self, fname): | ||
""" | ||
Overriden to deal with CALIPSO filenames | ||
""" | ||
base = fname.split('_') | ||
offset = -2 if len(base) != 5 else -1 | ||
pattern = '_'.join(base[:offset] + ['*.nc']) | ||
return pattern | ||
|
||
|
||
class CORDEXMetadataExtractor(MetadataExtractor): | ||
@property | ||
def models(self): | ||
""" | ||
Get the list of models accross all the datasets. | ||
""" | ||
return self.get_field('model') | ||
|
||
@property | ||
def fields(self): | ||
""" | ||
obs4MIPs fields | ||
""" | ||
fields = ['variable', 'domain', 'driving_model', 'experiment', | ||
'ensemble', 'model', 'version', 'time_step'] | ||
return fields |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
{% set domain = models_info[0].domain %} | ||
{% set instrument = obs_info.instrument %} | ||
{% set variable = models_info[0].variable %} | ||
{% set basename = [variable, instrument, domain, season]|join('_') %} | ||
workdir: {{ [output_dir, domain, instrument, variable, season]|join('/') }} | ||
output_netcdf_filename: {{ basename }}.nc | ||
|
||
# (RCMES will temporally subset data between month_start and month_end. | ||
# If average_each_year is True (False), seasonal mean in each year is (not) calculated and used for metrics calculation.) | ||
time: | ||
maximum_overlap_period: True | ||
temporal_resolution: monthly | ||
{% if season == "winter" %} | ||
month_start: 12 | ||
month_end: 2 | ||
{% elif season == "summer" %} | ||
month_start: 6 | ||
month_end: 8 | ||
{% else %} | ||
month_start: 1 | ||
month_end: 12 | ||
{% endif %} | ||
average_each_year: True | ||
|
||
space: | ||
boundary_type: CORDEX {{ domain[:3] }} | ||
|
||
regrid: | ||
regrid_on_reference: True | ||
|
||
datasets: | ||
- loader_name: local_split | ||
name: {{ instrument }} | ||
file_path: {{ obs_info.filename }} | ||
variable_name: {{ obs_info.variable }} | ||
{% for model_info in models_info %} | ||
- loader_name: local_split | ||
name: {{ model_info.model }} | ||
file_path: {{ model_info.filename }} | ||
variable_name: {{ model_info.variable }} | ||
lat_name: lat | ||
lon_name: lon | ||
{% endfor %} | ||
|
||
number_of_metrics_and_plots: 2 | ||
|
||
metrics1: Map_plot_bias_of_multiyear_climatology | ||
|
||
plots1: | ||
file_name: {{ basename }}_bias | ||
|
||
metrics2: Taylor_diagram_spatial_pattern_of_multiyear_climatology | ||
|
||
plots2: | ||
file_name: {{ basename }}_taylor | ||
|
||
use_subregions: False |
Oops, something went wrong.