Skip to content

Commit

Permalink
initial import
Browse files Browse the repository at this point in the history
  • Loading branch information
nocollier committed Sep 24, 2024
0 parents commit f7e7ac2
Show file tree
Hide file tree
Showing 9 changed files with 465 additions and 0 deletions.
69 changes: 69 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
.ruff_cache/
cover/

# Sphinx documentation
docs/_build/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

#
*.nc
64 changes: 64 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Prototype of ESM Abstraction for Use in Benchmarking

This project is meant to initiate a discussion about an abstract model interface that can be used by benchmarking packages for Earth System Models. The primary goal is to help make our packages agnostic about variable names are used, but we can also use it to help unify a lot of our distinct work.

Nothing is sacred. Names can be changed. We should strive to make it make sense for everyone.

## Intended Usage

To illustrate some simple usage, I made some dummy data in a local directory `_model`. You can run the tests with `pytest` to generate the same setup. Then inside a python interpreter, you can:

```python
m = Model("Test")
m.find_files("./_model")
```

You can also chain it all together:

```python
m = Model("Test").find_files("./_model")
```

In ILAMB, I tend to keep a color associated with each model object to make line plots consistent, but this is optional and defaults to black.

```python
m = Model("Test",color=(1,0,0)).find_files("./_model")
```

If you want to see what the object found, you can just print the variables dictionary:

```python
print(m.variables)
{
'time': ['_model/GPP.nc', '_model/ra.nc', '_model/rh.nc', '_model/tas.nc'],
'lat': ['_model/GPP.nc', '_model/ra.nc', '_model/rh.nc', '_model/tas.nc'],
'lon': ['_model/GPP.nc', '_model/ra.nc', '_model/rh.nc', '_model/tas.nc'],
'tas': ['_model/tas.nc'],
'GPP': ['_model/GPP.nc'],
'rh': ['_model/rh.nc'],
'ra': ['_model/ra.nc'],
}
```

I am not sure how familiar you are with the land model, but you will see I have a variable `GPP` in there which by CMOR standards should be `gpp`. Also, in ILAMB we have observational data for the total respiration `reco` which is the sum of the components, `ra+rh`. So for this model we would add 2 synonyms:

```python
m.add_synonym("gpp = GPP")
m.add_synonym("reco = ra + rh") # this works but will not lazy load :(
```

Once you have these registered, all of the following work:

```python
tas = m.get_variable("tas")
gpp = m.get_variable("gpp")
reco = m.get_variable("reco")
```

## Easy Extensions

I wanted to start with the simplest structure which would solve the synonym problem, but there is a lot of functionality that would be easy to layer in:

- On model initialization, we can search the variables for grid information (`areacella`, `areacello`, `sftlf`, `sftlo`, etc.) and then automatically associate these with the variables when requested via `get_variable()`.
- Provide a `from_yaml()` or `from_dict()` initializer so models can be setup in external files and tools can automatically read them.
- Add a derived model class which uses `intake-esgf` to automatically download files from ESGF.
3 changes: 3 additions & 0 deletions esmodels/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from esmodels.base_model import Model

__all__ = ["Model"]
1 change: 1 addition & 0 deletions esmodels/_version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "0.0.0"
139 changes: 139 additions & 0 deletions esmodels/base_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
"""A class for abstracting the variables associated with an earth system model."""

import os
import re
from collections.abc import Sequence
from dataclasses import dataclass, field
from pathlib import Path
from typing import Self

import xarray as xr

from esmodels.exceptions import ParsingError, VarNotInModel


def free_symbols(expression: str) -> list[str]:
"""
Return the free symbols of the expression.
Parameters
----------
expression : str
The expression.
Returns
-------
list[str]
A list of the free symbols in the expression.
"""
return re.findall(r"\w+", expression)


@dataclass
class Model:

name: str
color: tuple[float] = (0.0, 0.0, 0.0)
variables: dict = field(init=False, repr=False, default_factory=dict)
synonyms: dict = field(init=False, repr=False, default_factory=dict)

def find_files(
self,
path: str | Sequence[str],
) -> Self:
"""
Find netCDF files and variables in the provided path(s).
Parameters
----------
path : str or a Sequence of str
The path(s) in which to search for netCDF files.
"""
if isinstance(path, str):
path = [path]
# Walk through all the paths given (following links)...
for file_path in path:
for root, _, files in os.walk(file_path, followlinks=True):
for filename in files:
filepath = Path(root) / Path(filename)
if not filepath.suffix == ".nc":
continue
# Open the dataset and for each variable, insert a key into the
# variables dict appending the path where it was found.
with xr.open_dataset(filepath) as ds:
for key in ds.variables.keys():
if key not in self.variables:
self.variables[key] = []
self.variables[key].append(str(filepath))
# Sort the file lists to make opening easier later
self.variables = {var: sorted(paths) for var, paths in self.variables.items()}
return self

def add_synonym(self, expression: str) -> Self:
"""
Associate variable synonyms or expressions with this model.
Parameters
----------
expression : str
An expression for the synonym of the form, `new_var = var_in_model`.
"""

assert isinstance(expression, str)
if expression.count("=") != 1:
raise ValueError("Add a synonym by providing a string of the form 'a = b'")
key, expression = expression.split("=")
# Check that the free symbols of the expression are variables in this model
for arg in free_symbols(expression):
assert arg in self.variables
self.synonyms[key.strip()] = expression.strip()
return self

def get_variable(
self,
name: str,
) -> xr.Dataset:
"""Search the model database for the specified variable.
Parameters
----------
name : str
The name of the variable we wish to load.
Returns
-------
xr.Dataset
The xarray Dataset.
"""

def _load(symbols: list[str]):
dsd = (
{
sym: (
xr.open_dataset(self.variables[sym][0])
if len(self.variables[sym]) == 1
else xr.open_mfdataset(self.variables[sym])
)[sym]
for sym in symbols
},
)
return dsd[0]

if name in self.variables:
return xr.Dataset(_load([name]))
if name in self.synonyms:
symbols = free_symbols(self.synonyms[name])
ds = _load(symbols)
expr = str(self.synonyms[name])
for sym in symbols:
expr = expr.replace(sym, f"ds['{sym}']")
try:
ds = xr.Dataset({name: eval(expr)})
except Exception as ex:
print(ex)
raise ParsingError(self.name, name, self.synonyms[name])
return ds
raise VarNotInModel(name, self.synonyms[name])
33 changes: 33 additions & 0 deletions esmodels/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Exceptions for esmodels."""

from collections.abc import Sequence


class ESMException(Exception):
pass


class VarNotInModel(ESMException):

def __init__(self, name: str, synonyms: None | str | Sequence[str] = None):
self.name = name
self.synonyms = synonyms

def __str__(self):
msg = f"The variable {self.name} "
if self.synonyms is not None:
msg += "nor its synonyms {self.synonyms} "
msg += " are present in this model."
return msg


class ParsingError(ESMException):

def __init__(self, model: str, name: str, expression: str):
self.model = model
self.name = name
self.expression = expression

def __str__(self):
msg = f"Error parsing expression: model={self.model} variable={self.name} expression='{self.expression}'"
return msg
Empty file added esmodels/tests/__init__.py
Empty file.
74 changes: 74 additions & 0 deletions esmodels/tests/test_basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from collections.abc import Sequence
from pathlib import Path

import numpy as np
import pandas as pd
import xarray as xr

from esmodels import Model


def generate_test_dset(seed: int = 1, ntime=None, nlat=None, nlon=None):
rs = np.random.RandomState(seed)
coords = []
dims = []
if ntime is not None:
time = pd.date_range(start="2000-01-15", periods=ntime, freq="30D")
coords.append(time)
dims.append("time")
if nlat is not None:
lat = np.linspace(-90, 90, nlat + 1)
lat = 0.5 * (lat[1:] + lat[:-1])
coords.append(lat)
dims.append("lat")
if nlon is not None:
lon = np.linspace(-180, 180, nlon + 1)
lon = 0.5 * (lon[1:] + lon[:-1])
coords.append(lon)
dims.append("lon")
ds = xr.Dataset(
data_vars={
"da": xr.DataArray(
rs.rand(*[len(c) for c in coords]) * 1e-8, coords=coords, dims=dims
),
}
)
ds["da"].attrs["units"] = "kg m-2 s-1"
return ds


def setup_test_files(
variables: Sequence[str] = ["tas", "GPP", "rh", "ra"],
model_dir: str | Path = "_model",
):
model_dir = Path(model_dir)
model_dir.mkdir(parents=True, exist_ok=True)
for seed, var in enumerate(variables):
ds = generate_test_dset(seed=seed, ntime=10, nlat=20, nlon=40).rename_vars(
{"da": var}
)
ds.to_netcdf(model_dir / f"{var}.nc")


def test_basic():
setup_test_files()
m = Model("Test").find_files("_model")
v = m.get_variable("tas")
print(v)
assert np.isclose(v["tas"].mean(), 4.955490027173138e-09)


def test_synonyms():
setup_test_files()
m = Model("Test").find_files("_model").add_synonym("gpp = GPP")
v = m.get_variable("gpp")
print(v["gpp"].mean())
assert np.isclose(v["gpp"].mean(), 4.9892081095978596e-09)


def test_expression():
setup_test_files()
m = Model("Test").find_files("_model").add_synonym("reco = ra + rh")
v = m.get_variable("reco")
print(v["reco"].mean())
assert np.isclose(v["reco"].mean(), 9.888192545242514e-09)
Loading

0 comments on commit f7e7ac2

Please sign in to comment.