initial import

rubisco-sfa · Sep 24, 2024 · f7e7ac2 · f7e7ac2
commit f7e7ac2
Show file tree

Hide file tree

Showing 9 changed files with 465 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,69 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache/
+cover/
+
+# Sphinx documentation
+docs/_build/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+#
+*.nc
diff --git a/README.md b/README.md
@@ -0,0 +1,64 @@
+# Prototype of ESM Abstraction for Use in Benchmarking
+
+This project is meant to initiate a discussion about an abstract model interface that can be used by benchmarking packages for Earth System Models. The primary goal is to help make our packages agnostic about variable names are used, but we can also use it to help unify a lot of our distinct work.
+
+Nothing is sacred. Names can be changed. We should strive to make it make sense for everyone.
+
+## Intended Usage
+
+To illustrate some simple usage, I made some dummy data in a local directory `_model`. You can run the tests with `pytest` to generate the same setup. Then inside a python interpreter, you can:
+
+```python
+m = Model("Test")
+m.find_files("./_model")
+```
+
+You can also chain it all together:
+
+```python
+m = Model("Test").find_files("./_model")
+```
+
+In ILAMB, I tend to keep a color associated with each model object to make line plots consistent, but this is optional and defaults to black.
+
+```python
+m = Model("Test",color=(1,0,0)).find_files("./_model")
+```
+
+If you want to see what the object found, you can just print the variables dictionary:
+
+```python
+print(m.variables)
+{
+  'time': ['_model/GPP.nc', '_model/ra.nc', '_model/rh.nc', '_model/tas.nc'],
+  'lat': ['_model/GPP.nc', '_model/ra.nc', '_model/rh.nc', '_model/tas.nc'],
+  'lon': ['_model/GPP.nc', '_model/ra.nc', '_model/rh.nc', '_model/tas.nc'],
+  'tas': ['_model/tas.nc'],
+  'GPP': ['_model/GPP.nc'],
+  'rh': ['_model/rh.nc'],
+  'ra': ['_model/ra.nc'],
+}
+```
+
+I am not sure how familiar you are with the land model, but you will see I have a variable `GPP` in there which by CMOR standards should be `gpp`. Also, in ILAMB we have observational data for the total respiration `reco` which is the sum of the components, `ra+rh`. So for this model we would add 2 synonyms:
+
+```python
+m.add_synonym("gpp = GPP")
+m.add_synonym("reco = ra + rh") # this works but will not lazy load :(
+```
+
+Once you have these registered, all of the following work:
+
+```python
+tas = m.get_variable("tas")
+gpp = m.get_variable("gpp")
+reco = m.get_variable("reco")
+```
+
+## Easy Extensions
+
+I wanted to start with the simplest structure which would solve the synonym problem, but there is a lot of functionality that would be easy to layer in:
+
+- On model initialization, we can search the variables for grid information (`areacella`, `areacello`, `sftlf`, `sftlo`, etc.) and then automatically associate these with the variables when requested via `get_variable()`.
+- Provide a `from_yaml()` or `from_dict()` initializer so models can be setup in external files and tools can automatically read them.
+- Add a derived model class which uses `intake-esgf` to automatically download files from ESGF.
diff --git a/esmodels/__init__.py b/esmodels/__init__.py
@@ -0,0 +1,3 @@
+from esmodels.base_model import Model
+
+__all__ = ["Model"]
diff --git a/esmodels/_version.py b/esmodels/_version.py
@@ -0,0 +1 @@
+__version__ = "0.0.0"
diff --git a/esmodels/base_model.py b/esmodels/base_model.py
@@ -0,0 +1,139 @@
+"""A class for abstracting the variables associated with an earth system model."""
+
+import os
+import re
+from collections.abc import Sequence
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Self
+
+import xarray as xr
+
+from esmodels.exceptions import ParsingError, VarNotInModel
+
+
+def free_symbols(expression: str) -> list[str]:
+    """
+    Return the free symbols of the expression.
+
+    Parameters
+    ----------
+    expression : str
+        The expression.
+
+    Returns
+    -------
+    list[str]
+        A list of the free symbols in the expression.
+
+    """
+    return re.findall(r"\w+", expression)
+
+
+@dataclass
+class Model:
+
+    name: str
+    color: tuple[float] = (0.0, 0.0, 0.0)
+    variables: dict = field(init=False, repr=False, default_factory=dict)
+    synonyms: dict = field(init=False, repr=False, default_factory=dict)
+
+    def find_files(
+        self,
+        path: str | Sequence[str],
+    ) -> Self:
+        """
+        Find netCDF files and variables in the provided path(s).
+
+        Parameters
+        ----------
+        path : str or a Sequence of str
+            The path(s) in which to search for netCDF files.
+
+        """
+        if isinstance(path, str):
+            path = [path]
+        # Walk through all the paths given (following links)...
+        for file_path in path:
+            for root, _, files in os.walk(file_path, followlinks=True):
+                for filename in files:
+                    filepath = Path(root) / Path(filename)
+                    if not filepath.suffix == ".nc":
+                        continue
+                    # Open the dataset and for each variable, insert a key into the
+                    # variables dict appending the path where it was found.
+                    with xr.open_dataset(filepath) as ds:
+                        for key in ds.variables.keys():
+                            if key not in self.variables:
+                                self.variables[key] = []
+                            self.variables[key].append(str(filepath))
+        # Sort the file lists to make opening easier later
+        self.variables = {var: sorted(paths) for var, paths in self.variables.items()}
+        return self
+
+    def add_synonym(self, expression: str) -> Self:
+        """
+        Associate variable synonyms or expressions with this model.
+
+        Parameters
+        ----------
+        expression : str
+            An expression for the synonym of the form, `new_var = var_in_model`.
+
+        """
+
+        assert isinstance(expression, str)
+        if expression.count("=") != 1:
+            raise ValueError("Add a synonym by providing a string of the form 'a = b'")
+        key, expression = expression.split("=")
+        # Check that the free symbols of the expression are variables in this model
+        for arg in free_symbols(expression):
+            assert arg in self.variables
+        self.synonyms[key.strip()] = expression.strip()
+        return self
+
+    def get_variable(
+        self,
+        name: str,
+    ) -> xr.Dataset:
+        """Search the model database for the specified variable.
+
+        Parameters
+        ----------
+        name : str
+            The name of the variable we wish to load.
+
+        Returns
+        -------
+        xr.Dataset
+            The xarray Dataset.
+        """
+
+        def _load(symbols: list[str]):
+            dsd = (
+                {
+                    sym: (
+                        xr.open_dataset(self.variables[sym][0])
+                        if len(self.variables[sym]) == 1
+                        else xr.open_mfdataset(self.variables[sym])
+                    )[sym]
+                    for sym in symbols
+                },
+            )
+            return dsd[0]
+
+        if name in self.variables:
+            return xr.Dataset(_load([name]))
+        if name in self.synonyms:
+            symbols = free_symbols(self.synonyms[name])
+            ds = _load(symbols)
+            expr = str(self.synonyms[name])
+            for sym in symbols:
+                expr = expr.replace(sym, f"ds['{sym}']")
+            try:
+                ds = xr.Dataset({name: eval(expr)})
+            except Exception as ex:
+                print(ex)
+                raise ParsingError(self.name, name, self.synonyms[name])
+            return ds
+        raise VarNotInModel(name, self.synonyms[name])
diff --git a/esmodels/exceptions.py b/esmodels/exceptions.py
@@ -0,0 +1,33 @@
+"""Exceptions for esmodels."""
+
+from collections.abc import Sequence
+
+
+class ESMException(Exception):
+    pass
+
+
+class VarNotInModel(ESMException):
+
+    def __init__(self, name: str, synonyms: None | str | Sequence[str] = None):
+        self.name = name
+        self.synonyms = synonyms
+
+    def __str__(self):
+        msg = f"The variable {self.name} "
+        if self.synonyms is not None:
+            msg += "nor its synonyms {self.synonyms} "
+        msg += " are present in this model."
+        return msg
+
+
+class ParsingError(ESMException):
+
+    def __init__(self, model: str, name: str, expression: str):
+        self.model = model
+        self.name = name
+        self.expression = expression
+
+    def __str__(self):
+        msg = f"Error parsing expression: model={self.model} variable={self.name} expression='{self.expression}'"
+        return msg
diff --git a/esmodels/tests/__init__.py b/esmodels/tests/__init__.py
diff --git a/esmodels/tests/test_basic.py b/esmodels/tests/test_basic.py
@@ -0,0 +1,74 @@
+from collections.abc import Sequence
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+from esmodels import Model
+
+
+def generate_test_dset(seed: int = 1, ntime=None, nlat=None, nlon=None):
+    rs = np.random.RandomState(seed)
+    coords = []
+    dims = []
+    if ntime is not None:
+        time = pd.date_range(start="2000-01-15", periods=ntime, freq="30D")
+        coords.append(time)
+        dims.append("time")
+    if nlat is not None:
+        lat = np.linspace(-90, 90, nlat + 1)
+        lat = 0.5 * (lat[1:] + lat[:-1])
+        coords.append(lat)
+        dims.append("lat")
+    if nlon is not None:
+        lon = np.linspace(-180, 180, nlon + 1)
+        lon = 0.5 * (lon[1:] + lon[:-1])
+        coords.append(lon)
+        dims.append("lon")
+    ds = xr.Dataset(
+        data_vars={
+            "da": xr.DataArray(
+                rs.rand(*[len(c) for c in coords]) * 1e-8, coords=coords, dims=dims
+            ),
+        }
+    )
+    ds["da"].attrs["units"] = "kg m-2 s-1"
+    return ds
+
+
+def setup_test_files(
+    variables: Sequence[str] = ["tas", "GPP", "rh", "ra"],
+    model_dir: str | Path = "_model",
+):
+    model_dir = Path(model_dir)
+    model_dir.mkdir(parents=True, exist_ok=True)
+    for seed, var in enumerate(variables):
+        ds = generate_test_dset(seed=seed, ntime=10, nlat=20, nlon=40).rename_vars(
+            {"da": var}
+        )
+        ds.to_netcdf(model_dir / f"{var}.nc")
+
+
+def test_basic():
+    setup_test_files()
+    m = Model("Test").find_files("_model")
+    v = m.get_variable("tas")
+    print(v)
+    assert np.isclose(v["tas"].mean(), 4.955490027173138e-09)
+
+
+def test_synonyms():
+    setup_test_files()
+    m = Model("Test").find_files("_model").add_synonym("gpp = GPP")
+    v = m.get_variable("gpp")
+    print(v["gpp"].mean())
+    assert np.isclose(v["gpp"].mean(), 4.9892081095978596e-09)
+
+
+def test_expression():
+    setup_test_files()
+    m = Model("Test").find_files("_model").add_synonym("reco = ra + rh")
+    v = m.get_variable("reco")
+    print(v["reco"].mean())
+    assert np.isclose(v["reco"].mean(), 9.888192545242514e-09)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from esmodels.base_model import Model

		__all__ = ["Model"]