Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make data loading more efficient by opening each source file once only #712

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions docs/source/using_the_ve/data/data.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,16 @@ two methods:
{class}`~virtual_ecosystem.core.data.Data` instance just using the standard
dictionary assignment: ``data['var_name'] = data_array``. The Virtual Ecosystem
{mod}`~virtual_ecosystem.core.readers` module provides the
function {func}`~virtual_ecosystem.core.readers.load_to_dataarray` to read data into
a DataArray from supported file formats. This can then be added directly to a Data
instance:
function {func}`~virtual_ecosystem.core.readers.load_to_dataarray` to read a list of
variables in a file into DataArrays from supported file formats. The returned value
is a dictionary of DataArrays keyed by the variable names and can then be added
directly to a Data instance:

```{code-block} ipython3
data["var_name"] = load_to_dataarray("path/to/file.nc", var_name="temperature")
loaded_data = load_to_dataarray("path/to/file.nc", var_names=["temperature"])
# iterate over the dictionary of variable names and arrays
for var_name, data_array in loaded_data.items():
data[var_name] = data_array
```

1. The {meth}`~virtual_ecosystem.core.data.Data.load_data_config` method takes a
Expand Down Expand Up @@ -186,7 +190,11 @@ configured grid.
```{code-cell} ipython3
# Load data from a file
file_path = Path("../../data/xy_dim.nc")
data["temp"] = load_to_dataarray(file_path, var_name="temp")
loaded_data = load_to_dataarray(file_path, var_names=["temp"])

# iterate over the dictionary of variable names and arrays
for var_name, data_array in loaded_data.items():
data[var_name] = data_array
```

```{code-cell} ipython3
Expand Down
65 changes: 32 additions & 33 deletions tests/core/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,27 +229,27 @@ def test_Data_contains(fixture_data, var_name, expected):


@pytest.mark.parametrize(
argnames=["name", "exp_log"],
argnames=["var_names", "exp_log"],
argvalues=[
pytest.param(
"temp",
["temp"],
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
),
id="simple_load",
),
pytest.param(
"elev",
["elev"],
(
(INFO, "Loading variable 'elev' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Replacing data array for 'elev'"),
),
id="load_and_replace",
),
],
)
def test_Data_load_to_dataarray_naming(caplog, shared_datadir, name, exp_log):
def test_Data_load_to_dataarray_naming(caplog, shared_datadir, var_names, exp_log):
"""Test the coding of the name handling and replacement."""

# Setup a Data instance to match the example files generated in tests/core/data
Expand All @@ -258,6 +258,8 @@ def test_Data_load_to_dataarray_naming(caplog, shared_datadir, name, exp_log):
from virtual_ecosystem.core.grid import Grid
from virtual_ecosystem.core.readers import load_to_dataarray

caplog.clear()

grid = Grid(
grid_type="square",
cell_nx=10,
Expand All @@ -275,11 +277,14 @@ def test_Data_load_to_dataarray_naming(caplog, shared_datadir, name, exp_log):
# Load the data from file
datafile = shared_datadir / "cellid_coords.nc"

data[name] = load_to_dataarray(file=datafile, var_name=name)
results = load_to_dataarray(file=datafile, var_names=var_names)
for ky, val in results.items():
data[ky] = val

# Check the naming has worked and the data are loaded
assert name in data
assert data[name].sum() == (20 * 100)
for name in var_names:
# Check the naming has worked and the data are loaded
assert name in data
assert data[name].sum() == (20 * 100)

# Check the error reports
log_check(caplog, exp_log)
Expand Down Expand Up @@ -328,7 +333,7 @@ def fixture_load_data_grids(request):
does_not_raise(),
None,
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
),
20 * 100,
Expand All @@ -340,7 +345,7 @@ def fixture_load_data_grids(request):
pytest.raises(ValueError),
"Grid defines 100 cells, data provides 60",
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(CRITICAL, "Grid defines 100 cells, data provides 60"),
),
Expand All @@ -353,7 +358,7 @@ def fixture_load_data_grids(request):
pytest.raises(ValueError),
"Grid defines 100 cells, data provides 200",
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(CRITICAL, "Grid defines 100 cells, data provides 200"),
),
Expand All @@ -366,7 +371,7 @@ def fixture_load_data_grids(request):
does_not_raise(),
None,
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
),
20 * 100,
Expand All @@ -378,7 +383,7 @@ def fixture_load_data_grids(request):
pytest.raises(ValueError),
"The data cell ids do not provide a one-to-one map onto grid cell ids.",
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(
CRITICAL,
Expand All @@ -395,7 +400,7 @@ def fixture_load_data_grids(request):
pytest.raises(ValueError),
"The data cell ids do not provide a one-to-one map onto grid cell ids.",
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(
CRITICAL,
Expand All @@ -412,7 +417,7 @@ def fixture_load_data_grids(request):
does_not_raise(),
None,
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
),
20 * 100,
Expand All @@ -424,7 +429,7 @@ def fixture_load_data_grids(request):
pytest.raises(ValueError),
"Data XY dimensions do not match square grid",
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(CRITICAL, "Data XY dimensions do not match square grid"),
),
Expand All @@ -437,7 +442,7 @@ def fixture_load_data_grids(request):
does_not_raise(),
None,
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
),
20 * 100,
Expand All @@ -449,7 +454,7 @@ def fixture_load_data_grids(request):
pytest.raises(ValueError),
"Mapped points do not cover all cells.",
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(CRITICAL, "Mapped points do not cover all cells."),
),
Expand All @@ -462,7 +467,7 @@ def fixture_load_data_grids(request):
pytest.raises(ValueError),
"Mapped points fall outside grid.",
(
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(CRITICAL, "Mapped points fall outside grid."),
),
Expand Down Expand Up @@ -499,6 +504,8 @@ def test_Data_load_to_dataarray_data_handling(
from virtual_ecosystem.core.data import Data
from virtual_ecosystem.core.readers import load_to_dataarray

caplog.clear()

# Skip combinations where validator does not supported this grid
if not (
("__any__" in supported_grids)
Expand All @@ -510,7 +517,8 @@ def test_Data_load_to_dataarray_data_handling(
datafile = shared_datadir / filename

with exp_error as err:
data["temp"] = load_to_dataarray(file=datafile, var_name="temp")
results = load_to_dataarray(file=datafile, var_names=["temp"])
data["temp"] = results["temp"]

# Check the data is in fact loaded and that a simple sum of values matches
assert "temp" in data
Expand All @@ -521,8 +529,6 @@ def test_Data_load_to_dataarray_data_handling(

log_check(caplog, exp_log)

return


@pytest.mark.parametrize(
argnames=["cfg_strings", "exp_error", "exp_msg", "exp_log"],
Expand All @@ -546,13 +552,10 @@ def test_Data_load_to_dataarray_data_handling(
None,
(
(INFO, "Loading data from configuration"),
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(INFO, "Loading variable 'prec' from file:"),
(INFO, "Adding data array for 'prec'"),
(INFO, "Loading variable 'elev' from file:"),
(INFO, "Adding data array for 'elev'"),
(INFO, "Loading variable 'vapd' from file:"),
(INFO, "Adding data array for 'vapd'"),
),
id="valid config",
Expand Down Expand Up @@ -587,14 +590,10 @@ def test_Data_load_to_dataarray_data_handling(
(
(INFO, "Loading data from configuration"),
(ERROR, "Duplicate variable names in data configuration"),
(INFO, "Loading variable 'temp' from file:"),
(INFO, "Loading variables from file"),
(INFO, "Adding data array for 'temp'"),
(INFO, "Loading variable 'prec' from file:"),
(INFO, "Adding data array for 'prec'"),
(INFO, "Loading variable 'elev' from file:"),
(INFO, "Adding data array for 'elev'"),
(INFO, "Loading variable 'elev' from file:"),
(INFO, "Replacing data array for 'elev'"),
(CRITICAL, "Data configuration did not load cleanly - check log"),
),
id="repeated names",
Expand Down
Loading