diff --git a/notebooks/01a-griddap.ipynb b/notebooks/01a-griddap.ipynb index 833267c9..4646084f 100644 --- a/notebooks/01a-griddap.ipynb +++ b/notebooks/01a-griddap.ipynb @@ -6,8 +6,14 @@ "source": [ "# Griddap\n", "\n", - "Erddapy can access gridded datasets, using the server-side subsetting of griddap\n", - "to download only the parts of a dataset that the user requires\n" + "Erddapy can access gridded datasets,\n", + "using the server-side subsetting of griddap or the OPeNDAP response,\n", + "to download only the parts of a dataset that the user requires.\n", + "\n", + "\n", + "In our example we will use a Region of Interest (ROI) to extract data within its bounds.\n", + "First we need to read the ROI with `geopandas`.\n", + "Let's use the South Atlantic Ocean basin from Natural Earth as our ROI." ] }, { @@ -16,14 +22,26 @@ "metadata": {}, "outputs": [], "source": [ - "from erddapy import ERDDAP" + "import geopandas\n", + "import pooch\n", + "\n", + "url = \"https://naturalearth.s3.amazonaws.com/4.1.1/50m_physical/ne_50m_geography_marine_polys.zip\"\n", + "fname = pooch.retrieve(\n", + " url,\n", + " known_hash=\"db6f59e5a747c016451caec2450db6deea25d702dc2fb9c39384c1b909fb7f72\",\n", + ")\n", + "\n", + "oceans = geopandas.read_file(fname)\n", + "\n", + "name = \"South Atlantic Ocean\"\n", + "SA = oceans.loc[oceans[\"name\"] == name]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "When accessing gridded datasets, the kwarg `protocol='griddap'` must be used\n" + "When accessing gridded datasets we need to define the `protocol=\"griddap\"` in our class instantiation." ] }, { @@ -36,21 +54,24 @@ }, "outputs": [], "source": [ + "from erddapy import ERDDAP\n", + "\n", "e = ERDDAP(\n", " server=\"CSWC\", # CoastWatch West Coast Node\n", - " protocol=\"griddap\",\n", + " protocol=\"griddap\", # Protocol for gridded datasets\n", ")\n", - "e.dataset_id = (\n", - " \"jplAvisoSshMon\" # AVISO Model Output, obs4MIPs NASA-JPL, Global, 1 Degree\n", - ")" + "\n", + "e.dataset_id = \"jplAvisoSshMon_LonPM180\" # AVISO Model Output, obs4MIPs NASA-JPL, Global, 1 Degree" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ + "CAVEAT: Note that ERDDAP can serve gridded data with longitudes in the 0–360 format or -180–180. You must inspect the dataset and modify your query accordingly.\n", + "\n", "Information on the griddap dataset is fetched with `griddap_initialize`. This\n", - "fills the `variables` and `constraints` properties for that dataset\n" + "fills in the `variables` and `constraints` properties for that dataset.\n" ] }, { @@ -59,10 +80,10 @@ "metadata": {}, "outputs": [], "source": [ - "e.griddap_initialize()\n", - "\n", "import json\n", "\n", + "e.griddap_initialize()\n", + "\n", "print(f\"variables in this dataset:\\n\\n{e.variables}\")\n", "print(\n", " f\"\\nconstraints of this dataset:\\n\\n{json.dumps(e.constraints, indent=1)}\"\n", @@ -77,7 +98,7 @@ "at the most recent timestep and every point of the remaining dimensions.\n", "\n", "This can result in large datasets, the values of the constraints can be changed,\n", - "and variables dropped before data set is downloaded\n" + "and variables dropped before data set is downloaded. Here we will download only one variable from that list." ] }, { @@ -86,8 +107,16 @@ "metadata": {}, "outputs": [], "source": [ - "e.variables = e.variables[:2]\n", - "print(f\"variables for download:\\n\\n{e.variables}\")" + "e.variables = [e.variables[0]]\n", + "\n", + "print(f\"Downloading {e.variables}.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will reduce the dataset a bit further by requesting only the data that is inside the bounding box of the South Atlantic." ] }, { @@ -96,9 +125,25 @@ "metadata": {}, "outputs": [], "source": [ - "e.constraints[\"latitude_step\"] = 1\n", - "e.constraints[\"longitude_step\"] = 1\n", + "def bounds2contraints(bounds):\n", + " return {\n", + " \"longitude>=\": bounds.minx.squeeze(),\n", + " \"longitude<=\": bounds.maxx.squeeze(),\n", + " \"latitude>=\": bounds.miny.squeeze(),\n", + " \"latitude<=\": bounds.maxy.squeeze(),\n", + " }\n", + "\n", "\n", + "e.constraints.update(bounds2contraints(SA.bounds))\n", + "e.constraints" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "print(f\"\\nconstraints for download:\\n\\n{json.dumps(e.constraints, indent=1)}\")" ] }, @@ -106,8 +151,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Once the query is prepared, data can be downloaded to xarray DataSet, pandas\n", - "DataFrame and more\n" + "Once the query is prepared we can download the data into an `xarray.Dataset` object. (We can also download it in a `pandas.DataFrame` or `iris.Cube` objects.)" ] }, { @@ -116,6 +160,23 @@ "metadata": {}, "outputs": [], "source": [ + "from urllib.parse import unquote_plus\n", + "\n", + "url = \"https://coastwatch.pfeg.noaa.gov/erddap/griddap/jplAvisoSshMon_LonPM180.nc?sshag%5B(2010-12-16T12:00:00Z):1:(2010-12-16T12:00:00Z)%5D%5B(-60.53346241642455):1:(0.03286652261984102)%5D%5B(-69.09208207871731):1:(19.63485354989288)%5D,nObs%5B(2010-12-16T12:00:00Z):1:(2010-12-16T12:00:00Z)%5D%5B(-60.53346241642455):1:(0.03286652261984102)%5D%5B(-69.09208207871731):1:(19.63485354989288)%5D\"\n", + "\n", + "unquote_plus(url)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "%%time\n", + "\n", "ds = e.to_xarray()" ] }, @@ -133,18 +194,73 @@ "metadata": {}, "outputs": [], "source": [ - "ds[\"sshag\"].plot()" + "import cartopy.crs as ccrs\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "fig, ax = plt.subplots(subplot_kw={\"projection\": ccrs.PlateCarree()})\n", + "ds[\"sshag\"].plot(ax=ax)\n", + "ax.coastlines()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that we did not extract the exact ROI but instead we downloaded what is inside its bounds.\n", + "We can refine the data selection using region mask and download strictly what is inside th ROI.\n", + "The `regionmask` module can created from the `geopandas` object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import regionmask\n", + "\n", + "\n", + "region = regionmask.from_geopandas(SA, name=name)\n", + "region.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mask = region.mask(\n", + " ds,\n", + " lon_name=\"longitude\",\n", + " lat_name=\"latitude\",\n", + " method=\"pygeos\",\n", + ")\n", + "\n", + "\n", + "fig, ax = plt.subplots(subplot_kw={\"projection\": ccrs.PlateCarree()})\n", + "ds[\"sshag\"].sel(time=\"2010\").where(mask == region.numbers[0]).plot(ax=ax)\n", + "ax.coastlines()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The main difference is that now we did not load data from rivers plumes that are not part of the ROI. (Check the Río de la Plata area in both plots.)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Skip subsetting\n", + "### Subset after the request with OPeNDAP\n", + "\n", + "ERDDAP server-side subsetting can be avoided by specifying the OPeNDAP protocol.\n", + "This is a good choice if you intend to use a full dataset or subset it after the request.\n", "\n", - "ERDDAP server-side subsetting can be avoided by specifying the opendap protocol.\n", - "This is a good choice if you intend to use a full dataset or take several slices\n", - "from the same dataset\n" + "Note that most OPeNDAP clients will eagerly download only the coordinates, making a post request subset almost as fast as serve-side subset." ] }, { @@ -174,16 +290,77 @@ "metadata": {}, "outputs": [], "source": [ + "%%time\n", + "\n", "ds = e.to_xarray()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a quick look at the data from 2015." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "projection = ccrs.PlateCarree()\n", + "\n", + "fig, ax = plt.subplots(subplot_kw={\"projection\": projection})\n", + "sss = ds[\"sss\"].sel(time=\"2015\")\n", + "sss.plot(ax=ax)\n", + "ax.coastlines()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mask = region.mask(\n", + " ds,\n", + " lon_name=\"longitude\",\n", + " lat_name=\"latitude\",\n", + " method=\"pygeos\",\n", + ")\n", + "\n", + "\n", + "fig, ax = plt.subplots(subplot_kw={\"projection\": ccrs.PlateCarree()})\n", + "ds[\"sss\"].sel(time=\"2015\").where(mask == region.numbers[0]).plot(ax=ax)\n", + "ax.coastlines()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Thanks to `regionmask` we can go a few steps further with the ROI and compute statistics.\n", + "Let's calculate the mean salinity time-series for the South Atlantic and the mean spatial salinity for the time coverage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sasal = ds.groupby(mask).mean()\n", + "sasal[\"sss\"].plot(marker=\"o\")" + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "ds" + "sasal = ds.groupby(mask).mean(dim=\"time\")\n", + "sasal[\"sss\"].plot()" ] } ], @@ -203,7 +380,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.6" + "version": "3.10.4" } }, "nbformat": 4, diff --git a/requirements-dev.txt b/requirements-dev.txt index 7eb67cb7..04e42307 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -8,6 +8,7 @@ flake8-builtins flake8-comprehensions flake8-mutable flake8-print +geopandas interrogate isort joblib @@ -16,14 +17,17 @@ mypy nbsphinx netcdf4 pendulum>=2.0.1 +pooch pre-commit pydocstyle +pygeos pylint pytest pytest-cov pytest-flake8 pytest-sugar pytest-vcr +regionmask scitools-iris>=3 setuptools_scm sphinx