From c68646ca1e0a15a2783375ec0adbe94e86e660bd Mon Sep 17 00:00:00 2001 From: Aaron Spring <aaron.spring@mpimet.mpg.de> Date: Tue, 8 Jun 2021 07:55:44 +0000 Subject: [PATCH] showcase different data access options --- CHANGELOG.md | 32 +- environment.yml | 20 +- notebooks/data_access/EWC_catalog.yml | 122 ++++ notebooks/data_access/IRIDL.ipynb | 855 ++++++++++++++++++++++++++ notebooks/data_access/README.md | 13 + notebooks/data_access/intake.ipynb | 565 +++++++++++++++++ notebooks/data_access/wget_curl.ipynb | 186 ++++++ 7 files changed, 1774 insertions(+), 19 deletions(-) create mode 100644 notebooks/data_access/EWC_catalog.yml create mode 100644 notebooks/data_access/IRIDL.ipynb create mode 100644 notebooks/data_access/README.md create mode 100644 notebooks/data_access/intake.ipynb create mode 100644 notebooks/data_access/wget_curl.ipynb diff --git a/CHANGELOG.md b/CHANGELOG.md index 7785534..8fe0ddc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,28 +1,42 @@ # CHANGELOG +### unreleased + +- Add notebooks showcasing accessing output of different models from different sources: (!2, [Aaron Spring](https://renkulab.io/gitlab/aaron.spring)) + - S2S-Project models: + - from from European Weather Cloud: + - [`climetlab-s2s-ai-challenge`](https://github.com/ecmwf-lab/climetlab-s2s-ai-challenge/) [recommended], see [`climetlab-s2s-ai-challenge` notebooks](https://github.com/ecmwf-lab/climetlab-s2s-ai-challenge/tree/main/notebooks) + - `curl` & `wget`, see [wget_curl.ipynb](https://renkulab.io/gitlab/aaron.spring/s2s-ai-challenge-template/-/blob/master/notebooks/data_access/wget_curl.ipynb) + - `intake`, see [intake.ipynb](https://renkulab.io/gitlab/aaron.spring/s2s-ai-challenge-template/-/blob/master/notebooks/data_access/intake.ipynb) + - `IRIDL` including overview, see [IRIDL.ipynb](https://renkulab.io/gitlab/aaron.spring/s2s-ai-challenge-template/-/blob/master/notebooks/data_access/IRIDL.ipynb) + - SubX-Project models: `IRIDL` including overview, see [IRIDL.ipynb](https://renkulab.io/gitlab/aaron.spring/s2s-ai-challenge-template/-/blob/master/notebooks/data_access/IRIDL.ipynb) + - How to access password-protected S2S-Project output from IRIDL with xarray? see [IRIDL.ipynb](https://renkulab.io/gitlab/aaron.spring/s2s-ai-challenge-template/-/blob/master/notebooks/data_access/IRIDL.ipynb) +- fix `netcdf4` version to `1.5.4` for `opendap` to work lazily with `xarray` (!2, [Aaron Spring](https://renkulab.io/gitlab/aaron.spring)) + + ### 2021-05-31: `v0.2` *release* After this `v0.2` release, this CHANGELOG.md will describe all changes made in this template repository. - update `README` how to join competition, please `git pull` if you forked before - find status of your submission in `s2s-ai-competition-scoring-image` https://renkulab.io/gitlab/tasko.olevski/s2s-ai-competition-scoring-image/-/blob/master/README.md -- calculate `RPSS` with respect to climatology (not ECMWF anymore) +- calculate `RPSS` with respect to climatology (not ECMWF anymore) ([Aaron Spring](https://renkulab.io/gitlab/aaron.spring)) - update `RPSS_verification.ipynb` - - update `scorer`: https://renkulab.io/gitlab/tasko.olevski/s2s-ai-competition-scoring-image + - update `scorer`: https://renkulab.io/gitlab/tasko.olevski/s2s-ai-competition-scoring-image ([Tasko Olevski](https://renkulab.io/gitlab/tasko.olevski), [Aaron Spring](https://renkulab.io/gitlab/aaron.spring)) - Averaged ECMWF RPSS skill value to beat at least: -0.0070 ### 2021-05-26: `v0.1` *pre-release* -- update `README` how to join competition !4 -- git lfs track zarr: `git lfs track "**/*.zarr/**"` !4 -- add notebooks: !4 +- update `README` how to join competition (!4, [Aaron Spring](https://renkulab.io/gitlab/aaron.spring)) +- git lfs track zarr: `git lfs track "**/*.zarr/**"` (!4, [Aaron Spring](https://renkulab.io/gitlab/aaron.spring)) +- add notebooks: (!4, [Aaron Spring](https://renkulab.io/gitlab/aaron.spring)) - create renku datasets: `renku_datasets_biweekly.ipynb` - RPSS verification: `RPSS_verification.ipynb` - ML train and predict based on weatherbench: `ML_train_and_predict.ipynb` - mean bias reduction: `mean_bias_reduction.ipynb` - template for training and predictions: `ML_forecast_template.ipynb` -- add renku dataset `s2s-ai-challenge` with files: !4 +- add renku dataset `s2s-ai-challenge` with files: (!4, [Aaron Spring](https://renkulab.io/gitlab/aaron.spring)) - `hindcast-like-observations_2000-2019_biweekly_deterministic.zarr` - `forecast-like-observations_2020_biweekly_deterministic.zarr` - `hindcast-like-observations_2000-2019_biweekly_tercile-edges.nc` @@ -31,10 +45,10 @@ After this `v0.2` release, this CHANGELOG.md will describe all changes made in t - `ecmwf_forecast-input_2020_biweekly_deterministic.zarr` - `ecmwf_hindcast-input_2000-2019_biweekly_deterministic.zarr` - `ecmwf_recalibrated_benchmark_2020_biweekly_terciled.nc` -- add reproducibility section below in training !4 +- add reproducibility section below in training (!4, [Aaron Spring](https://renkulab.io/gitlab/aaron.spring)) - how to deal with this dry mask? provide as renku dataset? now implicitly masked in categorized observations `obs_p` -- justify if training takes more than a week !4 -- show RPS for all years. ToDo: take RPSS #4 +- justify if training takes more than a week (!4, [Aaron Spring](https://renkulab.io/gitlab/aaron.spring)) +- show RPS for all years. ~~ToDo: take RPSS~~ (!4, [Aaron Spring](https://renkulab.io/gitlab/aaron.spring)) diff --git a/environment.yml b/environment.yml index b7585bc..8d5ba04 100644 --- a/environment.yml +++ b/environment.yml @@ -5,27 +5,27 @@ dependencies: - xarray # ML - tensorflow - #- pytorch + - pytorch # viz - matplotlib-base # - cartopy # scoring - - xskillscore # includes sklearn + - xskillscore>=0.0.20 # includes sklearn # data access - #- intake - #- fsspec + - intake + - fsspec - zarr - s3fs - #- intake-xarray + - intake-xarray - cfgrib - #- pydap - #- h5netcdf - # - netcdf4#==1.5.1 # see https://github.com/pydata/xarray/issues/4925 + - nc-time-axis + - pydap + - h5netcdf + - netcdf4==1.5.3 - pip - pip: - climetlab >= 0.7.0 - climetlab_s2s_ai_challenge >= 0.6.3 - configargparse # for weatherbench - - netcdf4 # ==1.5.1 # see https://github.com/pydata/xarray/issues/4925 - - git+https://github.com/phausamann/sklearn-xarray.git@develop + - netcdf4==1.5.4 prefix: "/opt/conda" diff --git a/notebooks/data_access/EWC_catalog.yml b/notebooks/data_access/EWC_catalog.yml new file mode 100644 index 0000000..a3af425 --- /dev/null +++ b/notebooks/data_access/EWC_catalog.yml @@ -0,0 +1,122 @@ +plugins: + source: + - module: intake_xarray + +sources: + training-input: + description: climetlab name in AI/ML community naming for hindcasts as input to the ML-model in training period + driver: netcdf + parameters: + model: + description: name of the S2S model + type: str + default: ecmwf + allowed: [ecmwf, eccc, ncep] + param: + description: variable name + type: str + default: tp + allowed: [t2m, ci, gh, lsm, msl, q, rsn, sm100, sm20, sp, sst, st100, st20, t, tcc, tcw, ttr, tp, v, u] + date: + description: initialization weekly thursdays + type: datetime + default: 2020.01.02 + min: 2020.01.02 + max: 2020.12.31 + version: + description: versioning of the data + type: str + default: 0.3.0 + format: + description: data type + type: str + default: netcdf + allowed: [netcdf, grib] + ending: + description: data format compatible with format; netcdf -> nc, grib -> grib + type: str + default: nc + allowed: [nc, grib] + xarray_kwargs: + engine: h5netcdf + args: # add simplecache:: for caching: https://filesystem-spec.readthedocs.io/en/latest/features.html#caching-files-locally + urlpath: https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/training-input/{{version}}/{{format}}/{{model}}-hindcast-{{param}}-{{date.strftime("%Y%m%d")}}.{{ending}} + + test-input: + description: climetlab name in AI/ML community naming for 2020 forecasts as input to ML model in test period 2020 + driver: netcdf + parameters: + model: + description: name of the S2S model + type: str + default: ecmwf + allowed: [ecmwf, eccc, ncep] + param: + description: variable name + type: str + default: tp + allowed: [t2m, ci, gh, lsm, msl, q, rsn, sm100, sm20, sp, sst, st100, st20, t, tcc, tcw, ttr, tp, v, u] + date: + description: initialization weekly thursdays + type: datetime + default: 2020.01.02 + min: 2020.01.02 + max: 2020.12.31 + version: + description: versioning of the data + type: str + default: 0.3.0 + format: + description: data type + type: str + default: netcdf + allowed: [netcdf, grib] + ending: + description: data format compatible with format; netcdf -> nc, grib -> grib + type: str + default: nc + allowed: [nc, grib] + xarray_kwargs: + engine: h5netcdf + args: # add simplecache:: for caching: https://filesystem-spec.readthedocs.io/en/latest/features.html#caching-files-locally + urlpath: https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/test-input/{{version}}/{{format}}/{{model}}-forecast-{{param}}-{{date.strftime("%Y%m%d")}}.{{ending}} + + training-output-reference: + description: climetlab name in AI/ML community naming for 2020 forecasts as output reference to compare to ML model output to in training period + driver: netcdf + parameters: + param: + description: variable name + type: str + default: tp + allowed: [t2m, tp] + date: + description: initialization weekly thursdays + type: datetime + default: 2020.01.02 + min: 2020.01.02 + max: 2020.12.31 + xarray_kwargs: + engine: h5netcdf + args: # add simplecache:: for caching: https://filesystem-spec.readthedocs.io/en/latest/features.html#caching-files-locally + urlpath: https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/test-output-reference/{{param}}-{{date.strftime("%Y%m%d")}}.nc + + test-output-reference: + description: climetlab name in AI/ML community naming for 2020 forecasts as output reference to compare to ML model output to in test period 2020 + driver: netcdf + parameters: + param: + description: variable name + type: str + default: tp + allowed: [t2m, tp] + date: + description: initialization weekly thursdays + type: datetime + default: 2020.01.02 + min: 2020.01.02 + max: 2020.12.31 + xarray_kwargs: + engine: h5netcdf + args: # add simplecache:: for caching: https://filesystem-spec.readthedocs.io/en/latest/features.html#caching-files-locally + urlpath: https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/test-output-reference/{{param}}-{{date.strftime("%Y%m%d")}}.nc diff --git a/notebooks/data_access/IRIDL.ipynb b/notebooks/data_access/IRIDL.ipynb new file mode 100644 index 0000000..f1e11df --- /dev/null +++ b/notebooks/data_access/IRIDL.ipynb @@ -0,0 +1,855 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Access from `iridl.ldeo.columbia.edu`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "IRI Data Library (IRIDL) hosts various subseasonal initialized forecast and hindcast simulations:\n", + "- `S2S project`:\n", + " - http://iridl.ldeo.columbia.edu/SOURCES/.ECMWF/.S2S/\n", + " - hindcast/reforecast: one variable, one model:\n", + " - login required\n", + "- `SubX project`:\n", + " - http://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/\n", + " - hindcast/reforecast: one variable, one model:\n", + " - login not required\n", + "- Notes:\n", + " - Output on IRIDL is not always on the 1.5 degree grid requested for the competition. Also dimension names and coordinates differ.\n", + " - Beware that most models are not only initialized on thursdays. It is not forbidden to use simulations which are started on other weekdays, buy please pay attention that you may only use information available on `forecast_time`, i.e. if the model is initialized on Mondays, you have to use the day 14+3=17 to day 27+3=30 forecast for week 3-4.\n", + "---\n", + "This notebook also provides opendap magic, i.e. commands added to the opendap URL which preprocess data server-side. (not implemented)\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here are instructions for configuring xarray to open protected Data Library datasets, after you have created a Data Library account and accepted the terms and conditions for the dataset.\n", + "1. Visit https://iridl.ldeo.columbia.edu/auth/genkey . Log in to the Data Library. Copy the key from the response.\n", + "\n", + "2. Create a file with the following content, substituting the key from step 1 for `\"xxxx\"`:\n", + "`Set-Cookie: __dlauth_id=xxxx; domain=.iridl.ldeo.columbia.edu`\n", + "\n", + "3. Put the following in `~/.daprc`, which is `/home/jovyan/.daprc` on renku, substituting the path to the above file for `/path/to/cookie/file`:\n", + "`HTTP.COOKIEJAR=/path/to/cookie/file`. You may need to copy `.daprc` to `/home/jovyan` on renku, because `/home/jovyan` is not tracked by `git`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing /work/s2s-ai-challenge-template/.daprc\n" + ] + } + ], + "source": [ + "%%writefile /work/s2s-ai-challenge-template/.daprc\n", + "HTTP.COOKIEJAR=/work/s2s-ai-challenge-template/.cookie_iridl" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "!cp /work/s2s-ai-challenge-template/.daprc /home/jovyan" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#%writefile /work/s2s-ai-challenge-template/.cookie_iridl\n", + "#Set-Cookie: __dlauth_id=xxxx; domain=.iridl.ldeo.columbia.edu" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing /work/s2s-ai-challenge-template/.cookie_iridl\n" + ] + } + ], + "source": [ + "%%writefile /work/s2s-ai-challenge-template/.cookie_iridl\n", + "Set-Cookie: __dlauth_id=6d3f0d342e1bdd448b287481f6d7989673305eeba2fa65fabb2709e2d76101b21ae816ffe0560b1a25ed3c8d0bf8884eab7d4bc2; domain=.iridl.ldeo.columbia.edu" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.8/site-packages/xarray/backends/cfgrib_.py:27: UserWarning: Failed to load cfgrib - most likely there is a problem accessing the ecCodes library. Try `import cfgrib` to get the full error message\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "<xarray.core.options.set_options at 0x7efe3cb51fd0>" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import xarray as xr\n", + "xr.set_options(display_style='text')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Please beawre that most models are not only initialized on thursdays.\n", + "It is not forbidden to use simulations which are started on other weekdays,\n", + "buy please pay attention that you may only use information available on `forecast_time`,\n", + "i.e. if the model is initialized on Mondays, you have to use the day 14+3=17 to day 27+3=30 forecast for week 3-4." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.8/site-packages/xarray/backends/plugins.py:61: RuntimeWarning: Engine 'cfgrib' loading failed:\n", + "/opt/conda/lib/python3.8/site-packages/gribapi/_bindings.cpython-38-x86_64-linux-gnu.so: undefined symbol: codes_bufr_key_is_header\n", + " warnings.warn(f\"Engine {name!r} loading failed:\\n{ex}\", RuntimeWarning)\n" + ] + } + ], + "source": [ + "ds = xr.open_dataset('https://iridl.ldeo.columbia.edu/SOURCES/.ECMWF/.S2S/.ECMF/.reforecast/.control/.2m_above_ground/.2t/dods',\n", + " chunks='auto', decode_times=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# calendar '360' not recognized, but '360_day'\n", + "if ds.hdate.attrs['calendar'] == '360':\n", + " ds.hdate.attrs['calendar'] = '360_day'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<pre><xarray.DataArray 't2m' (hdate: 26, forecast_time: 637, lead_time: 46, latitude: 121, longitude: 240)>\n", + "dask.array<open_dataset-f89df07098f6ce22c120a08e3f3f29a52t, shape=(26, 637, 46, 121, 240), dtype=float32, chunksize=(8, 91, 15, 46, 60), chunktype=numpy.ndarray>\n", + "Coordinates:\n", + " * latitude (latitude) float32 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 45 days 12...\n", + " * hdate (hdate) object 1995-07-01 00:00:00 ... 2020-07-01 00:00:00\n", + " * forecast_time (forecast_time) datetime64[ns] 2015-05-14 ... 2021-06-17\n", + " * longitude (longitude) float32 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5\n", + "Attributes:\n", + " pointwidth: 0\n", + " gribPDSpattern: 04XXXX003D0000\n", + " long_name: 2-meter Temperature\n", + " units: K\n", + " standard_name: air_temperature</pre>" + ], + "text/plain": [ + "<xarray.DataArray 't2m' (hdate: 26, forecast_time: 637, lead_time: 46, latitude: 121, longitude: 240)>\n", + "dask.array<open_dataset-f89df07098f6ce22c120a08e3f3f29a52t, shape=(26, 637, 46, 121, 240), dtype=float32, chunksize=(8, 91, 15, 46, 60), chunktype=numpy.ndarray>\n", + "Coordinates:\n", + " * latitude (latitude) float32 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 45 days 12...\n", + " * hdate (hdate) object 1995-07-01 00:00:00 ... 2020-07-01 00:00:00\n", + " * forecast_time (forecast_time) datetime64[ns] 2015-05-14 ... 2021-06-17\n", + " * longitude (longitude) float32 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5\n", + "Attributes:\n", + " pointwidth: 0\n", + " gribPDSpattern: 04XXXX003D0000\n", + " long_name: 2-meter Temperature\n", + " units: K\n", + " standard_name: air_temperature" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = xr.decode_cf(ds).rename({'X':'longitude', 'Y':'latitude', 'S':'forecast_time', 'LA': 'lead_time', '2t':'t2m'})\n", + "ds['t2m']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(88.496735436, 'GB')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds.nbytes/1e9,'GB'" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# hdate gets the privous years reforecast for that dayofyear" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hindcast Availability\n", + "\n", + "- BOM:\tBoM POAMA Ensemble.\n", + "- CMA:\tBeijing Climate Center (BCC) Climate Prediction System for S2S.\n", + "- CNRM:\tCNRM Ensemble Prediction System.\n", + "- ECCC:\tECCC Ensemble Prediction System.\n", + "- ECMF:\tECMWF Ensemble.\n", + "- HMCR:\tHMCR Ensemble.\n", + "- ISAC:\tISAC-CNR Ensemble.\n", + "- JMA:\tJMA Ensemble System.\n", + "- KMA:\tKMA Seasonal Prediction System.\n", + "- NCEP:\tNCEP CFSv2 Ensemble.\n", + "- UKMO:\tUKMO Ensemble Prediction System." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "BOM not on-the-fly forecast_time freq not found \n", + " Coordinates:\n", + " * latitude (latitude) float32 88.1 85.64 83.16 ... -83.16 -85.64 -88.1\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 61 days 12...\n", + " * forecast_time (forecast_time) datetime64[ns] 1981-01-01 ... 2013-12-26\n", + " * realization (realization) float32 1.0 2.0 3.0 4.0 ... 29.0 30.0 31.0 32.0\n", + " * longitude (longitude) float32 0.0 2.507 5.014 ... 353.5 356.0 358.5 \n", + " Frozen(SortedKeysDict({'latitude': 72, 'lead_time': 62, 'forecast_time': 2376, 'realization': 32, 'longitude': 144})) 195.498364944 GB \n", + "\n", + "CNRM not on-the-fly forecast_time freq not found \n", + " Coordinates:\n", + " * latitude (latitude) float32 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 60 days 12...\n", + " * forecast_time (forecast_time) datetime64[ns] 1993-01-01 ... 2014-12-15\n", + " * realization (realization) float32 1.0 2.0 3.0 4.0 ... 11.0 12.0 13.0 14.0\n", + " * longitude (longitude) float32 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5 \n", + " Frozen(SortedKeysDict({'latitude': 121, 'lead_time': 61, 'forecast_time': 528, 'realization': 14, 'longitude': 240})) 52.377944132 GB \n", + "\n", + "ECCC on-the-fly forecast_time freq:W-THU \n", + " Coordinates:\n", + " * latitude (latitude) float32 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * hdate (hdate) object 1995-07-01 00:00:00 ... 2017-07-01 00:00:00\n", + " * forecast_time (forecast_time) datetime64[ns] 2016-01-07 ... 2021-06-03\n", + " * realization (realization) float32 1.0 2.0 3.0\n", + " * longitude (longitude) float32 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 31 days 12... \n", + " Frozen(SortedKeysDict({'latitude': 121, 'hdate': 23, 'forecast_time': 283, 'realization': 3, 'longitude': 240, 'lead_time': 32})) 72.5842064 GB \n", + "\n", + "ECMF on-the-fly forecast_time freq not found \n", + " Coordinates:\n", + " * latitude (latitude) float32 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 45 days 12...\n", + " * hdate (hdate) object 1995-07-01 00:00:00 ... 2020-07-01 00:00:00\n", + " * forecast_time (forecast_time) datetime64[ns] 2015-05-14 ... 2021-06-17\n", + " * realization (realization) float32 1.0 2.0 3.0 4.0 ... 7.0 8.0 9.0 10.0\n", + " * longitude (longitude) float32 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5 \n", + " Frozen(SortedKeysDict({'latitude': 121, 'lead_time': 46, 'hdate': 26, 'forecast_time': 637, 'realization': 10, 'longitude': 240})) 884.967290356 GB \n", + "\n", + "HMCR on-the-fly forecast_time freq not found \n", + " Coordinates:\n", + " * latitude (latitude) float32 90.0 87.5 85.0 82.5 ... -85.0 -87.5 -90.0\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 60 days 12...\n", + " * hdate (hdate) object 1985-07-01 00:00:00 ... 2010-07-01 00:00:00\n", + " * forecast_time (forecast_time) datetime64[ns] 2015-01-07 ... 2021-06-03\n", + " * realization (realization) float32 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0\n", + " * longitude (longitude) float32 0.0 2.507 5.014 ... 353.5 356.0 358.5 \n", + " Frozen(SortedKeysDict({'latitude': 73, 'lead_time': 61, 'hdate': 26, 'forecast_time': 335, 'realization': 9, 'longitude': 144})) 201.0647102 GB \n", + "\n", + "model=ISAC failed due to OSError: [Errno -90] NetCDF: file not found: b'https://iridl.ldeo.columbia.edu/SOURCES/.ECMWF/.S2S/.ISAC/.reforecast/.perturbed/.2m_above_ground/.2t/dods' \n", + "\n", + "JMA not on-the-fly forecast_time freq:D \n", + " Coordinates:\n", + " * latitude (latitude) float32 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1981-01-10T12:00:00 ... 201...\n", + " * realization (realization) float32 1.0 2.0 3.0 4.0\n", + " * longitude (longitude) float32 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5\n", + " * lead_time (lead_time) timedelta64[ns] 1 days 2 days ... 32 days 33 days \n", + " Frozen(SortedKeysDict({'latitude': 121, 'forecast_time': 10948, 'realization': 4, 'longitude': 240, 'lead_time': 33})) 167.867087068 GB \n", + "\n", + "KMA on-the-fly forecast_time freq:D \n", + " Coordinates:\n", + " * latitude (latitude) float32 90.0 87.5 85.0 82.5 ... -85.0 -87.5 -90.0\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 59 days 12...\n", + " * hdate (hdate) object 1991-07-01 00:00:00 ... 2010-07-01 00:00:00\n", + " * forecast_time (forecast_time) datetime64[ns] 2016-11-01 ... 2021-06-01\n", + " * realization (realization) float32 1.0 2.0\n", + " * longitude (longitude) float32 0.0 2.507 5.014 ... 353.5 356.0 358.5 \n", + " Frozen(SortedKeysDict({'latitude': 73, 'lead_time': 60, 'hdate': 20, 'forecast_time': 1674, 'realization': 2, 'longitude': 144})) 168.932059708 GB \n", + "\n", + "NCEP not on-the-fly forecast_time freq:D \n", + " Coordinates:\n", + " * latitude (latitude) float32 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 43 days 12...\n", + " * forecast_time (forecast_time) datetime64[ns] 1999-01-01 ... 2010-12-31\n", + " * realization (realization) float32 1.0 2.0 3.0\n", + " * longitude (longitude) float32 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5 \n", + " Frozen(SortedKeysDict({'latitude': 121, 'lead_time': 44, 'forecast_time': 4383, 'realization': 3, 'longitude': 240})) 67.205101832 GB \n", + "\n", + "UKMO on-the-fly forecast_time freq not found \n", + " Coordinates:\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 59 days 12...\n", + " * latitude (latitude) float32 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 2016-01-01 ... 2019-05-09\n", + " * realization (realization) float32 1.0 2.0\n", + " * longitude (longitude) float32 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5\n", + " * hdate (hdate) object 1993-07-01 00:00:00 ... 2015-07-01 00:00:00 \n", + " Frozen(SortedKeysDict({'lead_time': 60, 'latitude': 121, 'forecast_time': 162, 'realization': 2, 'longitude': 240, 'hdate': 23})) 51.937462612 GB \n", + "\n" + ] + } + ], + "source": [ + "models = ['BOM','CNRM','ECCC','ECMF','HMCR','ISAC','JMA','KMA','NCEP','UKMO']\n", + "for model in models:\n", + " try:\n", + " ds = xr.open_dataset(f'https://iridl.ldeo.columbia.edu/SOURCES/.ECMWF/.S2S/.{model}/.reforecast/.perturbed/.2m_above_ground/.2t/dods',\n", + " chunks='auto', decode_times=False).rename({'S':'forecast_time', 'LA':'lead_time','M':'realization', 'X':'longitude', 'Y':'latitude'})\n", + " # calendar '360' not recognized, but '360_day'\n", + " for c in ['hdate','forecast_time']:\n", + " if c in ds.coords:\n", + " if ds[c].attrs['calendar'] == '360':\n", + " ds[c].attrs['calendar'] = '360_day'\n", + " ds = xr.decode_cf(ds)\n", + " onthefly = True if 'hdate' in ds.coords else False\n", + " forecast_time_freq = xr.infer_freq(ds.forecast_time)\n", + " print(model, 'on-the-fly' if onthefly else 'not on-the-fly',\n", + " 'forecast_time freq:'+forecast_time_freq if forecast_time_freq else 'forecast_time freq not found',\n", + " '\\n',ds.coords,'\\n',ds.sizes,ds.nbytes/1e9,'GB','\\n')\n", + " except Exception as e:\n", + " print(f'model={model} failed due to {type(e).__name__}: {e} \\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SubX" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The access to output from the SubX project does not require login information via cookie." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "ds = xr.open_dataset('http://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/.CESM/.30LCESM1/.hindcast/.tas/dods',\n", + " chunks='auto', decode_times=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# calendar '360' not recognized, but '360_day'\n", + "if ds.S.attrs['calendar'] == '360':\n", + " ds.S.attrs['calendar'] = '360_day'" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<pre><xarray.DataArray 't2m' (forecast_time: 887, realization: 10, lead_time: 45, latitude: 181, longitude: 360)>\n", + "dask.array<open_dataset-1bd5755a82e148fd83330ea4db46cbb8tas, shape=(887, 10, 45, 181, 360), dtype=float32, chunksize=(335, 2, 9, 61, 90), chunktype=numpy.ndarray>\n", + "Coordinates:\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 44 days 12...\n", + " * latitude (latitude) float32 -90.0 -89.0 -88.0 -87.0 ... 88.0 89.0 90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1999-01-06 ... 2015-12-30\n", + " * realization (realization) float32 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0\n", + " * longitude (longitude) float32 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0\n", + "Attributes:\n", + " pointwidth: 0.0\n", + " standard_name: air_temperature\n", + " long_name: 2-meter Air Temperature\n", + " level_type: 2 meters above ground\n", + " cell_methods: time: mean\n", + " units: Kelvin_scale</pre>" + ], + "text/plain": [ + "<xarray.DataArray 't2m' (forecast_time: 887, realization: 10, lead_time: 45, latitude: 181, longitude: 360)>\n", + "dask.array<open_dataset-1bd5755a82e148fd83330ea4db46cbb8tas, shape=(887, 10, 45, 181, 360), dtype=float32, chunksize=(335, 2, 9, 61, 90), chunktype=numpy.ndarray>\n", + "Coordinates:\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 44 days 12...\n", + " * latitude (latitude) float32 -90.0 -89.0 -88.0 -87.0 ... 88.0 89.0 90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1999-01-06 ... 2015-12-30\n", + " * realization (realization) float32 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0\n", + " * longitude (longitude) float32 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0\n", + "Attributes:\n", + " pointwidth: 0.0\n", + " standard_name: air_temperature\n", + " long_name: 2-meter Air Temperature\n", + " level_type: 2 meters above ground\n", + " cell_methods: time: mean\n", + " units: Kelvin_scale" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds = xr.decode_cf(ds).rename({'X':'longitude', 'Y':'latitude', 'S':'forecast_time', 'L': 'lead_time', 'M':'realization', 'tas':'t2m'})\n", + "ds['t2m']" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(104.03446566, 'GB')" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds.nbytes/1e9,'GB'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hindcast Availability" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- center: model\n", + "- CESM:\t30LCESM1 46LCESM1\n", + "- ECCC:\tGEM GEPS6 GEPS5\n", + "- EMC:\tGEFS GEFSv12\n", + "- ESRL:\tFIMr1p1\n", + "- GMAO:\tGEOS_V2p1\n", + "- NCEP:\tCFSv2\n", + "- NRL:\tNESM\n", + "- RSMAS:\tCCSM4" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "30LCESM1 not on-the-fly forecast_time freq:W-WED \n", + " Coordinates:\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 44 days 12...\n", + " * latitude (latitude) float32 -90.0 -89.0 -88.0 -87.0 ... 88.0 89.0 90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1999-01-06 ... 2015-12-30\n", + " * realization (realization) float32 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0\n", + " * longitude (longitude) float32 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0 \n", + " Frozen(SortedKeysDict({'lead_time': 45, 'latitude': 181, 'forecast_time': 887, 'realization': 10, 'longitude': 360})) 104.03446566 GB \n", + "\n", + "46LCESM1 not on-the-fly forecast_time freq:W-WED \n", + " Coordinates:\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 44 days 12...\n", + " * latitude (latitude) float32 -90.0 -89.0 -88.0 -87.0 ... 88.0 89.0 90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1999-01-06 ... 2015-12-30\n", + " * realization (realization) float32 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0\n", + " * longitude (longitude) float32 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0 \n", + " Frozen(SortedKeysDict({'lead_time': 45, 'latitude': 181, 'forecast_time': 887, 'realization': 10, 'longitude': 360})) 104.03446566 GB \n", + "\n", + "GEM not on-the-fly forecast_time freq:D \n", + " Coordinates:\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 31 days 12...\n", + " * latitude (latitude) float32 -90.0 -89.0 -88.0 -87.0 ... 88.0 89.0 90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1995-01-04 ... 2014-12-28\n", + " * realization (realization) float32 1.0 2.0 3.0 4.0\n", + " * longitude (longitude) float32 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0 \n", + " Frozen(SortedKeysDict({'lead_time': 32, 'latitude': 181, 'forecast_time': 7299, 'realization': 4, 'longitude': 360})) 243.508714908 GB \n", + "\n", + "GEPS6 not on-the-fly forecast_time freq:D \n", + " Coordinates:\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 31 days 12...\n", + " * latitude (latitude) float32 -90.0 -89.0 -88.0 -87.0 ... 88.0 89.0 90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1998-01-03 ... 2017-12-27\n", + " * realization (realization) float32 1.0 2.0 3.0 4.0\n", + " * longitude (longitude) float32 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0 \n", + " Frozen(SortedKeysDict({'lead_time': 32, 'latitude': 181, 'forecast_time': 7299, 'realization': 4, 'longitude': 360})) 243.508714908 GB \n", + "\n", + "GEPS5 not on-the-fly forecast_time freq:D \n", + " Coordinates:\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 31 days 12...\n", + " * latitude (latitude) float32 -90.0 -89.0 -88.0 -87.0 ... 88.0 89.0 90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1998-01-03 ... 2017-12-27\n", + " * realization (realization) float32 1.0 2.0 3.0 4.0\n", + " * longitude (longitude) float32 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0 \n", + " Frozen(SortedKeysDict({'lead_time': 32, 'latitude': 181, 'forecast_time': 7299, 'realization': 4, 'longitude': 360})) 243.508714908 GB \n", + "\n", + "GEFS not on-the-fly forecast_time freq:W-WED \n", + " Coordinates:\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 34 days 12...\n", + " * latitude (latitude) float32 90.0 89.0 88.0 87.0 ... -88.0 -89.0 -90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1999-01-06 ... 2016-12-28\n", + " * realization (realization) float32 0.0 1.0 2.0 3.0 ... 7.0 8.0 9.0 10.0\n", + " * longitude (longitude) float32 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0 \n", + " Frozen(SortedKeysDict({'lead_time': 35, 'latitude': 181, 'forecast_time': 939, 'realization': 11, 'longitude': 360})) 94.2252796 GB \n", + "\n", + "center=EMC model=GEFSv12 failed due to OSError: [Errno -90] NetCDF: file not found: b'https://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/.EMC/.GEFSv12/.hindcast/.tas/dods' \n", + "\n", + "FIMr1p1 not on-the-fly forecast_time freq:W-WED \n", + " Coordinates:\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 31 days 12...\n", + " * latitude (latitude) float32 -90.0 -89.0 -88.0 -87.0 ... 88.0 89.0 90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1999-01-06 ... 2017-06-28\n", + " * realization (realization) float32 1.0 2.0 3.0 4.0\n", + " * longitude (longitude) float32 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0 \n", + " Frozen(SortedKeysDict({'lead_time': 32, 'latitude': 181, 'forecast_time': 965, 'realization': 4, 'longitude': 360})) 32.194262956 GB \n", + "\n", + "GEOS_V2p1 not on-the-fly forecast_time freq:D \n", + " Coordinates:\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 44 days 12...\n", + " * latitude (latitude) float32 -90.0 -89.0 -88.0 -87.0 ... 88.0 89.0 90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1999-01-01 ... 2016-12-27\n", + " * realization (realization) float32 1.0 2.0 3.0 4.0\n", + " * longitude (longitude) float32 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0 \n", + " Frozen(SortedKeysDict({'lead_time': 45, 'latitude': 181, 'forecast_time': 6571, 'realization': 4, 'longitude': 360})) 308.279834308 GB \n", + "\n", + "CFSv2 not on-the-fly forecast_time freq:6H \n", + " Coordinates:\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 43 days 12...\n", + " * latitude (latitude) float32 90.0 89.0 88.0 87.0 ... -88.0 -89.0 -90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1999-01-01 ... 2017-09-30\n", + " * realization (realization) int32 1\n", + " * longitude (longitude) float32 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0 \n", + " Frozen(SortedKeysDict({'lead_time': 44, 'latitude': 181, 'forecast_time': 27389, 'realization': 1, 'longitude': 360})) 314.101655872 GB \n", + "\n", + "NESM not on-the-fly forecast_time freq:D \n", + " Coordinates:\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 44 days 12...\n", + " * latitude (latitude) float32 -90.0 -89.0 -88.0 -87.0 ... 88.0 89.0 90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1999-01-02T12:00:00 ... 201...\n", + " * realization (realization) int32 1\n", + " * longitude (longitude) float32 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0 \n", + " Frozen(SortedKeysDict({'lead_time': 45, 'latitude': 181, 'forecast_time': 6574, 'realization': 1, 'longitude': 360})) 77.10518632 GB \n", + "\n", + "CCSM4 not on-the-fly forecast_time freq:D \n", + " Coordinates:\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 12:00:00 ... 44 days 12...\n", + " * latitude (latitude) float32 -90.0 -89.0 -88.0 -87.0 ... 88.0 89.0 90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1999-01-07 ... 2016-12-31\n", + " * realization (realization) float32 1.0 2.0 3.0\n", + " * longitude (longitude) float32 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0 \n", + " Frozen(SortedKeysDict({'lead_time': 45, 'latitude': 181, 'forecast_time': 6569, 'realization': 3, 'longitude': 360})) 231.139516688 GB \n", + "\n" + ] + } + ], + "source": [ + "centers = ['CESM', 'CESM', 'ECCC', 'ECCC', 'ECCC', 'EMC', 'EMC', 'ESRL', 'GMAO' , 'NCEP', 'NRL','RSMAS']\n", + "models = ['30LCESM1','46LCESM1','GEM','GEPS6','GEPS5','GEFS','GEFSv12','FIMr1p1','GEOS_V2p1','CFSv2','NESM','CCSM4']\n", + "for center,model in zip(centers,models):\n", + " try:\n", + " ds = xr.open_dataset(f'https://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/.{center}/.{model}/.hindcast/.tas/dods',\n", + " chunks='auto', decode_times=False).rename({'S':'forecast_time', 'L':'lead_time','M':'realization', 'X':'longitude', 'Y':'latitude'})\n", + " # calendar '360' not recognized, but '360_day'\n", + " for c in ['hdate','forecast_time']:\n", + " if c in ds.coords:\n", + " if ds[c].attrs['calendar'] == '360':\n", + " ds[c].attrs['calendar'] = '360_day'\n", + " ds = xr.decode_cf(ds)\n", + " onthefly = True if 'hdate' in ds.coords else False\n", + " forecast_time_freq = xr.infer_freq(ds.forecast_time)\n", + " print(model, 'on-the-fly' if onthefly else 'not on-the-fly',\n", + " 'forecast_time freq:'+forecast_time_freq if forecast_time_freq else 'forecast_time freq not found',\n", + " '\\n',ds.coords,'\\n',ds.sizes,ds.nbytes/1e9,'GB','\\n')\n", + " except Exception as e:\n", + " print(f'center={center} model={model} failed due to {type(e).__name__}: {e} \\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Opendap magic\n", + "\n", + "Opendap URLs be appended for server-side preprocessing.\n", + "\n", + "- https://www.opendap.org/support\n", + "- http://iridl.ldeo.columbia.edu/dochelp/topics/DODS/fnlist.html\n", + "- https://iridl.ldeo.columbia.edu/dochelp/Documentation/funcindex.html?Set-Language=en" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from subprocess import call\n", + "fname = 'GEFS_pra_hc.nc'\n", + "# endless magic commands selecting week 3-4 and aggregating pr to tp with unit conversion\n", + "dset_url = 'http://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/.EMC/.GEFS/.hindcast/.pr/S/(0000%206%20Jan%201999)/(0000%2028%20Dec%202015)/RANGEEDGES/S/(days%20since%201999-01-01)/streamgridunitconvert/Y/1/20/RANGE/X/-20/10/RANGE/L/(14)/(28)/RANGEEDGES/%5BL%5Daverage/S/(Jun-Aug)/VALUES/SOURCES/.Models/.SubX/.EMC/.GEFS/.hindcast/.dc9915/.pr/Y/1/20/RANGE/X/-20/10/RANGE/L/(14)/(28)/RANGEEDGES/%5BL%5Daverage/S/to366daysample/%5BYR%5Daverage/S/sampleDOY/sub/c%3A/0.001/(m3%20kg-1)/%3Ac/mul/c%3A/1000/(mm%20m-1)/%3Ac/mul/c%3A/86400/(s%20day-1)/%3Ac/mul/c%3A/7.0//units//days/def/%3Ac/mul/data.nc'\n", + "# download data with curl\n", + "call(['curl','-k',dset_url, '-o',fname])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<pre><xarray.Dataset>\n", + "Dimensions: (forecast_time: 226, latitude: 20, longitude: 31, realization: 11)\n", + "Coordinates:\n", + " * latitude (latitude) float32 1.0 2.0 3.0 4.0 ... 17.0 18.0 19.0 20.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1999-06-02 ... 2015-08-26\n", + " * realization (realization) float32 0.0 1.0 2.0 3.0 ... 7.0 8.0 9.0 10.0\n", + " * longitude (longitude) float32 -20.0 -19.0 -18.0 -17.0 ... 8.0 9.0 10.0\n", + " lead_time timedelta64[ns] 14 days\n", + "Data variables:\n", + " tp (realization, forecast_time, latitude, longitude) float64 ...</pre>" + ], + "text/plain": [ + "<xarray.Dataset>\n", + "Dimensions: (forecast_time: 226, latitude: 20, longitude: 31, realization: 11)\n", + "Coordinates:\n", + " * latitude (latitude) float32 1.0 2.0 3.0 4.0 ... 17.0 18.0 19.0 20.0\n", + " * forecast_time (forecast_time) datetime64[ns] 1999-06-02 ... 2015-08-26\n", + " * realization (realization) float32 0.0 1.0 2.0 3.0 ... 7.0 8.0 9.0 10.0\n", + " * longitude (longitude) float32 -20.0 -19.0 -18.0 -17.0 ... 8.0 9.0 10.0\n", + " lead_time timedelta64[ns] 14 days\n", + "Data variables:\n", + " tp (realization, forecast_time, latitude, longitude) float64 ..." + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "ds = xr.open_dataset(fname).rename({'X':'longitude', 'Y':'latitude', 'S':'forecast_time', 'M':'realization', 'aprod':'tp'}).assign_coords(lead_time=pd.Timedelta('14 d'))\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5.080404184 GB\n" + ] + }, + { + "data": { + "text/html": [ + "<pre><xarray.DataArray 'tp' (forecast_time: 886, realization: 11, latitude: 181, longitude: 360)>\n", + "dask.array<open_dataset-187ec1ad5a15edadd11711d7cbe1f114pr, shape=(886, 11, 181, 360), dtype=float64, chunksize=(423, 4, 82, 120), chunktype=numpy.ndarray>\n", + "Coordinates:\n", + " * latitude (latitude) float32 90.0 89.0 88.0 87.0 ... -88.0 -89.0 -90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 2000-01-12 ... 2016-12-28\n", + " * longitude (longitude) float32 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0\n", + " * realization (realization) float32 0.0 1.0 2.0 3.0 ... 7.0 8.0 9.0 10.0\n", + " lead_time timedelta64[ns] 14 days\n", + "Attributes: (12/13)\n", + " pointwidth: 0.0\n", + " standard_name: precipitation_flux\n", + " parameter_template_discipline_category_number: 1\\n0\\n1\\n8\n", + " parameter_discipline_and_category: Meteorological products, ...\n", + " long_name: Total Precipitation\n", + " grid_type: Latitude/longitude\n", + " ... ...\n", + " level: 0.0\n", + " center: US National Weather Servi...\n", + " production_status: Operational products\n", + " level_type: surface\n", + " file_missing_value: 1e+20\n", + " history: Averaged over L[14.5 days...</pre>" + ], + "text/plain": [ + "<xarray.DataArray 'tp' (forecast_time: 886, realization: 11, latitude: 181, longitude: 360)>\n", + "dask.array<open_dataset-187ec1ad5a15edadd11711d7cbe1f114pr, shape=(886, 11, 181, 360), dtype=float64, chunksize=(423, 4, 82, 120), chunktype=numpy.ndarray>\n", + "Coordinates:\n", + " * latitude (latitude) float32 90.0 89.0 88.0 87.0 ... -88.0 -89.0 -90.0\n", + " * forecast_time (forecast_time) datetime64[ns] 2000-01-12 ... 2016-12-28\n", + " * longitude (longitude) float32 0.0 1.0 2.0 3.0 ... 357.0 358.0 359.0\n", + " * realization (realization) float32 0.0 1.0 2.0 3.0 ... 7.0 8.0 9.0 10.0\n", + " lead_time timedelta64[ns] 14 days\n", + "Attributes: (12/13)\n", + " pointwidth: 0.0\n", + " standard_name: precipitation_flux\n", + " parameter_template_discipline_category_number: 1\\n0\\n1\\n8\n", + " parameter_discipline_and_category: Meteorological products, ...\n", + " long_name: Total Precipitation\n", + " grid_type: Latitude/longitude\n", + " ... ...\n", + " level: 0.0\n", + " center: US National Weather Servi...\n", + " production_status: Operational products\n", + " level_type: surface\n", + " file_missing_value: 1e+20\n", + " history: Averaged over L[14.5 days..." + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# aggregate w34 precip to tp\n", + "ds = xr.open_dataset('http://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/.EMC/.GEFS/.hindcast/.pr/S/(0000%206%20Jan%202000)/(0000%2031%20Dec%202019)/RANGEEDGES/L/(14)/(28)/RANGEEDGES/[L]sum/dods',\n", + " chunks='auto').rename({'X':'longitude', 'Y':'latitude', 'S':'forecast_time', 'M':'realization', 'pr':'tp'}).assign_coords(lead_time=pd.Timedelta('14 d'))\n", + "print(ds.nbytes/1e9,'GB')\n", + "ds.tp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + }, + "toc-autonumbering": true + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/data_access/README.md b/notebooks/data_access/README.md new file mode 100644 index 0000000..764b6d4 --- /dev/null +++ b/notebooks/data_access/README.md @@ -0,0 +1,13 @@ +# Data Access + +- European Weather Cloud: + - [`climetlab-s2s-ai-challenge`](https://github.com/ecmwf-lab/climetlab-s2s-ai-challenge) + - `wget`: wget_curl.ipynb + - `curl`: wget_curl.ipynb + - `mouse`: wget_curl.ipynb + - `intake`: intake.ipynb +- [IRI Data Library](iridl.ldeo.columbia.edu/): IRIDL.ipynb + - S2S: http://iridl.ldeo.columbia.edu/SOURCES/.ECMWF/.S2S/ (restricted access explained in IRIDL.ipynb) + - SubX: http://iridl.ldeo.columbia.edu/SOURCES/.Models/.SubX/ + - NMME: http://iridl.ldeo.columbia.edu/SOURCES/.Models/.NMME/ +- s2sprediction.net diff --git a/notebooks/data_access/intake.ipynb b/notebooks/data_access/intake.ipynb new file mode 100644 index 0000000..1fa6e09 --- /dev/null +++ b/notebooks/data_access/intake.ipynb @@ -0,0 +1,565 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Access from EWC via `intake`\n", + "\n", + "Data easily available via `climetlab`: https://github.com/ecmwf-lab/climetlab-s2s-ai-challenge\n", + "Data holdings listed: https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/test-input/0.3.0/netcdf/index.html\n", + "\n", + "Therefore, S3 data also accessible with `intake-xarray` and cachable with `fsspec`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.8/site-packages/xarray/backends/cfgrib_.py:27: UserWarning: Failed to load cfgrib - most likely there is a problem accessing the ecCodes library. Try `import cfgrib` to get the full error message\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "<xarray.core.options.set_options at 0x7fa0100dcdc0>" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import intake\n", + "import fsspec\n", + "import xarray as xr\n", + "import os, glob\n", + "import pandas as pd\n", + "xr.set_options(display_style='text')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# prevent aihttp timeout errors\n", + "\n", + "from aiohttp import ClientSession, ClientTimeout\n", + "timeout = ClientTimeout(total=600)\n", + "fsspec.config.conf['https'] = dict(client_kwargs={'timeout': timeout})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# intake\n", + "\n", + "https://github.com/intake/intake-xarray can read and cache `grib` and `netcdf` from catalogs.\n", + "\n", + "Caching via `fsspec`: https://filesystem-spec.readthedocs.io/en/latest/features.html#caching-files-locally" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import intake_xarray\n", + "cache_path = '/work/s2s-ai-challenge-template/data/cache'\n", + "fsspec.config.conf['simplecache'] = {'cache_storage': cache_path, 'same_names':True}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing EWC_catalog.yml\n" + ] + } + ], + "source": [ + "%%writefile EWC_catalog.yml\n", + "plugins:\n", + " source:\n", + " - module: intake_xarray\n", + "\n", + "sources:\n", + " training-input:\n", + " description: climetlab name in AI/ML community naming for hindcasts as input to the ML-model in training period\n", + " driver: netcdf\n", + " parameters:\n", + " model:\n", + " description: name of the S2S model\n", + " type: str\n", + " default: ecmwf\n", + " allowed: [ecmwf, eccc, ncep]\n", + " param:\n", + " description: variable name\n", + " type: str\n", + " default: tp\n", + " allowed: [t2m, ci, gh, lsm, msl, q, rsn, sm100, sm20, sp, sst, st100, st20, t, tcc, tcw, ttr, tp, v, u]\n", + " date:\n", + " description: initialization weekly thursdays\n", + " type: datetime\n", + " default: 2020.01.02\n", + " min: 2020.01.02\n", + " max: 2020.12.31\n", + " version:\n", + " description: versioning of the data\n", + " type: str\n", + " default: 0.3.0\n", + " format:\n", + " description: data type\n", + " type: str\n", + " default: netcdf\n", + " allowed: [netcdf, grib]\n", + " ending:\n", + " description: data format compatible with format; netcdf -> nc, grib -> grib\n", + " type: str\n", + " default: nc\n", + " allowed: [nc, grib]\n", + " xarray_kwargs:\n", + " engine: h5netcdf\n", + " args: # add simplecache:: for caching: https://filesystem-spec.readthedocs.io/en/latest/features.html#caching-files-locally\n", + " urlpath: https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/training-input/{{version}}/{{format}}/{{model}}-hindcast-{{param}}-{{date.strftime(\"%Y%m%d\")}}.{{ending}}\n", + "\n", + " test-input:\n", + " description: climetlab name in AI/ML community naming for 2020 forecasts as input to ML model in test period 2020\n", + " driver: netcdf\n", + " parameters:\n", + " model:\n", + " description: name of the S2S model\n", + " type: str\n", + " default: ecmwf\n", + " allowed: [ecmwf, eccc, ncep]\n", + " param:\n", + " description: variable name\n", + " type: str\n", + " default: tp\n", + " allowed: [t2m, ci, gh, lsm, msl, q, rsn, sm100, sm20, sp, sst, st100, st20, t, tcc, tcw, ttr, tp, v, u]\n", + " date:\n", + " description: initialization weekly thursdays\n", + " type: datetime\n", + " default: 2020.01.02\n", + " min: 2020.01.02\n", + " max: 2020.12.31\n", + " version:\n", + " description: versioning of the data\n", + " type: str\n", + " default: 0.3.0\n", + " format:\n", + " description: data type\n", + " type: str\n", + " default: netcdf\n", + " allowed: [netcdf, grib]\n", + " ending:\n", + " description: data format compatible with format; netcdf -> nc, grib -> grib\n", + " type: str\n", + " default: nc\n", + " allowed: [nc, grib]\n", + " xarray_kwargs:\n", + " engine: h5netcdf\n", + " args: # add simplecache:: for caching: https://filesystem-spec.readthedocs.io/en/latest/features.html#caching-files-locally\n", + " urlpath: https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/test-input/{{version}}/{{format}}/{{model}}-forecast-{{param}}-{{date.strftime(\"%Y%m%d\")}}.{{ending}}\n", + "\n", + " training-output-reference:\n", + " description: climetlab name in AI/ML community naming for 2020 forecasts as output reference to compare to ML model output to in training period\n", + " driver: netcdf\n", + " parameters:\n", + " param:\n", + " description: variable name\n", + " type: str\n", + " default: tp\n", + " allowed: [t2m, ci, gh, lsm, msl, q, rsn, sm100, sm20, sp, sst, st100, st20, t, tcc, tcw, ttr, tp, v, u]\n", + " date:\n", + " description: initialization weekly thursdays\n", + " type: datetime\n", + " default: 2020.01.02\n", + " min: 2020.01.02\n", + " max: 2020.12.31\n", + " xarray_kwargs:\n", + " engine: h5netcdf\n", + " args: # add simplecache:: for caching: https://filesystem-spec.readthedocs.io/en/latest/features.html#caching-files-locally\n", + " urlpath: https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/test-output-reference/{{param}}-{{date.strftime(\"%Y%m%d\")}}.nc\n", + " \n", + " test-output-reference:\n", + " description: climetlab name in AI/ML community naming for 2020 forecasts as output reference to compare to ML model output to in test period 2020\n", + " driver: netcdf\n", + " parameters:\n", + " param:\n", + " description: variable name\n", + " type: str\n", + " default: tp\n", + " allowed: [t2m, ci, gh, lsm, msl, q, rsn, sm100, sm20, sp, sst, st100, st20, t, tcc, tcw, ttr, tp, v, u]\n", + " date:\n", + " description: initialization weekly thursdays\n", + " type: datetime\n", + " default: 2020.01.02\n", + " min: 2020.01.02\n", + " max: 2020.12.31\n", + " xarray_kwargs:\n", + " engine: h5netcdf\n", + " args: # add simplecache:: for caching: https://filesystem-spec.readthedocs.io/en/latest/features.html#caching-files-locally\n", + " urlpath: https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/test-output-reference/{{param}}-{{date.strftime(\"%Y%m%d\")}}.nc" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "cat = intake.open_catalog('EWC_catalog.yml')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatetimeIndex(['2020-01-02', '2020-01-09', '2020-01-16', '2020-01-23',\n", + " '2020-01-30', '2020-02-06', '2020-02-13', '2020-02-20',\n", + " '2020-02-27', '2020-03-05', '2020-03-12', '2020-03-19',\n", + " '2020-03-26', '2020-04-02', '2020-04-09', '2020-04-16',\n", + " '2020-04-23', '2020-04-30', '2020-05-07', '2020-05-14',\n", + " '2020-05-21', '2020-05-28', '2020-06-04', '2020-06-11',\n", + " '2020-06-18', '2020-06-25', '2020-07-02', '2020-07-09',\n", + " '2020-07-16', '2020-07-23', '2020-07-30', '2020-08-06',\n", + " '2020-08-13', '2020-08-20', '2020-08-27', '2020-09-03',\n", + " '2020-09-10', '2020-09-17', '2020-09-24', '2020-10-01',\n", + " '2020-10-08', '2020-10-15', '2020-10-22', '2020-10-29',\n", + " '2020-11-05', '2020-11-12', '2020-11-19', '2020-11-26',\n", + " '2020-12-03', '2020-12-10', '2020-12-17', '2020-12-24',\n", + " '2020-12-31'],\n", + " dtype='datetime64[ns]', freq='7D')" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# dates for 2020 forecasts and their on-the-fly reforecasts\n", + "dates=pd.date_range(start='2020-01-02',freq='7D',end='2020-12-31')\n", + "dates" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# `hindcast-input`\n", + "\n", + "on-the-fly hindcasts corresponding to the 2020 forecasts" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.8/site-packages/xarray/backends/plugins.py:61: RuntimeWarning: Engine 'cfgrib' loading failed:\n", + "/opt/conda/lib/python3.8/site-packages/gribapi/_bindings.cpython-38-x86_64-linux-gnu.so: undefined symbol: codes_bufr_key_is_header\n", + " warnings.warn(f\"Engine {name!r} loading failed:\\n{ex}\", RuntimeWarning)\n" + ] + }, + { + "data": { + "text/html": [ + "<pre><xarray.Dataset>\n", + "Dimensions: (forecast_time: 20, latitude: 121, lead_time: 32, longitude: 240, realization: 4)\n", + "Coordinates:\n", + " * realization (realization) int64 0 1 2 3\n", + " * forecast_time (forecast_time) datetime64[ns] 1998-03-12 ... 2017-03-12\n", + " * lead_time (lead_time) timedelta64[ns] 1 days 2 days ... 31 days 32 days\n", + " * latitude (latitude) float64 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * longitude (longitude) float64 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5\n", + " valid_time (forecast_time, lead_time) datetime64[ns] ...\n", + "Data variables:\n", + " tp (realization, forecast_time, lead_time, latitude, longitude) float32 ...\n", + "Attributes:\n", + " GRIB_edition: [2]\n", + " GRIB_centre: cwao\n", + " GRIB_centreDescription: Canadian Meteorological Service - Montreal \n", + " GRIB_subCentre: [0]\n", + " Conventions: CF-1.7\n", + " institution: Canadian Meteorological Service - Montreal \n", + " history: 2021-05-11T10:03 GRIB to CDM+CF via cfgrib-0.9.9...</pre>" + ], + "text/plain": [ + "<xarray.Dataset>\n", + "Dimensions: (forecast_time: 20, latitude: 121, lead_time: 32, longitude: 240, realization: 4)\n", + "Coordinates:\n", + " * realization (realization) int64 0 1 2 3\n", + " * forecast_time (forecast_time) datetime64[ns] 1998-03-12 ... 2017-03-12\n", + " * lead_time (lead_time) timedelta64[ns] 1 days 2 days ... 31 days 32 days\n", + " * latitude (latitude) float64 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * longitude (longitude) float64 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5\n", + " valid_time (forecast_time, lead_time) datetime64[ns] ...\n", + "Data variables:\n", + " tp (realization, forecast_time, lead_time, latitude, longitude) float32 ...\n", + "Attributes:\n", + " GRIB_edition: [2]\n", + " GRIB_centre: cwao\n", + " GRIB_centreDescription: Canadian Meteorological Service - Montreal \n", + " GRIB_subCentre: [0]\n", + " Conventions: CF-1.7\n", + " institution: Canadian Meteorological Service - Montreal \n", + " history: 2021-05-11T10:03 GRIB to CDM+CF via cfgrib-0.9.9..." + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat['training-input'](date=dates[10], param='tp', model='eccc').to_dask()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# `forecast-input`\n", + "\n", + "2020" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<pre><xarray.Dataset>\n", + "Dimensions: (forecast_time: 1, latitude: 121, lead_time: 46, longitude: 240, realization: 51)\n", + "Coordinates:\n", + " * realization (realization) int64 0 1 2 3 4 5 6 7 ... 44 45 46 47 48 49 50\n", + " * forecast_time (forecast_time) datetime64[ns] 2020-03-12\n", + " * lead_time (lead_time) timedelta64[ns] 1 days 2 days ... 45 days 46 days\n", + " * latitude (latitude) float64 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * longitude (longitude) float64 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5\n", + " valid_time (forecast_time, lead_time) datetime64[ns] ...\n", + "Data variables:\n", + " t2m (realization, forecast_time, lead_time, latitude, longitude) float32 ...\n", + "Attributes:\n", + " GRIB_edition: [2]\n", + " GRIB_centre: ecmf\n", + " GRIB_centreDescription: European Centre for Medium-Range Weather Forecasts\n", + " GRIB_subCentre: [0]\n", + " Conventions: CF-1.7\n", + " institution: European Centre for Medium-Range Weather Forecasts\n", + " history: 2021-05-10T16:14:36 GRIB to CDM+CF via cfgrib-0....</pre>" + ], + "text/plain": [ + "<xarray.Dataset>\n", + "Dimensions: (forecast_time: 1, latitude: 121, lead_time: 46, longitude: 240, realization: 51)\n", + "Coordinates:\n", + " * realization (realization) int64 0 1 2 3 4 5 6 7 ... 44 45 46 47 48 49 50\n", + " * forecast_time (forecast_time) datetime64[ns] 2020-03-12\n", + " * lead_time (lead_time) timedelta64[ns] 1 days 2 days ... 45 days 46 days\n", + " * latitude (latitude) float64 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * longitude (longitude) float64 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5\n", + " valid_time (forecast_time, lead_time) datetime64[ns] ...\n", + "Data variables:\n", + " t2m (realization, forecast_time, lead_time, latitude, longitude) float32 ...\n", + "Attributes:\n", + " GRIB_edition: [2]\n", + " GRIB_centre: ecmf\n", + " GRIB_centreDescription: European Centre for Medium-Range Weather Forecasts\n", + " GRIB_subCentre: [0]\n", + " Conventions: CF-1.7\n", + " institution: European Centre for Medium-Range Weather Forecasts\n", + " history: 2021-05-10T16:14:36 GRIB to CDM+CF via cfgrib-0...." + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat['test-input'](date=dates[10], param='t2m', model='ecmwf').to_dask()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# `hindcast-like-observations`\n", + "\n", + "observations matching hindcasts" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<pre><xarray.Dataset>\n", + "Dimensions: (forecast_time: 1, latitude: 121, lead_time: 47, longitude: 240)\n", + "Coordinates:\n", + " valid_time (lead_time, forecast_time) datetime64[ns] ...\n", + " * latitude (latitude) float64 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * longitude (longitude) float64 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5\n", + " * forecast_time (forecast_time) datetime64[ns] 2020-03-12\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 1 days ... 45 days 46 days\n", + "Data variables:\n", + " t2m (lead_time, forecast_time, latitude, longitude) float32 ...\n", + "Attributes:\n", + " source_dataset_name: temperature daily from NOAA NCEP CPC: Climate Predi...\n", + " source_hosting: IRIDL\n", + " source_url: http://iridl.ldeo.columbia.edu/SOURCES/.NOAA/.NCEP/...\n", + " created_by_software: climetlab-s2s-ai-challenge\n", + " created_by_script: tools/observations/makefile</pre>" + ], + "text/plain": [ + "<xarray.Dataset>\n", + "Dimensions: (forecast_time: 1, latitude: 121, lead_time: 47, longitude: 240)\n", + "Coordinates:\n", + " valid_time (lead_time, forecast_time) datetime64[ns] ...\n", + " * latitude (latitude) float64 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * longitude (longitude) float64 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5\n", + " * forecast_time (forecast_time) datetime64[ns] 2020-03-12\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 1 days ... 45 days 46 days\n", + "Data variables:\n", + " t2m (lead_time, forecast_time, latitude, longitude) float32 ...\n", + "Attributes:\n", + " source_dataset_name: temperature daily from NOAA NCEP CPC: Climate Predi...\n", + " source_hosting: IRIDL\n", + " source_url: http://iridl.ldeo.columbia.edu/SOURCES/.NOAA/.NCEP/...\n", + " created_by_software: climetlab-s2s-ai-challenge\n", + " created_by_script: tools/observations/makefile" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat['training-output-reference'](date=dates[10], param='t2m').to_dask()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# `forecast-like-observations`\n", + "\n", + "observations matching 2020 forecasts" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<pre><xarray.Dataset>\n", + "Dimensions: (forecast_time: 1, latitude: 121, lead_time: 47, longitude: 240)\n", + "Coordinates:\n", + " valid_time (lead_time, forecast_time) datetime64[ns] ...\n", + " * latitude (latitude) float64 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * longitude (longitude) float64 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5\n", + " * forecast_time (forecast_time) datetime64[ns] 2020-03-12\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 1 days ... 45 days 46 days\n", + "Data variables:\n", + " t2m (lead_time, forecast_time, latitude, longitude) float32 ...\n", + "Attributes:\n", + " source_dataset_name: temperature daily from NOAA NCEP CPC: Climate Predi...\n", + " source_hosting: IRIDL\n", + " source_url: http://iridl.ldeo.columbia.edu/SOURCES/.NOAA/.NCEP/...\n", + " created_by_software: climetlab-s2s-ai-challenge\n", + " created_by_script: tools/observations/makefile</pre>" + ], + "text/plain": [ + "<xarray.Dataset>\n", + "Dimensions: (forecast_time: 1, latitude: 121, lead_time: 47, longitude: 240)\n", + "Coordinates:\n", + " valid_time (lead_time, forecast_time) datetime64[ns] ...\n", + " * latitude (latitude) float64 90.0 88.5 87.0 85.5 ... -87.0 -88.5 -90.0\n", + " * longitude (longitude) float64 0.0 1.5 3.0 4.5 ... 355.5 357.0 358.5\n", + " * forecast_time (forecast_time) datetime64[ns] 2020-03-12\n", + " * lead_time (lead_time) timedelta64[ns] 0 days 1 days ... 45 days 46 days\n", + "Data variables:\n", + " t2m (lead_time, forecast_time, latitude, longitude) float32 ...\n", + "Attributes:\n", + " source_dataset_name: temperature daily from NOAA NCEP CPC: Climate Predi...\n", + " source_hosting: IRIDL\n", + " source_url: http://iridl.ldeo.columbia.edu/SOURCES/.NOAA/.NCEP/...\n", + " created_by_software: climetlab-s2s-ai-challenge\n", + " created_by_script: tools/observations/makefile" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat['test-output-reference'](date=dates[10], param='t2m').to_dask()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/data_access/wget_curl.ipynb b/notebooks/data_access/wget_curl.ipynb new file mode 100644 index 0000000..0f9e43e --- /dev/null +++ b/notebooks/data_access/wget_curl.ipynb @@ -0,0 +1,186 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Access via `curl` or `wget`\n", + "\n", + "Data easily available via `climetlab`: https://github.com/ecmwf-lab/climetlab-s2s-ai-challenge\n", + "\n", + "Data holdings listed:\n", + "\n", + "- https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/test-input/0.3.0/netcdf/index.html\n", + "- https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/training-input/0.3.0/netcdf/index.html\n", + "- https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/test-output-reference/index.html\n", + "- https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/training-output-reference/index.html\n", + "\n", + "Therefore, S3 data also accessible with `curl` or `wget`. Alternatively, you can click on the html links and download files by mouse click." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.8/site-packages/xarray/backends/cfgrib_.py:27: UserWarning: Failed to load cfgrib - most likely there is a problem accessing the ecCodes library. Try `import cfgrib` to get the full error message\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "<xarray.core.options.set_options at 0x7f5170570520>" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import xarray as xr\n", + "import os\n", + "from subprocess import call\n", + "xr.set_options(display_style='text')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# version of the EWC data\n", + "version = '0.3.0'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# `hindcast-input`\n", + "\n", + "on-the-fly hindcasts corresponding to the 2020 forecasts" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "parameter = 't2m'\n", + "date = '20200102'\n", + "model = 'ecmwf'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "url = f'https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/training-input/{version}/netcdf/{model}-hindcast-{parameter}-{date}.nc'\n", + "os.system(f'wget {url}')\n", + "\n", + "assert os.path.exists(f'{model}-hindcast-{parameter}-{date}.nc')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# `forecast-input`\n", + "\n", + "2020" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "url = f'https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/test-input/{version}/netcdf/{model}-forecast-{parameter}-{date}.nc'\n", + "os.system(f'wget {url}')\n", + "\n", + "assert os.path.exists(f'{model}-forecast-{parameter}-{date}.nc')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# `hindcast-like-observations`\n", + "\n", + "CPC observations formatted like training period hindcasts" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "url = f'https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/training-output-reference/{parameter}-{date}.nc'\n", + "os.system(f'wget {url}')\n", + "\n", + "assert os.path.exists(f'{parameter}-{date}.nc')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# `forecast-like-observations`\n", + "\n", + "CPC observations formatted like test period 2020 forecasts" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "url = f'https://storage.ecmwf.europeanweather.cloud/s2s-ai-challenge/data/test-output-reference/{parameter}-{date}.nc'\n", + "os.system(f'wget {url}')\n", + "\n", + "assert os.path.exists(f'{parameter}-{date}.nc')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} -- GitLab