diff --git a/.gitattributes b/.gitattributes index 98a8bdfc98aa1d9fea9b583cf0a2654298b22e29..fd75cabbc2bd9a519f3f5d41710149da5eebafb1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,4 @@ *.nc filter=lfs diff=lfs merge=lfs -text scoring/terciled_observations.zarr/** filter=lfs diff=lfs merge=lfs -text *.zarr/** filter=lfs diff=lfs merge=lfs -text +**/*.zarr/** filter=lfs diff=lfs merge=lfs -text diff --git a/scoring/scoring_script.py b/scoring/scoring_script.py index 2b12e1906521ce6661bac8fff90552b80e32fb85..a4f0249176479d4e3167990364cc5264f6e51501 100644 --- a/scoring/scoring_script.py +++ b/scoring/scoring_script.py @@ -1,6 +1,7 @@ import xarray as xr import xskillscore as xs import numpy as np + import argparse from pathlib import Path @@ -9,36 +10,23 @@ if __name__ == "__main__": parser.add_argument("prediction", help="The netcdf file with predictions") args = parser.parse_args() - observations_terciled_fin = Path('scoring/terciled_observations.zarr') - benchmark_forecasts_terciled_fin = Path("scoring/ECMWF_rt_2020_cdf_terciled.nc") + cache_path = "scoring" + + observations_terciled_fin = Path(f'{cache_path}/forecast-like-observations_2020_biweekly_terciled.nc') + benchmark_forecasts_terciled_fin = Path(f"{cache_path}/ecmwf_recalibrated_benchmark_2020_biweekly_terciled.nc") - observations_terciled = xr.open_dataset(observations_terciled_fin, engine='zarr').sel(forecast_reference_time='2020').rename({'category_edge':'category'}) - a=observations_terciled['2t'].sel(time=(observations_terciled.tp.forecast_reference_time + observations_terciled.tp.lead_time[0])) - b=observations_terciled['2t'].sel(time=(observations_terciled.tp.forecast_reference_time + observations_terciled.tp.lead_time[1])) - observations_terciled['2t']=xr.concat([a.drop('time'),b.drop('time')],'lead_time')#.rename({'category_edge':'category'}) - observations_terciled = observations_terciled.drop('time') + obs_p = xr.open_dataset(observations_terciled_fin) - benchmark_forecasts_terciled = xr.open_dataset(benchmark_forecasts_terciled_fin, chunks={}) - # benchmark_forecasts_terciled = benchmark_forecasts_terciled.rename({'category':'category_edge'}) + fct_p = xr.open_dataset(args.prediction) - ML_predictions_terciled = xr.open_dataset(args.prediction).rename({'category_edge':'category'}) - ML_predictions_terciled = xr.concat([ - ML_predictions_terciled.isel(step=[0,1]).mean('step'), - ML_predictions_terciled.isel(step=[2,3]).mean('step') - ], 'lead_time').assign_coords(lead_time=observations_terciled.lead_time) - ML_predictions_terciled = ML_predictions_terciled.assign_coords(category=observations_terciled.category) + bench_p = xr.open_dataset(benchmark_forecasts_terciled_fin) - for v in ['2t', 'tp']: - if v not in ML_predictions_terciled.data_vars: - raise ValueError(f'Expected both variables in ML_predictions, didnt find {v} in {ML_predictions_terciled.data_vars}') - # same number of dimensions - assert set(ML_predictions_terciled.dims) == set(benchmark_forecasts_terciled.dims) + rps_ML = xs.rps(obs_p, fct_p, category_edges=None, dim='forecast_time', input_distributions='p').compute() + rps_bench = xs.rps(obs_p, bench_p, category_edges=None, dim='forecast_time', input_distributions='p').compute() - rps_ML = xs.rps(observations_terciled, ML_predictions_terciled, category_edges=None, dim=['forecast_reference_time'], input_distributions='c').compute() - rps_benchmark = xs.rps(observations_terciled, benchmark_forecasts_terciled, category_edges=None, dim=['forecast_reference_time'], input_distributions='c').compute() - rpss = 1 - rps_ML / rps_benchmark # positive means ML better than ECMWF benchmark + rpss = (1 - rps_ML / rps_bench) # check for -inf grid cells if (rpss==-np.inf).to_array().any(): @@ -47,15 +35,16 @@ if __name__ == "__main__": # dirty fix rpss = rpss.clip(-1, 1) - mask = xr.ones_like(rpss.isel(lead_time=0,drop=True)).reset_coords(drop=True).tp - boundary_tropics=30 + # what to do with requested grid cells where NaN is submitted? also penalize + mask = xr.ones_like(rpss.isel(lead_time=0, drop=True)).reset_coords(drop=True).t2m + boundary_tropics = 30 mask = xr.concat([mask.where(mask.latitude > boundary_tropics), mask.where(np.abs(mask.latitude) <= boundary_tropics), mask.where((mask.latitude < -boundary_tropics) & (mask.latitude > -60))],'area') mask = mask.assign_coords(area=['northern_extratropics', 'tropics', 'southern_extratropics']) - mask.name='area' + mask.name = 'area' - mask = mask.where(rpss.tp.isel(lead_time=0,drop=True).notnull()) + mask = mask.where(rpss.t2m.isel(lead_time=0, drop=True).notnull()) # weighted area mean weights = np.cos(np.deg2rad(np.abs(mask.latitude))) @@ -65,5 +54,7 @@ if __name__ == "__main__": # final score scores = rpss.weighted(weights).mean('latitude').mean('longitude') # spatially weighted score averaged over lead_times and variables to one single value + + # score transfered to leaderboard scores = scores.to_array().mean().reset_coords(drop=True) print(scores)