Skip to content
Snippets Groups Projects

use proper data

Merged Aaron Spring requested to merge AS_use_proper_data into master
2 files
+ 19
27
Compare changes
  • Side-by-side
  • Inline
Files
2
+ 18
27
import xarray as xr
import xskillscore as xs
import numpy as np
import argparse
from pathlib import Path
@@ -9,36 +10,23 @@ if __name__ == "__main__":
parser.add_argument("prediction", help="The netcdf file with predictions")
args = parser.parse_args()
observations_terciled_fin = Path('scoring/terciled_observations.zarr')
benchmark_forecasts_terciled_fin = Path("scoring/ECMWF_rt_2020_cdf_terciled.nc")
cache_path = "scoring"
observations_terciled_fin = Path(f'{cache_path}/forecast-like-observations_2020_biweekly_terciled.nc')
benchmark_forecasts_terciled_fin = Path(f"{cache_path}/ecmwf_recalibrated_benchmark_2020_biweekly_terciled.nc")
observations_terciled = xr.open_dataset(observations_terciled_fin, engine='zarr').sel(forecast_reference_time='2020').rename({'category_edge':'category'})
a=observations_terciled['2t'].sel(time=(observations_terciled.tp.forecast_reference_time + observations_terciled.tp.lead_time[0]))
b=observations_terciled['2t'].sel(time=(observations_terciled.tp.forecast_reference_time + observations_terciled.tp.lead_time[1]))
observations_terciled['2t']=xr.concat([a.drop('time'),b.drop('time')],'lead_time')#.rename({'category_edge':'category'})
observations_terciled = observations_terciled.drop('time')
obs_p = xr.open_dataset(observations_terciled_fin)
benchmark_forecasts_terciled = xr.open_dataset(benchmark_forecasts_terciled_fin, chunks={})
# benchmark_forecasts_terciled = benchmark_forecasts_terciled.rename({'category':'category_edge'})
fct_p = xr.open_dataset(args.prediction)
ML_predictions_terciled = xr.open_dataset(args.prediction).rename({'category_edge':'category'})
ML_predictions_terciled = xr.concat([
ML_predictions_terciled.isel(step=[0,1]).mean('step'),
ML_predictions_terciled.isel(step=[2,3]).mean('step')
], 'lead_time').assign_coords(lead_time=observations_terciled.lead_time)
ML_predictions_terciled = ML_predictions_terciled.assign_coords(category=observations_terciled.category)
bench_p = xr.open_dataset(benchmark_forecasts_terciled_fin)
for v in ['2t', 'tp']:
if v not in ML_predictions_terciled.data_vars:
raise ValueError(f'Expected both variables in ML_predictions, didnt find {v} in {ML_predictions_terciled.data_vars}')
# same number of dimensions
assert set(ML_predictions_terciled.dims) == set(benchmark_forecasts_terciled.dims)
rps_ML = xs.rps(obs_p, fct_p, category_edges=None, dim='forecast_time', input_distributions='p').compute()
rps_bench = xs.rps(obs_p, bench_p, category_edges=None, dim='forecast_time', input_distributions='p').compute()
rps_ML = xs.rps(observations_terciled, ML_predictions_terciled, category_edges=None, dim=['forecast_reference_time'], input_distributions='c').compute()
rps_benchmark = xs.rps(observations_terciled, benchmark_forecasts_terciled, category_edges=None, dim=['forecast_reference_time'], input_distributions='c').compute()
rpss = 1 - rps_ML / rps_benchmark # positive means ML better than ECMWF benchmark
rpss = (1 - rps_ML / rps_bench)
# check for -inf grid cells
if (rpss==-np.inf).to_array().any():
@@ -47,15 +35,16 @@ if __name__ == "__main__":
# dirty fix
rpss = rpss.clip(-1, 1)
mask = xr.ones_like(rpss.isel(lead_time=0,drop=True)).reset_coords(drop=True).tp
boundary_tropics=30
# what to do with requested grid cells where NaN is submitted? also penalize
mask = xr.ones_like(rpss.isel(lead_time=0, drop=True)).reset_coords(drop=True).t2m
boundary_tropics = 30
mask = xr.concat([mask.where(mask.latitude > boundary_tropics),
mask.where(np.abs(mask.latitude) <= boundary_tropics),
mask.where((mask.latitude < -boundary_tropics) & (mask.latitude > -60))],'area')
mask = mask.assign_coords(area=['northern_extratropics', 'tropics', 'southern_extratropics'])
mask.name='area'
mask.name = 'area'
mask = mask.where(rpss.tp.isel(lead_time=0,drop=True).notnull())
mask = mask.where(rpss.t2m.isel(lead_time=0, drop=True).notnull())
# weighted area mean
weights = np.cos(np.deg2rad(np.abs(mask.latitude)))
@@ -65,5 +54,7 @@ if __name__ == "__main__":
# final score
scores = rpss.weighted(weights).mean('latitude').mean('longitude')
# spatially weighted score averaged over lead_times and variables to one single value
# score transfered to leaderboard
scores = scores.to_array().mean().reset_coords(drop=True)
print(scores)
Loading