diff --git a/.gitattributes b/.gitattributes
index 98a8bdfc98aa1d9fea9b583cf0a2654298b22e29..fd75cabbc2bd9a519f3f5d41710149da5eebafb1 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +1,4 @@
 *.nc filter=lfs diff=lfs merge=lfs -text
 scoring/terciled_observations.zarr/** filter=lfs diff=lfs merge=lfs -text
 *.zarr/** filter=lfs diff=lfs merge=lfs -text
+**/*.zarr/** filter=lfs diff=lfs merge=lfs -text
diff --git a/scoring/scoring_script.py b/scoring/scoring_script.py
index 2b12e1906521ce6661bac8fff90552b80e32fb85..a4f0249176479d4e3167990364cc5264f6e51501 100644
--- a/scoring/scoring_script.py
+++ b/scoring/scoring_script.py
@@ -1,6 +1,7 @@
 import xarray as xr
 import xskillscore as xs
 import numpy as np
+
 import argparse
 from pathlib import Path
 
@@ -9,36 +10,23 @@ if __name__ == "__main__":
     parser.add_argument("prediction", help="The netcdf file with predictions")
     args = parser.parse_args()
 
-    observations_terciled_fin = Path('scoring/terciled_observations.zarr')
-    benchmark_forecasts_terciled_fin = Path("scoring/ECMWF_rt_2020_cdf_terciled.nc")
+    cache_path = "scoring"
+
+    observations_terciled_fin = Path(f'{cache_path}/forecast-like-observations_2020_biweekly_terciled.nc')
+    benchmark_forecasts_terciled_fin = Path(f"{cache_path}/ecmwf_recalibrated_benchmark_2020_biweekly_terciled.nc")
 
-    observations_terciled = xr.open_dataset(observations_terciled_fin, engine='zarr').sel(forecast_reference_time='2020').rename({'category_edge':'category'})
-    a=observations_terciled['2t'].sel(time=(observations_terciled.tp.forecast_reference_time + observations_terciled.tp.lead_time[0]))
-    b=observations_terciled['2t'].sel(time=(observations_terciled.tp.forecast_reference_time + observations_terciled.tp.lead_time[1]))
-    observations_terciled['2t']=xr.concat([a.drop('time'),b.drop('time')],'lead_time')#.rename({'category_edge':'category'})
-    observations_terciled = observations_terciled.drop('time')
+    obs_p = xr.open_dataset(observations_terciled_fin)
 
-    benchmark_forecasts_terciled = xr.open_dataset(benchmark_forecasts_terciled_fin, chunks={})
-    # benchmark_forecasts_terciled = benchmark_forecasts_terciled.rename({'category':'category_edge'})
+    fct_p = xr.open_dataset(args.prediction)
 
-    ML_predictions_terciled = xr.open_dataset(args.prediction).rename({'category_edge':'category'})
-    ML_predictions_terciled = xr.concat([
-            ML_predictions_terciled.isel(step=[0,1]).mean('step'),
-            ML_predictions_terciled.isel(step=[2,3]).mean('step')
-        ], 'lead_time').assign_coords(lead_time=observations_terciled.lead_time)
-    ML_predictions_terciled = ML_predictions_terciled.assign_coords(category=observations_terciled.category)
+    bench_p = xr.open_dataset(benchmark_forecasts_terciled_fin)
 
-    for v in ['2t', 'tp']:
-        if v not in ML_predictions_terciled.data_vars:
-            raise ValueError(f'Expected both variables in ML_predictions, didnt find {v} in {ML_predictions_terciled.data_vars}')
 
-    # same number of dimensions
-    assert set(ML_predictions_terciled.dims) == set(benchmark_forecasts_terciled.dims)
+    rps_ML = xs.rps(obs_p, fct_p, category_edges=None, dim='forecast_time', input_distributions='p').compute()
 
+    rps_bench = xs.rps(obs_p, bench_p, category_edges=None, dim='forecast_time', input_distributions='p').compute()
 
-    rps_ML = xs.rps(observations_terciled, ML_predictions_terciled, category_edges=None, dim=['forecast_reference_time'], input_distributions='c').compute()
-    rps_benchmark = xs.rps(observations_terciled, benchmark_forecasts_terciled, category_edges=None, dim=['forecast_reference_time'], input_distributions='c').compute()
-    rpss = 1 - rps_ML / rps_benchmark # positive means ML better than ECMWF benchmark
+    rpss = (1 - rps_ML / rps_bench)
 
     # check for -inf grid cells
     if (rpss==-np.inf).to_array().any():
@@ -47,15 +35,16 @@ if __name__ == "__main__":
         # dirty fix
         rpss = rpss.clip(-1, 1)
 
-    mask = xr.ones_like(rpss.isel(lead_time=0,drop=True)).reset_coords(drop=True).tp
-    boundary_tropics=30
+    # what to do with requested grid cells where NaN is submitted? also penalize
+    mask = xr.ones_like(rpss.isel(lead_time=0, drop=True)).reset_coords(drop=True).t2m
+    boundary_tropics = 30
     mask = xr.concat([mask.where(mask.latitude > boundary_tropics),
                     mask.where(np.abs(mask.latitude) <= boundary_tropics),
                     mask.where((mask.latitude < -boundary_tropics) & (mask.latitude > -60))],'area')
     mask = mask.assign_coords(area=['northern_extratropics', 'tropics', 'southern_extratropics'])
-    mask.name='area'
+    mask.name = 'area'
 
-    mask = mask.where(rpss.tp.isel(lead_time=0,drop=True).notnull())
+    mask = mask.where(rpss.t2m.isel(lead_time=0, drop=True).notnull())
 
     # weighted area mean
     weights = np.cos(np.deg2rad(np.abs(mask.latitude)))
@@ -65,5 +54,7 @@ if __name__ == "__main__":
     # final score
     scores = rpss.weighted(weights).mean('latitude').mean('longitude')
     # spatially weighted score averaged over lead_times and variables to one single value
+
+    # score transfered to leaderboard
     scores = scores.to_array().mean().reset_coords(drop=True)
     print(scores)