Compare revisions

Andreas Bleuler · Andreas Bleuler · Andreas Bleuler · Andreas Bleuler · Andreas Bleuler · decd7012
--- a/.renku/datasets/286c58b1-dbbc-4caa-a23a-fcb001d5ac51/metadata.yml
+++ b/.renku/datasets/286c58b1-dbbc-4caa-a23a-fcb001d5ac51/metadata.yml
@@ -491,7 +491,7 @@ files:
  - schema:DigitalDocument
  - wfprov:Artifact
  _id: https://renkulab.io/blob/d0b8e0d440de1d96cf57e35c9a47a2699d6f5106/data/covid-19-italy/dati-andamento-nazionale-description.json
-  _label: data/covid-19-italy/dati-andamento-nazionale-description.json@d0b8e0d440de1d96cf57e35c9a47a2699d6f5106
+  _label: data/covid-19-italy/dati-andamento-nazionale-description.json@ba96bbf0f7d54cd9ca144fa283b36c7aa83fd4d6
  _project:
    '@type':
    - prov:Location
@@ -518,7 +518,7 @@ files:
    - schema:DigitalDocument
    - wfprov:Artifact
    _id: https://github.com/blob/bea52c91028eb7131c645d93fb633555c768405a/dati-andamento-nazionale/dati-andamento-nazionale-description.json
-    _label: dati-andamento-nazionale/dati-andamento-nazionale-description.json@bea52c91028eb7131c645d93fb633555c768405a
+    _label: dati-andamento-nazionale/dati-andamento-nazionale-description.json@fe0ee4e99f9927e76be4a7bc3cc8c655e25d2a6d
    _project: null
    added: '2020-03-22T21:42:08.484068+00:00'
    based_on: null
@@ -544,7 +544,7 @@ files:
  - schema:DigitalDocument
  - wfprov:Artifact
  _id: https://renkulab.io/blob/f1d2951e579144735588f5810bdf7f433c41cc13/data/covid-19-italy/dati-regioni-description.json
-  _label: data/covid-19-italy/dati-regioni-description.json@f1d2951e579144735588f5810bdf7f433c41cc13
+  _label: data/covid-19-italy/dati-regioni-description.json@ba96bbf0f7d54cd9ca144fa283b36c7aa83fd4d6
  _project:
    '@type':
    - prov:Location
@@ -571,7 +571,7 @@ files:
    - schema:DigitalDocument
    - wfprov:Artifact
    _id: https://github.com/blob/bea52c91028eb7131c645d93fb633555c768405a/dati-regioni/dati-regioni-description.json
-    _label: dati-regioni/dati-regioni-description.json@bea52c91028eb7131c645d93fb633555c768405a
+    _label: dati-regioni/dati-regioni-description.json@fe0ee4e99f9927e76be4a7bc3cc8c655e25d2a6d
    _project: null
    added: '2020-03-22T21:42:37.536554+00:00'
    based_on: null
@@ -597,12 +597,12 @@ files:
  - schema:DigitalDocument
  - wfprov:Artifact
  _id: https://renkulab.io/blob/2faeb464d32751d89a4d1426b741d65a7fcee263/data/covid-19-italy/dati-province-description.json
-  _label: data/covid-19-italy/dati-province-description.json@2faeb464d32751d89a4d1426b741d65a7fcee263
+  _label: data/covid-19-italy/dati-province-description.json@8eb4980336cf3c05420d344cf68cd4cc234dac42
  _project:
    '@type':
    - prov:Location
    - schema:Project
-    _id: https://renkulab.io/projects/covid-19/covid-19-public-data
+    _id: https://localhost/projects/cramakri/covid-19-dashboard
    created: '2020-03-11T21:43:12.736000+00:00'
    creator:
      '@type':
@@ -624,7 +624,7 @@ files:
    - schema:DigitalDocument
    - wfprov:Artifact
    _id: https://github.com/blob/bea52c91028eb7131c645d93fb633555c768405a/dati-province/dati-province-description.json
-    _label: dati-province/dati-province-description.json@bea52c91028eb7131c645d93fb633555c768405a
+    _label: dati-province/dati-province-description.json@fe0ee4e99f9927e76be4a7bc3cc8c655e25d2a6d
    _project: null
    added: '2020-03-22T21:43:03.933827+00:00'
    based_on: null

--- a/data/covid-19-italy/dati-province-description.json
+++ b/data/covid-19-italy/dati-province-description.json
--- a/notebooks/examples/italian-data-example.ipynb
+++ b/notebooks/examples/italian-data-example.ipynb
+%% Cell type:code id: tags:
+
+``` python
+import json
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+import ipywidgets as widgets
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def translate_columns(df, description_filename):
+    description_file_path = os.path.join(data_folder, description_filename)
+
+    with open(description_file_path, 'r') as description_file:
+        decoded_data = description_file.read().encode().decode('utf-8-sig')
+        descriptions = json.loads(decoded_data)
+        descriptions = { column_dict['Nome campo']: column_dict for column_dict in descriptions}
+
+    def rename_column(col):
+        new_col = descriptions[col]['Field name']
+        if len(new_col) > 30:
+            return "{}...".format(new_col[:30])
+        else:
+            return new_col
+
+    df.rename(columns=rename_column, inplace=True)
+    return df
+
+def set_time_index(df):
+    timestamp = pd.DatetimeIndex(df['date'])
+    df.set_index(timestamp, inplace=True)
+    del df['date']
+    return df
+
+def prepare_dataframe(data_folder, df_filename, description_filename, use_time_index=False):
+    data_file_path = os.path.join(data_folder, df_filename)
+    df = pd.read_csv(data_file_path)
+    df = translate_columns(df, description_filename)
+    if use_time_index:
+        df = set_time_index(df)
+    return df
+```
+
+%% Cell type:code id: tags:
+
+``` python
+data_folder = "../../data/covid-19-italy/"
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df_national_trend = prepare_dataframe(
+    data_folder,
+    "dpc-covid19-ita-andamento-nazionale.csv",
+    "dati-andamento-nazionale-description.json",
+    use_time_index=True
+)
+df_national_trend.tail()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+import math
+df_national_trend["New cases"] = df_national_trend["total_cases"].diff().rolling('5d').mean().apply(lambda x: math.log10(x))
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df_national_trend.plot(y="New cases", kind="bar")
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df_provinces = prepare_dataframe(
+    data_folder,
+    "dpc-covid19-ita-province.csv",
+    "dati-province-description.json",
+    use_time_index=True
+)
+df_hotspots = df_provinces.loc[
+    df_provinces['province'].isin(['Bergamo'])
+].groupby('date').sum().loc[:,['total_cases']]
+df_hotspots = set_time_index(df_hotspots.reset_index())
+df_hotspots['New positive cases'] = df_hotspots['total_cases'].diff()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+ax = df_hotspots.plot(y='New positive cases', kind='bar')
+df_hotspots['New positive cases'].rolling('6d').mean().plot(y='New positive cases')
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df_hotspots.rolling('3d').mean().plot(y='total_cases', kind='bar', logy=True)
+plt.gca().set_title('Total amound of positive tested cases in Bergamo and Lodi');
+```
+
+%% Cell type:code id: tags:
+
+``` python
+provinces = set(df_provinces.loc[df_provinces['region']=='Lombardia']['province'])
+provinces.discard('In fase di definizione/aggiornamento')
+provinces
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def get_growth_factor_series(province, df, N_min=1000):
+    ts = df.loc[
+                (df['province'] == province) & \
+                (df['total_cases'] >= N_min)
+            ] \
+            ['total_cases'] \
+            .rolling('3d') \
+            .mean() \
+            .pct_change() \
+            .add(1.0)
+    return ts.iloc[1:]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def plot_growth_factors(provinces, df, N_min=1000):
+    plt.figure(figsize=(9, 6))
+    datemin = datemax = df.index[-1]
+    for province in provinces:
+        data = get_growth_factor_series(province, df, N_min=N_min)
+        if len(data) >= 1:
+            data.plot(label=province)
+            datemin = min(data.index[0], datemin)
+
+    ax = plt.gca()
+    plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1.0), frameon=False)
+    plt.plot([datemin, datemax], [1,1], color='gray', alpha=0.2)
+    plt.ylim(0.95, ax.get_ylim()[1])
+    plt.title('Daily growth rate of total cases per province')
+    plt.xlabel('');
+
+def plot_total_cases(provinces, df, N_min=500):
+    plt.figure(figsize=(9, 6))
+    for province in provinces:
+        data = df.loc[
+                (df['province'] == province) & \
+                (df['total_cases'] >= N_min)
+            ] \
+            ['total_cases'] \
+            .rolling('1d') \
+            .mean() \
+            .add(1.0)
+        if len(data) >= 1:
+            data.plot(label=province, logy=True)
+
+    ax = plt.gca()
+    plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1.0), frameon=False)
+    plt.title('Total cases per province')
+    plt.xlabel('');
+```
+
+%% Cell type:code id: tags:
+
+``` python
+plot_growth_factors(provinces, df_provinces)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+plot_total_cases(provinces, df_provinces, N_min=10)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+widgets.Dropdown(
+    options=['1', '2', '3'],
+    value='2',
+    description='Number:',
+    disabled=False,
+)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+provinces
+```
+%% Cell type:code id: tags:
+
+``` python
+import json
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+import ipywidgets as widgets
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def translate_columns(df, description_filename):
+    description_file_path = os.path.join(data_folder, description_filename)
+
+    with open(description_file_path, 'r') as description_file:
+        decoded_data = description_file.read().encode().decode('utf-8-sig')
+        descriptions = json.loads(decoded_data)
+        descriptions = { column_dict['Nome campo']: column_dict for column_dict in descriptions}
+
+    def rename_column(col):
+        new_col = descriptions[col]['Field name']
+        if len(new_col) > 30:
+            return "{}...".format(new_col[:30])
+        else:
+            return new_col
+
+    df.rename(columns=rename_column, inplace=True)
+    return df
+
+def set_time_index(df):
+    timestamp = pd.DatetimeIndex(df['date'])
+    df.set_index(timestamp, inplace=True)
+    del df['date']
+    return df
+
+def prepare_dataframe(data_folder, df_filename, description_filename, use_time_index=False):
+    data_file_path = os.path.join(data_folder, df_filename)
+    df = pd.read_csv(data_file_path)
+    df = translate_columns(df, description_filename)
+    if use_time_index:
+        df = set_time_index(df)
+    return df
+```
+
+%% Cell type:code id: tags:
+
+``` python
+data_folder = "../../data/covid-19-italy/"
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df_national_trend = prepare_dataframe(
+    data_folder,
+    "dpc-covid19-ita-andamento-nazionale.csv",
+    "dati-andamento-nazionale-description.json",
+    use_time_index=True
+)
+df_national_trend.tail()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+import math
+df_national_trend["New cases"] = df_national_trend["total_cases"].diff().rolling('5d').mean().apply(lambda x: math.log10(x))
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df_national_trend.plot(y="New cases", kind="bar")
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df_provinces = prepare_dataframe(
+    data_folder,
+    "dpc-covid19-ita-province.csv",
+    "dati-province-description.json",
+    use_time_index=True
+)
+df_hotspots = df_provinces.loc[
+    df_provinces['province'].isin(['Bergamo'])
+].groupby('date').sum().loc[:,['total_cases']]
+df_hotspots = set_time_index(df_hotspots.reset_index())
+df_hotspots['New positive cases'] = df_hotspots['total_cases'].diff()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+ax = df_hotspots.plot(y='New positive cases', kind='bar')
+df_hotspots['New positive cases'].rolling('6d').mean().plot(y='New positive cases')
+```
+
+%% Cell type:code id: tags:
+
+``` python
+df_hotspots.rolling('3d').mean().plot(y='total_cases', kind='bar', logy=True)
+plt.gca().set_title('Total amound of positive tested cases in Bergamo and Lodi');
+```
+
+%% Cell type:code id: tags:
+
+``` python
+provinces = set(df_provinces.loc[df_provinces['region']=='Lombardia']['province'])
+provinces.discard('In fase di definizione/aggiornamento')
+provinces
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def get_growth_factor_series(province, df, N_min=1000):
+    ts = df.loc[
+                (df['province'] == province) & \
+                (df['total_cases'] >= N_min)
+            ] \
+            ['total_cases'] \
+            .rolling('3d') \
+            .mean() \
+            .pct_change() \
+            .add(1.0)
+    return ts.iloc[1:]
+```
+
+%% Cell type:code id: tags:
+
+``` python
+def plot_growth_factors(provinces, df, N_min=1000):
+    plt.figure(figsize=(9, 6))
+    datemin = datemax = df.index[-1]
+    for province in provinces:
+        data = get_growth_factor_series(province, df, N_min=N_min)
+        if len(data) >= 1:
+            data.plot(label=province)
+            datemin = min(data.index[0], datemin)
+
+    ax = plt.gca()
+    plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1.0), frameon=False)
+    plt.plot([datemin, datemax], [1,1], color='gray', alpha=0.2)
+    plt.ylim(0.95, ax.get_ylim()[1])
+    plt.title('Daily growth rate of total cases per province')
+    plt.xlabel('');
+
+def plot_total_cases(provinces, df, N_min=500):
+    plt.figure(figsize=(9, 6))
+    for province in provinces:
+        data = df.loc[
+                (df['province'] == province) & \
+                (df['total_cases'] >= N_min)
+            ] \
+            ['total_cases'] \
+            .rolling('1d') \
+            .mean() \
+            .add(1.0)
+        if len(data) >= 1:
+            data.plot(label=province, logy=True)
+
+    ax = plt.gca()
+    plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1.0), frameon=False)
+    plt.title('Total cases per province')
+    plt.xlabel('');
+```
+
+%% Cell type:code id: tags:
+
+``` python
+plot_growth_factors(provinces, df_provinces)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+plot_total_cases(provinces, df_provinces, N_min=10)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+widgets.Dropdown(
+    options=['1', '2', '3'],
+    value='2',
+    description='Number:',
+    disabled=False,
+)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+provinces
+```
No results found