In [1]:
%load_ext autoreload
%autoreload 2

## Standardizing the various Covid-19 datasets

This notebook demonstrates the use and usefulness of pulling data from various datasets together in one place. A lot of information gets lost when numbers are compared across entities that are too large. For example, we have excellent data available for Italy broken down by region (and even province). We also have data for Switzerland per Canton. These datasets, however, each have their own schemas and peculiarities - some work is therefore needed upfront to be able to treat them equally. 

We have implemented a set of "converters" to standardize the various datasets to a subset of useful fields. Each converter is aware of the details of each dataset and produces a view of the dataset that is homogenized with the others. In this way, we are able to visualize with simple commands data of very different origins using very simple procedures. 

In [2]:
from pathlib import Path

import altair as alt
import pandas as pd

from covid_19_dashboard import helper, plotting
from covid_19_dashboard.converters import CaseConverter
from covid_19_dashboard.converters.switzerland import OpenZHCaseConverter
from covid_19_dashboard.converters.covidtracking import CovidtrackingCaseConverter
from covid_19_dashboard.converters.spain import SpainCaseConverter

In [3]:
df_list = []
for path in [
    '../../data/openzh-covid-19', 
    '../../data/covid-19-italy', 
    '../../data/covidtracking/', 
    '../../data/covid-19-spain'
]:
    df_list.append(CaseConverter.read_convert(path))
df_all = pd.concat(df_list).reset_index(drop=True)
df_all['date'] = pd.to_datetime(df_all.date)

using: <class 'covid_19_dashboard.converters.covidtracking.CovidtrackingCaseConverter'>
using: <class 'covid_19_dashboard.converters.italy.ItalyCaseConverter'>
using: <class 'covid_19_dashboard.converters.spain.SpainCaseConverter'>
using: <class 'covid_19_dashboard.converters.switzerland.OpenZHCaseConverter'>
using: <class 'covid_19_dashboard.converters.covidtracking.CovidtrackingCaseConverter'>
using: <class 'covid_19_dashboard.converters.italy.ItalyCaseConverter'>
using: <class 'covid_19_dashboard.converters.covidtracking.CovidtrackingCaseConverter'>
using: <class 'covid_19_dashboard.converters.covidtracking.CovidtrackingCaseConverter'>
using: <class 'covid_19_dashboard.converters.italy.ItalyCaseConverter'>
using: <class 'covid_19_dashboard.converters.spain.SpainCaseConverter'>


In [4]:
df_esp = SpainCaseConverter.read_data('../../data/covid-19-spain')

In [5]:

SpainCaseConverter.convert(df_esp)

Unnamed: 0,date,country,region_iso,region_label,tested,positive,deceased,population,positive_100k,deceased_100k
0,2020-02-27,ESP,ES-AN,Andalucía,,1,,8409738,0.011891,
1,2020-02-28,ESP,ES-AN,Andalucía,,6,,8409738,0.071346,
2,2020-02-29,ESP,ES-AN,Andalucía,,8,,8409738,0.095128,
3,2020-03-01,ESP,ES-AN,Andalucía,,12,,8409738,0.142692,
4,2020-03-02,ESP,ES-AN,Andalucía,,12,,8409738,0.142692,
...,...,...,...,...,...,...,...,...,...,...
622,2020-03-26,ESP,ES-RI,La Rioja,,995,43.0,315675,315.197592,13.621604
623,2020-03-27,ESP,ES-RI,La Rioja,,1236,55.0,315675,391.541934,17.422982
624,2020-03-28,ESP,ES-RI,La Rioja,,1436,65.0,315675,454.898234,20.590797
625,2020-03-29,ESP,ES-RI,La Rioja,,1629,68.0,315675,516.037063,21.541142


In [8]:
regions = ['Lombardy', 'Ticino', 'Zürich', 'Madrid', 'New York', 'Washington', 'Louisiana']

In [9]:
base = alt.Chart(df_all[df_all.region_label.isin(regions)])
base.mark_line().encode(alt.X('date'), alt.Y('positive', scale=alt.Scale(type='linear')), color='region_label')

In [10]:
since_df_positive = helper.make_since_df(df_all[df_all.region_label.isin(regions)], region_column='region_label', start_case=100)
base = alt.Chart(since_df_positive).properties(height=300,width=300)
days_log = plotting.make_region_since_chart(base, 'positive', 'sinceDay0', 'region_label', 'Days since 100th case', 'Cases', 'Cases', 'Region')
days_log_100k = plotting.make_region_since_chart(base, 'positive_100k', 'sinceDay0', 'region_label', 'Days since 100th case', 'Cases/100k', 'Cases/100k', 'Region')
alt.hconcat(days_log, days_log_100k)

In [11]:
since_df_deceased = helper.make_since_df(df_all[df_all.region_label.isin(regions)], column='deceased', region_column='region_label', start_case=10)
base = alt.Chart(since_df_deceased).properties(height=300,width=300)
days_log = plotting.make_region_since_chart(base, 'deceased', 'sinceDay0', 'region_label', 'Days since 10th death', 'Deaths', 'Deaths', 'Region')
days_log_100k = plotting.make_region_since_chart(base, 'deceased_100k', 'sinceDay0', 'region_label', 'Days since 10th death', 'Deaths/100k', 'Deaths/100k', 'Region')
alt.hconcat(days_log, days_log_100k)