Skip to content
Snippets Groups Projects
Commit c46f4060 authored by Rok Roškar's avatar Rok Roškar Committed by Rok Roškar
Browse files

feat: added data from ECDC w/ notebook and converter

parent ed91e09f
No related branches found
No related tags found
1 merge request!129feat: add data from ECDC
......@@ -66,6 +66,7 @@ update:
- renku dataset update covid-19-italy
- renku dataset update covid-19-spain
- renku dataset update covid-19-us-nyt
- renku dataset add covid-19-ecdc -d covid-19-ecdc.csv https://opendata.ecdc.europa.eu/covid19/casedistribution/csv --force
- renku dataset add covid-19-chile -s data/*.csv -s data/covid19_chile.rds https://github.com/itoledor/coronavirus.git --force
- renku dataset add covidtracker https://ocgptweb.azurewebsites.net/CSVDownload -d covidtracker.csv --force
- renku rerun data/covidtracking/states-metadata.json data/covidtracking/states-daily.json
......
......@@ -9,6 +9,7 @@ For each data source, we provide a simple summary notebook with interactive
figures:
* [Summary of global data from from JHU CSSE](https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/Dashboard.run.ipynb)
* [Global data from from ECDC](https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/covid-19-ecdc.run.ipynb)
* [U.S. state-level data from covidtracking.com](https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/covidtracking.run.ipynb)
* [U.S. county-level data from the New York Times](https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/covid-19-us-nyt.run.ipynb)
* [Regional data for Italy from italian Civil Protection](https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/covid-19-italy.run.ipynb)
......@@ -115,6 +116,12 @@ useful helper and plotting functions that are used in the sample notebooks.
<td><a href="https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/Dashboard.run.ipynb">notebooks/Dashboard.ipynb</a></td>
</tr>
<tr>
<td><a href="https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide">Covid-19 data collected by the ECDC</a></td>
<td><a href="https://renkulab.io/projects/covid-19/covid-19-public-data/datasets/78a35752-cc00-443d-8ed8-e37a82599099/">covid-19-ecdc</a></td>
<td><code>data/covid-19-ecdc</code></td>
<td><a href="https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/covid-19-ecdc.run.ipynb">notebooks/covid-19-ecdc.ipynb</a></td>
</tr>
<tr>
<td><a href="https://covidtracking.com/">covidtracking.com</a></td>
<td><a href="https://renkulab.io/projects/covid-19/covid-19-public-data/datasets/c8bec148-5332-4602-9dc3-e39bbe92ed67/">covidtracking</a></td>
<td><code>data/covidtracking</code></td>
......@@ -173,6 +180,12 @@ CSSE)](https://github.com/CSSEGISandData/COVID-19). The
[dashboard](covid-19-public-data/files/blob/runs/Dashboard.run.ipynb) summarizes
this data in combination with population data from the world bank.
### Covid-19 Data collected by the ECDC
A global dataset collected by a team of epidemiologists at the [European Center
for Disease Prevention and
Control](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide).
### Covid tracking crowdsourcing project
[Covid tracking](https://covidtracking.com) is a crowd-sourced dataset for US
......
%% Cell type:code id: tags:
``` python
%load_ext autoreload
%autoreload 2
```
%% Cell type:code id: tags:
``` python
from datetime import datetime
import altair as alt
import pandas as pd
from IPython.display import display, HTML
from covid_19_utils import helper, plotting
from covid_19_utils.converters import CaseConverter
```
%% Cell type:markdown id: tags:
# Covid-19 Global Data from ECDC
This dataset is collected by the European Center for Disease Prevention and Control and can be found [here](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide).
%% Cell type:code id: tags:
``` python
html_credits=HTML('''
<p style="font-size: smaller">Data Sources:
<a href="https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide">ECDC</a>
<br>
Analysis and Visualization:
<a href="https://renkulab.io/projects/covid-19/covid-19-public-data">Covid-19 Public Data Collaboration Project @ renkulab.io</a>
</p>''')
```
%% Cell type:code id: tags:parameters
``` python
data_path = '../data/covid-19-ecdc'
atlas_path = '../data/atlas'
```
%% Cell type:code id: tags:
``` python
converter = CaseConverter(atlas_path)
df = converter.read_convert(data_path)
```
%% Cell type:code id: tags:
``` python
nthresh=10000
country_max_ser = df.set_index(['country_label', 'date'])['positive'].groupby(level='country_label').max()
countries_over_thresh = country_max_ser[country_max_ser>nthresh].index
countries_over_thresh = [c for c in countries_over_thresh if c not in set(['Andorra', 'Iceland', 'San Marino'])]
start_date = datetime.fromisoformat('2020-02-01')
thresh_df = df.loc[(df.date > start_date) & (df.country_label.isin(countries_over_thresh))]
```
%% Cell type:markdown id: tags:
## Daily deaths globally
%% Cell type:markdown id: tags:
The chart below shows the total number of covid-19 related deaths reported worldwide since February 1st, 2020.
%% Cell type:code id: tags:
``` python
chart = alt.Chart(
df.loc[df.date > start_date].groupby('date')['deceased_daily'].sum().reset_index()
).mark_line().encode(
x=alt.X('date', title='Date'),
y=alt.Y('deceased_daily', title='Daily deaths')
)
display(chart)
display(html_credits)
```
%% Cell type:markdown id: tags:
Deaths in countries with over 10,000 cases, ordered by total number of deaths.
%% Cell type:code id: tags:
``` python
order = thresh_df.groupby(['country_label'])['deceased'].max().sort_values(ascending=False).index.tolist()
base = alt.Chart(thresh_df)
chart = base.mark_line().encode(
x=alt.X('date', title='Date'),
y=alt.Y('deceased_daily', title='Daily deaths'),
facet=alt.Facet('country_label', sort=alt.SortArray(order), columns=5, title='')
).properties(
height=150,
width=150
)
display(chart)
display(html_credits)
```
%% Cell type:code id: tags:
``` python
```
......@@ -4,4 +4,4 @@ __author__ = """Chandrasekhar Ramakrishnan"""
__email__ = "cramakri@ethz.ch"
__version__ = "0.1.0"
from .converters import covidtracking, italy, jhu, nyt, spain, switzerland
from .converters import covidtracking, ecdc, italy, jhu, nyt, spain, switzerland
......@@ -44,19 +44,21 @@ class CaseConverterImpl:
column_list = []
neg_column_list = []
common_columns = [
"date",
"country",
"country_label",
"region_iso",
"region_label",
"admin2",
"admin2_label",
"tested",
"positive",
"admin2",
"country_label",
"country",
"date",
"deceased_100k",
"deceased_daily",
"deceased",
"positive_100k",
"deceased_100k",
"positive_daily",
"positive",
"region_iso",
"region_label",
"tested_100k",
"tested",
]
def __init__(self, atlas_folder):
......@@ -83,7 +85,7 @@ class CaseConverterImpl:
def _set_common_columns(self, df):
"""Use only the common columns; add missing ones when needed."""
try:
# TODO: Int32 (as opposed to int) is an intger type that allows nan
# TODO: Int32 (as opposed to int) is an integer type that allows nan
df["population"] = df.population.astype(int)
except ValueError:
pass
......@@ -103,6 +105,9 @@ class CaseConverterImpl:
return df[self.common_columns]
# def _compute_daily_counts(self, df):
# """
@classmethod
def _register(cls):
_converter_registry[cls.__name__] = cls
"""
Covid-19 converters for data from the European Center for Disease Control.
Obtained from https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide
"""
from pathlib import Path
import pandas as pd
from . import CaseConverterImpl as CaseConverter
from .. import helper
class ECDCCaseConverter(CaseConverter):
"""
Converter for worldwide data from the ECDC.
"""
conversion_dict = {
"dateRep": "date",
"cases": "positive_daily",
"deaths": "deceased_daily",
"countryterritoryCode": "country",
"countriesAndTerritories": "country_label",
"popData2018": "population",
}
column_list = list(conversion_dict.keys())
def convert(self, df):
# rename the existing columns
df_conv = df.rename(columns=ECDCCaseConverter.conversion_dict)
# convert date
df_conv["date"] = pd.to_datetime(df_conv["date"], dayfirst=True)
# compute totals
df_conv = (
df_conv.set_index(["country_label", "date"]).sort_index().reset_index()
)
df_conv['positive'] = df_conv.groupby('country_label')['positive_daily'].cumsum()
df_conv['deceased'] = df_conv.groupby('country_label')['deceased_daily'].cumsum()
# replace underscores with spaces
df_conv['country_label'] = df_conv.apply(lambda row: row['country_label'].replace('_', ' '), axis=1)
return self._set_common_columns(df_conv)
@classmethod
def read_data(cls, path):
"""Read in the covidtracking state-level data."""
return pd.read_csv(Path(path) / "covid-19-ecdc.csv")
ECDCCaseConverter._register()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment