feat: added data from ECDC w/ notebook and converter

c46f4060 · Rok Roškar · Rok Roškar · ed91e09f · c46f4060 · c46f4060
Commit c46f4060 authored 4 years ago by Rok Roškar Committed by Rok Roškar 4 years ago
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -66,6 +66,7 @@ update:
    - renku dataset update covid-19-italy
    - renku dataset update covid-19-spain
    - renku dataset update covid-19-us-nyt
+    - renku dataset add covid-19-ecdc -d covid-19-ecdc.csv https://opendata.ecdc.europa.eu/covid19/casedistribution/csv --force
    - renku dataset add covid-19-chile -s data/*.csv  -s data/covid19_chile.rds https://github.com/itoledor/coronavirus.git --force
    - renku dataset add covidtracker https://ocgptweb.azurewebsites.net/CSVDownload -d covidtracker.csv --force
    - renku rerun data/covidtracking/states-metadata.json data/covidtracking/states-daily.json

--- a/README.md
+++ b/README.md
@@ -9,6 +9,7 @@ For each data source, we provide a simple summary notebook with interactive
 figures:

 * [Summary of global data from from JHU CSSE](https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/Dashboard.run.ipynb)
+* [Global data from from ECDC](https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/covid-19-ecdc.run.ipynb)
 * [U.S. state-level data from covidtracking.com](https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/covidtracking.run.ipynb)
 * [U.S. county-level data from the New York Times](https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/covid-19-us-nyt.run.ipynb)
 * [Regional data for Italy from italian Civil Protection](https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/covid-19-italy.run.ipynb)
@@ -115,6 +116,12 @@ useful helper and plotting functions that are used in the sample notebooks.
 <td><a href="https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/Dashboard.run.ipynb">notebooks/Dashboard.ipynb</a></td>
 </tr>
 <tr>
+<td><a href="https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide">Covid-19 data collected by the ECDC</a></td>
+<td><a href="https://renkulab.io/projects/covid-19/covid-19-public-data/datasets/78a35752-cc00-443d-8ed8-e37a82599099/">covid-19-ecdc</a></td>
+<td><code>data/covid-19-ecdc</code></td>
+<td><a href="https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/covid-19-ecdc.run.ipynb">notebooks/covid-19-ecdc.ipynb</a></td>
+</tr>
+<tr>
 <td><a href="https://covidtracking.com/">covidtracking.com</a></td>
 <td><a href="https://renkulab.io/projects/covid-19/covid-19-public-data/datasets/c8bec148-5332-4602-9dc3-e39bbe92ed67/">covidtracking</a></td>
 <td><code>data/covidtracking</code></td>
@@ -173,6 +180,12 @@ CSSE)](https://github.com/CSSEGISandData/COVID-19). The
 [dashboard](covid-19-public-data/files/blob/runs/Dashboard.run.ipynb) summarizes
 this data in combination with population data from the world bank.

+### Covid-19 Data collected by the ECDC
+
+A global dataset collected by a team of epidemiologists at the [European Center
+for Disease Prevention and
+Control](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide).
+
 ### Covid tracking crowdsourcing project

 [Covid tracking](https://covidtracking.com) is a crowd-sourced dataset for US

--- a/notebooks/covid-19-ecdc.ipynb
+++ b/notebooks/covid-19-ecdc.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datetime import datetime\n",
+    "\n",
+    "import altair as alt\n",
+    "import pandas as pd\n",
+    "\n",
+    "from IPython.display import display, HTML\n",
+    "\n",
+    "from covid_19_utils import helper, plotting\n",
+    "from covid_19_utils.converters import CaseConverter"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Covid-19 Global Data from ECDC\n",
+    "\n",
+    "This dataset is collected by the European Center for Disease Prevention and Control and can be found [here](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide).\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "html_credits=HTML('''\n",
+    "<p style=\"font-size: smaller\">Data Sources: \n",
+    "  <a href=\"https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide\">ECDC</a>\n",
+    "  <br>\n",
+    "Analysis and Visualization:\n",
+    "  <a href=\"https://renkulab.io/projects/covid-19/covid-19-public-data\">Covid-19 Public Data Collaboration Project @ renkulab.io</a>\n",
+    "</p>''')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "data_path = '../data/covid-19-ecdc'\n",
+    "atlas_path = '../data/atlas'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "converter = CaseConverter(atlas_path)\n",
+    "df = converter.read_convert(data_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nthresh=10000\n",
+    "country_max_ser = df.set_index(['country_label', 'date'])['positive'].groupby(level='country_label').max()\n",
+    "countries_over_thresh = country_max_ser[country_max_ser>nthresh].index\n",
+    "countries_over_thresh = [c for c in countries_over_thresh if c not in set(['Andorra', 'Iceland', 'San Marino'])]\n",
+    "\n",
+    "start_date = datetime.fromisoformat('2020-02-01')\n",
+    "thresh_df = df.loc[(df.date > start_date) & (df.country_label.isin(countries_over_thresh))]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Daily deaths globally\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The chart below shows the total number of covid-19 related deaths reported worldwide since February 1st, 2020. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chart = alt.Chart(\n",
+    "    df.loc[df.date > start_date].groupby('date')['deceased_daily'].sum().reset_index()\n",
+    ").mark_line().encode(\n",
+    "    x=alt.X('date', title='Date'),\n",
+    "    y=alt.Y('deceased_daily', title='Daily deaths')\n",
+    ")\n",
+    "display(chart)\n",
+    "display(html_credits)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Deaths in countries with over 10,000 cases, ordered by total number of deaths. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "order = thresh_df.groupby(['country_label'])['deceased'].max().sort_values(ascending=False).index.tolist()\n",
+    "\n",
+    "base = alt.Chart(thresh_df)\n",
+    "chart = base.mark_line().encode(\n",
+    "    x=alt.X('date', title='Date'), \n",
+    "    y=alt.Y('deceased_daily', title='Daily deaths'),\n",
+    "    facet=alt.Facet('country_label', sort=alt.SortArray(order), columns=5, title='')\n",
+    ").properties(\n",
+    "    height=150,\n",
+    "    width=150\n",
+    ")\n",
+    "display(chart)\n",
+    "display(html_credits)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "file_extension": ".py",
+  "hide_input": true,
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  },
+  "mimetype": "text/x-python",
+  "name": "python",
+  "npconvert_exporter": "python",
+  "pygments_lexer": "ipython3",
+  "version": 3
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
+%% Cell type:code id: tags:
+
+``` python
+%load_ext autoreload
+%autoreload 2
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from datetime import datetime
+
+import altair as alt
+import pandas as pd
+
+from IPython.display import display, HTML
+
+from covid_19_utils import helper, plotting
+from covid_19_utils.converters import CaseConverter
+```
+
+%% Cell type:markdown id: tags:
+
+# Covid-19 Global Data from ECDC
+
+This dataset is collected by the European Center for Disease Prevention and Control and can be found [here](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide).
+
+%% Cell type:code id: tags:
+
+``` python
+html_credits=HTML('''
+<p style="font-size: smaller">Data Sources:
+  <a href="https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide">ECDC</a>
+  <br>
+Analysis and Visualization:
+  <a href="https://renkulab.io/projects/covid-19/covid-19-public-data">Covid-19 Public Data Collaboration Project @ renkulab.io</a>
+</p>''')
+```
+
+%% Cell type:code id: tags:parameters
+
+``` python
+data_path = '../data/covid-19-ecdc'
+atlas_path = '../data/atlas'
+```
+
+%% Cell type:code id: tags:
+
+``` python
+converter = CaseConverter(atlas_path)
+df = converter.read_convert(data_path)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+nthresh=10000
+country_max_ser = df.set_index(['country_label', 'date'])['positive'].groupby(level='country_label').max()
+countries_over_thresh = country_max_ser[country_max_ser>nthresh].index
+countries_over_thresh = [c for c in countries_over_thresh if c not in set(['Andorra', 'Iceland', 'San Marino'])]
+
+start_date = datetime.fromisoformat('2020-02-01')
+thresh_df = df.loc[(df.date > start_date) & (df.country_label.isin(countries_over_thresh))]
+```
+
+%% Cell type:markdown id: tags:
+
+## Daily deaths globally
+
+%% Cell type:markdown id: tags:
+
+The chart below shows the total number of covid-19 related deaths reported worldwide since February 1st, 2020.
+
+%% Cell type:code id: tags:
+
+``` python
+chart = alt.Chart(
+    df.loc[df.date > start_date].groupby('date')['deceased_daily'].sum().reset_index()
+).mark_line().encode(
+    x=alt.X('date', title='Date'),
+    y=alt.Y('deceased_daily', title='Daily deaths')
+)
+display(chart)
+display(html_credits)
+```
+
+%% Cell type:markdown id: tags:
+
+Deaths in countries with over 10,000 cases, ordered by total number of deaths.
+
+%% Cell type:code id: tags:
+
+``` python
+order = thresh_df.groupby(['country_label'])['deceased'].max().sort_values(ascending=False).index.tolist()
+
+base = alt.Chart(thresh_df)
+chart = base.mark_line().encode(
+    x=alt.X('date', title='Date'),
+    y=alt.Y('deceased_daily', title='Daily deaths'),
+    facet=alt.Facet('country_label', sort=alt.SortArray(order), columns=5, title='')
+).properties(
+    height=150,
+    width=150
+)
+display(chart)
+display(html_credits)
+```
+
+%% Cell type:code id: tags:
+
+``` python
+```
--- a/src/covid-19/covid_19_utils/covid_19_utils/__init__.py
+++ b/src/covid-19/covid_19_utils/covid_19_utils/__init__.py
@@ -4,4 +4,4 @@ __author__ = """Chandrasekhar Ramakrishnan"""
 __email__ = "cramakri@ethz.ch"
 __version__ = "0.1.0"

-from .converters import covidtracking, italy, jhu, nyt, spain, switzerland
+from .converters import covidtracking, ecdc, italy, jhu, nyt, spain, switzerland
--- a/src/covid-19/covid_19_utils/covid_19_utils/converters/__init__.py
+++ b/src/covid-19/covid_19_utils/covid_19_utils/converters/__init__.py
@@ -44,19 +44,21 @@ class CaseConverterImpl:
    column_list = []
    neg_column_list = []
    common_columns = [
-        "date",
-        "country",
-        "country_label",
-        "region_iso",
-        "region_label",
-        "admin2",
        "admin2_label",
-        "tested",
-        "positive",
+        "admin2",
+        "country_label",
+        "country",
+        "date",
+        "deceased_100k",
+        "deceased_daily",
        "deceased",
        "positive_100k",
-        "deceased_100k",
+        "positive_daily",
+        "positive",
+        "region_iso",
+        "region_label",
        "tested_100k",
+        "tested",
    ]

    def __init__(self, atlas_folder):
@@ -83,7 +85,7 @@ class CaseConverterImpl:
    def _set_common_columns(self, df):
        """Use only the common columns; add missing ones when needed."""
        try:
-            # TODO: Int32 (as opposed to int) is an intger type that allows nan
+            # TODO: Int32 (as opposed to int) is an integer type that allows nan
            df["population"] = df.population.astype(int)
        except ValueError:
            pass
@@ -103,6 +105,9 @@ class CaseConverterImpl:

        return df[self.common_columns]

+    # def _compute_daily_counts(self, df):
+    #     """
+
    @classmethod
    def _register(cls):
        _converter_registry[cls.__name__] = cls
--- a/src/covid-19/covid_19_utils/covid_19_utils/converters/ecdc.py
+++ b/src/covid-19/covid_19_utils/covid_19_utils/converters/ecdc.py
+"""
+Covid-19 converters for data from the European Center for Disease Control.
+Obtained from https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide
+"""
+
+from pathlib import Path
+
+
+import pandas as pd
+
+from . import CaseConverterImpl as CaseConverter
+from .. import helper
+
+
+class ECDCCaseConverter(CaseConverter):
+    """
+    Converter for worldwide data from the ECDC.
+    """
+
+    conversion_dict = {
+        "dateRep": "date",
+        "cases": "positive_daily",
+        "deaths": "deceased_daily",
+        "countryterritoryCode": "country",
+        "countriesAndTerritories": "country_label",
+        "popData2018": "population",
+    }
+    column_list = list(conversion_dict.keys())
+
+    def convert(self, df):
+        # rename the existing columns
+        df_conv = df.rename(columns=ECDCCaseConverter.conversion_dict)
+
+        # convert date
+        df_conv["date"] = pd.to_datetime(df_conv["date"], dayfirst=True)
+
+        # compute totals
+        df_conv = (
+            df_conv.set_index(["country_label", "date"]).sort_index().reset_index()
+        )
+        df_conv['positive'] = df_conv.groupby('country_label')['positive_daily'].cumsum()
+        df_conv['deceased'] = df_conv.groupby('country_label')['deceased_daily'].cumsum()
+
+        # replace underscores with spaces
+        df_conv['country_label'] = df_conv.apply(lambda row: row['country_label'].replace('_', ' '), axis=1)
+        return self._set_common_columns(df_conv)
+
+    @classmethod
+    def read_data(cls, path):
+        """Read in the covidtracking state-level data."""
+        return pd.read_csv(Path(path) / "covid-19-ecdc.csv")
+
+
+ECDCCaseConverter._register()