diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 17e787234d3a509266283bf4df074b4de7c313dd..5158957d523b6013e77717e9950608bad829b3e6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -66,6 +66,7 @@ update: - renku dataset update covid-19-italy - renku dataset update covid-19-spain - renku dataset update covid-19-us-nyt + - renku dataset add covid-19-ecdc -d covid-19-ecdc.csv https://opendata.ecdc.europa.eu/covid19/casedistribution/csv --force - renku dataset add covid-19-chile -s data/*.csv -s data/covid19_chile.rds https://github.com/itoledor/coronavirus.git --force - renku dataset add covidtracker https://ocgptweb.azurewebsites.net/CSVDownload -d covidtracker.csv --force - renku rerun data/covidtracking/states-metadata.json data/covidtracking/states-daily.json diff --git a/README.md b/README.md index 18760b96c9be5958fd09e1551a35ab9e54e2bb4f..712c0e1e783ccdc741f21616cf2d3f1c65c2b431 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ For each data source, we provide a simple summary notebook with interactive figures: * [Summary of global data from from JHU CSSE](https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/Dashboard.run.ipynb) +* [Global data from from ECDC](https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/covid-19-ecdc.run.ipynb) * [U.S. state-level data from covidtracking.com](https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/covidtracking.run.ipynb) * [U.S. county-level data from the New York Times](https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/covid-19-us-nyt.run.ipynb) * [Regional data for Italy from italian Civil Protection](https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/covid-19-italy.run.ipynb) @@ -115,6 +116,12 @@ useful helper and plotting functions that are used in the sample notebooks. <td><a href="https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/Dashboard.run.ipynb">notebooks/Dashboard.ipynb</a></td> </tr> <tr> +<td><a href="https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide">Covid-19 data collected by the ECDC</a></td> +<td><a href="https://renkulab.io/projects/covid-19/covid-19-public-data/datasets/78a35752-cc00-443d-8ed8-e37a82599099/">covid-19-ecdc</a></td> +<td><code>data/covid-19-ecdc</code></td> +<td><a href="https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/covid-19-ecdc.run.ipynb">notebooks/covid-19-ecdc.ipynb</a></td> +</tr> +<tr> <td><a href="https://covidtracking.com/">covidtracking.com</a></td> <td><a href="https://renkulab.io/projects/covid-19/covid-19-public-data/datasets/c8bec148-5332-4602-9dc3-e39bbe92ed67/">covidtracking</a></td> <td><code>data/covidtracking</code></td> @@ -173,6 +180,12 @@ CSSE)](https://github.com/CSSEGISandData/COVID-19). The [dashboard](covid-19-public-data/files/blob/runs/Dashboard.run.ipynb) summarizes this data in combination with population data from the world bank. +### Covid-19 Data collected by the ECDC + +A global dataset collected by a team of epidemiologists at the [European Center +for Disease Prevention and +Control](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide). + ### Covid tracking crowdsourcing project [Covid tracking](https://covidtracking.com) is a crowd-sourced dataset for US diff --git a/notebooks/covid-19-ecdc.ipynb b/notebooks/covid-19-ecdc.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..0bbdfeccd39f5fa7c13777b5508805439461ce29 --- /dev/null +++ b/notebooks/covid-19-ecdc.ipynb @@ -0,0 +1,187 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "\n", + "import altair as alt\n", + "import pandas as pd\n", + "\n", + "from IPython.display import display, HTML\n", + "\n", + "from covid_19_utils import helper, plotting\n", + "from covid_19_utils.converters import CaseConverter" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Covid-19 Global Data from ECDC\n", + "\n", + "This dataset is collected by the European Center for Disease Prevention and Control and can be found [here](https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "html_credits=HTML('''\n", + "<p style=\"font-size: smaller\">Data Sources: \n", + " <a href=\"https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide\">ECDC</a>\n", + " <br>\n", + "Analysis and Visualization:\n", + " <a href=\"https://renkulab.io/projects/covid-19/covid-19-public-data\">Covid-19 Public Data Collaboration Project @ renkulab.io</a>\n", + "</p>''')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "data_path = '../data/covid-19-ecdc'\n", + "atlas_path = '../data/atlas'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "converter = CaseConverter(atlas_path)\n", + "df = converter.read_convert(data_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "nthresh=10000\n", + "country_max_ser = df.set_index(['country_label', 'date'])['positive'].groupby(level='country_label').max()\n", + "countries_over_thresh = country_max_ser[country_max_ser>nthresh].index\n", + "countries_over_thresh = [c for c in countries_over_thresh if c not in set(['Andorra', 'Iceland', 'San Marino'])]\n", + "\n", + "start_date = datetime.fromisoformat('2020-02-01')\n", + "thresh_df = df.loc[(df.date > start_date) & (df.country_label.isin(countries_over_thresh))]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Daily deaths globally\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The chart below shows the total number of covid-19 related deaths reported worldwide since February 1st, 2020. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "chart = alt.Chart(\n", + " df.loc[df.date > start_date].groupby('date')['deceased_daily'].sum().reset_index()\n", + ").mark_line().encode(\n", + " x=alt.X('date', title='Date'),\n", + " y=alt.Y('deceased_daily', title='Daily deaths')\n", + ")\n", + "display(chart)\n", + "display(html_credits)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Deaths in countries with over 10,000 cases, ordered by total number of deaths. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "order = thresh_df.groupby(['country_label'])['deceased'].max().sort_values(ascending=False).index.tolist()\n", + "\n", + "base = alt.Chart(thresh_df)\n", + "chart = base.mark_line().encode(\n", + " x=alt.X('date', title='Date'), \n", + " y=alt.Y('deceased_daily', title='Daily deaths'),\n", + " facet=alt.Facet('country_label', sort=alt.SortArray(order), columns=5, title='')\n", + ").properties(\n", + " height=150,\n", + " width=150\n", + ")\n", + "display(chart)\n", + "display(html_credits)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "file_extension": ".py", + "hide_input": true, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "mimetype": "text/x-python", + "name": "python", + "npconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/src/covid-19/covid_19_utils/covid_19_utils/__init__.py b/src/covid-19/covid_19_utils/covid_19_utils/__init__.py index 47beb193cb6e2f97e93682118b6d60f7ed155753..6860bbbe2f3813cb58aca2df35851f5ded050439 100644 --- a/src/covid-19/covid_19_utils/covid_19_utils/__init__.py +++ b/src/covid-19/covid_19_utils/covid_19_utils/__init__.py @@ -4,4 +4,4 @@ __author__ = """Chandrasekhar Ramakrishnan""" __email__ = "cramakri@ethz.ch" __version__ = "0.1.0" -from .converters import covidtracking, italy, jhu, nyt, spain, switzerland +from .converters import covidtracking, ecdc, italy, jhu, nyt, spain, switzerland diff --git a/src/covid-19/covid_19_utils/covid_19_utils/converters/__init__.py b/src/covid-19/covid_19_utils/covid_19_utils/converters/__init__.py index c39827e7b25fb5dece4ed062c67b68ce327360fb..ef0d0e55b06681f93b407791d6f98f9d4e0a90cf 100644 --- a/src/covid-19/covid_19_utils/covid_19_utils/converters/__init__.py +++ b/src/covid-19/covid_19_utils/covid_19_utils/converters/__init__.py @@ -44,19 +44,21 @@ class CaseConverterImpl: column_list = [] neg_column_list = [] common_columns = [ - "date", - "country", - "country_label", - "region_iso", - "region_label", - "admin2", "admin2_label", - "tested", - "positive", + "admin2", + "country_label", + "country", + "date", + "deceased_100k", + "deceased_daily", "deceased", "positive_100k", - "deceased_100k", + "positive_daily", + "positive", + "region_iso", + "region_label", "tested_100k", + "tested", ] def __init__(self, atlas_folder): @@ -83,7 +85,7 @@ class CaseConverterImpl: def _set_common_columns(self, df): """Use only the common columns; add missing ones when needed.""" try: - # TODO: Int32 (as opposed to int) is an intger type that allows nan + # TODO: Int32 (as opposed to int) is an integer type that allows nan df["population"] = df.population.astype(int) except ValueError: pass @@ -103,6 +105,9 @@ class CaseConverterImpl: return df[self.common_columns] + # def _compute_daily_counts(self, df): + # """ + @classmethod def _register(cls): _converter_registry[cls.__name__] = cls diff --git a/src/covid-19/covid_19_utils/covid_19_utils/converters/ecdc.py b/src/covid-19/covid_19_utils/covid_19_utils/converters/ecdc.py new file mode 100644 index 0000000000000000000000000000000000000000..4a0f6367acb0ca5ea560dc7f9e44fa9d7c993496 --- /dev/null +++ b/src/covid-19/covid_19_utils/covid_19_utils/converters/ecdc.py @@ -0,0 +1,54 @@ +""" +Covid-19 converters for data from the European Center for Disease Control. +Obtained from https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide +""" + +from pathlib import Path + + +import pandas as pd + +from . import CaseConverterImpl as CaseConverter +from .. import helper + + +class ECDCCaseConverter(CaseConverter): + """ + Converter for worldwide data from the ECDC. + """ + + conversion_dict = { + "dateRep": "date", + "cases": "positive_daily", + "deaths": "deceased_daily", + "countryterritoryCode": "country", + "countriesAndTerritories": "country_label", + "popData2018": "population", + } + column_list = list(conversion_dict.keys()) + + def convert(self, df): + # rename the existing columns + df_conv = df.rename(columns=ECDCCaseConverter.conversion_dict) + + # convert date + df_conv["date"] = pd.to_datetime(df_conv["date"], dayfirst=True) + + # compute totals + df_conv = ( + df_conv.set_index(["country_label", "date"]).sort_index().reset_index() + ) + df_conv['positive'] = df_conv.groupby('country_label')['positive_daily'].cumsum() + df_conv['deceased'] = df_conv.groupby('country_label')['deceased_daily'].cumsum() + + # replace underscores with spaces + df_conv['country_label'] = df_conv.apply(lambda row: row['country_label'].replace('_', ' '), axis=1) + return self._set_common_columns(df_conv) + + @classmethod + def read_data(cls, path): + """Read in the covidtracking state-level data.""" + return pd.read_csv(Path(path) / "covid-19-ecdc.csv") + + +ECDCCaseConverter._register()