From 65256793b4997ad991b69121412b6bec219dd559 Mon Sep 17 00:00:00 2001 From: Chandrasekhar Ramakrishnan <cramakri@ethz.ch> Date: Sat, 4 Apr 2020 01:45:33 +0200 Subject: [PATCH] feat: add converter for JHU/CSSE global data --- notebooks/process/standardize_datasets.ipynb | 395 +++++++++++++++++- .../covid_19_dashboard/converters/__init__.py | 7 +- .../covid_19_dashboard/converters/jhu.py | 90 ++++ 3 files changed, 477 insertions(+), 15 deletions(-) create mode 100644 src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py diff --git a/notebooks/process/standardize_datasets.ipynb b/notebooks/process/standardize_datasets.ipynb index 9f5e326aa..1012b2605 100644 --- a/notebooks/process/standardize_datasets.ipynb +++ b/notebooks/process/standardize_datasets.ipynb @@ -2,9 +2,18 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 53, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "%load_ext autoreload\n", "%autoreload 2" @@ -23,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -36,12 +45,13 @@ "from covid_19_dashboard.converters import CaseConverter\n", "from covid_19_dashboard.converters.switzerland import OpenZHCaseConverter\n", "from covid_19_dashboard.converters.covidtracking import CovidtrackingCaseConverter\n", - "from covid_19_dashboard.converters.spain import SpainCaseConverter" + "from covid_19_dashboard.converters.spain import SpainCaseConverter\n", + "from covid_19_dashboard.converters.jhu import JhuCsseGlobalCaseConverter" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -50,16 +60,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 85, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using <covid_19_dashboard.converters.switzerland.OpenZHCaseConverter object at 0x11d95fd50> for ../../data/openzh-covid-19\n", + "Using <covid_19_dashboard.converters.italy.ItalyCaseConverter object at 0x11d95fd90> for ../../data/covid-19-italy\n", + "Using <covid_19_dashboard.converters.covidtracking.CovidtrackingCaseConverter object at 0x11d95fcd0> for ../../data/covidtracking/\n", + "Using <covid_19_dashboard.converters.spain.SpainCaseConverter object at 0x11d95fd10> for ../../data/covid-19-spain\n", + "Using <covid_19_dashboard.converters.jhu.JhuCsseGlobalCaseConverter object at 0x11d95fdd0> for ../../data/covid-19_jhu-csse/\n" + ] + } + ], "source": [ "df_list = []\n", "for path in [\n", " '../../data/openzh-covid-19', \n", " '../../data/covid-19-italy', \n", " '../../data/covidtracking/', \n", - " '../../data/covid-19-spain'\n", + " '../../data/covid-19-spain',\n", + " \"../../data/covid-19_jhu-csse/\"\n", "]:\n", " df_list.append(converter.read_convert(path))\n", "df_all = pd.concat(df_list).reset_index(drop=True)\n", @@ -68,7 +91,140 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>date</th>\n", + " <th>country</th>\n", + " <th>region_iso</th>\n", + " <th>region_label</th>\n", + " <th>tested</th>\n", + " <th>positive</th>\n", + " <th>deceased</th>\n", + " <th>population</th>\n", + " <th>positive_100k</th>\n", + " <th>deceased_100k</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>2020-02-28</td>\n", + " <td>CHE</td>\n", + " <td>CH-AG</td>\n", + " <td>Aargau</td>\n", + " <td>NaN</td>\n", + " <td>1.0</td>\n", + " <td>NaN</td>\n", + " <td>678207.0</td>\n", + " <td>0.147448</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>2020-03-02</td>\n", + " <td>CHE</td>\n", + " <td>CH-AG</td>\n", + " <td>Aargau</td>\n", + " <td>NaN</td>\n", + " <td>2.0</td>\n", + " <td>NaN</td>\n", + " <td>678207.0</td>\n", + " <td>0.294895</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2020-03-03</td>\n", + " <td>CHE</td>\n", + " <td>CH-AG</td>\n", + " <td>Aargau</td>\n", + " <td>NaN</td>\n", + " <td>6.0</td>\n", + " <td>NaN</td>\n", + " <td>678207.0</td>\n", + " <td>0.884686</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>2020-03-04</td>\n", + " <td>CHE</td>\n", + " <td>CH-AG</td>\n", + " <td>Aargau</td>\n", + " <td>NaN</td>\n", + " <td>7.0</td>\n", + " <td>NaN</td>\n", + " <td>678207.0</td>\n", + " <td>1.032133</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>2020-03-05</td>\n", + " <td>CHE</td>\n", + " <td>CH-AG</td>\n", + " <td>Aargau</td>\n", + " <td>NaN</td>\n", + " <td>9.0</td>\n", + " <td>NaN</td>\n", + " <td>678207.0</td>\n", + " <td>1.327028</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " date country region_iso region_label tested positive deceased \\\n", + "0 2020-02-28 CHE CH-AG Aargau NaN 1.0 NaN \n", + "1 2020-03-02 CHE CH-AG Aargau NaN 2.0 NaN \n", + "2 2020-03-03 CHE CH-AG Aargau NaN 6.0 NaN \n", + "3 2020-03-04 CHE CH-AG Aargau NaN 7.0 NaN \n", + "4 2020-03-05 CHE CH-AG Aargau NaN 9.0 NaN \n", + "\n", + " population positive_100k deceased_100k \n", + "0 678207.0 0.147448 NaN \n", + "1 678207.0 0.294895 NaN \n", + "2 678207.0 0.884686 NaN \n", + "3 678207.0 1.032133 NaN \n", + "4 678207.0 1.327028 NaN " + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_all.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -77,9 +233,226 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>date</th>\n", + " <th>country</th>\n", + " <th>region_iso</th>\n", + " <th>region_label</th>\n", + " <th>tested</th>\n", + " <th>positive</th>\n", + " <th>deceased</th>\n", + " <th>population</th>\n", + " <th>positive_100k</th>\n", + " <th>deceased_100k</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>2020-02-27</td>\n", + " <td>ESP</td>\n", + " <td>ES-AN</td>\n", + " <td>AndalucÃa</td>\n", + " <td>None</td>\n", + " <td>1</td>\n", + " <td>NaN</td>\n", + " <td>8409738</td>\n", + " <td>0.011891</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>2020-02-28</td>\n", + " <td>ESP</td>\n", + " <td>ES-AN</td>\n", + " <td>AndalucÃa</td>\n", + " <td>None</td>\n", + " <td>6</td>\n", + " <td>NaN</td>\n", + " <td>8409738</td>\n", + " <td>0.071346</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2020-02-29</td>\n", + " <td>ESP</td>\n", + " <td>ES-AN</td>\n", + " <td>AndalucÃa</td>\n", + " <td>None</td>\n", + " <td>8</td>\n", + " <td>NaN</td>\n", + " <td>8409738</td>\n", + " <td>0.095128</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>2020-03-01</td>\n", + " <td>ESP</td>\n", + " <td>ES-AN</td>\n", + " <td>AndalucÃa</td>\n", + " <td>None</td>\n", + " <td>12</td>\n", + " <td>NaN</td>\n", + " <td>8409738</td>\n", + " <td>0.142692</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>2020-03-02</td>\n", + " <td>ESP</td>\n", + " <td>ES-AN</td>\n", + " <td>AndalucÃa</td>\n", + " <td>None</td>\n", + " <td>12</td>\n", + " <td>NaN</td>\n", + " <td>8409738</td>\n", + " <td>0.142692</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>622</th>\n", + " <td>2020-03-26</td>\n", + " <td>ESP</td>\n", + " <td>ES-RI</td>\n", + " <td>La Rioja</td>\n", + " <td>None</td>\n", + " <td>995</td>\n", + " <td>43.0</td>\n", + " <td>315675</td>\n", + " <td>315.197592</td>\n", + " <td>13.621604</td>\n", + " </tr>\n", + " <tr>\n", + " <th>623</th>\n", + " <td>2020-03-27</td>\n", + " <td>ESP</td>\n", + " <td>ES-RI</td>\n", + " <td>La Rioja</td>\n", + " <td>None</td>\n", + " <td>1236</td>\n", + " <td>55.0</td>\n", + " <td>315675</td>\n", + " <td>391.541934</td>\n", + " <td>17.422982</td>\n", + " </tr>\n", + " <tr>\n", + " <th>624</th>\n", + " <td>2020-03-28</td>\n", + " <td>ESP</td>\n", + " <td>ES-RI</td>\n", + " <td>La Rioja</td>\n", + " <td>None</td>\n", + " <td>1436</td>\n", + " <td>65.0</td>\n", + " <td>315675</td>\n", + " <td>454.898234</td>\n", + " <td>20.590797</td>\n", + " </tr>\n", + " <tr>\n", + " <th>625</th>\n", + " <td>2020-03-29</td>\n", + " <td>ESP</td>\n", + " <td>ES-RI</td>\n", + " <td>La Rioja</td>\n", + " <td>None</td>\n", + " <td>1629</td>\n", + " <td>68.0</td>\n", + " <td>315675</td>\n", + " <td>516.037063</td>\n", + " <td>21.541142</td>\n", + " </tr>\n", + " <tr>\n", + " <th>626</th>\n", + " <td>2020-03-30</td>\n", + " <td>ESP</td>\n", + " <td>ES-RI</td>\n", + " <td>La Rioja</td>\n", + " <td>None</td>\n", + " <td>1733</td>\n", + " <td>71.0</td>\n", + " <td>315675</td>\n", + " <td>548.982339</td>\n", + " <td>22.491486</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>627 rows × 10 columns</p>\n", + "</div>" + ], + "text/plain": [ + " date country region_iso region_label tested positive deceased \\\n", + "0 2020-02-27 ESP ES-AN AndalucÃa None 1 NaN \n", + "1 2020-02-28 ESP ES-AN AndalucÃa None 6 NaN \n", + "2 2020-02-29 ESP ES-AN AndalucÃa None 8 NaN \n", + "3 2020-03-01 ESP ES-AN AndalucÃa None 12 NaN \n", + "4 2020-03-02 ESP ES-AN AndalucÃa None 12 NaN \n", + ".. ... ... ... ... ... ... ... \n", + "622 2020-03-26 ESP ES-RI La Rioja None 995 43.0 \n", + "623 2020-03-27 ESP ES-RI La Rioja None 1236 55.0 \n", + "624 2020-03-28 ESP ES-RI La Rioja None 1436 65.0 \n", + "625 2020-03-29 ESP ES-RI La Rioja None 1629 68.0 \n", + "626 2020-03-30 ESP ES-RI La Rioja None 1733 71.0 \n", + "\n", + " population positive_100k deceased_100k \n", + "0 8409738 0.011891 NaN \n", + "1 8409738 0.071346 NaN \n", + "2 8409738 0.095128 NaN \n", + "3 8409738 0.142692 NaN \n", + "4 8409738 0.142692 NaN \n", + ".. ... ... ... \n", + "622 315675 315.197592 13.621604 \n", + "623 315675 391.541934 17.422982 \n", + "624 315675 454.898234 20.590797 \n", + "625 315675 516.037063 21.541142 \n", + "626 315675 548.982339 22.491486 \n", + "\n", + "[627 rows x 10 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "\n", "SpainCaseConverter.convert(df_esp)" diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py index 7a8e3e037..2ba8c6510 100644 --- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py @@ -71,14 +71,13 @@ class CaseConverterImpl: """Initialze the converter with the path to the atlas""" self.atlas_folder = atlas_folder - @classmethod - def can_convert(cls, path): + def can_convert(self, path): """Returns true if the class can convert the Dataframe.""" try: - df = cls.read_data(path) + df = self.read_data(path) except (FileNotFoundError, ValueError): return False - return all([col in df.columns for col in cls.column_list]) + return all([col in df.columns for col in self.__class__.column_list]) @classmethod def read_data(cls, path): diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py new file mode 100644 index 000000000..45eed0201 --- /dev/null +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py @@ -0,0 +1,90 @@ +""" +Converters for covid-19 data from JHU-CSSE. +""" + +from pathlib import Path + +import numpy as np +import pandas as pd + +from . import CaseConverterImpl +from .. import helper +import zipfile + + +class JhuCsseGlobalCaseConverter(CaseConverterImpl): + """ + Converter for global covid-19 case data from JHU/CSSE + and made available at https://github.com/CSSEGISandData/COVID-19 + """ + def convert(self, df): + # combine subregions at the level of country + df = df.groupby(['date', 'Country/Region']).sum().reset_index() + df = df.rename({"Country/Region": "region_label"}, axis=1) + + # get population data from worldbank + zf = zipfile.ZipFile( + Path(self.atlas_folder) / "worldbank" / "SP.POP.TOTL.zip") + pop_df = pd.read_csv( + zf.open("API_SP.POP.TOTL_DS2_en_csv_v2_821007.csv"), skiprows=4) + + # Fix the names of countries + region_jhu_wb_map = { + 'Brunei': 'Brunei Darussalam', + 'Czechia': 'Czech Republic', + 'Egypt': 'Egypt, Arab Rep.', + 'Hong Kong SAR': 'Hong Kong SAR, China', + 'Iran': 'Iran, Islamic Rep.', + 'Korea, South': 'Korea, Rep.', + 'Macao SAR': 'Macao SAR, China', + 'Russia': 'Russian Federation', + 'Slovakia': 'Slovak Republic', + 'Saint Martin': 'St. Martin (French part)', + 'US': 'United States' + } + df = df.replace(region_jhu_wb_map) + country_code_map = { + r['Country Name']: r['Country Code'] + for i, r in pop_df[['Country Name', 'Country Code']].iterrows() + } + df['country'] = df['region_label'].replace(country_code_map) + pop_ser = pop_df.set_index('Country Code')['2018'] + merged = df.loc[df['country'].isin(pop_ser.index)].copy() + + # TODO Also consult for worldmap data for countries like Taiwan + merged['population'] = merged.apply( + lambda r: pop_ser.loc[r['country']], axis=1) + + # calculate incidence rates + merged["positive_100k"] = merged["positive"] / merged[ + "population"] * 100000 + merged["deceased_100k"] = merged["deceased"] / merged[ + "population"] * 100000 + merged['region_iso'] = merged['country'] + merged['tested'] = np.nan + + return merged[CaseConverterImpl.common_columns] + + def read_ser(self, path, name): + """Read in the path and return as a column named name""" + df = pd.read_csv(path) + df = df.drop(['Lat', 'Long'], axis=1) + df = df.set_index(['Province/State', 'Country/Region']) + df.columns = pd.to_datetime(df.columns) + ser = df.stack() + ser.name = name + return ser + + def read_data(self, folder): + """Read in the data.""" + paths = [('time_series_covid19_confirmed_global.csv', 'positive'), + ('time_series_covid19_deaths_global.csv', 'deceased')] + sers = [self.read_ser(Path(folder) / p, name) for p, name in paths] + df = pd.concat(sers, axis=1) + df = df.reset_index().rename({"level_2": "date"}, axis=1) + return df + + +# TODO Implement converter for the daily reports + +JhuCsseGlobalCaseConverter._register() -- GitLab