From 3c1447296fbaabcc53fd419f41e68d94d83cdd64 Mon Sep 17 00:00:00 2001 From: Chandrasekhar Ramakrishnan <cramakri@ethz.ch> Date: Thu, 12 Mar 2020 23:09:08 +0000 Subject: [PATCH] feat: notebook to convert values to rates --- .../{Play.ipynb => Preprocessing-Play.ipynb} | 244 +++++++++++++++++- notebooks/ToRates.ipynb | 220 ++++++++++++++++ 2 files changed, 451 insertions(+), 13 deletions(-) rename notebooks/{Play.ipynb => Preprocessing-Play.ipynb} (69%) create mode 100644 notebooks/ToRates.ipynb diff --git a/notebooks/Play.ipynb b/notebooks/Preprocessing-Play.ipynb similarity index 69% rename from notebooks/Play.ipynb rename to notebooks/Preprocessing-Play.ipynb index 5ab6e9cc..71d7c4c0 100644 --- a/notebooks/Play.ipynb +++ b/notebooks/Preprocessing-Play.ipynb @@ -34,7 +34,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -47,7 +47,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -61,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -260,7 +260,7 @@ "Australia 107 3 21" ] }, - "execution_count": 23, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -279,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -288,9 +288,16 @@ "pop_df = pd.read_csv(zf.open(\"API_SP.POP.TOTL_DS2_en_csv_v2_821007.csv\"), skiprows=4)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is 2018 pop data for all countries/regions except Eritrea" + ] + }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -407,13 +414,12 @@ "[2 rows x 65 columns]" ] }, - "execution_count": 53, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# There is 2018 pop data for all countries/regions except Eritrea\n", "pop_df[pd.isna(pop_df['2018'])]" ] }, @@ -421,12 +427,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Fix the country/region names that differ" + "Fix the country/region names that differ between the World Bank population data and the JHU CSSE data." ] }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -445,7 +451,7 @@ " 'St. Martin (French part)': 'Saint Martin',\n", " 'Brunei Darussalam': 'Brunei'\n", "}\n", - "current_pop_ser = pop_df[['Country Name', '2018']].copy().replace(region_wb_jhu_map).set_index('Country Name')\n", + "current_pop_ser = pop_df[['Country Name', '2018']].copy().replace(region_wb_jhu_map).set_index('Country Name')['2018']\n", "data_pop_ser = current_pop_ser[current_pop_ser.index.isin(current_totals_df.index)]" ] }, @@ -458,7 +464,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -545,7 +551,7 @@ "Saint Barthelemy 1 0 0" ] }, - "execution_count": 112, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -554,6 +560,218 @@ "current_totals_df[current_totals_df.index.isin(data_pop_ser.index) == False]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compute rates per 100,000 for regions with more than 100 cases" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>confirmed</th>\n", + " <th>deaths</th>\n", + " <th>recovered</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>Italy</th>\n", + " <td>16.794282</td>\n", + " <td>1.044161</td>\n", + " <td>1.198055</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Republic of Korea</th>\n", + " <td>14.550136</td>\n", + " <td>0.104580</td>\n", + " <td>0.478355</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Iran (Islamic Republic of)</th>\n", + " <td>9.831264</td>\n", + " <td>0.355745</td>\n", + " <td>3.338620</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Norway</th>\n", + " <td>7.526810</td>\n", + " <td>0.000000</td>\n", + " <td>0.018817</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Bahrain</th>\n", + " <td>7.008874</td>\n", + " <td>0.000000</td>\n", + " <td>1.401775</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Mainland China</th>\n", + " <td>5.798468</td>\n", + " <td>0.225169</td>\n", + " <td>4.315697</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Switzerland</th>\n", + " <td>5.765250</td>\n", + " <td>0.035226</td>\n", + " <td>0.035226</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Denmark</th>\n", + " <td>4.519231</td>\n", + " <td>0.000000</td>\n", + " <td>0.017249</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Spain</th>\n", + " <td>3.627705</td>\n", + " <td>0.074908</td>\n", + " <td>0.068488</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Sweden</th>\n", + " <td>3.486143</td>\n", + " <td>0.000000</td>\n", + " <td>0.009820</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Singapore</th>\n", + " <td>2.837546</td>\n", + " <td>0.000000</td>\n", + " <td>1.383303</td>\n", + " </tr>\n", + " <tr>\n", + " <th>France</th>\n", + " <td>2.663194</td>\n", + " <td>0.049263</td>\n", + " <td>0.017914</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Belgium</th>\n", + " <td>2.337580</td>\n", + " <td>0.000000</td>\n", + " <td>0.008755</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Netherlands</th>\n", + " <td>2.216932</td>\n", + " <td>0.023214</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Austria</th>\n", + " <td>2.057186</td>\n", + " <td>0.000000</td>\n", + " <td>0.045213</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Germany</th>\n", + " <td>1.756947</td>\n", + " <td>0.002412</td>\n", + " <td>0.021706</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Hong Kong SAR</th>\n", + " <td>1.610522</td>\n", + " <td>0.040263</td>\n", + " <td>0.872366</td>\n", + " </tr>\n", + " <tr>\n", + " <th>UK</th>\n", + " <td>0.574531</td>\n", + " <td>0.009024</td>\n", + " <td>0.027072</td>\n", + " </tr>\n", + " <tr>\n", + " <th>US</th>\n", + " <td>0.510442</td>\n", + " <td>0.017117</td>\n", + " <td>0.004585</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Japan</th>\n", + " <td>0.459183</td>\n", + " <td>0.007903</td>\n", + " <td>0.079824</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Australia</th>\n", + " <td>0.428131</td>\n", + " <td>0.012004</td>\n", + " <td>0.084026</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Malaysia</th>\n", + " <td>0.409153</td>\n", + " <td>0.000000</td>\n", + " <td>0.076121</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " confirmed deaths recovered\n", + "Italy 16.794282 1.044161 1.198055\n", + "Republic of Korea 14.550136 0.104580 0.478355\n", + "Iran (Islamic Republic of) 9.831264 0.355745 3.338620\n", + "Norway 7.526810 0.000000 0.018817\n", + "Bahrain 7.008874 0.000000 1.401775\n", + "Mainland China 5.798468 0.225169 4.315697\n", + "Switzerland 5.765250 0.035226 0.035226\n", + "Denmark 4.519231 0.000000 0.017249\n", + "Spain 3.627705 0.074908 0.068488\n", + "Sweden 3.486143 0.000000 0.009820\n", + "Singapore 2.837546 0.000000 1.383303\n", + "France 2.663194 0.049263 0.017914\n", + "Belgium 2.337580 0.000000 0.008755\n", + "Netherlands 2.216932 0.023214 0.000000\n", + "Austria 2.057186 0.000000 0.045213\n", + "Germany 1.756947 0.002412 0.021706\n", + "Hong Kong SAR 1.610522 0.040263 0.872366\n", + "UK 0.574531 0.009024 0.027072\n", + "US 0.510442 0.017117 0.004585\n", + "Japan 0.459183 0.007903 0.079824\n", + "Australia 0.428131 0.012004 0.084026\n", + "Malaysia 0.409153 0.000000 0.076121" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "current_per_100000_df = current_totals_df[current_totals_df['confirmed'] > 100]\n", + "current_per_100000_df = current_per_100000_df.div(data_pop_ser, 'index').mul(100000).dropna()\n", + "current_per_100000_df.sort_values('confirmed', ascending=False)" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/notebooks/ToRates.ipynb b/notebooks/ToRates.ipynb new file mode 100644 index 00000000..6a200ae1 --- /dev/null +++ b/notebooks/ToRates.ipynb @@ -0,0 +1,220 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Convert Series to Rates per 100,000" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "ts_folder = \"../data/covid-19_jhu-csse/\"\n", + "wb_path = \"../data/worldbank/SP.POP.TOTL.zip\"\n", + "out_folder = None\n", + "PAPERMILL_OUTPUT_PATH = None" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "parameters" + ] + }, + "source": [ + "## Read in JHU CSSE data\n", + "\n", + "I will switch to [xarray](http://xarray.pydata.org/en/stable/), but ATM, it's easier like this..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def read_jhu_covid_region_df(name):\n", + " filename = os.path.join(ts_folder, f\"time_series_19-covid-{name}.csv\")\n", + " df = pd.read_csv(filename)\n", + " df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])\n", + " df.columns = pd.to_datetime(df.columns)\n", + " region_df = df.groupby(level='Country/Region').sum()\n", + " loc_df = df.reset_index([2,3]).groupby(level='Country/Region').mean()[['Long', 'Lat']]\n", + " return region_df.join(loc_df).set_index(['Long', 'Lat'], append=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "frames_map = {\n", + " \"confirmed\": read_jhu_covid_region_df(\"Confirmed\"),\n", + " \"deaths\": read_jhu_covid_region_df(\"Deaths\"),\n", + " \"recovered\": read_jhu_covid_region_df(\"Recovered\")\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "frames_map['confirmed'].sort_values(frames_map['confirmed'].columns[-1], ascending=False).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Read in World Bank data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import zipfile\n", + "zf = zipfile.ZipFile(wb_path)\n", + "pop_df = pd.read_csv(zf.open(\"API_SP.POP.TOTL_DS2_en_csv_v2_821007.csv\"), skiprows=4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is 2018 pop data for all countries/regions except Eritrea" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pop_df[pd.isna(pop_df['2018'])]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fix the country/region names that differ between the World Bank population data and the JHU CSSE data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "region_wb_jhu_map = {\n", + " 'China': 'Mainland China',\n", + " 'Iran, Islamic Rep.': 'Iran (Islamic Republic of)',\n", + " 'Korea, Rep.': 'Republic of Korea',\n", + " 'United States': 'US',\n", + " 'United Kingdom': 'UK',\n", + " 'Hong Kong SAR, China': 'Hong Kong SAR',\n", + " 'Egypt, Arab Rep.': 'Egypt',\n", + " 'Vietnam': 'Viet Nam',\n", + " 'Macao SAR, China': 'Macao SAR',\n", + " 'Slovak Republic': 'Slovakia',\n", + " 'Moldova': 'Republic of Moldova',\n", + " 'St. Martin (French part)': 'Saint Martin',\n", + " 'Brunei Darussalam': 'Brunei'\n", + "}\n", + "current_pop_ser = pop_df[['Country Name', '2018']].copy().replace(region_wb_jhu_map).set_index('Country Name')['2018']\n", + "data_pop_ser = current_pop_ser[current_pop_ser.index.isin(frames_map['confirmed'].index.levels[0])]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are some regions that we cannot resolve, but we will just ignore these." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Compute rates per 100,000 for regions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def cases_to_rates_df(df):\n", + " per_100000_df = df.reset_index([1, 2], drop=True)\n", + " per_100000_df = per_100000_df.div(data_pop_ser, 'index').mul(100000).dropna()\n", + " per_100000_df.index.name = 'Country/Region'\n", + " return per_100000_df\n", + " \n", + "def frames_to_rates(frames_map):\n", + " return {k: cases_to_rates_df(v) for k,v in frames_map.items()}\n", + "\n", + "\n", + "rates_map = frames_to_rates(frames_map)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if PAPERMILL_OUTPUT_PATH:\n", + " for k, v in rates_map.items():\n", + " out_path = os.path.join(out_folder, f\"ts_rates_19-covid-{k}.csv\")\n", + " v.reset_index().to_csv(out_path)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} -- GitLab