From 3c1447296fbaabcc53fd419f41e68d94d83cdd64 Mon Sep 17 00:00:00 2001
From: Chandrasekhar Ramakrishnan <cramakri@ethz.ch>
Date: Thu, 12 Mar 2020 23:09:08 +0000
Subject: [PATCH] feat: notebook to convert values to rates

---
 .../{Play.ipynb => Preprocessing-Play.ipynb}  | 244 +++++++++++++++++-
 notebooks/ToRates.ipynb                       | 220 ++++++++++++++++
 2 files changed, 451 insertions(+), 13 deletions(-)
 rename notebooks/{Play.ipynb => Preprocessing-Play.ipynb} (69%)
 create mode 100644 notebooks/ToRates.ipynb

diff --git a/notebooks/Play.ipynb b/notebooks/Preprocessing-Play.ipynb
similarity index 69%
rename from notebooks/Play.ipynb
rename to notebooks/Preprocessing-Play.ipynb
index 5ab6e9cc..71d7c4c0 100644
--- a/notebooks/Play.ipynb
+++ b/notebooks/Preprocessing-Play.ipynb
@@ -34,7 +34,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -47,7 +47,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -61,7 +61,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -260,7 +260,7 @@
        "Australia                         107       3         21"
       ]
      },
-     "execution_count": 23,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -279,7 +279,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -288,9 +288,16 @@
     "pop_df = pd.read_csv(zf.open(\"API_SP.POP.TOTL_DS2_en_csv_v2_821007.csv\"), skiprows=4)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There is 2018 pop data for all countries/regions except Eritrea"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -407,13 +414,12 @@
        "[2 rows x 65 columns]"
       ]
      },
-     "execution_count": 53,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "# There is 2018 pop data for all countries/regions except Eritrea\n",
     "pop_df[pd.isna(pop_df['2018'])]"
    ]
   },
@@ -421,12 +427,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Fix the country/region names that differ"
+    "Fix the country/region names that differ between the World Bank population data and the JHU CSSE data."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -445,7 +451,7 @@
     "     'St. Martin (French part)': 'Saint Martin',\n",
     "     'Brunei Darussalam': 'Brunei'\n",
     "}\n",
-    "current_pop_ser = pop_df[['Country Name', '2018']].copy().replace(region_wb_jhu_map).set_index('Country Name')\n",
+    "current_pop_ser = pop_df[['Country Name', '2018']].copy().replace(region_wb_jhu_map).set_index('Country Name')['2018']\n",
     "data_pop_ser = current_pop_ser[current_pop_ser.index.isin(current_totals_df.index)]"
    ]
   },
@@ -458,7 +464,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
@@ -545,7 +551,7 @@
        "Saint Barthelemy                        1       0          0"
       ]
      },
-     "execution_count": 112,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -554,6 +560,218 @@
     "current_totals_df[current_totals_df.index.isin(data_pop_ser.index) == False]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Compute rates per 100,000 for regions with more than 100 cases"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>confirmed</th>\n",
+       "      <th>deaths</th>\n",
+       "      <th>recovered</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Italy</th>\n",
+       "      <td>16.794282</td>\n",
+       "      <td>1.044161</td>\n",
+       "      <td>1.198055</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Republic of Korea</th>\n",
+       "      <td>14.550136</td>\n",
+       "      <td>0.104580</td>\n",
+       "      <td>0.478355</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Iran (Islamic Republic of)</th>\n",
+       "      <td>9.831264</td>\n",
+       "      <td>0.355745</td>\n",
+       "      <td>3.338620</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Norway</th>\n",
+       "      <td>7.526810</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.018817</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Bahrain</th>\n",
+       "      <td>7.008874</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.401775</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Mainland China</th>\n",
+       "      <td>5.798468</td>\n",
+       "      <td>0.225169</td>\n",
+       "      <td>4.315697</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Switzerland</th>\n",
+       "      <td>5.765250</td>\n",
+       "      <td>0.035226</td>\n",
+       "      <td>0.035226</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Denmark</th>\n",
+       "      <td>4.519231</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.017249</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Spain</th>\n",
+       "      <td>3.627705</td>\n",
+       "      <td>0.074908</td>\n",
+       "      <td>0.068488</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Sweden</th>\n",
+       "      <td>3.486143</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.009820</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Singapore</th>\n",
+       "      <td>2.837546</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>1.383303</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>France</th>\n",
+       "      <td>2.663194</td>\n",
+       "      <td>0.049263</td>\n",
+       "      <td>0.017914</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Belgium</th>\n",
+       "      <td>2.337580</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.008755</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Netherlands</th>\n",
+       "      <td>2.216932</td>\n",
+       "      <td>0.023214</td>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Austria</th>\n",
+       "      <td>2.057186</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.045213</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Germany</th>\n",
+       "      <td>1.756947</td>\n",
+       "      <td>0.002412</td>\n",
+       "      <td>0.021706</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Hong Kong SAR</th>\n",
+       "      <td>1.610522</td>\n",
+       "      <td>0.040263</td>\n",
+       "      <td>0.872366</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>UK</th>\n",
+       "      <td>0.574531</td>\n",
+       "      <td>0.009024</td>\n",
+       "      <td>0.027072</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>US</th>\n",
+       "      <td>0.510442</td>\n",
+       "      <td>0.017117</td>\n",
+       "      <td>0.004585</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Japan</th>\n",
+       "      <td>0.459183</td>\n",
+       "      <td>0.007903</td>\n",
+       "      <td>0.079824</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Australia</th>\n",
+       "      <td>0.428131</td>\n",
+       "      <td>0.012004</td>\n",
+       "      <td>0.084026</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Malaysia</th>\n",
+       "      <td>0.409153</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.076121</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                            confirmed    deaths  recovered\n",
+       "Italy                       16.794282  1.044161   1.198055\n",
+       "Republic of Korea           14.550136  0.104580   0.478355\n",
+       "Iran (Islamic Republic of)   9.831264  0.355745   3.338620\n",
+       "Norway                       7.526810  0.000000   0.018817\n",
+       "Bahrain                      7.008874  0.000000   1.401775\n",
+       "Mainland China               5.798468  0.225169   4.315697\n",
+       "Switzerland                  5.765250  0.035226   0.035226\n",
+       "Denmark                      4.519231  0.000000   0.017249\n",
+       "Spain                        3.627705  0.074908   0.068488\n",
+       "Sweden                       3.486143  0.000000   0.009820\n",
+       "Singapore                    2.837546  0.000000   1.383303\n",
+       "France                       2.663194  0.049263   0.017914\n",
+       "Belgium                      2.337580  0.000000   0.008755\n",
+       "Netherlands                  2.216932  0.023214   0.000000\n",
+       "Austria                      2.057186  0.000000   0.045213\n",
+       "Germany                      1.756947  0.002412   0.021706\n",
+       "Hong Kong SAR                1.610522  0.040263   0.872366\n",
+       "UK                           0.574531  0.009024   0.027072\n",
+       "US                           0.510442  0.017117   0.004585\n",
+       "Japan                        0.459183  0.007903   0.079824\n",
+       "Australia                    0.428131  0.012004   0.084026\n",
+       "Malaysia                     0.409153  0.000000   0.076121"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "current_per_100000_df = current_totals_df[current_totals_df['confirmed'] > 100]\n",
+    "current_per_100000_df = current_per_100000_df.div(data_pop_ser, 'index').mul(100000).dropna()\n",
+    "current_per_100000_df.sort_values('confirmed', ascending=False)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
diff --git a/notebooks/ToRates.ipynb b/notebooks/ToRates.ipynb
new file mode 100644
index 00000000..6a200ae1
--- /dev/null
+++ b/notebooks/ToRates.ipynb
@@ -0,0 +1,220 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Convert Series to Rates per 100,000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "ts_folder = \"../data/covid-19_jhu-csse/\"\n",
+    "wb_path = \"../data/worldbank/SP.POP.TOTL.zip\"\n",
+    "out_folder = None\n",
+    "PAPERMILL_OUTPUT_PATH = None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "source": [
+    "## Read in JHU CSSE data\n",
+    "\n",
+    "I will switch to [xarray](http://xarray.pydata.org/en/stable/), but ATM, it's easier like this..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_jhu_covid_region_df(name):\n",
+    "    filename = os.path.join(ts_folder, f\"time_series_19-covid-{name}.csv\")\n",
+    "    df = pd.read_csv(filename)\n",
+    "    df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])\n",
+    "    df.columns = pd.to_datetime(df.columns)\n",
+    "    region_df = df.groupby(level='Country/Region').sum()\n",
+    "    loc_df = df.reset_index([2,3]).groupby(level='Country/Region').mean()[['Long', 'Lat']]\n",
+    "    return region_df.join(loc_df).set_index(['Long', 'Lat'], append=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "frames_map = {\n",
+    "    \"confirmed\": read_jhu_covid_region_df(\"Confirmed\"),\n",
+    "    \"deaths\": read_jhu_covid_region_df(\"Deaths\"),\n",
+    "    \"recovered\": read_jhu_covid_region_df(\"Recovered\")\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "frames_map['confirmed'].sort_values(frames_map['confirmed'].columns[-1], ascending=False).head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Read in World Bank data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import zipfile\n",
+    "zf = zipfile.ZipFile(wb_path)\n",
+    "pop_df = pd.read_csv(zf.open(\"API_SP.POP.TOTL_DS2_en_csv_v2_821007.csv\"), skiprows=4)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There is 2018 pop data for all countries/regions except Eritrea"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pop_df[pd.isna(pop_df['2018'])]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Fix the country/region names that differ between the World Bank population data and the JHU CSSE data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "region_wb_jhu_map = {\n",
+    "    'China': 'Mainland China',\n",
+    "     'Iran, Islamic Rep.': 'Iran (Islamic Republic of)',\n",
+    "     'Korea, Rep.': 'Republic of Korea',\n",
+    "     'United States': 'US',\n",
+    "     'United Kingdom': 'UK',\n",
+    "     'Hong Kong SAR, China': 'Hong Kong SAR',\n",
+    "     'Egypt, Arab Rep.': 'Egypt',\n",
+    "     'Vietnam': 'Viet Nam',\n",
+    "     'Macao SAR, China': 'Macao SAR',\n",
+    "     'Slovak Republic': 'Slovakia',\n",
+    "     'Moldova': 'Republic of Moldova',\n",
+    "     'St. Martin (French part)': 'Saint Martin',\n",
+    "     'Brunei Darussalam': 'Brunei'\n",
+    "}\n",
+    "current_pop_ser = pop_df[['Country Name', '2018']].copy().replace(region_wb_jhu_map).set_index('Country Name')['2018']\n",
+    "data_pop_ser = current_pop_ser[current_pop_ser.index.isin(frames_map['confirmed'].index.levels[0])]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "There are some regions that we cannot resolve, but we will just ignore these."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Compute rates per 100,000 for regions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def cases_to_rates_df(df):\n",
+    "    per_100000_df = df.reset_index([1, 2], drop=True)\n",
+    "    per_100000_df = per_100000_df.div(data_pop_ser, 'index').mul(100000).dropna()\n",
+    "    per_100000_df.index.name = 'Country/Region'\n",
+    "    return per_100000_df\n",
+    "    \n",
+    "def frames_to_rates(frames_map):\n",
+    "    return {k: cases_to_rates_df(v) for k,v in frames_map.items()}\n",
+    "\n",
+    "\n",
+    "rates_map = frames_to_rates(frames_map)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if PAPERMILL_OUTPUT_PATH:\n",
+    "    for k, v in rates_map.items():\n",
+    "        out_path = os.path.join(out_folder, f\"ts_rates_19-covid-{k}.csv\")\n",
+    "        v.reset_index().to_csv(out_path)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
-- 
GitLab