ToRates.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Convert Series to Rates per 100,000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "ts_folder = \"../data/covid-19_jhu-csse/\"\n",
    "wb_path = \"../data/worldbank/SP.POP.TOTL.zip\"\n",
    "geodata_path = \"../data/geodata/geo_data.csv\"\n",
    "out_folder = None\n",
    "PAPERMILL_OUTPUT_PATH = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": [
     "parameters"
    ]
   },
   "source": [
    "## Read in JHU CSSE data\n",
    "\n",
    "I will switch to [xarray](http://xarray.pydata.org/en/stable/), but ATM, it's easier like this..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_jhu_covid_region_df(name):\n",
    "    filename = os.path.join(ts_folder, f\"time_series_19-covid-{name}.csv\")\n",
    "    df = pd.read_csv(filename)\n",
    "    df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])\n",
    "    df.columns = pd.to_datetime(df.columns)\n",
    "    region_df = df.groupby(level='Country/Region').sum()\n",
    "    loc_df = df.reset_index([2,3]).groupby(level='Country/Region').mean()[['Long', 'Lat']]\n",
    "    return region_df.join(loc_df).set_index(['Long', 'Lat'], append=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "frames_map = {\n",
    "    \"confirmed\": read_jhu_covid_region_df(\"Confirmed\"),\n",
    "    \"deaths\": read_jhu_covid_region_df(\"Deaths\"),\n",
    "    \"recovered\": read_jhu_covid_region_df(\"Recovered\")\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Read in World Bank data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import zipfile\n",
    "zf = zipfile.ZipFile(wb_path)\n",
    "pop_df = pd.read_csv(zf.open(\"API_SP.POP.TOTL_DS2_en_csv_v2_821007.csv\"), skiprows=4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There is 2018 pop data for all countries/regions except Eritrea"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pop_df[pd.isna(pop_df['2018'])]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fix the country/region names that differ between the World Bank population data and the JHU CSSE data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "region_wb_jhu_map = {\n",
    "     'Brunei Darussalam': 'Brunei',\n",
    "     'Czech Republic': 'Czechia',\n",
    "     'Egypt, Arab Rep.': 'Egypt',\n",
    "     'Hong Kong SAR, China': 'Hong Kong SAR',\n",
    "     'Iran, Islamic Rep.': 'Iran',\n",
    "     'Korea, Rep.': 'Korea, South',\n",
    "     'Macao SAR, China': 'Macao SAR',\n",
    "     'Russian Federation': 'Russia',\n",
    "     'Slovak Republic': 'Slovakia',\n",
    "     'St. Martin (French part)': 'Saint Martin',\n",
    "     'United States': 'US'\n",
    "}\n",
    "current_pop_ser = pop_df[['Country Name', '2018']].copy().replace(region_wb_jhu_map).set_index('Country Name')['2018']\n",
    "data_pop_ser = current_pop_ser[current_pop_ser.index.isin(frames_map['confirmed'].index.levels[0])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use this to find the name in the series\n",
    "# current_pop_ser[current_pop_ser.index.str.contains('Czech')]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "There are some regions that we cannot resolve, but we will just ignore these."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "frames_map['confirmed'].loc[\n",
    "    frames_map['confirmed'].index.levels[0].isin(data_pop_ser.index) == False\n",
    "].iloc[:,-2:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Read in geodata to get additional population numbers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "geodata_df = pd.read_csv(geodata_path).drop('Unnamed: 0', axis=1).set_index('name_jhu')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Add in populations for missing countries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "missing_countries = frames_map['confirmed'].loc[\n",
    "    frames_map['confirmed'].index.levels[0].isin(data_pop_ser.index) == False\n",
    "].iloc[:,-2:].reset_index()['Country/Region']\n",
    "\n",
    "display(geodata_df.loc[geodata_df.index.isin(missing_countries)])\n",
    "\n",
    "data_pop_ser = data_pop_ser.append(geodata_df.loc[geodata_df.index.isin(missing_countries), 'pop_est'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compute rates per 100,000 for regions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cases_to_rates_df(df):\n",
    "    per_100000_df = df.reset_index([1, 2], drop=True)\n",
    "    per_100000_df = per_100000_df.div(data_pop_ser, 'index').mul(100000).dropna()\n",
    "    per_100000_df.index.name = 'Country/Region'\n",
    "    return per_100000_df\n",
    "    \n",
    "def frames_to_rates(frames_map):\n",
    "    return {k: cases_to_rates_df(v) for k,v in frames_map.items()}\n",
    "\n",
    "\n",
    "rates_map = frames_to_rates(frames_map)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if PAPERMILL_OUTPUT_PATH:\n",
    "    for k, v in rates_map.items():\n",
    "        out_path = os.path.join(out_folder, f\"ts_rates_19-covid-{k}.csv\")\n",
    "        v.reset_index().to_csv(out_path)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}