{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Convert Series to Rates per 100,000" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [ "parameters" ] }, "outputs": [], "source": [ "ts_folder = \"../data/covid-19_jhu-csse/\"\n", "wb_path = \"../data/worldbank/SP.POP.TOTL.zip\"\n", "out_folder = None\n", "PAPERMILL_OUTPUT_PATH = None" ] }, { "cell_type": "markdown", "metadata": { "tags": [ "parameters" ] }, "source": [ "## Read in JHU CSSE data\n", "\n", "I will switch to [xarray](http://xarray.pydata.org/en/stable/), but ATM, it's easier like this..." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def read_jhu_covid_region_df(name):\n", " filename = os.path.join(ts_folder, f\"time_series_19-covid-{name}.csv\")\n", " df = pd.read_csv(filename)\n", " df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])\n", " df.columns = pd.to_datetime(df.columns)\n", " region_df = df.groupby(level='Country/Region').sum()\n", " loc_df = df.reset_index([2,3]).groupby(level='Country/Region').mean()[['Long', 'Lat']]\n", " return region_df.join(loc_df).set_index(['Long', 'Lat'], append=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "frames_map = {\n", " \"confirmed\": read_jhu_covid_region_df(\"Confirmed\"),\n", " \"deaths\": read_jhu_covid_region_df(\"Deaths\"),\n", " \"recovered\": read_jhu_covid_region_df(\"Recovered\")\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "frames_map['confirmed'].sort_values(frames_map['confirmed'].columns[-1], ascending=False).head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Read in World Bank data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import zipfile\n", "zf = zipfile.ZipFile(wb_path)\n", "pop_df = pd.read_csv(zf.open(\"API_SP.POP.TOTL_DS2_en_csv_v2_821007.csv\"), skiprows=4)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There is 2018 pop data for all countries/regions except Eritrea" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pop_df[pd.isna(pop_df['2018'])]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fix the country/region names that differ between the World Bank population data and the JHU CSSE data." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "region_wb_jhu_map = {\n", " 'China': 'Mainland China',\n", " 'Iran, Islamic Rep.': 'Iran (Islamic Republic of)',\n", " 'Korea, Rep.': 'Republic of Korea',\n", " 'United States': 'US',\n", " 'United Kingdom': 'UK',\n", " 'Hong Kong SAR, China': 'Hong Kong SAR',\n", " 'Egypt, Arab Rep.': 'Egypt',\n", " 'Vietnam': 'Viet Nam',\n", " 'Macao SAR, China': 'Macao SAR',\n", " 'Slovak Republic': 'Slovakia',\n", " 'Moldova': 'Republic of Moldova',\n", " 'St. Martin (French part)': 'Saint Martin',\n", " 'Brunei Darussalam': 'Brunei'\n", "}\n", "current_pop_ser = pop_df[['Country Name', '2018']].copy().replace(region_wb_jhu_map).set_index('Country Name')['2018']\n", "data_pop_ser = current_pop_ser[current_pop_ser.index.isin(frames_map['confirmed'].index.levels[0])]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are some regions that we cannot resolve, but we will just ignore these." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Compute rates per 100,000 for regions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def cases_to_rates_df(df):\n", " per_100000_df = df.reset_index([1, 2], drop=True)\n", " per_100000_df = per_100000_df.div(data_pop_ser, 'index').mul(100000).dropna()\n", " per_100000_df.index.name = 'Country/Region'\n", " return per_100000_df\n", " \n", "def frames_to_rates(frames_map):\n", " return {k: cases_to_rates_df(v) for k,v in frames_map.items()}\n", "\n", "\n", "rates_map = frames_to_rates(frames_map)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if PAPERMILL_OUTPUT_PATH:\n", " for k, v in rates_map.items():\n", " out_path = os.path.join(out_folder, f\"ts_rates_19-covid-{k}.csv\")\n", " v.reset_index().to_csv(out_path)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }