fix: use geodata.csv to fill in missing population info

70676e66 · Chandrasekhar Ramakrishnan · 7d00456b · 70676e66
Commit 70676e66 authored 5 years ago by Chandrasekhar Ramakrishnan
--- a/notebooks/ToRates.ipynb
+++ b/notebooks/ToRates.ipynb
@@ -29,6 +29,7 @@
   "source": [
    "ts_folder = \"../data/covid-19_jhu-csse/\"\n",
    "wb_path = \"../data/worldbank/SP.POP.TOTL.zip\"\n",
+    "geodata_path = \"../data/geodata/geo_data.csv\"\n",
    "out_folder = None\n",
    "PAPERMILL_OUTPUT_PATH = None"
   ]
@@ -75,15 +76,6 @@
    "}"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "frames_map['confirmed'].sort_values(frames_map['confirmed'].columns[-1], ascending=False).head()"
-   ]
-  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -176,6 +168,44 @@
    "].iloc[:,-2:]"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Read in geodata to get additional population numbers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geodata_df = pd.read_csv(geodata_path).drop('Unnamed: 0', axis=1).set_index('name_jhu')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Add in populations for missing countries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "missing_countries = frames_map['confirmed'].loc[\n",
+    "    frames_map['confirmed'].index.levels[0].isin(data_pop_ser.index) == False\n",
+    "].iloc[:,-2:].reset_index()['Country/Region']\n",
+    "\n",
+    "display(geodata_df.loc[geodata_df.index.isin(missing_countries)])\n",
+    "\n",
+    "data_pop_ser = data_pop_ser.append(geodata_df.loc[geodata_df.index.isin(missing_countries), 'pop_est'])"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},

 %% Cell type:markdown id: tags:

 # Convert Series to Rates per 100,000

 %% Cell type:code id: tags:

 ``` python
 import pandas as pd
 import os
 ```

 %% Cell type:code id: tags:parameters

 ``` python
 ts_folder = "../data/covid-19_jhu-csse/"
 wb_path = "../data/worldbank/SP.POP.TOTL.zip"
+geodata_path = "../data/geodata/geo_data.csv"
 out_folder = None
 PAPERMILL_OUTPUT_PATH = None
 ```

 %% Cell type:markdown id: tags:parameters

 ## Read in JHU CSSE data

 I will switch to [xarray](http://xarray.pydata.org/en/stable/), but ATM, it's easier like this...

 %% Cell type:code id: tags:

 ``` python
 def read_jhu_covid_region_df(name):
    filename = os.path.join(ts_folder, f"time_series_19-covid-{name}.csv")
    df = pd.read_csv(filename)
    df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])
    df.columns = pd.to_datetime(df.columns)
    region_df = df.groupby(level='Country/Region').sum()
    loc_df = df.reset_index([2,3]).groupby(level='Country/Region').mean()[['Long', 'Lat']]
    return region_df.join(loc_df).set_index(['Long', 'Lat'], append=True)
 ```

 %% Cell type:code id: tags:

 ``` python
 frames_map = {
    "confirmed": read_jhu_covid_region_df("Confirmed"),
    "deaths": read_jhu_covid_region_df("Deaths"),
    "recovered": read_jhu_covid_region_df("Recovered")
 }
 ```

-%% Cell type:code id: tags:
-
-``` python
-frames_map['confirmed'].sort_values(frames_map['confirmed'].columns[-1], ascending=False).head()
-```
-
 %% Cell type:markdown id: tags:

 # Read in World Bank data

 %% Cell type:code id: tags:

 ``` python
 import zipfile
 zf = zipfile.ZipFile(wb_path)
 pop_df = pd.read_csv(zf.open("API_SP.POP.TOTL_DS2_en_csv_v2_821007.csv"), skiprows=4)
 ```

 %% Cell type:markdown id: tags:

 There is 2018 pop data for all countries/regions except Eritrea

 %% Cell type:code id: tags:

 ``` python
 pop_df[pd.isna(pop_df['2018'])]
 ```

 %% Cell type:markdown id: tags:

 Fix the country/region names that differ between the World Bank population data and the JHU CSSE data.

 %% Cell type:code id: tags:

 ``` python
 region_wb_jhu_map = {
     'Brunei Darussalam': 'Brunei',
     'Czech Republic': 'Czechia',
     'Egypt, Arab Rep.': 'Egypt',
     'Hong Kong SAR, China': 'Hong Kong SAR',
     'Iran, Islamic Rep.': 'Iran',
     'Korea, Rep.': 'Korea, South',
     'Macao SAR, China': 'Macao SAR',
     'Russian Federation': 'Russia',
     'Slovak Republic': 'Slovakia',
     'St. Martin (French part)': 'Saint Martin',
     'United States': 'US'
 }
 current_pop_ser = pop_df[['Country Name', '2018']].copy().replace(region_wb_jhu_map).set_index('Country Name')['2018']
 data_pop_ser = current_pop_ser[current_pop_ser.index.isin(frames_map['confirmed'].index.levels[0])]
 ```

 %% Cell type:code id: tags:

 ``` python
 # Use this to find the name in the series
 # current_pop_ser[current_pop_ser.index.str.contains('Czech')]
 ```

 %% Cell type:markdown id: tags:

 There are some regions that we cannot resolve, but we will just ignore these.

 %% Cell type:code id: tags:

 ``` python
 frames_map['confirmed'].loc[
    frames_map['confirmed'].index.levels[0].isin(data_pop_ser.index) == False
 ].iloc[:,-2:]
 ```

 %% Cell type:markdown id: tags:

+# Read in geodata to get additional population numbers
+
+%% Cell type:code id: tags:
+
+``` python
+geodata_df = pd.read_csv(geodata_path).drop('Unnamed: 0', axis=1).set_index('name_jhu')
+```
+
+%% Cell type:markdown id: tags:
+
+Add in populations for missing countries
+
+%% Cell type:code id: tags:
+
+``` python
+missing_countries = frames_map['confirmed'].loc[
+    frames_map['confirmed'].index.levels[0].isin(data_pop_ser.index) == False
+].iloc[:,-2:].reset_index()['Country/Region']
+
+display(geodata_df.loc[geodata_df.index.isin(missing_countries)])
+
+data_pop_ser = data_pop_ser.append(geodata_df.loc[geodata_df.index.isin(missing_countries), 'pop_est'])
+```
+
+%% Cell type:markdown id: tags:
+
 # Compute rates per 100,000 for regions

 %% Cell type:code id: tags:

 ``` python
 def cases_to_rates_df(df):
    per_100000_df = df.reset_index([1, 2], drop=True)
    per_100000_df = per_100000_df.div(data_pop_ser, 'index').mul(100000).dropna()
    per_100000_df.index.name = 'Country/Region'
    return per_100000_df

 def frames_to_rates(frames_map):
    return {k: cases_to_rates_df(v) for k,v in frames_map.items()}


 rates_map = frames_to_rates(frames_map)
 ```

 %% Cell type:code id: tags:

 ``` python
 if PAPERMILL_OUTPUT_PATH:
    for k, v in rates_map.items():
        out_path = os.path.join(out_folder, f"ts_rates_19-covid-{k}.csv")
        v.reset_index().to_csv(out_path)
 ```