From 70676e6668125582822151bc12644b5250aa652e Mon Sep 17 00:00:00 2001
From: Chandrasekhar Ramakrishnan <cramakri@ethz.ch>
Date: Sun, 15 Mar 2020 16:08:27 +0000
Subject: [PATCH] fix: use geodata.csv to fill in missing population info

---
 notebooks/ToRates.ipynb | 48 +++++++++++++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 9 deletions(-)

diff --git a/notebooks/ToRates.ipynb b/notebooks/ToRates.ipynb
index d0aa13c5..cfc1c6bc 100644
--- a/notebooks/ToRates.ipynb
+++ b/notebooks/ToRates.ipynb
@@ -29,6 +29,7 @@
    "source": [
     "ts_folder = \"../data/covid-19_jhu-csse/\"\n",
     "wb_path = \"../data/worldbank/SP.POP.TOTL.zip\"\n",
+    "geodata_path = \"../data/geodata/geo_data.csv\"\n",
     "out_folder = None\n",
     "PAPERMILL_OUTPUT_PATH = None"
    ]
@@ -75,15 +76,6 @@
     "}"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "frames_map['confirmed'].sort_values(frames_map['confirmed'].columns[-1], ascending=False).head()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -176,6 +168,44 @@
     "].iloc[:,-2:]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Read in geodata to get additional population numbers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "geodata_df = pd.read_csv(geodata_path).drop('Unnamed: 0', axis=1).set_index('name_jhu')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Add in populations for missing countries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "missing_countries = frames_map['confirmed'].loc[\n",
+    "    frames_map['confirmed'].index.levels[0].isin(data_pop_ser.index) == False\n",
+    "].iloc[:,-2:].reset_index()['Country/Region']\n",
+    "\n",
+    "display(geodata_df.loc[geodata_df.index.isin(missing_countries)])\n",
+    "\n",
+    "data_pop_ser = data_pop_ser.append(geodata_df.loc[geodata_df.index.isin(missing_countries), 'pop_est'])"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
-- 
GitLab