From db521fa7912e960c02fb9aa2f053150b24c5874b Mon Sep 17 00:00:00 2001
From: Chandrasekhar Ramakrishnan <cramakri@ethz.ch>
Date: Sun, 15 Mar 2020 16:33:38 +0000
Subject: [PATCH] fix: Use geodata for centroids in Dashboard

---
 notebooks/Dashboard.ipynb | 76 +++++++++++++++++++++++----------------
 1 file changed, 46 insertions(+), 30 deletions(-)

diff --git a/notebooks/Dashboard.ipynb b/notebooks/Dashboard.ipynb
index b7f64b2b..695dac12 100644
--- a/notebooks/Dashboard.ipynb
+++ b/notebooks/Dashboard.ipynb
@@ -23,6 +23,7 @@
    "source": [
     "ts_folder = \"../data/covid-19_jhu-csse/\"\n",
     "rates_folder = \"../data/covid-19_rates/\"\n",
+    "geodata_path = \"../data/geodata/geo_data.csv\"\n",
     "out_folder = None\n",
     "PAPERMILL_OUTPUT_PATH = None"
    ]
@@ -84,18 +85,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Compile data needed for the visualizations"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Compute geospatial coordinates\n",
-    "country_coords_df = jhu_frames_map['confirmed'].reset_index([2,3])[['Lat', 'Long']]\n",
-    "country_coords_df = country_coords_df.groupby(level='Country/Region').mean()"
+    "geodata_df = pd.read_csv(geodata_path)\n",
+    "geodata_df = geodata_df.drop('Unnamed: 0', axis=1)\n",
+    "geodata_df = geodata_df.rename({'name_jhu':'Country/Region'}, axis=1)\n",
+    "geodata_df = geodata_df.set_index('Country/Region')"
    ]
   },
   {
@@ -105,7 +98,11 @@
    "outputs": [],
    "source": [
     "# Identify countries with 100 or more cases\n",
-    "case_count_ser = jhu_frames_map['confirmed'].iloc[:,-1].groupby(level='Country/Region').sum()\n",
+    "def latest_jhu_country_ser(name):\n",
+    "    return jhu_frames_map[name].iloc[:,-1].groupby(level='Country/Region').sum()\n",
+    "\n",
+    "\n",
+    "case_count_ser = latest_jhu_country_ser('confirmed')\n",
     "countries_over_thresh = case_count_ser[case_count_ser > 99].index"
    ]
   },
@@ -115,7 +112,9 @@
    "source": [
     "# Questions About COVID-19 and Its Spread\n",
     "\n",
-    "These plots should be taken with a large grain of salt. I am not an epidemiologist, so the analyses shown here are completely naive. There are large discrepencies in the data from different countries for a variety of reasons (rates of testing, demographics, etc.) so that make direct comparisons inaccurate. Nonetheless, I think there is a lot of interesting information in this data."
+    "Understanding the spread, distribution, and deadliness of COVID-19 is difficult, despite the data available about it. Differences in rates of testing, quality of data, demographics, etc. make it difficult to compare data between countries. \n",
+    "\n",
+    "All this needs to be considered when looking at the plots below. But despite those caveats, I found it helpful to plot the raw data, even though direct comparisons between countries might not be inaccurate."
    ]
   },
   {
@@ -125,7 +124,7 @@
    "outputs": [],
    "source": [
     "data_ts = jhu_frames_map['confirmed'].iloc[:,-1].name.strftime(\"%b %d %Y\")\n",
-    "display(HTML(f\"<em>Data up to {data_ts}</em>\"))"
+    "display(HTML(f\"<em>Data up to {data_ts}; countries with 100 or more confirmed cases.</em>\"))"
    ]
   },
   {
@@ -151,16 +150,27 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "def latest_rates_ser(name):\n",
+    "    return rates_frames_map[name].iloc[:,-1]\n",
+    "\n",
+    "\n",
     "# Compile the basic df\n",
     "map_df = pd.concat([\n",
-    "    rates_frames_map['confirmed'].iloc[:,-1],\n",
-    "    rates_frames_map['deaths'].iloc[:,-1],\n",
-    "    rates_frames_map['recovered'].iloc[:,-1],\n",
-    "    country_coords_df], axis=1)\n",
+    "    latest_rates_ser('confirmed'),\n",
+    "    latest_rates_ser('deaths'),\n",
+    "    latest_rates_ser('recovered')], axis=1)\n",
+    "nominal_df = pd.concat([\n",
+    "    latest_jhu_country_ser('confirmed'),\n",
+    "    latest_jhu_country_ser('deaths'),\n",
+    "    latest_jhu_country_ser('recovered')], axis=1)\n",
+    "map_df = pd.concat([map_df, nominal_df, geodata_df[['Longitude', 'Latitude']]], axis=1)\n",
     "# Restrict to countries with 100 or more cases\n",
     "map_df = map_df.loc[countries_over_thresh].dropna()\n",
     "map_df = map_df.reset_index()\n",
-    "map_df.columns = ['Country/Region', 'Confirmed', 'Deaths', 'Recovered', 'Lat', 'Long']"
+    "map_df.columns = ['Country/Region', \n",
+    "                  'Confirmed/100k', 'Deaths/100k', 'Recovered/100k', \n",
+    "                  'Confirmed', 'Deaths', 'Recovered',\n",
+    "                  'Long', 'Lat']"
    ]
   },
   {
@@ -169,7 +179,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def map_of_variable(map_df, variable):\n",
+    "def map_of_variable(map_df, variable, title):\n",
     "    # Data generators for the background\n",
     "    sphere = alt.sphere()\n",
     "    graticule = alt.graticule()\n",
@@ -187,11 +197,13 @@
     "            latitude='Lat:Q',\n",
     "            size=alt.Size(f'{variable}:Q', title=\"Cases\"),\n",
     "            color=alt.value('steelblue'),\n",
-    "            tooltip=[\"Country/Region:N\", \"Confirmed:Q\", \"Deaths:Q\", \"Recovered:Q\"]\n",
+    "            tooltip=[\"Country/Region:N\", \n",
+    "                     \"Confirmed:Q\", \"Deaths:Q\", \"Recovered:Q\",\n",
+    "                     \"Confirmed/100k:Q\", \"Deaths/100k:Q\", \"Recovered/100k:Q\"]\n",
     "        )\n",
     "    ).project(\n",
     "        'naturalEarth1'\n",
-    "    ).properties(width=600, height=400, title=f\"{variable} cases per 100,000\"\n",
+    "    ).properties(width=600, height=400, title=f\"{title} cases per 100k inhabitants\"\n",
     "    ).configure_view(stroke=None)\n",
     "    return p"
    ]
@@ -202,11 +214,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "display(map_of_variable(map_df, 'Confirmed'))\n",
+    "display(map_of_variable(map_df, 'Confirmed/100k', 'Confirmed'))\n",
     "display(HTML('''\n",
-    "<p style=\"font-size: smaller\">Data Source: \n",
-    "  <a href=\"https://github.com/CSSEGISandData/COVID-19\">JHU CSSE</a> and\n",
-    "  <a href=\"https://data.worldbank.org/indicator/SP.POP.TOTL\">World Bank</a>\n",
+    "<p style=\"font-size: smaller\">Data Sources: \n",
+    "  <a href=\"https://github.com/CSSEGISandData/COVID-19\">JHU CSSE</a>,\n",
+    "  <a href=\"https://data.worldbank.org/indicator/SP.POP.TOTL\">World Bank</a>,\n",
+    "  <a href=\"https://worldmap.harvard.edu/data/geonode:country_centroids_az8\">Harvard Worldmap</a>\n",
     "</p>'''))"
    ]
   },
@@ -217,8 +230,11 @@
    "outputs": [],
    "source": [
     "bars = alt.Chart(map_df).mark_bar().encode(\n",
-    "    x='Confirmed:Q',\n",
-    "    y=alt.Y(\"Country/Region:N\", sort='-x')\n",
+    "    x='Confirmed/100k:Q',\n",
+    "    y=alt.Y(\"Country/Region:N\", sort='-x'),\n",
+    "    tooltip=[\"Country/Region:N\", \n",
+    "         \"Confirmed:Q\", \"Deaths:Q\", \"Recovered:Q\",\n",
+    "         \"Confirmed/100k:Q\", \"Deaths/100k:Q\", \"Recovered/100k:Q\"]\n",
     ")\n",
     "\n",
     "text = bars.mark_text(\n",
@@ -226,7 +242,7 @@
     "    baseline='middle',\n",
     "    dx=3  # Nudges text to right so it doesn't appear on top of the bar\n",
     ").encode(\n",
-    "    text=alt.Text('Confirmed:Q', format=\".3\")\n",
+    "    text=alt.Text('Confirmed/100k:Q', format=\".3\")\n",
     ")\n",
     "\n",
     "(bars + text).properties(height=900)"
-- 
GitLab