From db521fa7912e960c02fb9aa2f053150b24c5874b Mon Sep 17 00:00:00 2001 From: Chandrasekhar Ramakrishnan <cramakri@ethz.ch> Date: Sun, 15 Mar 2020 16:33:38 +0000 Subject: [PATCH] fix: Use geodata for centroids in Dashboard --- notebooks/Dashboard.ipynb | 76 +++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 30 deletions(-) diff --git a/notebooks/Dashboard.ipynb b/notebooks/Dashboard.ipynb index b7f64b2b..695dac12 100644 --- a/notebooks/Dashboard.ipynb +++ b/notebooks/Dashboard.ipynb @@ -23,6 +23,7 @@ "source": [ "ts_folder = \"../data/covid-19_jhu-csse/\"\n", "rates_folder = \"../data/covid-19_rates/\"\n", + "geodata_path = \"../data/geodata/geo_data.csv\"\n", "out_folder = None\n", "PAPERMILL_OUTPUT_PATH = None" ] @@ -84,18 +85,10 @@ "metadata": {}, "outputs": [], "source": [ - "# Compile data needed for the visualizations" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Compute geospatial coordinates\n", - "country_coords_df = jhu_frames_map['confirmed'].reset_index([2,3])[['Lat', 'Long']]\n", - "country_coords_df = country_coords_df.groupby(level='Country/Region').mean()" + "geodata_df = pd.read_csv(geodata_path)\n", + "geodata_df = geodata_df.drop('Unnamed: 0', axis=1)\n", + "geodata_df = geodata_df.rename({'name_jhu':'Country/Region'}, axis=1)\n", + "geodata_df = geodata_df.set_index('Country/Region')" ] }, { @@ -105,7 +98,11 @@ "outputs": [], "source": [ "# Identify countries with 100 or more cases\n", - "case_count_ser = jhu_frames_map['confirmed'].iloc[:,-1].groupby(level='Country/Region').sum()\n", + "def latest_jhu_country_ser(name):\n", + " return jhu_frames_map[name].iloc[:,-1].groupby(level='Country/Region').sum()\n", + "\n", + "\n", + "case_count_ser = latest_jhu_country_ser('confirmed')\n", "countries_over_thresh = case_count_ser[case_count_ser > 99].index" ] }, @@ -115,7 +112,9 @@ "source": [ "# Questions About COVID-19 and Its Spread\n", "\n", - "These plots should be taken with a large grain of salt. I am not an epidemiologist, so the analyses shown here are completely naive. There are large discrepencies in the data from different countries for a variety of reasons (rates of testing, demographics, etc.) so that make direct comparisons inaccurate. Nonetheless, I think there is a lot of interesting information in this data." + "Understanding the spread, distribution, and deadliness of COVID-19 is difficult, despite the data available about it. Differences in rates of testing, quality of data, demographics, etc. make it difficult to compare data between countries. \n", + "\n", + "All this needs to be considered when looking at the plots below. But despite those caveats, I found it helpful to plot the raw data, even though direct comparisons between countries might not be inaccurate." ] }, { @@ -125,7 +124,7 @@ "outputs": [], "source": [ "data_ts = jhu_frames_map['confirmed'].iloc[:,-1].name.strftime(\"%b %d %Y\")\n", - "display(HTML(f\"<em>Data up to {data_ts}</em>\"))" + "display(HTML(f\"<em>Data up to {data_ts}; countries with 100 or more confirmed cases.</em>\"))" ] }, { @@ -151,16 +150,27 @@ "metadata": {}, "outputs": [], "source": [ + "def latest_rates_ser(name):\n", + " return rates_frames_map[name].iloc[:,-1]\n", + "\n", + "\n", "# Compile the basic df\n", "map_df = pd.concat([\n", - " rates_frames_map['confirmed'].iloc[:,-1],\n", - " rates_frames_map['deaths'].iloc[:,-1],\n", - " rates_frames_map['recovered'].iloc[:,-1],\n", - " country_coords_df], axis=1)\n", + " latest_rates_ser('confirmed'),\n", + " latest_rates_ser('deaths'),\n", + " latest_rates_ser('recovered')], axis=1)\n", + "nominal_df = pd.concat([\n", + " latest_jhu_country_ser('confirmed'),\n", + " latest_jhu_country_ser('deaths'),\n", + " latest_jhu_country_ser('recovered')], axis=1)\n", + "map_df = pd.concat([map_df, nominal_df, geodata_df[['Longitude', 'Latitude']]], axis=1)\n", "# Restrict to countries with 100 or more cases\n", "map_df = map_df.loc[countries_over_thresh].dropna()\n", "map_df = map_df.reset_index()\n", - "map_df.columns = ['Country/Region', 'Confirmed', 'Deaths', 'Recovered', 'Lat', 'Long']" + "map_df.columns = ['Country/Region', \n", + " 'Confirmed/100k', 'Deaths/100k', 'Recovered/100k', \n", + " 'Confirmed', 'Deaths', 'Recovered',\n", + " 'Long', 'Lat']" ] }, { @@ -169,7 +179,7 @@ "metadata": {}, "outputs": [], "source": [ - "def map_of_variable(map_df, variable):\n", + "def map_of_variable(map_df, variable, title):\n", " # Data generators for the background\n", " sphere = alt.sphere()\n", " graticule = alt.graticule()\n", @@ -187,11 +197,13 @@ " latitude='Lat:Q',\n", " size=alt.Size(f'{variable}:Q', title=\"Cases\"),\n", " color=alt.value('steelblue'),\n", - " tooltip=[\"Country/Region:N\", \"Confirmed:Q\", \"Deaths:Q\", \"Recovered:Q\"]\n", + " tooltip=[\"Country/Region:N\", \n", + " \"Confirmed:Q\", \"Deaths:Q\", \"Recovered:Q\",\n", + " \"Confirmed/100k:Q\", \"Deaths/100k:Q\", \"Recovered/100k:Q\"]\n", " )\n", " ).project(\n", " 'naturalEarth1'\n", - " ).properties(width=600, height=400, title=f\"{variable} cases per 100,000\"\n", + " ).properties(width=600, height=400, title=f\"{title} cases per 100k inhabitants\"\n", " ).configure_view(stroke=None)\n", " return p" ] @@ -202,11 +214,12 @@ "metadata": {}, "outputs": [], "source": [ - "display(map_of_variable(map_df, 'Confirmed'))\n", + "display(map_of_variable(map_df, 'Confirmed/100k', 'Confirmed'))\n", "display(HTML('''\n", - "<p style=\"font-size: smaller\">Data Source: \n", - " <a href=\"https://github.com/CSSEGISandData/COVID-19\">JHU CSSE</a> and\n", - " <a href=\"https://data.worldbank.org/indicator/SP.POP.TOTL\">World Bank</a>\n", + "<p style=\"font-size: smaller\">Data Sources: \n", + " <a href=\"https://github.com/CSSEGISandData/COVID-19\">JHU CSSE</a>,\n", + " <a href=\"https://data.worldbank.org/indicator/SP.POP.TOTL\">World Bank</a>,\n", + " <a href=\"https://worldmap.harvard.edu/data/geonode:country_centroids_az8\">Harvard Worldmap</a>\n", "</p>'''))" ] }, @@ -217,8 +230,11 @@ "outputs": [], "source": [ "bars = alt.Chart(map_df).mark_bar().encode(\n", - " x='Confirmed:Q',\n", - " y=alt.Y(\"Country/Region:N\", sort='-x')\n", + " x='Confirmed/100k:Q',\n", + " y=alt.Y(\"Country/Region:N\", sort='-x'),\n", + " tooltip=[\"Country/Region:N\", \n", + " \"Confirmed:Q\", \"Deaths:Q\", \"Recovered:Q\",\n", + " \"Confirmed/100k:Q\", \"Deaths/100k:Q\", \"Recovered/100k:Q\"]\n", ")\n", "\n", "text = bars.mark_text(\n", @@ -226,7 +242,7 @@ " baseline='middle',\n", " dx=3 # Nudges text to right so it doesn't appear on top of the bar\n", ").encode(\n", - " text=alt.Text('Confirmed:Q', format=\".3\")\n", + " text=alt.Text('Confirmed/100k:Q', format=\".3\")\n", ")\n", "\n", "(bars + text).properties(height=900)" -- GitLab