diff --git a/notebooks/CompileGeoData.ipynb b/notebooks/CompileGeoData.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..afc2c2b8e9a087930bd0d941eea8e760566ff9f0 --- /dev/null +++ b/notebooks/CompileGeoData.ipynb @@ -0,0 +1,191 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extract the Geographic Info\n", + "\n", + "Use the Harvard [country_centroids.csv](https://worldmap.harvard.edu/data/geonode:country_centroids_az8) data to extract the geographic info we need for the visualizations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ts_folder = \"../data/covid-19_jhu-csse/\"\n", + "worldmap_path = \"../data/worldmap/country_centroids.csv\"\n", + "out_folder = None\n", + "PAPERMILL_OUTPUT_PATH = None" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [ + "parameters" + ] + }, + "source": [ + "## Read in JHU CSSE data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def read_jhu_covid_region_df(name):\n", + " filename = os.path.join(ts_folder, f\"time_series_19-covid-{name}.csv\")\n", + " df = pd.read_csv(filename)\n", + " df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])\n", + " df.columns = pd.to_datetime(df.columns)\n", + " region_df = df.groupby(level='Country/Region').sum()\n", + " return region_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "confirmed_df = read_jhu_covid_region_df(\"Confirmed\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Read in Harvard country centroids" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "country_centroids_df = pd.read_csv(worldmap_path)\n", + "country_centroids_df = country_centroids_df[['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est', 'gdp_md_est', 'income_grp', 'Longitude', 'Latitude']]\n", + "country_centroids_df['name_jhu'] = country_centroids_df['name_long'] " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "country_centroids_df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fix names that differ between JHU CSSE and Harvard data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "region_hd_jhu_map = {\n", + " 'Brunei Darussalam': 'Brunei',\n", + " \"Côte d'Ivoire\": \"Cote d'Ivoire\",\n", + " 'Czech Republic': 'Czechia',\n", + " 'Hong Kong': 'Hong Kong SAR',\n", + " 'Republic of Korea': 'Korea, South',\n", + " 'Macao': 'Macao SAR',\n", + " 'Russian Federation': 'Russia',\n", + " 'Taiwan': 'Taiwan*',\n", + " 'United States': 'US'\n", + "}\n", + "country_centroids_df['name_jhu'] = country_centroids_df['name_jhu'].replace(region_hd_jhu_map)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Use this to find the name in the series\n", + "# country_centroids_df[country_centroids_df['name'].str.contains('Macao')]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are some regions that we cannot resolve, but we will just ignore these." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "confirmed_df.loc[\n", + " (confirmed_df.index.isin(country_centroids_df['name_jhu']) == False)\n", + "].iloc[:,-2:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Save the result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "if PAPERMILL_OUTPUT_PATH:\n", + " out_path = os.path.join(out_folder, f\"geo_data.csv\")\n", + " country_centroids_df.to_csv(out_path)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}