{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Extract the Geographic Info\n", "\n", "Use the Harvard [country_centroids.csv](https://worldmap.harvard.edu/data/geonode:country_centroids_az8) data to extract the geographic info we need for the visualizations." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ts_folder = \"../data/covid-19_jhu-csse/\"\n", "worldmap_path = \"../data/worldmap/country_centroids.csv\"\n", "out_folder = None\n", "PAPERMILL_OUTPUT_PATH = None" ] }, { "cell_type": "markdown", "metadata": { "tags": [ "parameters" ] }, "source": [ "## Read in JHU CSSE data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def read_jhu_covid_region_df(name):\n", " filename = os.path.join(ts_folder, f\"time_series_19-covid-{name}.csv\")\n", " df = pd.read_csv(filename)\n", " df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])\n", " df.columns = pd.to_datetime(df.columns)\n", " region_df = df.groupby(level='Country/Region').sum()\n", " return region_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "confirmed_df = read_jhu_covid_region_df(\"Confirmed\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Read in Harvard country centroids" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "country_centroids_df = pd.read_csv(worldmap_path)\n", "country_centroids_df = country_centroids_df[['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est', 'gdp_md_est', 'income_grp', 'Longitude', 'Latitude']]\n", "country_centroids_df['name_jhu'] = country_centroids_df['name_long'] " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "country_centroids_df.columns" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fix names that differ between JHU CSSE and Harvard data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "region_hd_jhu_map = {\n", " 'Brunei Darussalam': 'Brunei',\n", " \"Côte d'Ivoire\": \"Cote d'Ivoire\",\n", " 'Czech Republic': 'Czechia',\n", " 'Hong Kong': 'Hong Kong SAR',\n", " 'Republic of Korea': 'Korea, South',\n", " 'Macao': 'Macao SAR',\n", " 'Russian Federation': 'Russia',\n", " 'Taiwan': 'Taiwan*',\n", " 'United States': 'US'\n", "}\n", "country_centroids_df['name_jhu'] = country_centroids_df['name_jhu'].replace(region_hd_jhu_map)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Use this to find the name in the series\n", "# country_centroids_df[country_centroids_df['name'].str.contains('Macao')]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "There are some regions that we cannot resolve, but we will just ignore these." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "confirmed_df.loc[\n", " (confirmed_df.index.isin(country_centroids_df['name_jhu']) == False)\n", "].iloc[:,-2:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Save the result" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "if PAPERMILL_OUTPUT_PATH:\n", " out_path = os.path.join(out_folder, f\"geo_data.csv\")\n", " country_centroids_df.to_csv(out_path)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }