{ "cells": [ { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.019915, "end_time": "2020-03-18T17:39:45.438621", "exception": false, "start_time": "2020-03-18T17:39:45.418706", "status": "completed" }, "tags": [] }, "source": [ "# Extract the Geographic Info\n", "\n", "Use the Harvard [country_centroids.csv](https://worldmap.harvard.edu/data/geonode:country_centroids_az8) data to extract the geographic info we need for the visualizations." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "papermill": { "duration": 0.356977, "end_time": "2020-03-18T17:39:45.806137", "exception": false, "start_time": "2020-03-18T17:39:45.449160", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import pandas as pd\n", "import os" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "papermill": { "duration": 0.020489, "end_time": "2020-03-18T17:39:45.839942", "exception": false, "start_time": "2020-03-18T17:39:45.819453", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "ts_folder = \"../data/covid-19_jhu-csse/\"\n", "worldmap_path = \"../data/worldmap/country_centroids.csv\"\n", "out_folder = None\n", "PAPERMILL_OUTPUT_PATH = None" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.010294, "end_time": "2020-03-18T17:39:45.862217", "exception": false, "start_time": "2020-03-18T17:39:45.851923", "status": "completed" }, "tags": [ "parameters" ] }, "source": [ "## Read in JHU CSSE data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "papermill": { "duration": 0.023154, "end_time": "2020-03-18T17:39:45.909658", "exception": false, "start_time": "2020-03-18T17:39:45.886504", "status": "completed" }, "tags": [ "injected-parameters" ] }, "outputs": [], "source": [ "# Parameters\n", "PAPERMILL_INPUT_PATH = \"notebooks/process/CompileGeoData.ipynb\"\n", "PAPERMILL_OUTPUT_PATH = \"runs/CompileGeoData.run.ipynb\"\n", "ts_folder = \"./data/covid-19_jhu-csse/\"\n", "worldmap_path = \"./data/worldmap/country_centroids.csv\"\n", "out_folder = \"./data/geodata/\"\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "papermill": { "duration": 0.022631, "end_time": "2020-03-18T17:39:45.942649", "exception": false, "start_time": "2020-03-18T17:39:45.920018", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "def read_jhu_covid_region_df(name):\n", " filename = os.path.join(ts_folder, f\"time_series_19-covid-{name}.csv\")\n", " df = pd.read_csv(filename)\n", " df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])\n", " df.columns = pd.to_datetime(df.columns)\n", " region_df = df.groupby(level='Country/Region').sum()\n", " return region_df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "papermill": { "duration": 0.05949, "end_time": "2020-03-18T17:39:46.014614", "exception": false, "start_time": "2020-03-18T17:39:45.955124", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "confirmed_df = read_jhu_covid_region_df(\"Confirmed\")" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.010077, "end_time": "2020-03-18T17:39:46.041211", "exception": false, "start_time": "2020-03-18T17:39:46.031134", "status": "completed" }, "tags": [] }, "source": [ "# Read in Harvard country centroids" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "papermill": { "duration": 0.043439, "end_time": "2020-03-18T17:39:46.094285", "exception": false, "start_time": "2020-03-18T17:39:46.050846", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "country_centroids_df = pd.read_csv(worldmap_path)\n", "country_centroids_df = country_centroids_df[['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est', 'gdp_md_est', 'income_grp', 'Longitude', 'Latitude']]\n", "country_centroids_df['name_jhu'] = country_centroids_df['name_long'] " ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "papermill": { "duration": 0.034514, "end_time": "2020-03-18T17:39:46.142439", "exception": false, "start_time": "2020-03-18T17:39:46.107925", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "Index(['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est',\n", " 'gdp_md_est', 'income_grp', 'Longitude', 'Latitude', 'name_jhu'],\n", " dtype='object')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "country_centroids_df.columns" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.009928, "end_time": "2020-03-18T17:39:46.166476", "exception": false, "start_time": "2020-03-18T17:39:46.156548", "status": "completed" }, "tags": [] }, "source": [ "Fix names that differ between JHU CSSE and Harvard data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "papermill": { "duration": 0.026025, "end_time": "2020-03-18T17:39:46.202465", "exception": false, "start_time": "2020-03-18T17:39:46.176440", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "region_hd_jhu_map = {\n", " 'Brunei Darussalam': 'Brunei',\n", " \"Côte d'Ivoire\": \"Cote d'Ivoire\",\n", " 'Czech Republic': 'Czechia',\n", " 'Hong Kong': 'Hong Kong SAR',\n", " 'Republic of Korea': 'Korea, South',\n", " 'Macao': 'Macao SAR',\n", " 'Russian Federation': 'Russia',\n", " 'Taiwan': 'Taiwan*',\n", " 'United States': 'US'\n", "}\n", "country_centroids_df['name_jhu'] = country_centroids_df['name_jhu'].replace(region_hd_jhu_map)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "papermill": { "duration": 0.018618, "end_time": "2020-03-18T17:39:46.232915", "exception": false, "start_time": "2020-03-18T17:39:46.214297", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Use this to find the name in the series\n", "# country_centroids_df[country_centroids_df['name'].str.contains('Macao')]" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.009895, "end_time": "2020-03-18T17:39:46.253760", "exception": false, "start_time": "2020-03-18T17:39:46.243865", "status": "completed" }, "tags": [] }, "source": [ "There are some regions that we cannot resolve, but we will just ignore these." ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "papermill": { "duration": 0.034717, "end_time": "2020-03-18T17:39:46.298658", "exception": false, "start_time": "2020-03-18T17:39:46.263941", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>2020-03-16</th>\n", " <th>2020-03-17</th>\n", " </tr>\n", " <tr>\n", " <th>Country/Region</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Congo (Brazzaville)</th>\n", " <td>1</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>Congo (Kinshasa)</th>\n", " <td>2</td>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>Cruise Ship</th>\n", " <td>696</td>\n", " <td>696</td>\n", " </tr>\n", " <tr>\n", " <th>Eswatini</th>\n", " <td>1</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>Holy See</th>\n", " <td>1</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>Martinique</th>\n", " <td>15</td>\n", " <td>16</td>\n", " </tr>\n", " <tr>\n", " <th>North Macedonia</th>\n", " <td>18</td>\n", " <td>26</td>\n", " </tr>\n", " <tr>\n", " <th>Republic of the Congo</th>\n", " <td>1</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>The Bahamas</th>\n", " <td>1</td>\n", " <td>1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 2020-03-16 2020-03-17\n", "Country/Region \n", "Congo (Brazzaville) 1 1\n", "Congo (Kinshasa) 2 3\n", "Cruise Ship 696 696\n", "Eswatini 1 1\n", "Holy See 1 1\n", "Martinique 15 16\n", "North Macedonia 18 26\n", "Republic of the Congo 1 1\n", "The Bahamas 1 1" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "confirmed_df.loc[\n", " (confirmed_df.index.isin(country_centroids_df['name_jhu']) == False)\n", "].iloc[:,-2:]" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.010396, "end_time": "2020-03-18T17:39:46.322850", "exception": false, "start_time": "2020-03-18T17:39:46.312454", "status": "completed" }, "tags": [] }, "source": [ "# Save the result" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "papermill": { "duration": 0.028961, "end_time": "2020-03-18T17:39:46.362455", "exception": false, "start_time": "2020-03-18T17:39:46.333494", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "if PAPERMILL_OUTPUT_PATH:\n", " out_path = os.path.join(out_folder, f\"geo_data.csv\")\n", " country_centroids_df.to_csv(out_path)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" }, "papermill": { "duration": 2.135836, "end_time": "2020-03-18T17:39:46.683682", "environment_variables": {}, "exception": null, "input_path": "notebooks/process/CompileGeoData.ipynb", "output_path": "runs/CompileGeoData.run.ipynb", "parameters": { "PAPERMILL_INPUT_PATH": "notebooks/process/CompileGeoData.ipynb", "PAPERMILL_OUTPUT_PATH": "runs/CompileGeoData.run.ipynb", "out_folder": "./data/geodata/", "ts_folder": "./data/covid-19_jhu-csse/", "worldmap_path": "./data/worldmap/country_centroids.csv" }, "start_time": "2020-03-18T17:39:44.547846", "version": "1.1.0" } }, "nbformat": 4, "nbformat_minor": 4 }