{ "cells": [ { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.022323, "end_time": "2020-03-26T13:22:44.646598", "exception": false, "start_time": "2020-03-26T13:22:44.624275", "status": "completed" }, "tags": [] }, "source": [ "# Extract the Geographic Info\n", "\n", "Use the Harvard [country_centroids.csv](https://worldmap.harvard.edu/data/geonode:country_centroids_az8) data to extract the geographic info we need for the visualizations." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "papermill": { "duration": 0.337871, "end_time": "2020-03-26T13:22:44.995055", "exception": false, "start_time": "2020-03-26T13:22:44.657184", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "import pandas as pd\n", "import os" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "papermill": { "duration": 0.024696, "end_time": "2020-03-26T13:22:45.036188", "exception": false, "start_time": "2020-03-26T13:22:45.011492", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "ts_folder = \"../../data/covid-19_jhu-csse/\"\n", "worldmap_path = \"../../data/worldmap/country_centroids.csv\"\n", "out_folder = None\n", "PAPERMILL_OUTPUT_PATH = None" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.008821, "end_time": "2020-03-26T13:22:45.057062", "exception": false, "start_time": "2020-03-26T13:22:45.048241", "status": "completed" }, "tags": [ "parameters" ] }, "source": [ "## Read in JHU CSSE data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "papermill": { "duration": 0.024941, "end_time": "2020-03-26T13:22:45.090116", "exception": false, "start_time": "2020-03-26T13:22:45.065175", "status": "completed" }, "tags": [ "injected-parameters" ] }, "outputs": [], "source": [ "# Parameters\n", "PAPERMILL_INPUT_PATH = \"/tmp/g3lga9o8/notebooks/process/CompileGeoData.ipynb\"\n", "PAPERMILL_OUTPUT_PATH = \"runs/CompileGeoData.run.ipynb\"\n", "ts_folder = \"/tmp/g3lga9o8/data/covid-19_jhu-csse\"\n", "worldmap_path = \"/tmp/g3lga9o8/data/worldmap/country_centroids.csv\"\n", "out_folder = \"/tmp/g3lga9o8/data/geodata\"\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "papermill": { "duration": 0.025233, "end_time": "2020-03-26T13:22:45.135369", "exception": false, "start_time": "2020-03-26T13:22:45.110136", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "def read_jhu_covid_region_df(name):\n", " filename = os.path.join(ts_folder, f\"time_series_covid19_{name}_global.csv\")\n", " df = pd.read_csv(filename)\n", " df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])\n", " df.columns = pd.to_datetime(df.columns)\n", " region_df = df.groupby(level='Country/Region').sum()\n", " return region_df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "papermill": { "duration": 0.054123, "end_time": "2020-03-26T13:22:45.202307", "exception": false, "start_time": "2020-03-26T13:22:45.148184", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "confirmed_df = read_jhu_covid_region_df(\"confirmed\")" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.007699, "end_time": "2020-03-26T13:22:45.217884", "exception": false, "start_time": "2020-03-26T13:22:45.210185", "status": "completed" }, "tags": [] }, "source": [ "# Read in Harvard country centroids" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "papermill": { "duration": 0.031588, "end_time": "2020-03-26T13:22:45.257211", "exception": false, "start_time": "2020-03-26T13:22:45.225623", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "country_centroids_df = pd.read_csv(worldmap_path)\n", "country_centroids_df = country_centroids_df[['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est', 'gdp_md_est', 'income_grp', 'Longitude', 'Latitude']]\n", "country_centroids_df['name_jhu'] = country_centroids_df['name_long'] " ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "papermill": { "duration": 0.037705, "end_time": "2020-03-26T13:22:45.303604", "exception": false, "start_time": "2020-03-26T13:22:45.265899", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/plain": [ "Index(['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est',\n", " 'gdp_md_est', 'income_grp', 'Longitude', 'Latitude', 'name_jhu'],\n", " dtype='object')" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "country_centroids_df.columns" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.009762, "end_time": "2020-03-26T13:22:45.332521", "exception": false, "start_time": "2020-03-26T13:22:45.322759", "status": "completed" }, "tags": [] }, "source": [ "Fix names that differ between JHU CSSE and Harvard data" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "papermill": { "duration": 0.030831, "end_time": "2020-03-26T13:22:45.371494", "exception": false, "start_time": "2020-03-26T13:22:45.340663", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "region_hd_jhu_map = {\n", " 'Brunei Darussalam': 'Brunei',\n", " \"Côte d'Ivoire\": \"Cote d'Ivoire\",\n", " 'Czech Republic': 'Czechia',\n", " 'Hong Kong': 'Hong Kong SAR',\n", " 'Republic of Korea': 'Korea, South',\n", " 'Macao': 'Macao SAR',\n", " 'Russian Federation': 'Russia',\n", " 'Taiwan': 'Taiwan*',\n", " 'United States': 'US'\n", "}\n", "country_centroids_df['name_jhu'] = country_centroids_df['name_jhu'].replace(region_hd_jhu_map)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "papermill": { "duration": 0.026196, "end_time": "2020-03-26T13:22:45.411424", "exception": false, "start_time": "2020-03-26T13:22:45.385228", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Use this to find the name in the series\n", "# country_centroids_df[country_centroids_df['name'].str.contains('Macao')]" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.009748, "end_time": "2020-03-26T13:22:45.435849", "exception": false, "start_time": "2020-03-26T13:22:45.426101", "status": "completed" }, "tags": [] }, "source": [ "There are some regions that we cannot resolve, but we will just ignore these." ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "papermill": { "duration": 0.04638, "end_time": "2020-03-26T13:22:45.491053", "exception": false, "start_time": "2020-03-26T13:22:45.444673", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>2020-03-24</th>\n", " <th>2020-03-25</th>\n", " </tr>\n", " <tr>\n", " <th>Country/Region</th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>Cabo Verde</th>\n", " <td>3</td>\n", " <td>4</td>\n", " </tr>\n", " <tr>\n", " <th>Congo (Brazzaville)</th>\n", " <td>4</td>\n", " <td>4</td>\n", " </tr>\n", " <tr>\n", " <th>Congo (Kinshasa)</th>\n", " <td>45</td>\n", " <td>48</td>\n", " </tr>\n", " <tr>\n", " <th>Diamond Princess</th>\n", " <td>712</td>\n", " <td>712</td>\n", " </tr>\n", " <tr>\n", " <th>Eswatini</th>\n", " <td>4</td>\n", " <td>4</td>\n", " </tr>\n", " <tr>\n", " <th>Gambia</th>\n", " <td>3</td>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>Holy See</th>\n", " <td>4</td>\n", " <td>4</td>\n", " </tr>\n", " <tr>\n", " <th>Laos</th>\n", " <td>2</td>\n", " <td>3</td>\n", " </tr>\n", " <tr>\n", " <th>North Macedonia</th>\n", " <td>148</td>\n", " <td>177</td>\n", " </tr>\n", " <tr>\n", " <th>West Bank and Gaza</th>\n", " <td>59</td>\n", " <td>-1</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " 2020-03-24 2020-03-25\n", "Country/Region \n", "Cabo Verde 3 4\n", "Congo (Brazzaville) 4 4\n", "Congo (Kinshasa) 45 48\n", "Diamond Princess 712 712\n", "Eswatini 4 4\n", "Gambia 3 3\n", "Holy See 4 4\n", "Laos 2 3\n", "North Macedonia 148 177\n", "West Bank and Gaza 59 -1" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "confirmed_df.loc[\n", " (confirmed_df.index.isin(country_centroids_df['name_jhu']) == False)\n", "].iloc[:,-2:]" ] }, { "cell_type": "markdown", "metadata": { "papermill": { "duration": 0.009932, "end_time": "2020-03-26T13:22:45.518476", "exception": false, "start_time": "2020-03-26T13:22:45.508544", "status": "completed" }, "tags": [] }, "source": [ "# Save the result" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "papermill": { "duration": 0.037962, "end_time": "2020-03-26T13:22:45.565000", "exception": false, "start_time": "2020-03-26T13:22:45.527038", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "if PAPERMILL_OUTPUT_PATH:\n", " out_path = os.path.join(out_folder, f\"geo_data.csv\")\n", " country_centroids_df.to_csv(out_path)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" }, "papermill": { "duration": 2.257336, "end_time": "2020-03-26T13:22:46.017593", "environment_variables": {}, "exception": null, "input_path": "/tmp/g3lga9o8/notebooks/process/CompileGeoData.ipynb", "output_path": "runs/CompileGeoData.run.ipynb", "parameters": { "PAPERMILL_INPUT_PATH": "/tmp/g3lga9o8/notebooks/process/CompileGeoData.ipynb", "PAPERMILL_OUTPUT_PATH": "runs/CompileGeoData.run.ipynb", "out_folder": "/tmp/g3lga9o8/data/geodata", "ts_folder": "/tmp/g3lga9o8/data/covid-19_jhu-csse", "worldmap_path": "/tmp/g3lga9o8/data/worldmap/country_centroids.csv" }, "start_time": "2020-03-26T13:22:43.760257", "version": "1.1.0" } }, "nbformat": 4, "nbformat_minor": 4 }