{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.022249,
     "end_time": "2020-03-17T10:09:07.647548",
     "exception": false,
     "start_time": "2020-03-17T10:09:07.625299",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Extract the Geographic Info\n",
    "\n",
    "Use the Harvard [country_centroids.csv](https://worldmap.harvard.edu/data/geonode:country_centroids_az8) data to extract the geographic info we need for the visualizations."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "papermill": {
     "duration": 0.378966,
     "end_time": "2020-03-17T10:09:08.037220",
     "exception": false,
     "start_time": "2020-03-17T10:09:07.658254",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "papermill": {
     "duration": 0.021222,
     "end_time": "2020-03-17T10:09:08.076492",
     "exception": false,
     "start_time": "2020-03-17T10:09:08.055270",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "ts_folder = \"../data/covid-19_jhu-csse/\"\n",
    "worldmap_path = \"../data/worldmap/country_centroids.csv\"\n",
    "out_folder = None\n",
    "PAPERMILL_OUTPUT_PATH = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.008239,
     "end_time": "2020-03-17T10:09:08.093667",
     "exception": false,
     "start_time": "2020-03-17T10:09:08.085428",
     "status": "completed"
    },
    "tags": [
     "parameters"
    ]
   },
   "source": [
    "## Read in JHU CSSE data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "papermill": {
     "duration": 0.02409,
     "end_time": "2020-03-17T10:09:08.124799",
     "exception": false,
     "start_time": "2020-03-17T10:09:08.100709",
     "status": "completed"
    },
    "tags": [
     "injected-parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# Parameters\n",
    "PAPERMILL_INPUT_PATH = \"/tmp/9d0h4tlx/notebooks/CompileGeoData.ipynb\"\n",
    "PAPERMILL_OUTPUT_PATH = \"runs/CompileGeoData.run.ipynb\"\n",
    "ts_folder = \"/tmp/9d0h4tlx/data/covid-19_jhu-csse\"\n",
    "worldmap_path = \"/tmp/9d0h4tlx/data/worldmap/country_centroids.csv\"\n",
    "out_folder = \"data/geodata\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "papermill": {
     "duration": 0.027104,
     "end_time": "2020-03-17T10:09:08.168406",
     "exception": false,
     "start_time": "2020-03-17T10:09:08.141302",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "def read_jhu_covid_region_df(name):\n",
    "    filename = os.path.join(ts_folder, f\"time_series_19-covid-{name}.csv\")\n",
    "    df = pd.read_csv(filename)\n",
    "    df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])\n",
    "    df.columns = pd.to_datetime(df.columns)\n",
    "    region_df = df.groupby(level='Country/Region').sum()\n",
    "    return region_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "papermill": {
     "duration": 0.0591,
     "end_time": "2020-03-17T10:09:08.238956",
     "exception": false,
     "start_time": "2020-03-17T10:09:08.179856",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "confirmed_df = read_jhu_covid_region_df(\"Confirmed\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.010019,
     "end_time": "2020-03-17T10:09:08.263998",
     "exception": false,
     "start_time": "2020-03-17T10:09:08.253979",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Read in Harvard country centroids"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "papermill": {
     "duration": 0.047618,
     "end_time": "2020-03-17T10:09:08.319197",
     "exception": false,
     "start_time": "2020-03-17T10:09:08.271579",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "country_centroids_df = pd.read_csv(worldmap_path)\n",
    "country_centroids_df = country_centroids_df[['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est', 'gdp_md_est', 'income_grp', 'Longitude', 'Latitude']]\n",
    "country_centroids_df['name_jhu'] = country_centroids_df['name_long'] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "papermill": {
     "duration": 0.035439,
     "end_time": "2020-03-17T10:09:08.368648",
     "exception": false,
     "start_time": "2020-03-17T10:09:08.333209",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est',\n",
       "       'gdp_md_est', 'income_grp', 'Longitude', 'Latitude', 'name_jhu'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "country_centroids_df.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.007979,
     "end_time": "2020-03-17T10:09:08.387999",
     "exception": false,
     "start_time": "2020-03-17T10:09:08.380020",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "Fix names that differ between JHU CSSE and Harvard data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "papermill": {
     "duration": 0.026191,
     "end_time": "2020-03-17T10:09:08.421750",
     "exception": false,
     "start_time": "2020-03-17T10:09:08.395559",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "region_hd_jhu_map = {\n",
    "     'Brunei Darussalam': 'Brunei',\n",
    "     \"Côte d'Ivoire\": \"Cote d'Ivoire\",\n",
    "     'Czech Republic': 'Czechia',\n",
    "     'Hong Kong': 'Hong Kong SAR',\n",
    "     'Republic of Korea': 'Korea, South',\n",
    "     'Macao': 'Macao SAR',\n",
    "     'Russian Federation': 'Russia',\n",
    "     'Taiwan': 'Taiwan*',\n",
    "     'United States': 'US'\n",
    "}\n",
    "country_centroids_df['name_jhu'] = country_centroids_df['name_jhu'].replace(region_hd_jhu_map)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "papermill": {
     "duration": 0.019254,
     "end_time": "2020-03-17T10:09:08.453053",
     "exception": false,
     "start_time": "2020-03-17T10:09:08.433799",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Use this to find the name in the series\n",
    "# country_centroids_df[country_centroids_df['name'].str.contains('Macao')]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.0076,
     "end_time": "2020-03-17T10:09:08.470246",
     "exception": false,
     "start_time": "2020-03-17T10:09:08.462646",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "There are some regions that we cannot resolve, but we will just ignore these."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "papermill": {
     "duration": 0.044332,
     "end_time": "2020-03-17T10:09:08.521943",
     "exception": false,
     "start_time": "2020-03-17T10:09:08.477611",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>2020-03-15</th>\n",
       "      <th>2020-03-16</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Country/Region</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Congo (Brazzaville)</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Congo (Kinshasa)</th>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Cruise Ship</th>\n",
       "      <td>696</td>\n",
       "      <td>696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Eswatini</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>French Guiana</th>\n",
       "      <td>0</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Guadeloupe</th>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Holy See</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Martinique</th>\n",
       "      <td>9</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Mayotte</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>North Macedonia</th>\n",
       "      <td>14</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Republic of the Congo</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Reunion</th>\n",
       "      <td>7</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>The Bahamas</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>occupied Palestinian territory</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                2020-03-15  2020-03-16\n",
       "Country/Region                                        \n",
       "Congo (Brazzaville)                      1           1\n",
       "Congo (Kinshasa)                         2           2\n",
       "Cruise Ship                            696         696\n",
       "Eswatini                                 1           1\n",
       "French Guiana                            0          11\n",
       "Guadeloupe                               3           6\n",
       "Holy See                                 1           1\n",
       "Martinique                               9          15\n",
       "Mayotte                                  0           1\n",
       "North Macedonia                         14          18\n",
       "Republic of the Congo                    0           1\n",
       "Reunion                                  7           9\n",
       "The Bahamas                              0           1\n",
       "occupied Palestinian territory           0           0"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "confirmed_df.loc[\n",
    "    (confirmed_df.index.isin(country_centroids_df['name_jhu']) == False)\n",
    "].iloc[:,-2:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.009942,
     "end_time": "2020-03-17T10:09:08.549521",
     "exception": false,
     "start_time": "2020-03-17T10:09:08.539579",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Save the result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "papermill": {
     "duration": 0.031984,
     "end_time": "2020-03-17T10:09:08.589686",
     "exception": false,
     "start_time": "2020-03-17T10:09:08.557702",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "if PAPERMILL_OUTPUT_PATH:\n",
    "    out_path = os.path.join(out_folder, f\"geo_data.csv\")\n",
    "    country_centroids_df.to_csv(out_path)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "papermill": {
   "duration": 2.13034,
   "end_time": "2020-03-17T10:09:08.914717",
   "environment_variables": {},
   "exception": null,
   "input_path": "/tmp/9d0h4tlx/notebooks/CompileGeoData.ipynb",
   "output_path": "runs/CompileGeoData.run.ipynb",
   "parameters": {
    "PAPERMILL_INPUT_PATH": "/tmp/9d0h4tlx/notebooks/CompileGeoData.ipynb",
    "PAPERMILL_OUTPUT_PATH": "runs/CompileGeoData.run.ipynb",
    "out_folder": "data/geodata",
    "ts_folder": "/tmp/9d0h4tlx/data/covid-19_jhu-csse",
    "worldmap_path": "/tmp/9d0h4tlx/data/worldmap/country_centroids.csv"
   },
   "start_time": "2020-03-17T10:09:06.784377",
   "version": "1.1.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}