Newer
Older
Chandrasekhar Ramakrishnan
committed
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.019915,
"end_time": "2020-03-18T17:39:45.438621",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:45.418706",
Chandrasekhar Ramakrishnan
committed
"status": "completed"
},
"tags": []
},
"source": [
"# Extract the Geographic Info\n",
"\n",
"Use the Harvard [country_centroids.csv](https://worldmap.harvard.edu/data/geonode:country_centroids_az8) data to extract the geographic info we need for the visualizations."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.356977,
"end_time": "2020-03-18T17:39:45.806137",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:45.449160",
Chandrasekhar Ramakrishnan
committed
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.020489,
"end_time": "2020-03-18T17:39:45.839942",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:45.819453",
Chandrasekhar Ramakrishnan
committed
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"ts_folder = \"../data/covid-19_jhu-csse/\"\n",
"worldmap_path = \"../data/worldmap/country_centroids.csv\"\n",
"out_folder = None\n",
"PAPERMILL_OUTPUT_PATH = None"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.010294,
"end_time": "2020-03-18T17:39:45.862217",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:45.851923",
Chandrasekhar Ramakrishnan
committed
"status": "completed"
},
"tags": [
"parameters"
]
},
"source": [
"## Read in JHU CSSE data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.023154,
"end_time": "2020-03-18T17:39:45.909658",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:45.886504",
Chandrasekhar Ramakrishnan
committed
"status": "completed"
},
"tags": [
"injected-parameters"
]
},
"outputs": [],
"source": [
"# Parameters\n",
Chandrasekhar Ramakrishnan
committed
"PAPERMILL_INPUT_PATH = \"notebooks/process/CompileGeoData.ipynb\"\n",
Chandrasekhar Ramakrishnan
committed
"PAPERMILL_OUTPUT_PATH = \"runs/CompileGeoData.run.ipynb\"\n",
Chandrasekhar Ramakrishnan
committed
"ts_folder = \"./data/covid-19_jhu-csse/\"\n",
"worldmap_path = \"./data/worldmap/country_centroids.csv\"\n",
"out_folder = \"./data/geodata/\"\n"
Chandrasekhar Ramakrishnan
committed
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.022631,
"end_time": "2020-03-18T17:39:45.942649",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:45.920018",
Chandrasekhar Ramakrishnan
committed
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"def read_jhu_covid_region_df(name):\n",
" filename = os.path.join(ts_folder, f\"time_series_19-covid-{name}.csv\")\n",
" df = pd.read_csv(filename)\n",
" df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])\n",
" df.columns = pd.to_datetime(df.columns)\n",
" region_df = df.groupby(level='Country/Region').sum()\n",
" return region_df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.05949,
"end_time": "2020-03-18T17:39:46.014614",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:45.955124",
Chandrasekhar Ramakrishnan
committed
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"confirmed_df = read_jhu_covid_region_df(\"Confirmed\")"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.010077,
"end_time": "2020-03-18T17:39:46.041211",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:46.031134",
Chandrasekhar Ramakrishnan
committed
"status": "completed"
},
"tags": []
},
"source": [
"# Read in Harvard country centroids"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.043439,
"end_time": "2020-03-18T17:39:46.094285",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:46.050846",
Chandrasekhar Ramakrishnan
committed
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"country_centroids_df = pd.read_csv(worldmap_path)\n",
"country_centroids_df = country_centroids_df[['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est', 'gdp_md_est', 'income_grp', 'Longitude', 'Latitude']]\n",
"country_centroids_df['name_jhu'] = country_centroids_df['name_long'] "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.034514,
"end_time": "2020-03-18T17:39:46.142439",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:46.107925",
Chandrasekhar Ramakrishnan
committed
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"status": "completed"
},
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est',\n",
" 'gdp_md_est', 'income_grp', 'Longitude', 'Latitude', 'name_jhu'],\n",
" dtype='object')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"country_centroids_df.columns"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.009928,
"end_time": "2020-03-18T17:39:46.166476",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:46.156548",
Chandrasekhar Ramakrishnan
committed
"status": "completed"
},
"tags": []
},
"source": [
"Fix names that differ between JHU CSSE and Harvard data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.026025,
"end_time": "2020-03-18T17:39:46.202465",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:46.176440",
Chandrasekhar Ramakrishnan
committed
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"region_hd_jhu_map = {\n",
" 'Brunei Darussalam': 'Brunei',\n",
" \"Côte d'Ivoire\": \"Cote d'Ivoire\",\n",
" 'Czech Republic': 'Czechia',\n",
" 'Hong Kong': 'Hong Kong SAR',\n",
" 'Republic of Korea': 'Korea, South',\n",
" 'Macao': 'Macao SAR',\n",
" 'Russian Federation': 'Russia',\n",
" 'Taiwan': 'Taiwan*',\n",
" 'United States': 'US'\n",
"}\n",
"country_centroids_df['name_jhu'] = country_centroids_df['name_jhu'].replace(region_hd_jhu_map)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.018618,
"end_time": "2020-03-18T17:39:46.232915",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:46.214297",
Chandrasekhar Ramakrishnan
committed
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"# Use this to find the name in the series\n",
"# country_centroids_df[country_centroids_df['name'].str.contains('Macao')]"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.009895,
"end_time": "2020-03-18T17:39:46.253760",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:46.243865",
Chandrasekhar Ramakrishnan
committed
"status": "completed"
},
"tags": []
},
"source": [
"There are some regions that we cannot resolve, but we will just ignore these."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.034717,
"end_time": "2020-03-18T17:39:46.298658",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:46.263941",
Chandrasekhar Ramakrishnan
committed
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
"status": "completed"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
Chandrasekhar Ramakrishnan
committed
" </tr>\n",
" <tr>\n",
" <th>Country/Region</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Congo (Brazzaville)</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
Chandrasekhar Ramakrishnan
committed
" <th>Congo (Kinshasa)</th>\n",
Chandrasekhar Ramakrishnan
committed
" </tr>\n",
" <tr>\n",
" <th>Cruise Ship</th>\n",
" <td>696</td>\n",
" <td>696</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
Chandrasekhar Ramakrishnan
committed
" <th>Holy See</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Martinique</th>\n",
Chandrasekhar Ramakrishnan
committed
" </tr>\n",
" <tr>\n",
" <th>North Macedonia</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Republic of the Congo</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>The Bahamas</th>\n",
" <td>1</td>\n",
Chandrasekhar Ramakrishnan
committed
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 2020-03-16 2020-03-17\n",
"Country/Region \n",
"Congo (Brazzaville) 1 1\n",
"Congo (Kinshasa) 2 3\n",
"Cruise Ship 696 696\n",
"Eswatini 1 1\n",
"Holy See 1 1\n",
"Martinique 15 16\n",
"North Macedonia 18 26\n",
"Republic of the Congo 1 1\n",
"The Bahamas 1 1"
Chandrasekhar Ramakrishnan
committed
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"confirmed_df.loc[\n",
" (confirmed_df.index.isin(country_centroids_df['name_jhu']) == False)\n",
"].iloc[:,-2:]"
]
},
{
"cell_type": "markdown",
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.010396,
"end_time": "2020-03-18T17:39:46.322850",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:46.312454",
Chandrasekhar Ramakrishnan
committed
"status": "completed"
},
"tags": []
},
"source": [
"# Save the result"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 0.028961,
"end_time": "2020-03-18T17:39:46.362455",
Chandrasekhar Ramakrishnan
committed
"exception": false,
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:46.333494",
Chandrasekhar Ramakrishnan
committed
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"if PAPERMILL_OUTPUT_PATH:\n",
" out_path = os.path.join(out_folder, f\"geo_data.csv\")\n",
" country_centroids_df.to_csv(out_path)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
},
"papermill": {
Chandrasekhar Ramakrishnan
committed
"duration": 2.135836,
"end_time": "2020-03-18T17:39:46.683682",
Chandrasekhar Ramakrishnan
committed
"environment_variables": {},
"exception": null,
Chandrasekhar Ramakrishnan
committed
"input_path": "notebooks/process/CompileGeoData.ipynb",
Chandrasekhar Ramakrishnan
committed
"output_path": "runs/CompileGeoData.run.ipynb",
"parameters": {
Chandrasekhar Ramakrishnan
committed
"PAPERMILL_INPUT_PATH": "notebooks/process/CompileGeoData.ipynb",
Chandrasekhar Ramakrishnan
committed
"PAPERMILL_OUTPUT_PATH": "runs/CompileGeoData.run.ipynb",
Chandrasekhar Ramakrishnan
committed
"out_folder": "./data/geodata/",
"ts_folder": "./data/covid-19_jhu-csse/",
"worldmap_path": "./data/worldmap/country_centroids.csv"
Chandrasekhar Ramakrishnan
committed
},
Chandrasekhar Ramakrishnan
committed
"start_time": "2020-03-18T17:39:44.547846",
Chandrasekhar Ramakrishnan
committed
"version": "1.1.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}