From c47b83e6dcc95fa8fbfb422e23126839e15f5edf Mon Sep 17 00:00:00 2001
From: Chandrasekhar Ramakrishnan <cramakri@ethz.ch>
Date: Sun, 15 Mar 2020 15:40:47 +0000
Subject: [PATCH] renku run papermill -p ts_folder ./data/covid-19_jhu-csse/ -p
 worldmap_path ./data/worldmap/country_centroids.csv -p out_folder
 ./data/geodata/ --inject-paths notebooks/CompileGeoData.ipynb
 runs/CompileGeoData.run.ipynb

---
 .gitattributes                                |   1 +
 ...1f74e51d4e54bc522007a2030ec2_papermill.cwl | 115 +++++
 data/geodata/geo_data.csv                     |   3 +
 runs/CompileGeoData.run.ipynb                 | 481 ++++++++++++++++++
 4 files changed, 600 insertions(+)
 create mode 100644 .renku/workflow/73781f74e51d4e54bc522007a2030ec2_papermill.cwl
 create mode 100644 data/geodata/geo_data.csv
 create mode 100644 runs/CompileGeoData.run.ipynb

diff --git a/.gitattributes b/.gitattributes
index 4243bf16..9cc936b0 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -7,3 +7,4 @@ data/covid-19_rates/ts_rates_19-covid-deaths.csv filter=lfs diff=lfs merge=lfs -
 data/covid-19_rates/ts_rates_19-covid-recovered.csv filter=lfs diff=lfs merge=lfs -text
 data/covid-19_rates/ts_rates_19-covid-confirmed.csv filter=lfs diff=lfs merge=lfs -text
 data/worldmap/country_centroids.csv filter=lfs diff=lfs merge=lfs -text
+data/geodata/** filter=lfs diff=lfs merge=lfs -text
diff --git a/.renku/workflow/73781f74e51d4e54bc522007a2030ec2_papermill.cwl b/.renku/workflow/73781f74e51d4e54bc522007a2030ec2_papermill.cwl
new file mode 100644
index 00000000..4b4f9ba2
--- /dev/null
+++ b/.renku/workflow/73781f74e51d4e54bc522007a2030ec2_papermill.cwl
@@ -0,0 +1,115 @@
+arguments: []
+baseCommand:
+- papermill
+class: CommandLineTool
+cwlVersion: v1.0
+hints: []
+inputs:
+  input_1:
+    default: ts_folder
+    inputBinding:
+      position: 1
+      prefix: -p
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: string
+  input_2:
+    default:
+      class: Directory
+      listing: []
+      path: ../../data/covid-19_jhu-csse
+    inputBinding:
+      position: 2
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: Directory
+  input_3:
+    default: worldmap_path
+    inputBinding:
+      position: 3
+      prefix: -p
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: string
+  input_4:
+    default:
+      class: File
+      path: ../../data/worldmap/country_centroids.csv
+    inputBinding:
+      position: 4
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: File
+  input_5:
+    default: out_folder
+    inputBinding:
+      position: 5
+      prefix: -p
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: string
+  input_6:
+    default: data/geodata
+    inputBinding:
+      position: 6
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: string
+  input_7:
+    default:
+      class: File
+      path: ../../notebooks/CompileGeoData.ipynb
+    inputBinding:
+      position: 7
+      prefix: --inject-paths
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: File
+  input_8:
+    default: runs/CompileGeoData.run.ipynb
+    inputBinding:
+      position: 8
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: string
+outputs:
+  output_0:
+    outputBinding:
+      glob: $(inputs.input_8)
+    streamable: false
+    type: File
+  output_1:
+    outputBinding:
+      glob: $(inputs.input_6)
+    streamable: false
+    type: Directory
+permanentFailCodes: []
+requirements:
+- class: InlineJavascriptRequirement
+- class: InitialWorkDirRequirement
+  listing:
+  - entry: '$({"listing": [], "class": "Directory"})'
+    entryname: runs
+    writable: true
+  - entry: '$({"listing": [], "class": "Directory"})'
+    entryname: data/geodata
+    writable: true
+  - entry: $(inputs.input_2)
+    entryname: data/covid-19_jhu-csse
+    writable: false
+  - entry: $(inputs.input_4)
+    entryname: data/worldmap/country_centroids.csv
+    writable: false
+  - entry: $(inputs.input_7)
+    entryname: notebooks/CompileGeoData.ipynb
+    writable: false
+successCodes: []
+temporaryFailCodes: []
diff --git a/data/geodata/geo_data.csv b/data/geodata/geo_data.csv
new file mode 100644
index 00000000..37d63a78
--- /dev/null
+++ b/data/geodata/geo_data.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:10381a7c8b37736eb65cf9c510c8a3fb3cdbd11b3c3c7a53f5a1d5a9e8e5074a
+size 34026
diff --git a/runs/CompileGeoData.run.ipynb b/runs/CompileGeoData.run.ipynb
new file mode 100644
index 00000000..3d7acfa8
--- /dev/null
+++ b/runs/CompileGeoData.run.ipynb
@@ -0,0 +1,481 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.023649,
+     "end_time": "2020-03-15T15:40:45.758452",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:45.734803",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# Extract the Geographic Info\n",
+    "\n",
+    "Use the Harvard [country_centroids.csv](https://worldmap.harvard.edu/data/geonode:country_centroids_az8) data to extract the geographic info we need for the visualizations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "papermill": {
+     "duration": 0.297633,
+     "end_time": "2020-03-15T15:40:46.067382",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:45.769749",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "papermill": {
+     "duration": 0.018676,
+     "end_time": "2020-03-15T15:40:46.104734",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:46.086058",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "ts_folder = \"../data/covid-19_jhu-csse/\"\n",
+    "worldmap_path = \"../data/worldmap/country_centroids.csv\"\n",
+    "out_folder = None\n",
+    "PAPERMILL_OUTPUT_PATH = None"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.008977,
+     "end_time": "2020-03-15T15:40:46.123525",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:46.114548",
+     "status": "completed"
+    },
+    "tags": [
+     "parameters"
+    ]
+   },
+   "source": [
+    "## Read in JHU CSSE data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "papermill": {
+     "duration": 0.021077,
+     "end_time": "2020-03-15T15:40:46.152428",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:46.131351",
+     "status": "completed"
+    },
+    "tags": [
+     "injected-parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Parameters\n",
+    "PAPERMILL_INPUT_PATH = \"notebooks/CompileGeoData.ipynb\"\n",
+    "PAPERMILL_OUTPUT_PATH = \"runs/CompileGeoData.run.ipynb\"\n",
+    "ts_folder = \"./data/covid-19_jhu-csse/\"\n",
+    "worldmap_path = \"./data/worldmap/country_centroids.csv\"\n",
+    "out_folder = \"./data/geodata/\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "papermill": {
+     "duration": 0.031105,
+     "end_time": "2020-03-15T15:40:46.195383",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:46.164278",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def read_jhu_covid_region_df(name):\n",
+    "    filename = os.path.join(ts_folder, f\"time_series_19-covid-{name}.csv\")\n",
+    "    df = pd.read_csv(filename)\n",
+    "    df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])\n",
+    "    df.columns = pd.to_datetime(df.columns)\n",
+    "    region_df = df.groupby(level='Country/Region').sum()\n",
+    "    return region_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "papermill": {
+     "duration": 0.064486,
+     "end_time": "2020-03-15T15:40:46.279720",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:46.215234",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "confirmed_df = read_jhu_covid_region_df(\"Confirmed\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.009811,
+     "end_time": "2020-03-15T15:40:46.307002",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:46.297191",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# Read in Harvard country centroids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "papermill": {
+     "duration": 0.047941,
+     "end_time": "2020-03-15T15:40:46.362544",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:46.314603",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "country_centroids_df = pd.read_csv(worldmap_path)\n",
+    "country_centroids_df = country_centroids_df[['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est', 'gdp_md_est', 'income_grp', 'Longitude', 'Latitude']]\n",
+    "country_centroids_df['name_jhu'] = country_centroids_df['name_long'] "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "papermill": {
+     "duration": 0.046373,
+     "end_time": "2020-03-15T15:40:46.423601",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:46.377228",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est',\n",
+       "       'gdp_md_est', 'income_grp', 'Longitude', 'Latitude', 'name_jhu'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "country_centroids_df.columns"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.009953,
+     "end_time": "2020-03-15T15:40:46.450454",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:46.440501",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "Fix names that differ between JHU CSSE and Harvard data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "papermill": {
+     "duration": 0.029858,
+     "end_time": "2020-03-15T15:40:46.488135",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:46.458277",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "region_hd_jhu_map = {\n",
+    "     'Brunei Darussalam': 'Brunei',\n",
+    "     \"Côte d'Ivoire\": \"Cote d'Ivoire\",\n",
+    "     'Czech Republic': 'Czechia',\n",
+    "     'Hong Kong': 'Hong Kong SAR',\n",
+    "     'Republic of Korea': 'Korea, South',\n",
+    "     'Macao': 'Macao SAR',\n",
+    "     'Russian Federation': 'Russia',\n",
+    "     'Taiwan': 'Taiwan*',\n",
+    "     'United States': 'US'\n",
+    "}\n",
+    "country_centroids_df['name_jhu'] = country_centroids_df['name_jhu'].replace(region_hd_jhu_map)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "papermill": {
+     "duration": 0.024985,
+     "end_time": "2020-03-15T15:40:46.527221",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:46.502236",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# Use this to find the name in the series\n",
+    "# country_centroids_df[country_centroids_df['name'].str.contains('Macao')]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.010235,
+     "end_time": "2020-03-15T15:40:46.557408",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:46.547173",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "There are some regions that we cannot resolve, but we will just ignore these."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "papermill": {
+     "duration": 0.044948,
+     "end_time": "2020-03-15T15:40:46.610278",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:46.565330",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>2020-03-12</th>\n",
+       "      <th>2020-03-13</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Country/Region</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Congo (Kinshasa)</th>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Cruise Ship</th>\n",
+       "      <td>696</td>\n",
+       "      <td>696</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>French Guiana</th>\n",
+       "      <td>5</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Guadeloupe</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Holy See</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Martinique</th>\n",
+       "      <td>3</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>North Macedonia</th>\n",
+       "      <td>7</td>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Reunion</th>\n",
+       "      <td>1</td>\n",
+       "      <td>5</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                  2020-03-12  2020-03-13\n",
+       "Country/Region                          \n",
+       "Congo (Kinshasa)           1           2\n",
+       "Cruise Ship              696         696\n",
+       "French Guiana              5           5\n",
+       "Guadeloupe                 0           1\n",
+       "Holy See                   1           1\n",
+       "Martinique                 3           3\n",
+       "North Macedonia            7          14\n",
+       "Reunion                    1           5"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "confirmed_df.loc[\n",
+    "    (confirmed_df.index.isin(country_centroids_df['name_jhu']) == False)\n",
+    "].iloc[:,-2:]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.010609,
+     "end_time": "2020-03-15T15:40:46.641179",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:46.630570",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# Save the result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "papermill": {
+     "duration": 0.036281,
+     "end_time": "2020-03-15T15:40:46.685590",
+     "exception": false,
+     "start_time": "2020-03-15T15:40:46.649309",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "if PAPERMILL_OUTPUT_PATH:\n",
+    "    out_path = os.path.join(out_folder, f\"geo_data.csv\")\n",
+    "    country_centroids_df.to_csv(out_path)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  },
+  "papermill": {
+   "duration": 2.13544,
+   "end_time": "2020-03-15T15:40:47.019858",
+   "environment_variables": {},
+   "exception": null,
+   "input_path": "notebooks/CompileGeoData.ipynb",
+   "output_path": "runs/CompileGeoData.run.ipynb",
+   "parameters": {
+    "PAPERMILL_INPUT_PATH": "notebooks/CompileGeoData.ipynb",
+    "PAPERMILL_OUTPUT_PATH": "runs/CompileGeoData.run.ipynb",
+    "out_folder": "./data/geodata/",
+    "ts_folder": "./data/covid-19_jhu-csse/",
+    "worldmap_path": "./data/worldmap/country_centroids.csv"
+   },
+   "start_time": "2020-03-15T15:40:44.884418",
+   "version": "1.1.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
-- 
GitLab