From c47b83e6dcc95fa8fbfb422e23126839e15f5edf Mon Sep 17 00:00:00 2001 From: Chandrasekhar Ramakrishnan <cramakri@ethz.ch> Date: Sun, 15 Mar 2020 15:40:47 +0000 Subject: [PATCH] renku run papermill -p ts_folder ./data/covid-19_jhu-csse/ -p worldmap_path ./data/worldmap/country_centroids.csv -p out_folder ./data/geodata/ --inject-paths notebooks/CompileGeoData.ipynb runs/CompileGeoData.run.ipynb --- .gitattributes | 1 + ...1f74e51d4e54bc522007a2030ec2_papermill.cwl | 115 +++++ data/geodata/geo_data.csv | 3 + runs/CompileGeoData.run.ipynb | 481 ++++++++++++++++++ 4 files changed, 600 insertions(+) create mode 100644 .renku/workflow/73781f74e51d4e54bc522007a2030ec2_papermill.cwl create mode 100644 data/geodata/geo_data.csv create mode 100644 runs/CompileGeoData.run.ipynb diff --git a/.gitattributes b/.gitattributes index 4243bf16..9cc936b0 100644 --- a/.gitattributes +++ b/.gitattributes @@ -7,3 +7,4 @@ data/covid-19_rates/ts_rates_19-covid-deaths.csv filter=lfs diff=lfs merge=lfs - data/covid-19_rates/ts_rates_19-covid-recovered.csv filter=lfs diff=lfs merge=lfs -text data/covid-19_rates/ts_rates_19-covid-confirmed.csv filter=lfs diff=lfs merge=lfs -text data/worldmap/country_centroids.csv filter=lfs diff=lfs merge=lfs -text +data/geodata/** filter=lfs diff=lfs merge=lfs -text diff --git a/.renku/workflow/73781f74e51d4e54bc522007a2030ec2_papermill.cwl b/.renku/workflow/73781f74e51d4e54bc522007a2030ec2_papermill.cwl new file mode 100644 index 00000000..4b4f9ba2 --- /dev/null +++ b/.renku/workflow/73781f74e51d4e54bc522007a2030ec2_papermill.cwl @@ -0,0 +1,115 @@ +arguments: [] +baseCommand: +- papermill +class: CommandLineTool +cwlVersion: v1.0 +hints: [] +inputs: + input_1: + default: ts_folder + inputBinding: + position: 1 + prefix: -p + separate: true + shellQuote: true + streamable: false + type: string + input_2: + default: + class: Directory + listing: [] + path: ../../data/covid-19_jhu-csse + inputBinding: + position: 2 + separate: true + shellQuote: true + streamable: false + type: Directory + input_3: + default: worldmap_path + inputBinding: + position: 3 + prefix: -p + separate: true + shellQuote: true + streamable: false + type: string + input_4: + default: + class: File + path: ../../data/worldmap/country_centroids.csv + inputBinding: + position: 4 + separate: true + shellQuote: true + streamable: false + type: File + input_5: + default: out_folder + inputBinding: + position: 5 + prefix: -p + separate: true + shellQuote: true + streamable: false + type: string + input_6: + default: data/geodata + inputBinding: + position: 6 + separate: true + shellQuote: true + streamable: false + type: string + input_7: + default: + class: File + path: ../../notebooks/CompileGeoData.ipynb + inputBinding: + position: 7 + prefix: --inject-paths + separate: true + shellQuote: true + streamable: false + type: File + input_8: + default: runs/CompileGeoData.run.ipynb + inputBinding: + position: 8 + separate: true + shellQuote: true + streamable: false + type: string +outputs: + output_0: + outputBinding: + glob: $(inputs.input_8) + streamable: false + type: File + output_1: + outputBinding: + glob: $(inputs.input_6) + streamable: false + type: Directory +permanentFailCodes: [] +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: + - entry: '$({"listing": [], "class": "Directory"})' + entryname: runs + writable: true + - entry: '$({"listing": [], "class": "Directory"})' + entryname: data/geodata + writable: true + - entry: $(inputs.input_2) + entryname: data/covid-19_jhu-csse + writable: false + - entry: $(inputs.input_4) + entryname: data/worldmap/country_centroids.csv + writable: false + - entry: $(inputs.input_7) + entryname: notebooks/CompileGeoData.ipynb + writable: false +successCodes: [] +temporaryFailCodes: [] diff --git a/data/geodata/geo_data.csv b/data/geodata/geo_data.csv new file mode 100644 index 00000000..37d63a78 --- /dev/null +++ b/data/geodata/geo_data.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10381a7c8b37736eb65cf9c510c8a3fb3cdbd11b3c3c7a53f5a1d5a9e8e5074a +size 34026 diff --git a/runs/CompileGeoData.run.ipynb b/runs/CompileGeoData.run.ipynb new file mode 100644 index 00000000..3d7acfa8 --- /dev/null +++ b/runs/CompileGeoData.run.ipynb @@ -0,0 +1,481 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.023649, + "end_time": "2020-03-15T15:40:45.758452", + "exception": false, + "start_time": "2020-03-15T15:40:45.734803", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Extract the Geographic Info\n", + "\n", + "Use the Harvard [country_centroids.csv](https://worldmap.harvard.edu/data/geonode:country_centroids_az8) data to extract the geographic info we need for the visualizations." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "papermill": { + "duration": 0.297633, + "end_time": "2020-03-15T15:40:46.067382", + "exception": false, + "start_time": "2020-03-15T15:40:45.769749", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "papermill": { + "duration": 0.018676, + "end_time": "2020-03-15T15:40:46.104734", + "exception": false, + "start_time": "2020-03-15T15:40:46.086058", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "ts_folder = \"../data/covid-19_jhu-csse/\"\n", + "worldmap_path = \"../data/worldmap/country_centroids.csv\"\n", + "out_folder = None\n", + "PAPERMILL_OUTPUT_PATH = None" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.008977, + "end_time": "2020-03-15T15:40:46.123525", + "exception": false, + "start_time": "2020-03-15T15:40:46.114548", + "status": "completed" + }, + "tags": [ + "parameters" + ] + }, + "source": [ + "## Read in JHU CSSE data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "papermill": { + "duration": 0.021077, + "end_time": "2020-03-15T15:40:46.152428", + "exception": false, + "start_time": "2020-03-15T15:40:46.131351", + "status": "completed" + }, + "tags": [ + "injected-parameters" + ] + }, + "outputs": [], + "source": [ + "# Parameters\n", + "PAPERMILL_INPUT_PATH = \"notebooks/CompileGeoData.ipynb\"\n", + "PAPERMILL_OUTPUT_PATH = \"runs/CompileGeoData.run.ipynb\"\n", + "ts_folder = \"./data/covid-19_jhu-csse/\"\n", + "worldmap_path = \"./data/worldmap/country_centroids.csv\"\n", + "out_folder = \"./data/geodata/\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "papermill": { + "duration": 0.031105, + "end_time": "2020-03-15T15:40:46.195383", + "exception": false, + "start_time": "2020-03-15T15:40:46.164278", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def read_jhu_covid_region_df(name):\n", + " filename = os.path.join(ts_folder, f\"time_series_19-covid-{name}.csv\")\n", + " df = pd.read_csv(filename)\n", + " df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])\n", + " df.columns = pd.to_datetime(df.columns)\n", + " region_df = df.groupby(level='Country/Region').sum()\n", + " return region_df" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "papermill": { + "duration": 0.064486, + "end_time": "2020-03-15T15:40:46.279720", + "exception": false, + "start_time": "2020-03-15T15:40:46.215234", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "confirmed_df = read_jhu_covid_region_df(\"Confirmed\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.009811, + "end_time": "2020-03-15T15:40:46.307002", + "exception": false, + "start_time": "2020-03-15T15:40:46.297191", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Read in Harvard country centroids" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "papermill": { + "duration": 0.047941, + "end_time": "2020-03-15T15:40:46.362544", + "exception": false, + "start_time": "2020-03-15T15:40:46.314603", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "country_centroids_df = pd.read_csv(worldmap_path)\n", + "country_centroids_df = country_centroids_df[['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est', 'gdp_md_est', 'income_grp', 'Longitude', 'Latitude']]\n", + "country_centroids_df['name_jhu'] = country_centroids_df['name_long'] " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "papermill": { + "duration": 0.046373, + "end_time": "2020-03-15T15:40:46.423601", + "exception": false, + "start_time": "2020-03-15T15:40:46.377228", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est',\n", + " 'gdp_md_est', 'income_grp', 'Longitude', 'Latitude', 'name_jhu'],\n", + " dtype='object')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "country_centroids_df.columns" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.009953, + "end_time": "2020-03-15T15:40:46.450454", + "exception": false, + "start_time": "2020-03-15T15:40:46.440501", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Fix names that differ between JHU CSSE and Harvard data" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "papermill": { + "duration": 0.029858, + "end_time": "2020-03-15T15:40:46.488135", + "exception": false, + "start_time": "2020-03-15T15:40:46.458277", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "region_hd_jhu_map = {\n", + " 'Brunei Darussalam': 'Brunei',\n", + " \"Côte d'Ivoire\": \"Cote d'Ivoire\",\n", + " 'Czech Republic': 'Czechia',\n", + " 'Hong Kong': 'Hong Kong SAR',\n", + " 'Republic of Korea': 'Korea, South',\n", + " 'Macao': 'Macao SAR',\n", + " 'Russian Federation': 'Russia',\n", + " 'Taiwan': 'Taiwan*',\n", + " 'United States': 'US'\n", + "}\n", + "country_centroids_df['name_jhu'] = country_centroids_df['name_jhu'].replace(region_hd_jhu_map)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "papermill": { + "duration": 0.024985, + "end_time": "2020-03-15T15:40:46.527221", + "exception": false, + "start_time": "2020-03-15T15:40:46.502236", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Use this to find the name in the series\n", + "# country_centroids_df[country_centroids_df['name'].str.contains('Macao')]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.010235, + "end_time": "2020-03-15T15:40:46.557408", + "exception": false, + "start_time": "2020-03-15T15:40:46.547173", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "There are some regions that we cannot resolve, but we will just ignore these." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "papermill": { + "duration": 0.044948, + "end_time": "2020-03-15T15:40:46.610278", + "exception": false, + "start_time": "2020-03-15T15:40:46.565330", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>2020-03-12</th>\n", + " <th>2020-03-13</th>\n", + " </tr>\n", + " <tr>\n", + " <th>Country/Region</th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>Congo (Kinshasa)</th>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Cruise Ship</th>\n", + " <td>696</td>\n", + " <td>696</td>\n", + " </tr>\n", + " <tr>\n", + " <th>French Guiana</th>\n", + " <td>5</td>\n", + " <td>5</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Guadeloupe</th>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Holy See</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Martinique</th>\n", + " <td>3</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>North Macedonia</th>\n", + " <td>7</td>\n", + " <td>14</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Reunion</th>\n", + " <td>1</td>\n", + " <td>5</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " 2020-03-12 2020-03-13\n", + "Country/Region \n", + "Congo (Kinshasa) 1 2\n", + "Cruise Ship 696 696\n", + "French Guiana 5 5\n", + "Guadeloupe 0 1\n", + "Holy See 1 1\n", + "Martinique 3 3\n", + "North Macedonia 7 14\n", + "Reunion 1 5" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "confirmed_df.loc[\n", + " (confirmed_df.index.isin(country_centroids_df['name_jhu']) == False)\n", + "].iloc[:,-2:]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.010609, + "end_time": "2020-03-15T15:40:46.641179", + "exception": false, + "start_time": "2020-03-15T15:40:46.630570", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Save the result" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "papermill": { + "duration": 0.036281, + "end_time": "2020-03-15T15:40:46.685590", + "exception": false, + "start_time": "2020-03-15T15:40:46.649309", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "if PAPERMILL_OUTPUT_PATH:\n", + " out_path = os.path.join(out_folder, f\"geo_data.csv\")\n", + " country_centroids_df.to_csv(out_path)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + }, + "papermill": { + "duration": 2.13544, + "end_time": "2020-03-15T15:40:47.019858", + "environment_variables": {}, + "exception": null, + "input_path": "notebooks/CompileGeoData.ipynb", + "output_path": "runs/CompileGeoData.run.ipynb", + "parameters": { + "PAPERMILL_INPUT_PATH": "notebooks/CompileGeoData.ipynb", + "PAPERMILL_OUTPUT_PATH": "runs/CompileGeoData.run.ipynb", + "out_folder": "./data/geodata/", + "ts_folder": "./data/covid-19_jhu-csse/", + "worldmap_path": "./data/worldmap/country_centroids.csv" + }, + "start_time": "2020-03-15T15:40:44.884418", + "version": "1.1.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file -- GitLab