diff --git a/.gitattributes b/.gitattributes index d7d532ecd9b5561134ea6debb2e565af16445ebd..9afe52d034d59b89e2bbd51ff6fafb187a426869 100644 --- a/.gitattributes +++ b/.gitattributes @@ -2,3 +2,4 @@ data/covid-19_jhu-csse/time_series_19-covid-Confirmed.csv filter=lfs diff=lfs me data/covid-19_jhu-csse/time_series_19-covid-Deaths.csv filter=lfs diff=lfs merge=lfs -text data/covid-19_jhu-csse/time_series_19-covid-Recovered.csv filter=lfs diff=lfs merge=lfs -text data/worldbank/SP.POP.TOTL.zip filter=lfs diff=lfs merge=lfs -text +data/covid-19_rates/** filter=lfs diff=lfs merge=lfs -text diff --git a/.renku/workflow/a38f8d703e0c4c55a2e3f49bbf15466e_papermill.cwl b/.renku/workflow/a38f8d703e0c4c55a2e3f49bbf15466e_papermill.cwl new file mode 100644 index 0000000000000000000000000000000000000000..ee8c3ca5ce03d4f3c0ec3a8f6aefcba58f7aa740 --- /dev/null +++ b/.renku/workflow/a38f8d703e0c4c55a2e3f49bbf15466e_papermill.cwl @@ -0,0 +1,115 @@ +arguments: [] +baseCommand: +- papermill +class: CommandLineTool +cwlVersion: v1.0 +hints: [] +inputs: + input_1: + default: ts_folder + inputBinding: + position: 1 + prefix: -p + separate: true + shellQuote: true + streamable: false + type: string + input_2: + default: + class: Directory + listing: [] + path: ../../data/covid-19_jhu-csse + inputBinding: + position: 2 + separate: true + shellQuote: true + streamable: false + type: Directory + input_3: + default: wb_path + inputBinding: + position: 3 + prefix: -p + separate: true + shellQuote: true + streamable: false + type: string + input_4: + default: + class: File + path: ../../data/worldbank/SP.POP.TOTL.zip + inputBinding: + position: 4 + separate: true + shellQuote: true + streamable: false + type: File + input_5: + default: out_folder + inputBinding: + position: 5 + prefix: -p + separate: true + shellQuote: true + streamable: false + type: string + input_6: + default: data/covid-19_rates + inputBinding: + position: 6 + separate: true + shellQuote: true + streamable: false + type: string + input_7: + default: + class: File + path: ../../notebooks/ToRates.ipynb + inputBinding: + position: 7 + prefix: --inject-paths + separate: true + shellQuote: true + streamable: false + type: File + input_8: + default: runs/ToRates.run.ipynb + inputBinding: + position: 8 + separate: true + shellQuote: true + streamable: false + type: string +outputs: + output_0: + outputBinding: + glob: $(inputs.input_8) + streamable: false + type: File + output_1: + outputBinding: + glob: $(inputs.input_6) + streamable: false + type: Directory +permanentFailCodes: [] +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: + - entry: '$({"listing": [], "class": "Directory"})' + entryname: runs + writable: true + - entry: '$({"listing": [], "class": "Directory"})' + entryname: data/covid-19_rates + writable: true + - entry: $(inputs.input_2) + entryname: data/covid-19_jhu-csse + writable: false + - entry: $(inputs.input_4) + entryname: data/worldbank/SP.POP.TOTL.zip + writable: false + - entry: $(inputs.input_7) + entryname: notebooks/ToRates.ipynb + writable: false +successCodes: [] +temporaryFailCodes: [] diff --git a/data/covid-19_rates/ts_rates_19-covid-confirmed.csv b/data/covid-19_rates/ts_rates_19-covid-confirmed.csv new file mode 100644 index 0000000000000000000000000000000000000000..68469b73899f1a13e5dfc305c01e517f2372300f --- /dev/null +++ b/data/covid-19_rates/ts_rates_19-covid-confirmed.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:953941d2e4414e15f0e840d13c806b56d0cfda23f402d2d8daf840ef9e6ba666 +size 55521 diff --git a/data/covid-19_rates/ts_rates_19-covid-deaths.csv b/data/covid-19_rates/ts_rates_19-covid-deaths.csv new file mode 100644 index 0000000000000000000000000000000000000000..927fd93f7b7d3b87e3dc685bc8e283c4d007d41d --- /dev/null +++ b/data/covid-19_rates/ts_rates_19-covid-deaths.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d07625575b61e275be3ba1b85e04dc96984df348680d2c3f5866a4443144138 +size 28842 diff --git a/data/covid-19_rates/ts_rates_19-covid-recovered.csv b/data/covid-19_rates/ts_rates_19-covid-recovered.csv new file mode 100644 index 0000000000000000000000000000000000000000..c35fa7b034cd61892a7d26c0bb897288662cf439 --- /dev/null +++ b/data/covid-19_rates/ts_rates_19-covid-recovered.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eae1d5842d547e04eec2706bb61a5f5c019f244150dbab899c675ad67f166524 +size 38957 diff --git a/runs/ToRates.run.ipynb b/runs/ToRates.run.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..2925f4eefd7a6bff1c917d01551b49198f9d0e50 --- /dev/null +++ b/runs/ToRates.run.ipynb @@ -0,0 +1,827 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.019737, + "end_time": "2020-03-13T14:54:48.276145", + "exception": false, + "start_time": "2020-03-13T14:54:48.256408", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Convert Series to Rates per 100,000" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "papermill": { + "duration": 0.327112, + "end_time": "2020-03-13T14:54:48.613995", + "exception": false, + "start_time": "2020-03-13T14:54:48.286883", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "papermill": { + "duration": 0.025647, + "end_time": "2020-03-13T14:54:48.658686", + "exception": false, + "start_time": "2020-03-13T14:54:48.633039", + "status": "completed" + }, + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "ts_folder = \"../data/covid-19_jhu-csse/\"\n", + "wb_path = \"../data/worldbank/SP.POP.TOTL.zip\"\n", + "out_folder = None\n", + "PAPERMILL_OUTPUT_PATH = None" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "papermill": { + "duration": 0.027378, + "end_time": "2020-03-13T14:54:48.697099", + "exception": false, + "start_time": "2020-03-13T14:54:48.669721", + "status": "completed" + }, + "tags": [ + "injected-parameters" + ] + }, + "outputs": [], + "source": [ + "# Parameters\n", + "PAPERMILL_INPUT_PATH = \"notebooks/ToRates.ipynb\"\n", + "PAPERMILL_OUTPUT_PATH = \"runs/ToRates.run.ipynb\"\n", + "ts_folder = \"./data/covid-19_jhu-csse/\"\n", + "wb_path = \"./data/worldbank/SP.POP.TOTL.zip\"\n", + "out_folder = \"./data/covid-19_rates/\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.009984, + "end_time": "2020-03-13T14:54:48.724400", + "exception": false, + "start_time": "2020-03-13T14:54:48.714416", + "status": "completed" + }, + "tags": [ + "parameters" + ] + }, + "source": [ + "## Read in JHU CSSE data\n", + "\n", + "I will switch to [xarray](http://xarray.pydata.org/en/stable/), but ATM, it's easier like this..." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "papermill": { + "duration": 0.044299, + "end_time": "2020-03-13T14:54:48.778200", + "exception": false, + "start_time": "2020-03-13T14:54:48.733901", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def read_jhu_covid_region_df(name):\n", + " filename = os.path.join(ts_folder, f\"time_series_19-covid-{name}.csv\")\n", + " df = pd.read_csv(filename)\n", + " df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])\n", + " df.columns = pd.to_datetime(df.columns)\n", + " region_df = df.groupby(level='Country/Region').sum()\n", + " loc_df = df.reset_index([2,3]).groupby(level='Country/Region').mean()[['Long', 'Lat']]\n", + " return region_df.join(loc_df).set_index(['Long', 'Lat'], append=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "papermill": { + "duration": 0.126546, + "end_time": "2020-03-13T14:54:48.922552", + "exception": false, + "start_time": "2020-03-13T14:54:48.796006", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "frames_map = {\n", + " \"confirmed\": read_jhu_covid_region_df(\"Confirmed\"),\n", + " \"deaths\": read_jhu_covid_region_df(\"Deaths\"),\n", + " \"recovered\": read_jhu_covid_region_df(\"Recovered\")\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "papermill": { + "duration": 0.05437, + "end_time": "2020-03-13T14:54:48.984940", + "exception": false, + "start_time": "2020-03-13T14:54:48.930570", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th>2020-01-22</th>\n", + " <th>2020-01-23</th>\n", + " <th>2020-01-24</th>\n", + " <th>2020-01-25</th>\n", + " <th>2020-01-26</th>\n", + " <th>2020-01-27</th>\n", + " <th>2020-01-28</th>\n", + " <th>2020-01-29</th>\n", + " <th>2020-01-30</th>\n", + " <th>2020-01-31</th>\n", + " <th>...</th>\n", + " <th>2020-03-01</th>\n", + " <th>2020-03-02</th>\n", + " <th>2020-03-03</th>\n", + " <th>2020-03-04</th>\n", + " <th>2020-03-05</th>\n", + " <th>2020-03-06</th>\n", + " <th>2020-03-07</th>\n", + " <th>2020-03-08</th>\n", + " <th>2020-03-09</th>\n", + " <th>2020-03-10</th>\n", + " </tr>\n", + " <tr>\n", + " <th>Country/Region</th>\n", + " <th>Long</th>\n", + " <th>Lat</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>Mainland China</th>\n", + " <th>111.542903</th>\n", + " <th>33.406936</th>\n", + " <td>547</td>\n", + " <td>639</td>\n", + " <td>916</td>\n", + " <td>1399</td>\n", + " <td>2062</td>\n", + " <td>2863</td>\n", + " <td>5494</td>\n", + " <td>6070</td>\n", + " <td>8124</td>\n", + " <td>9783</td>\n", + " <td>...</td>\n", + " <td>79826</td>\n", + " <td>80026</td>\n", + " <td>80151</td>\n", + " <td>80271</td>\n", + " <td>80422</td>\n", + " <td>80573</td>\n", + " <td>80652</td>\n", + " <td>80699</td>\n", + " <td>80735</td>\n", + " <td>80757</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Italy</th>\n", + " <th>12.000000</th>\n", + " <th>43.000000</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " <td>...</td>\n", + " <td>1694</td>\n", + " <td>2036</td>\n", + " <td>2502</td>\n", + " <td>3089</td>\n", + " <td>3858</td>\n", + " <td>4636</td>\n", + " <td>5883</td>\n", + " <td>7375</td>\n", + " <td>9172</td>\n", + " <td>10149</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Iran (Islamic Republic of)</th>\n", + " <th>53.000000</th>\n", + " <th>32.000000</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>978</td>\n", + " <td>1501</td>\n", + " <td>2336</td>\n", + " <td>2922</td>\n", + " <td>3513</td>\n", + " <td>4747</td>\n", + " <td>5823</td>\n", + " <td>6566</td>\n", + " <td>7161</td>\n", + " <td>8042</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Republic of Korea</th>\n", + " <th>128.000000</th>\n", + " <th>36.000000</th>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>2</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>4</td>\n", + " <td>4</td>\n", + " <td>4</td>\n", + " <td>11</td>\n", + " <td>...</td>\n", + " <td>3736</td>\n", + " <td>4335</td>\n", + " <td>5186</td>\n", + " <td>5621</td>\n", + " <td>6088</td>\n", + " <td>6593</td>\n", + " <td>7041</td>\n", + " <td>7314</td>\n", + " <td>7478</td>\n", + " <td>7513</td>\n", + " </tr>\n", + " <tr>\n", + " <th>France</th>\n", + " <th>2.000000</th>\n", + " <th>47.000000</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>2</td>\n", + " <td>3</td>\n", + " <td>3</td>\n", + " <td>3</td>\n", + " <td>4</td>\n", + " <td>5</td>\n", + " <td>5</td>\n", + " <td>5</td>\n", + " <td>...</td>\n", + " <td>130</td>\n", + " <td>191</td>\n", + " <td>204</td>\n", + " <td>285</td>\n", + " <td>377</td>\n", + " <td>653</td>\n", + " <td>949</td>\n", + " <td>1126</td>\n", + " <td>1209</td>\n", + " <td>1784</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 49 columns</p>\n", + "</div>" + ], + "text/plain": [ + " 2020-01-22 2020-01-23 \\\n", + "Country/Region Long Lat \n", + "Mainland China 111.542903 33.406936 547 639 \n", + "Italy 12.000000 43.000000 0 0 \n", + "Iran (Islamic Republic of) 53.000000 32.000000 0 0 \n", + "Republic of Korea 128.000000 36.000000 1 1 \n", + "France 2.000000 47.000000 0 0 \n", + "\n", + " 2020-01-24 2020-01-25 \\\n", + "Country/Region Long Lat \n", + "Mainland China 111.542903 33.406936 916 1399 \n", + "Italy 12.000000 43.000000 0 0 \n", + "Iran (Islamic Republic of) 53.000000 32.000000 0 0 \n", + "Republic of Korea 128.000000 36.000000 2 2 \n", + "France 2.000000 47.000000 2 3 \n", + "\n", + " 2020-01-26 2020-01-27 \\\n", + "Country/Region Long Lat \n", + "Mainland China 111.542903 33.406936 2062 2863 \n", + "Italy 12.000000 43.000000 0 0 \n", + "Iran (Islamic Republic of) 53.000000 32.000000 0 0 \n", + "Republic of Korea 128.000000 36.000000 3 4 \n", + "France 2.000000 47.000000 3 3 \n", + "\n", + " 2020-01-28 2020-01-29 \\\n", + "Country/Region Long Lat \n", + "Mainland China 111.542903 33.406936 5494 6070 \n", + "Italy 12.000000 43.000000 0 0 \n", + "Iran (Islamic Republic of) 53.000000 32.000000 0 0 \n", + "Republic of Korea 128.000000 36.000000 4 4 \n", + "France 2.000000 47.000000 4 5 \n", + "\n", + " 2020-01-30 2020-01-31 ... \\\n", + "Country/Region Long Lat ... \n", + "Mainland China 111.542903 33.406936 8124 9783 ... \n", + "Italy 12.000000 43.000000 0 2 ... \n", + "Iran (Islamic Republic of) 53.000000 32.000000 0 0 ... \n", + "Republic of Korea 128.000000 36.000000 4 11 ... \n", + "France 2.000000 47.000000 5 5 ... \n", + "\n", + " 2020-03-01 2020-03-02 \\\n", + "Country/Region Long Lat \n", + "Mainland China 111.542903 33.406936 79826 80026 \n", + "Italy 12.000000 43.000000 1694 2036 \n", + "Iran (Islamic Republic of) 53.000000 32.000000 978 1501 \n", + "Republic of Korea 128.000000 36.000000 3736 4335 \n", + "France 2.000000 47.000000 130 191 \n", + "\n", + " 2020-03-03 2020-03-04 \\\n", + "Country/Region Long Lat \n", + "Mainland China 111.542903 33.406936 80151 80271 \n", + "Italy 12.000000 43.000000 2502 3089 \n", + "Iran (Islamic Republic of) 53.000000 32.000000 2336 2922 \n", + "Republic of Korea 128.000000 36.000000 5186 5621 \n", + "France 2.000000 47.000000 204 285 \n", + "\n", + " 2020-03-05 2020-03-06 \\\n", + "Country/Region Long Lat \n", + "Mainland China 111.542903 33.406936 80422 80573 \n", + "Italy 12.000000 43.000000 3858 4636 \n", + "Iran (Islamic Republic of) 53.000000 32.000000 3513 4747 \n", + "Republic of Korea 128.000000 36.000000 6088 6593 \n", + "France 2.000000 47.000000 377 653 \n", + "\n", + " 2020-03-07 2020-03-08 \\\n", + "Country/Region Long Lat \n", + "Mainland China 111.542903 33.406936 80652 80699 \n", + "Italy 12.000000 43.000000 5883 7375 \n", + "Iran (Islamic Republic of) 53.000000 32.000000 5823 6566 \n", + "Republic of Korea 128.000000 36.000000 7041 7314 \n", + "France 2.000000 47.000000 949 1126 \n", + "\n", + " 2020-03-09 2020-03-10 \n", + "Country/Region Long Lat \n", + "Mainland China 111.542903 33.406936 80735 80757 \n", + "Italy 12.000000 43.000000 9172 10149 \n", + "Iran (Islamic Republic of) 53.000000 32.000000 7161 8042 \n", + "Republic of Korea 128.000000 36.000000 7478 7513 \n", + "France 2.000000 47.000000 1209 1784 \n", + "\n", + "[5 rows x 49 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "frames_map['confirmed'].sort_values(frames_map['confirmed'].columns[-1], ascending=False).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.011038, + "end_time": "2020-03-13T14:54:49.012475", + "exception": false, + "start_time": "2020-03-13T14:54:49.001437", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Read in World Bank data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "papermill": { + "duration": 0.043639, + "end_time": "2020-03-13T14:54:49.064213", + "exception": false, + "start_time": "2020-03-13T14:54:49.020574", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import zipfile\n", + "zf = zipfile.ZipFile(wb_path)\n", + "pop_df = pd.read_csv(zf.open(\"API_SP.POP.TOTL_DS2_en_csv_v2_821007.csv\"), skiprows=4)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.010307, + "end_time": "2020-03-13T14:54:49.091394", + "exception": false, + "start_time": "2020-03-13T14:54:49.081087", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "There is 2018 pop data for all countries/regions except Eritrea" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "papermill": { + "duration": 0.055137, + "end_time": "2020-03-13T14:54:49.154951", + "exception": false, + "start_time": "2020-03-13T14:54:49.099814", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Country Name</th>\n", + " <th>Country Code</th>\n", + " <th>Indicator Name</th>\n", + " <th>Indicator Code</th>\n", + " <th>1960</th>\n", + " <th>1961</th>\n", + " <th>1962</th>\n", + " <th>1963</th>\n", + " <th>1964</th>\n", + " <th>1965</th>\n", + " <th>...</th>\n", + " <th>2011</th>\n", + " <th>2012</th>\n", + " <th>2013</th>\n", + " <th>2014</th>\n", + " <th>2015</th>\n", + " <th>2016</th>\n", + " <th>2017</th>\n", + " <th>2018</th>\n", + " <th>2019</th>\n", + " <th>Unnamed: 64</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>67</th>\n", + " <td>Eritrea</td>\n", + " <td>ERI</td>\n", + " <td>Population, total</td>\n", + " <td>SP.POP.TOTL</td>\n", + " <td>1007590.0</td>\n", + " <td>1033328.0</td>\n", + " <td>1060486.0</td>\n", + " <td>1088854.0</td>\n", + " <td>1118159.0</td>\n", + " <td>1148189.0</td>\n", + " <td>...</td>\n", + " <td>3213972.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>108</th>\n", + " <td>Not classified</td>\n", + " <td>INX</td>\n", + " <td>Population, total</td>\n", + " <td>SP.POP.TOTL</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>2 rows × 65 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Country Name Country Code Indicator Name Indicator Code 1960 \\\n", + "67 Eritrea ERI Population, total SP.POP.TOTL 1007590.0 \n", + "108 Not classified INX Population, total SP.POP.TOTL NaN \n", + "\n", + " 1961 1962 1963 1964 1965 ... 2011 \\\n", + "67 1033328.0 1060486.0 1088854.0 1118159.0 1148189.0 ... 3213972.0 \n", + "108 NaN NaN NaN NaN NaN ... NaN \n", + "\n", + " 2012 2013 2014 2015 2016 2017 2018 2019 Unnamed: 64 \n", + "67 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "108 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "[2 rows x 65 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pop_df[pd.isna(pop_df['2018'])]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.009197, + "end_time": "2020-03-13T14:54:49.179422", + "exception": false, + "start_time": "2020-03-13T14:54:49.170225", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "Fix the country/region names that differ between the World Bank population data and the JHU CSSE data." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "papermill": { + "duration": 0.048983, + "end_time": "2020-03-13T14:54:49.237289", + "exception": false, + "start_time": "2020-03-13T14:54:49.188306", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "region_wb_jhu_map = {\n", + " 'China': 'Mainland China',\n", + " 'Iran, Islamic Rep.': 'Iran (Islamic Republic of)',\n", + " 'Korea, Rep.': 'Republic of Korea',\n", + " 'United States': 'US',\n", + " 'United Kingdom': 'UK',\n", + " 'Hong Kong SAR, China': 'Hong Kong SAR',\n", + " 'Egypt, Arab Rep.': 'Egypt',\n", + " 'Vietnam': 'Viet Nam',\n", + " 'Macao SAR, China': 'Macao SAR',\n", + " 'Slovak Republic': 'Slovakia',\n", + " 'Moldova': 'Republic of Moldova',\n", + " 'St. Martin (French part)': 'Saint Martin',\n", + " 'Brunei Darussalam': 'Brunei'\n", + "}\n", + "current_pop_ser = pop_df[['Country Name', '2018']].copy().replace(region_wb_jhu_map).set_index('Country Name')['2018']\n", + "data_pop_ser = current_pop_ser[current_pop_ser.index.isin(frames_map['confirmed'].index.levels[0])]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.010552, + "end_time": "2020-03-13T14:54:49.263442", + "exception": false, + "start_time": "2020-03-13T14:54:49.252890", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "There are some regions that we cannot resolve, but we will just ignore these." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.008224, + "end_time": "2020-03-13T14:54:49.280239", + "exception": false, + "start_time": "2020-03-13T14:54:49.272015", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Compute rates per 100,000 for regions" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "papermill": { + "duration": 0.081057, + "end_time": "2020-03-13T14:54:49.370031", + "exception": false, + "start_time": "2020-03-13T14:54:49.288974", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def cases_to_rates_df(df):\n", + " per_100000_df = df.reset_index([1, 2], drop=True)\n", + " per_100000_df = per_100000_df.div(data_pop_ser, 'index').mul(100000).dropna()\n", + " per_100000_df.index.name = 'Country/Region'\n", + " return per_100000_df\n", + " \n", + "def frames_to_rates(frames_map):\n", + " return {k: cases_to_rates_df(v) for k,v in frames_map.items()}\n", + "\n", + "\n", + "rates_map = frames_to_rates(frames_map)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "papermill": { + "duration": 0.065887, + "end_time": "2020-03-13T14:54:49.458887", + "exception": false, + "start_time": "2020-03-13T14:54:49.393000", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "if PAPERMILL_OUTPUT_PATH:\n", + " for k, v in rates_map.items():\n", + " out_path = os.path.join(out_folder, f\"ts_rates_19-covid-{k}.csv\")\n", + " v.reset_index().to_csv(out_path)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + }, + "papermill": { + "duration": 2.37776, + "end_time": "2020-03-13T14:54:49.786899", + "environment_variables": {}, + "exception": null, + "input_path": "notebooks/ToRates.ipynb", + "output_path": "runs/ToRates.run.ipynb", + "parameters": { + "PAPERMILL_INPUT_PATH": "notebooks/ToRates.ipynb", + "PAPERMILL_OUTPUT_PATH": "runs/ToRates.run.ipynb", + "out_folder": "./data/covid-19_rates/", + "ts_folder": "./data/covid-19_jhu-csse/", + "wb_path": "./data/worldbank/SP.POP.TOTL.zip" + }, + "start_time": "2020-03-13T14:54:47.409139", + "version": "1.1.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file