From c716f42a4d21119f7533d93459ed56fab1c67f00 Mon Sep 17 00:00:00 2001
From: Chandrasekhar Ramakrishnan <cramakri@ethz.ch>
Date: Wed, 18 Mar 2020 21:45:37 +0000
Subject: [PATCH] renku run papermill -p out_folder ./data/covidtracking/
 --inject-paths notebooks/process/download-covidtracking-data.ipynb
 runs/download-covidtracking-data.runs.ipynb

---
 .gitattributes                                |   1 +
 ...474d38144c5d882c0036bd1059c2_papermill.cwl |  70 +++
 data/covidtracking/states-daily.json          |   3 +
 data/covidtracking/states-metadata.json       |   3 +
 runs/download-covidtracking-data.runs.ipynb   | 426 ++++++++++++++++++
 5 files changed, 503 insertions(+)
 create mode 100644 .renku/workflow/ff5f474d38144c5d882c0036bd1059c2_papermill.cwl
 create mode 100644 data/covidtracking/states-daily.json
 create mode 100644 data/covidtracking/states-metadata.json
 create mode 100644 runs/download-covidtracking-data.runs.ipynb

diff --git a/.gitattributes b/.gitattributes
index 5deb648b..5a581cd3 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -16,3 +16,4 @@ data/openzh-covid-19/COVID19_Fallzahlen_Kanton_TG_total.csv filter=lfs diff=lfs
 data/openzh-covid-19/COVID19_Fallzahlen_Kanton_BL_total.csv filter=lfs diff=lfs merge=lfs -text
 data/openzh-covid-19/COVID19_Fallzahlen_Kanton_ZH_total.csv filter=lfs diff=lfs merge=lfs -text
 data/openzh-covid-19/COVID19_Fallzahlen_Kanton_BE_total.csv filter=lfs diff=lfs merge=lfs -text
+data/covidtracking/** filter=lfs diff=lfs merge=lfs -text
diff --git a/.renku/workflow/ff5f474d38144c5d882c0036bd1059c2_papermill.cwl b/.renku/workflow/ff5f474d38144c5d882c0036bd1059c2_papermill.cwl
new file mode 100644
index 00000000..8ab0a3b5
--- /dev/null
+++ b/.renku/workflow/ff5f474d38144c5d882c0036bd1059c2_papermill.cwl
@@ -0,0 +1,70 @@
+arguments: []
+baseCommand:
+- papermill
+class: CommandLineTool
+cwlVersion: v1.0
+hints: []
+inputs:
+  input_1:
+    default: out_folder
+    inputBinding:
+      position: 1
+      prefix: -p
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: string
+  input_2:
+    default: data/covidtracking
+    inputBinding:
+      position: 2
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: string
+  input_3:
+    default:
+      class: File
+      path: ../../notebooks/process/download-covidtracking-data.ipynb
+    inputBinding:
+      position: 3
+      prefix: --inject-paths
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: File
+  input_4:
+    default: runs/download-covidtracking-data.runs.ipynb
+    inputBinding:
+      position: 4
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: string
+outputs:
+  output_0:
+    outputBinding:
+      glob: $(inputs.input_4)
+    streamable: false
+    type: File
+  output_1:
+    outputBinding:
+      glob: $(inputs.input_2)
+    streamable: false
+    type: Directory
+permanentFailCodes: []
+requirements:
+- class: InlineJavascriptRequirement
+- class: InitialWorkDirRequirement
+  listing:
+  - entry: '$({"listing": [], "class": "Directory"})'
+    entryname: runs
+    writable: true
+  - entry: '$({"listing": [], "class": "Directory"})'
+    entryname: data/covidtracking
+    writable: true
+  - entry: $(inputs.input_3)
+    entryname: notebooks/process/download-covidtracking-data.ipynb
+    writable: false
+successCodes: []
+temporaryFailCodes: []
diff --git a/data/covidtracking/states-daily.json b/data/covidtracking/states-daily.json
new file mode 100644
index 00000000..df5a0190
--- /dev/null
+++ b/data/covidtracking/states-daily.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1d3f6f266f353ca5f3e0f78a256b9fe2eba974db337fd67d054d9bd633b17bf
+size 93953
diff --git a/data/covidtracking/states-metadata.json b/data/covidtracking/states-metadata.json
new file mode 100644
index 00000000..7d4bcdac
--- /dev/null
+++ b/data/covidtracking/states-metadata.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ff2228b277b9cf60b8a3cb9aac9e1c31aa8cefe455579f28058cec6ae338215a
+size 20123
diff --git a/runs/download-covidtracking-data.runs.ipynb b/runs/download-covidtracking-data.runs.ipynb
new file mode 100644
index 00000000..e759b144
--- /dev/null
+++ b/runs/download-covidtracking-data.runs.ipynb
@@ -0,0 +1,426 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "papermill": {
+     "duration": 0.470882,
+     "end_time": "2020-03-18T21:45:34.449674",
+     "exception": false,
+     "start_time": "2020-03-18T21:45:33.978792",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "import os\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "papermill": {
+     "duration": 0.016909,
+     "end_time": "2020-03-18T21:45:34.477193",
+     "exception": false,
+     "start_time": "2020-03-18T21:45:34.460284",
+     "status": "completed"
+    },
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "out_folder = \"../data/covidtracking/\"\n",
+    "PAPERMILL_OUTPUT_PATH = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "papermill": {
+     "duration": 0.019263,
+     "end_time": "2020-03-18T21:45:34.503659",
+     "exception": false,
+     "start_time": "2020-03-18T21:45:34.484396",
+     "status": "completed"
+    },
+    "tags": [
+     "injected-parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Parameters\n",
+    "PAPERMILL_INPUT_PATH = \"notebooks/process/download-covidtracking-data.ipynb\"\n",
+    "PAPERMILL_OUTPUT_PATH = \"runs/download-covidtracking-data.runs.ipynb\"\n",
+    "out_folder = \"./data/covidtracking/\"\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.007558,
+     "end_time": "2020-03-18T21:45:34.520426",
+     "exception": false,
+     "start_time": "2020-03-18T21:45:34.512868",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# Download state metadata\n",
+    "\n",
+    "Download a dataset of URLs for data for each US state and several territories. See [Google Doc](https://docs.google.com/spreadsheets/d/18oVRrHj3c183mHmq3m89_163yuYltLNlOmPerQ18E8w/htmlview?sle=true)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "papermill": {
+     "duration": 1.425345,
+     "end_time": "2020-03-18T21:45:35.952888",
+     "exception": false,
+     "start_time": "2020-03-18T21:45:34.527543",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "url = 'http://covidtracking.com/api/states/info'\n",
+    "r = requests.get(url, allow_redirects=True)\n",
+    "states_metadata_json = r.content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "papermill": {
+     "duration": 0.019056,
+     "end_time": "2020-03-18T21:45:35.983865",
+     "exception": false,
+     "start_time": "2020-03-18T21:45:35.964809",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# save the result\n",
+    "if PAPERMILL_OUTPUT_PATH:\n",
+    "    out_path = os.path.join(out_folder, 'states-metadata.json')\n",
+    "    with open(out_path, 'wb') as f:\n",
+    "        f.write(states_metadata_json)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "papermill": {
+     "duration": 0.057062,
+     "end_time": "2020-03-18T21:45:36.050251",
+     "exception": false,
+     "start_time": "2020-03-18T21:45:35.993189",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "56 states and territories have metadata\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>state</th>\n",
+       "      <th>dataSite</th>\n",
+       "      <th>covid19Site</th>\n",
+       "      <th>twitter</th>\n",
+       "      <th>pui</th>\n",
+       "      <th>pum</th>\n",
+       "      <th>notes</th>\n",
+       "      <th>name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>AK</td>\n",
+       "      <td>http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-...</td>\n",
+       "      <td>http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-...</td>\n",
+       "      <td>@Alaska_DHSS</td>\n",
+       "      <td>All data</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Unclear if their reported number means \"person...</td>\n",
+       "      <td>Alaska</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>AL</td>\n",
+       "      <td>http://www.alabamapublichealth.gov/infectiousd...</td>\n",
+       "      <td>http://www.alabamapublichealth.gov/infectiousd...</td>\n",
+       "      <td>@alpublichealth</td>\n",
+       "      <td>No data</td>\n",
+       "      <td>False</td>\n",
+       "      <td>Last negative count from 3/16.</td>\n",
+       "      <td>Alabama</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  state                                           dataSite  \\\n",
+       "0    AK  http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-...   \n",
+       "1    AL  http://www.alabamapublichealth.gov/infectiousd...   \n",
+       "\n",
+       "                                         covid19Site          twitter  \\\n",
+       "0  http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-...     @Alaska_DHSS   \n",
+       "1  http://www.alabamapublichealth.gov/infectiousd...  @alpublichealth   \n",
+       "\n",
+       "        pui    pum                                              notes     name  \n",
+       "0  All data  False  Unclear if their reported number means \"person...   Alaska  \n",
+       "1   No data  False                     Last negative count from 3/16.  Alabama  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "metadata_df = pd.read_json(states_metadata_json)\n",
+    "print(len(metadata_df), \"states and territories have metadata\")\n",
+    "metadata_df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.011241,
+     "end_time": "2020-03-18T21:45:36.070581",
+     "exception": false,
+     "start_time": "2020-03-18T21:45:36.059340",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# Download daily state data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "papermill": {
+     "duration": 1.154443,
+     "end_time": "2020-03-18T21:45:37.237753",
+     "exception": false,
+     "start_time": "2020-03-18T21:45:36.083310",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "url = 'https://covidtracking.com/api/states/daily'\n",
+    "r = requests.get(url, allow_redirects=True)\n",
+    "states_daily_json = r.content"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "papermill": {
+     "duration": 0.020027,
+     "end_time": "2020-03-18T21:45:37.268813",
+     "exception": false,
+     "start_time": "2020-03-18T21:45:37.248786",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# save the result\n",
+    "if PAPERMILL_OUTPUT_PATH:\n",
+    "    out_path = os.path.join(out_folder, 'states-daily.json')\n",
+    "    with open(out_path, 'wb') as f:\n",
+    "        f.write(states_daily_json)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "papermill": {
+     "duration": 0.055853,
+     "end_time": "2020-03-18T21:45:37.334581",
+     "exception": false,
+     "start_time": "2020-03-18T21:45:37.278728",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "701 data points\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>date</th>\n",
+       "      <th>state</th>\n",
+       "      <th>positive</th>\n",
+       "      <th>negative</th>\n",
+       "      <th>pending</th>\n",
+       "      <th>death</th>\n",
+       "      <th>total</th>\n",
+       "      <th>dateChecked</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>20200318</td>\n",
+       "      <td>AK</td>\n",
+       "      <td>6.0</td>\n",
+       "      <td>406.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>412.0</td>\n",
+       "      <td>2020-03-18T20:00:00Z</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>20200318</td>\n",
+       "      <td>AL</td>\n",
+       "      <td>46.0</td>\n",
+       "      <td>28.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0.0</td>\n",
+       "      <td>74.0</td>\n",
+       "      <td>2020-03-18T20:00:00Z</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "       date state  positive  negative  pending  death  total  \\\n",
+       "0  20200318    AK       6.0     406.0      NaN    NaN  412.0   \n",
+       "1  20200318    AL      46.0      28.0      NaN    0.0   74.0   \n",
+       "\n",
+       "            dateChecked  \n",
+       "0  2020-03-18T20:00:00Z  \n",
+       "1  2020-03-18T20:00:00Z  "
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data_df = pd.read_json(states_daily_json)\n",
+    "print(len(data_df), \"data points\")\n",
+    "data_df.head(2)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  },
+  "papermill": {
+   "duration": 4.518691,
+   "end_time": "2020-03-18T21:45:37.656675",
+   "environment_variables": {},
+   "exception": null,
+   "input_path": "notebooks/process/download-covidtracking-data.ipynb",
+   "output_path": "runs/download-covidtracking-data.runs.ipynb",
+   "parameters": {
+    "PAPERMILL_INPUT_PATH": "notebooks/process/download-covidtracking-data.ipynb",
+    "PAPERMILL_OUTPUT_PATH": "runs/download-covidtracking-data.runs.ipynb",
+    "out_folder": "./data/covidtracking/"
+   },
+   "start_time": "2020-03-18T21:45:33.137984",
+   "version": "1.1.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
-- 
GitLab