From c716f42a4d21119f7533d93459ed56fab1c67f00 Mon Sep 17 00:00:00 2001 From: Chandrasekhar Ramakrishnan <cramakri@ethz.ch> Date: Wed, 18 Mar 2020 21:45:37 +0000 Subject: [PATCH] renku run papermill -p out_folder ./data/covidtracking/ --inject-paths notebooks/process/download-covidtracking-data.ipynb runs/download-covidtracking-data.runs.ipynb --- .gitattributes | 1 + ...474d38144c5d882c0036bd1059c2_papermill.cwl | 70 +++ data/covidtracking/states-daily.json | 3 + data/covidtracking/states-metadata.json | 3 + runs/download-covidtracking-data.runs.ipynb | 426 ++++++++++++++++++ 5 files changed, 503 insertions(+) create mode 100644 .renku/workflow/ff5f474d38144c5d882c0036bd1059c2_papermill.cwl create mode 100644 data/covidtracking/states-daily.json create mode 100644 data/covidtracking/states-metadata.json create mode 100644 runs/download-covidtracking-data.runs.ipynb diff --git a/.gitattributes b/.gitattributes index 5deb648b..5a581cd3 100644 --- a/.gitattributes +++ b/.gitattributes @@ -16,3 +16,4 @@ data/openzh-covid-19/COVID19_Fallzahlen_Kanton_TG_total.csv filter=lfs diff=lfs data/openzh-covid-19/COVID19_Fallzahlen_Kanton_BL_total.csv filter=lfs diff=lfs merge=lfs -text data/openzh-covid-19/COVID19_Fallzahlen_Kanton_ZH_total.csv filter=lfs diff=lfs merge=lfs -text data/openzh-covid-19/COVID19_Fallzahlen_Kanton_BE_total.csv filter=lfs diff=lfs merge=lfs -text +data/covidtracking/** filter=lfs diff=lfs merge=lfs -text diff --git a/.renku/workflow/ff5f474d38144c5d882c0036bd1059c2_papermill.cwl b/.renku/workflow/ff5f474d38144c5d882c0036bd1059c2_papermill.cwl new file mode 100644 index 00000000..8ab0a3b5 --- /dev/null +++ b/.renku/workflow/ff5f474d38144c5d882c0036bd1059c2_papermill.cwl @@ -0,0 +1,70 @@ +arguments: [] +baseCommand: +- papermill +class: CommandLineTool +cwlVersion: v1.0 +hints: [] +inputs: + input_1: + default: out_folder + inputBinding: + position: 1 + prefix: -p + separate: true + shellQuote: true + streamable: false + type: string + input_2: + default: data/covidtracking + inputBinding: + position: 2 + separate: true + shellQuote: true + streamable: false + type: string + input_3: + default: + class: File + path: ../../notebooks/process/download-covidtracking-data.ipynb + inputBinding: + position: 3 + prefix: --inject-paths + separate: true + shellQuote: true + streamable: false + type: File + input_4: + default: runs/download-covidtracking-data.runs.ipynb + inputBinding: + position: 4 + separate: true + shellQuote: true + streamable: false + type: string +outputs: + output_0: + outputBinding: + glob: $(inputs.input_4) + streamable: false + type: File + output_1: + outputBinding: + glob: $(inputs.input_2) + streamable: false + type: Directory +permanentFailCodes: [] +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: + - entry: '$({"listing": [], "class": "Directory"})' + entryname: runs + writable: true + - entry: '$({"listing": [], "class": "Directory"})' + entryname: data/covidtracking + writable: true + - entry: $(inputs.input_3) + entryname: notebooks/process/download-covidtracking-data.ipynb + writable: false +successCodes: [] +temporaryFailCodes: [] diff --git a/data/covidtracking/states-daily.json b/data/covidtracking/states-daily.json new file mode 100644 index 00000000..df5a0190 --- /dev/null +++ b/data/covidtracking/states-daily.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1d3f6f266f353ca5f3e0f78a256b9fe2eba974db337fd67d054d9bd633b17bf +size 93953 diff --git a/data/covidtracking/states-metadata.json b/data/covidtracking/states-metadata.json new file mode 100644 index 00000000..7d4bcdac --- /dev/null +++ b/data/covidtracking/states-metadata.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff2228b277b9cf60b8a3cb9aac9e1c31aa8cefe455579f28058cec6ae338215a +size 20123 diff --git a/runs/download-covidtracking-data.runs.ipynb b/runs/download-covidtracking-data.runs.ipynb new file mode 100644 index 00000000..e759b144 --- /dev/null +++ b/runs/download-covidtracking-data.runs.ipynb @@ -0,0 +1,426 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "papermill": { + "duration": 0.470882, + "end_time": "2020-03-18T21:45:34.449674", + "exception": false, + "start_time": "2020-03-18T21:45:33.978792", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import requests\n", + "import os\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "papermill": { + "duration": 0.016909, + "end_time": "2020-03-18T21:45:34.477193", + "exception": false, + "start_time": "2020-03-18T21:45:34.460284", + "status": "completed" + }, + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "out_folder = \"../data/covidtracking/\"\n", + "PAPERMILL_OUTPUT_PATH = None" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "papermill": { + "duration": 0.019263, + "end_time": "2020-03-18T21:45:34.503659", + "exception": false, + "start_time": "2020-03-18T21:45:34.484396", + "status": "completed" + }, + "tags": [ + "injected-parameters" + ] + }, + "outputs": [], + "source": [ + "# Parameters\n", + "PAPERMILL_INPUT_PATH = \"notebooks/process/download-covidtracking-data.ipynb\"\n", + "PAPERMILL_OUTPUT_PATH = \"runs/download-covidtracking-data.runs.ipynb\"\n", + "out_folder = \"./data/covidtracking/\"\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.007558, + "end_time": "2020-03-18T21:45:34.520426", + "exception": false, + "start_time": "2020-03-18T21:45:34.512868", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Download state metadata\n", + "\n", + "Download a dataset of URLs for data for each US state and several territories. See [Google Doc](https://docs.google.com/spreadsheets/d/18oVRrHj3c183mHmq3m89_163yuYltLNlOmPerQ18E8w/htmlview?sle=true)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "papermill": { + "duration": 1.425345, + "end_time": "2020-03-18T21:45:35.952888", + "exception": false, + "start_time": "2020-03-18T21:45:34.527543", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "url = 'http://covidtracking.com/api/states/info'\n", + "r = requests.get(url, allow_redirects=True)\n", + "states_metadata_json = r.content" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "papermill": { + "duration": 0.019056, + "end_time": "2020-03-18T21:45:35.983865", + "exception": false, + "start_time": "2020-03-18T21:45:35.964809", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# save the result\n", + "if PAPERMILL_OUTPUT_PATH:\n", + " out_path = os.path.join(out_folder, 'states-metadata.json')\n", + " with open(out_path, 'wb') as f:\n", + " f.write(states_metadata_json)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "papermill": { + "duration": 0.057062, + "end_time": "2020-03-18T21:45:36.050251", + "exception": false, + "start_time": "2020-03-18T21:45:35.993189", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "56 states and territories have metadata\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>state</th>\n", + " <th>dataSite</th>\n", + " <th>covid19Site</th>\n", + " <th>twitter</th>\n", + " <th>pui</th>\n", + " <th>pum</th>\n", + " <th>notes</th>\n", + " <th>name</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>AK</td>\n", + " <td>http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-...</td>\n", + " <td>http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-...</td>\n", + " <td>@Alaska_DHSS</td>\n", + " <td>All data</td>\n", + " <td>False</td>\n", + " <td>Unclear if their reported number means \"person...</td>\n", + " <td>Alaska</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>AL</td>\n", + " <td>http://www.alabamapublichealth.gov/infectiousd...</td>\n", + " <td>http://www.alabamapublichealth.gov/infectiousd...</td>\n", + " <td>@alpublichealth</td>\n", + " <td>No data</td>\n", + " <td>False</td>\n", + " <td>Last negative count from 3/16.</td>\n", + " <td>Alabama</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " state dataSite \\\n", + "0 AK http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-... \n", + "1 AL http://www.alabamapublichealth.gov/infectiousd... \n", + "\n", + " covid19Site twitter \\\n", + "0 http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-... @Alaska_DHSS \n", + "1 http://www.alabamapublichealth.gov/infectiousd... @alpublichealth \n", + "\n", + " pui pum notes name \n", + "0 All data False Unclear if their reported number means \"person... Alaska \n", + "1 No data False Last negative count from 3/16. Alabama " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metadata_df = pd.read_json(states_metadata_json)\n", + "print(len(metadata_df), \"states and territories have metadata\")\n", + "metadata_df.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.011241, + "end_time": "2020-03-18T21:45:36.070581", + "exception": false, + "start_time": "2020-03-18T21:45:36.059340", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Download daily state data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "papermill": { + "duration": 1.154443, + "end_time": "2020-03-18T21:45:37.237753", + "exception": false, + "start_time": "2020-03-18T21:45:36.083310", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "url = 'https://covidtracking.com/api/states/daily'\n", + "r = requests.get(url, allow_redirects=True)\n", + "states_daily_json = r.content" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "papermill": { + "duration": 0.020027, + "end_time": "2020-03-18T21:45:37.268813", + "exception": false, + "start_time": "2020-03-18T21:45:37.248786", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# save the result\n", + "if PAPERMILL_OUTPUT_PATH:\n", + " out_path = os.path.join(out_folder, 'states-daily.json')\n", + " with open(out_path, 'wb') as f:\n", + " f.write(states_daily_json)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "papermill": { + "duration": 0.055853, + "end_time": "2020-03-18T21:45:37.334581", + "exception": false, + "start_time": "2020-03-18T21:45:37.278728", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "701 data points\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>date</th>\n", + " <th>state</th>\n", + " <th>positive</th>\n", + " <th>negative</th>\n", + " <th>pending</th>\n", + " <th>death</th>\n", + " <th>total</th>\n", + " <th>dateChecked</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>20200318</td>\n", + " <td>AK</td>\n", + " <td>6.0</td>\n", + " <td>406.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>412.0</td>\n", + " <td>2020-03-18T20:00:00Z</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>20200318</td>\n", + " <td>AL</td>\n", + " <td>46.0</td>\n", + " <td>28.0</td>\n", + " <td>NaN</td>\n", + " <td>0.0</td>\n", + " <td>74.0</td>\n", + " <td>2020-03-18T20:00:00Z</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " date state positive negative pending death total \\\n", + "0 20200318 AK 6.0 406.0 NaN NaN 412.0 \n", + "1 20200318 AL 46.0 28.0 NaN 0.0 74.0 \n", + "\n", + " dateChecked \n", + "0 2020-03-18T20:00:00Z \n", + "1 2020-03-18T20:00:00Z " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data_df = pd.read_json(states_daily_json)\n", + "print(len(data_df), \"data points\")\n", + "data_df.head(2)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + }, + "papermill": { + "duration": 4.518691, + "end_time": "2020-03-18T21:45:37.656675", + "environment_variables": {}, + "exception": null, + "input_path": "notebooks/process/download-covidtracking-data.ipynb", + "output_path": "runs/download-covidtracking-data.runs.ipynb", + "parameters": { + "PAPERMILL_INPUT_PATH": "notebooks/process/download-covidtracking-data.ipynb", + "PAPERMILL_OUTPUT_PATH": "runs/download-covidtracking-data.runs.ipynb", + "out_folder": "./data/covidtracking/" + }, + "start_time": "2020-03-18T21:45:33.137984", + "version": "1.1.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file -- GitLab