Skip to content
Snippets Groups Projects
download-covidtracking-data.runs.ipynb 11.7 KiB
Newer Older
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "papermill": {
     "duration": 0.582742,
     "end_time": "2020-03-20T09:12:16.167813",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "import os\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "papermill": {
     "duration": 0.017681,
     "end_time": "2020-03-20T09:12:16.194097",
     "status": "completed"
    },
    "tags": [
     "parameters"
    ]
   },
   "outputs": [],
   "source": [
    "out_folder = \"../data/covidtracking/\"\n",
    "PAPERMILL_OUTPUT_PATH = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "papermill": {
     "duration": 0.019941,
     "end_time": "2020-03-20T09:12:16.222810",
     "status": "completed"
    },
    "tags": [
     "injected-parameters"
    ]
   },
   "outputs": [],
   "source": [
    "# Parameters\n",
    "PAPERMILL_INPUT_PATH = \"/tmp/tq93huw7/notebooks/process/download-covidtracking-data.ipynb\"\n",
    "PAPERMILL_OUTPUT_PATH = \"runs/download-covidtracking-data.runs.ipynb\"\n",
     "duration": 0.006852,
     "end_time": "2020-03-20T09:12:16.239661",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Download state metadata\n",
    "\n",
    "Download a dataset of URLs for data for each US state and several territories. See [Google Doc](https://docs.google.com/spreadsheets/d/18oVRrHj3c183mHmq3m89_163yuYltLNlOmPerQ18E8w/htmlview?sle=true)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "papermill": {
     "duration": 1.543652,
     "end_time": "2020-03-20T09:12:17.791818",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "url = 'http://covidtracking.com/api/states/info'\n",
    "r = requests.get(url, allow_redirects=True)\n",
    "states_metadata_json = r.content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "papermill": {
     "duration": 0.019479,
     "end_time": "2020-03-20T09:12:17.820781",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# save the result\n",
    "if PAPERMILL_OUTPUT_PATH:\n",
    "    out_path = os.path.join(out_folder, 'states-metadata.json')\n",
    "    with open(out_path, 'wb') as f:\n",
    "        f.write(states_metadata_json)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "papermill": {
     "duration": 0.067162,
     "end_time": "2020-03-20T09:12:17.897197",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "56 states and territories have metadata\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>state</th>\n",
       "      <th>twitter</th>\n",
       "      <th>pui</th>\n",
       "      <th>pum</th>\n",
       "      <th>notes</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>AK</td>\n",
       "      <td>http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-...</td>\n",
       "      <td>http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-...</td>\n",
       "      <td>@Alaska_DHSS</td>\n",
       "      <td>All data</td>\n",
       "      <td>False</td>\n",
       "      <td>Unclear if their reported number means \"person...</td>\n",
       "      <td>Alaska</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>AL</td>\n",
       "      <td>http://www.alabamapublichealth.gov/infectiousd...</td>\n",
       "      <td>https://alpublichealth.maps.arcgis.com/apps/op...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>@alpublichealth</td>\n",
       "      <td>No data</td>\n",
       "      <td>False</td>\n",
       "      <td>Last negative count from 3/16.</td>\n",
       "      <td>Alabama</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "0    AK  http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-...   \n",
       "1    AL  http://www.alabamapublichealth.gov/infectiousd...   \n",
       "\n",
       "                                         covid19Site  covid19SiteSecondary  \\\n",
       "0  http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-...                   NaN   \n",
       "1  https://alpublichealth.maps.arcgis.com/apps/op...                   NaN   \n",
       "\n",
       "           twitter       pui    pum  \\\n",
       "0     @Alaska_DHSS  All data  False   \n",
       "1  @alpublichealth   No data  False   \n",
       "                                               notes     name  \n",
       "0  Unclear if their reported number means \"person...   Alaska  \n",
       "1                     Last negative count from 3/16.  Alabama  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metadata_df = pd.read_json(states_metadata_json)\n",
    "print(len(metadata_df), \"states and territories have metadata\")\n",
    "metadata_df.head(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.010887,
     "end_time": "2020-03-20T09:12:17.919775",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "# Download daily state data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "papermill": {
     "duration": 0.134228,
     "end_time": "2020-03-20T09:12:18.065352",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "url = 'https://covidtracking.com/api/states/daily'\n",
    "r = requests.get(url, allow_redirects=True)\n",
    "states_daily_json = r.content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "papermill": {
     "duration": 0.02218,
     "end_time": "2020-03-20T09:12:18.097912",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# save the result\n",
    "if PAPERMILL_OUTPUT_PATH:\n",
    "    out_path = os.path.join(out_folder, 'states-daily.json')\n",
    "    with open(out_path, 'wb') as f:\n",
    "        f.write(states_daily_json)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "papermill": {
     "duration": 0.058368,
     "end_time": "2020-03-20T09:12:18.166166",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>state</th>\n",
       "      <th>positive</th>\n",
       "      <th>negative</th>\n",
       "      <th>pending</th>\n",
       "      <th>death</th>\n",
       "      <th>total</th>\n",
       "      <th>dateChecked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>28.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       date state  positive  negative  pending  death  total  \\\n",
       "0  20200319    AK         6     400.0      NaN    NaN    406   \n",
       "1  20200319    AL        68      28.0      NaN    0.0     96   \n",
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_df = pd.read_json(states_daily_json)\n",
    "print(len(data_df), \"data points\")\n",
    "data_df.head(2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "duration": 3.92686,
   "end_time": "2020-03-20T09:12:18.486365",
   "input_path": "/tmp/tq93huw7/notebooks/process/download-covidtracking-data.ipynb",
   "output_path": "runs/download-covidtracking-data.runs.ipynb",
   "parameters": {
    "PAPERMILL_INPUT_PATH": "/tmp/tq93huw7/notebooks/process/download-covidtracking-data.ipynb",
    "PAPERMILL_OUTPUT_PATH": "runs/download-covidtracking-data.runs.ipynb",