library(jsonlite)
library(ggplot2)
options(repr.plot.width=14, repr.plot.height=8)
library(data.table)

Look at the metadata

metadata_df <- fromJSON("../../data/covidtracking/states-metadata.json")
head(metadata_df)
##   state
## 1    AK
## 2    AL
## 3    AR
## 4    AS
## 5    AZ
## 6    CA
##                                                                                                                     covid19SiteOld
## 1                                                                    http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-19/default.aspx
## 2                                                      http://www.alabamapublichealth.gov/infectiousdiseases/2019-coronavirus.html
## 3                                                      https://www.healthy.arkansas.gov/programs-services/topics/novel-coronavirus
## 4                                    http://www.samoagovt.ws/2020/03/ministry-of-health-coronavirus-covid-19-update-14-march-2020/
## 5 https://www.azdhs.gov/preparedness/epidemiology-disease-control/infectious-disease-epidemiology/index.php#novel-coronavirus-home
## 6                                                       https://www.cdph.ca.gov/Programs/CID/DCDC/Pages/Immunization/ncov2019.aspx
##                                                                                                                        covid19Site
## 1                                                                 http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-19/monitoring.aspx
## 2                            https://alpublichealth.maps.arcgis.com/apps/opsdashboard/index.html#/6d2771faa9da4a2786a509d82c8cf0f7
## 3                                                      https://www.healthy.arkansas.gov/programs-services/topics/novel-coronavirus
## 4                                                                                    http://www.samoagovt.ws/category/latest-news/
## 5 https://www.azdhs.gov/preparedness/epidemiology-disease-control/infectious-disease-epidemiology/index.php#novel-coronavirus-home
## 6                                                 https://www.latimes.com/projects/california-coronavirus-cases-tracking-outbreak/
##                                                         covid19SiteSecondary
## 1              http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-19/default.aspx
## 2                                                                       <NA>
## 3                                                                       <NA>
## 4                                         https://www.facebook.com/amsamgov/
## 5                                                                       <NA>
## 6 https://www.cdph.ca.gov/Programs/CID/DCDC/Pages/Immunization/ncov2019.aspx
##           twitter            pui   pum
## 1    @Alaska_DHSS       All data FALSE
## 2 @alpublichealth        No data FALSE
## 3         @adhpio       All data  TRUE
## 4            <NA>        No Data FALSE
## 5          @azdhs       All data FALSE
## 6 @CAPublicHealth Only positives FALSE
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        notes
## 1                                                                                                                                                                                                                                                           Unclear if their reported number means "persons tested" or "specimens tested." We count them as "persons tested" because the header indicates this is the case. Total tests are taken from the annotations on the charts on the page. Negatives are calculated as totals – positives. Negatives reported on site have decreased at various times, without explanation. Latest update time taken from [this page](http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-19/default.aspx).
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Last negative count from 3/16. Last update time taken from [main page](http://www.alabamapublichealth.gov/infectiousdiseases/2019-coronavirus.html).
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           Pending = "PUIs"
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           American Samoa: No data, no confirmed cases yet.
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            Negative = “Ruled Out”. Negatives are from public labs only. Our total is slightly higher than the state’s site because we also add in the additional positives from private labs, listed in the second table. 
## 6 We report positives from the [LA Times](https://www.latimes.com/projects/california-coronavirus-cases-tracking-outbreak/), which is aggregated from county-level data and is ahead of the official California data site. When using LA Times data for positives, we do not add deaths to the positive count: they are already included. Previously we added them in because the California state site had not been including them in the positive count. Negative numbers are inferred from Totals - Positives, taken from the [latest press release](https://www.cdph.ca.gov/Programs/OPA/Pages/NR20-028.aspx). We are not reporting pending numbers until they are reported regularly, or from a dashboard.  We assume 1 test=1 person. 
##             name
## 1         Alaska
## 2        Alabama
## 3       Arkansas
## 4 American Samoa
## 5        Arizona
## 6     California

Look at the data

data_df <- fromJSON("../../data/covidtracking/states-daily.json")
head(data_df)
##       date state positive negative pending hospitalized death total
## 1 20200320    AK       12      686      NA           NA    NA   698
## 2 20200320    AL       81       28      NA           NA     0   109
## 3 20200320    AR       96      351     203           NA    NA   650
## 4 20200320    AS        0       NA      NA           NA     0     0
## 5 20200320    AZ       65      211     101           NA     0   377
## 6 20200320    CA     1063    10424      NA           NA    20 11487
##            dateChecked
## 1 2020-03-20T20:00:00Z
## 2 2020-03-20T20:00:00Z
## 3 2020-03-20T20:00:00Z
## 4 2020-03-20T20:00:00Z
## 5 2020-03-20T20:00:00Z
## 6 2020-03-20T20:00:00Z
data_df$date <- as.Date(paste(data_df$date), format="%Y%m%d")
head(data_df)
##         date state positive negative pending hospitalized death total
## 1 2020-03-20    AK       12      686      NA           NA    NA   698
## 2 2020-03-20    AL       81       28      NA           NA     0   109
## 3 2020-03-20    AR       96      351     203           NA    NA   650
## 4 2020-03-20    AS        0       NA      NA           NA     0     0
## 5 2020-03-20    AZ       65      211     101           NA     0   377
## 6 2020-03-20    CA     1063    10424      NA           NA    20 11487
##            dateChecked
## 1 2020-03-20T20:00:00Z
## 2 2020-03-20T20:00:00Z
## 3 2020-03-20T20:00:00Z
## 4 2020-03-20T20:00:00Z
## 5 2020-03-20T20:00:00Z
## 6 2020-03-20T20:00:00Z

Daily counts and totals

# compute daily differences
data_df <- data.table(data_df)
tdf <- data.table(data_df)
setkey(tdf, state, date)
head(tdf)
##          date state positive negative pending hospitalized death total
## 1: 2020-03-06    AK        0        8       1           NA    NA     9
## 2: 2020-03-07    AK        0       12       2           NA    NA    14
## 3: 2020-03-08    AK        0       14       6           NA    NA    20
## 4: 2020-03-09    AK        0       23       9           NA    NA    32
## 5: 2020-03-10    AK        0       23       9           NA    NA    32
## 6: 2020-03-11    AK        0       46      14           NA    NA    60
##             dateChecked
## 1: 2020-03-06T21:00:00Z
## 2: 2020-03-07T21:00:00Z
## 3: 2020-03-08T20:00:00Z
## 4: 2020-03-09T20:00:00Z
## 5: 2020-03-10T20:00:00Z
## 6: 2020-03-11T20:00:00Z
tdf[, diff_positive := positive - shift(positive), by = .(state)]
tdf_state <- tdf[state == "WA"]
tdf_state
##           date state positive negative pending hospitalized death total
##  1: 2020-03-04    WA       39       NA      NA           NA    NA    39
##  2: 2020-03-05    WA       70       NA      NA           NA    NA    70
##  3: 2020-03-06    WA       79      370      NA           NA    NA   449
##  4: 2020-03-07    WA      102      370      66           NA    NA   538
##  5: 2020-03-08    WA      102      640      60           NA    NA   802
##  6: 2020-03-09    WA      136     1110      NA           NA    NA  1246
##  7: 2020-03-10    WA      162     1110      NA           NA    NA  1272
##  8: 2020-03-11    WA      267     2175      NA           NA    24  2466
##  9: 2020-03-12    WA      337     3037      NA           NA    29  3403
## 10: 2020-03-13    WA      457     4350      NA           NA    31  4807
## 11: 2020-03-14    WA      568     6001      NA           NA    37  6569
## 12: 2020-03-15    WA      642     7122      NA           NA    40  7764
## 13: 2020-03-16    WA      769     9451      NA           NA    42 10220
## 14: 2020-03-17    WA      904    11582      NA           NA    48 12486
## 15: 2020-03-18    WA     1012    13117      NA           NA    52 14129
## 16: 2020-03-19    WA     1187    15918      NA           NA    66 17105
## 17: 2020-03-20    WA     1376    19336      NA           NA    74 20712
##              dateChecked diff_positive
##  1: 2020-03-04T21:00:00Z            NA
##  2: 2020-03-05T21:00:00Z            31
##  3: 2020-03-06T21:00:00Z             9
##  4: 2020-03-07T21:00:00Z            23
##  5: 2020-03-08T20:00:00Z             0
##  6: 2020-03-09T20:00:00Z            34
##  7: 2020-03-10T20:00:00Z            26
##  8: 2020-03-11T20:00:00Z           105
##  9: 2020-03-12T20:00:00Z            70
## 10: 2020-03-13T20:00:00Z           120
## 11: 2020-03-14T20:00:00Z           111
## 12: 2020-03-15T20:00:00Z            74
## 13: 2020-03-16T20:00:00Z           127
## 14: 2020-03-17T20:00:00Z           135
## 15: 2020-03-18T20:00:00Z           108
## 16: 2020-03-19T20:00:00Z           175
## 17: 2020-03-20T20:00:00Z           189
ggplot(data = tdf_state, aes(x = date, y = diff_positive)) +
  labs(title = "WA", y = "Daily positive") +
  geom_bar(stat = "identity", position =  position_dodge(width=1))
## Warning: Removed 1 rows containing missing values (geom_bar).

Counts per 100k

data_df <- as.data.table(data_df)
pop_df <- fread("../../data/geodata/us_pop_fung_2019.csv")
setkey(pop_df, ST)
head(pop_df)
##         State ST Population
## 1:     Alaska AK     731545
## 2:    Alabama AL    4903185
## 3:   Arkansas AR    3017825
## 4:    Arizona AZ    7278717
## 5: California CA   39512223
## 6:   Colorado CO    5758736
most_recent_test_date <- max(data_df$date)
most_recent_df <- data_df[date == most_recent_test_date]
paste("Most recent test date", most_recent_test_date)
## [1] "Most recent test date 2020-03-20"
paste(nrow(most_recent_df), "states/territories have data on this date.")
## [1] "56 states/territories have data on this date."
most_recent_df <- merge(most_recent_df, pop_df, by.x = "state", by.y = "ST", all = TRUE)
head(most_recent_df)
##    state       date positive negative pending hospitalized death total
## 1:    AK 2020-03-20       12      686      NA           NA    NA   698
## 2:    AL 2020-03-20       81       28      NA           NA     0   109
## 3:    AR 2020-03-20       96      351     203           NA    NA   650
## 4:    AS 2020-03-20        0       NA      NA           NA     0     0
## 5:    AZ 2020-03-20       65      211     101           NA     0   377
## 6:    CA 2020-03-20     1063    10424      NA           NA    20 11487
##             dateChecked      State Population
## 1: 2020-03-20T20:00:00Z     Alaska     731545
## 2: 2020-03-20T20:00:00Z    Alabama    4903185
## 3: 2020-03-20T20:00:00Z   Arkansas    3017825
## 4: 2020-03-20T20:00:00Z       <NA>         NA
## 5: 2020-03-20T20:00:00Z    Arizona    7278717
## 6: 2020-03-20T20:00:00Z California   39512223
most_recent_df[, total_100k := total / Population * 100000]
head(most_recent_df[, .(date, state, total_100k)])
##          date state total_100k
## 1: 2020-03-20    AK  95.414499
## 2: 2020-03-20    AL   2.223045
## 3: 2020-03-20    AR  21.538691
## 4: 2020-03-20    AS         NA
## 5: 2020-03-20    AZ   5.179484
## 6: 2020-03-20    CA  29.072017
setkey(most_recent_df, total_100k)
head(most_recent_df)
##    state       date positive negative pending hospitalized death total
## 1:    AS 2020-03-20        0       NA      NA           NA     0     0
## 2:    GU 2020-03-20       14       86      NA           NA    NA   100
## 3:    MP 2020-03-20        0       NA      NA           NA     0     0
## 4:    PR 2020-03-20       14      114      52           NA    NA   180
## 5:    VI 2020-03-20        3       NA      NA           NA     0     3
## 6:    AL 2020-03-20       81       28      NA           NA     0   109
##             dateChecked   State Population total_100k
## 1: 2020-03-20T20:00:00Z    <NA>         NA         NA
## 2: 2020-03-20T20:00:00Z    <NA>         NA         NA
## 3: 2020-03-20T20:00:00Z    <NA>         NA         NA
## 4: 2020-03-20T20:00:00Z    <NA>         NA         NA
## 5: 2020-03-20T20:00:00Z    <NA>         NA         NA
## 6: 2020-03-20T20:00:00Z Alabama    4903185   2.223045
most_recent_df[, state := factor(state, levels = c(state))]
ggplot(data = most_recent_df, aes(x = state, y = total_100k)) +
  labs(title = "Tests per 100k", y = "total/100k") +
  geom_bar(stat="identity", position=position_dodge(width=1)) +
  theme(axis.text.x = element_text(angle = 90))
## Warning: Removed 5 rows containing missing values (geom_bar).