library(jsonlite)
library(ggplot2)
options(repr.plot.width=14, repr.plot.height=8)
library(data.table)
metadata_df <- fromJSON("../../data/covidtracking/states-metadata.json")
head(metadata_df)
## state
## 1 AK
## 2 AL
## 3 AR
## 4 AS
## 5 AZ
## 6 CA
## covid19SiteOld
## 1 http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-19/default.aspx
## 2 http://www.alabamapublichealth.gov/infectiousdiseases/2019-coronavirus.html
## 3 https://www.healthy.arkansas.gov/programs-services/topics/novel-coronavirus
## 4 http://www.samoagovt.ws/2020/03/ministry-of-health-coronavirus-covid-19-update-14-march-2020/
## 5 https://www.azdhs.gov/preparedness/epidemiology-disease-control/infectious-disease-epidemiology/index.php#novel-coronavirus-home
## 6 https://www.cdph.ca.gov/Programs/CID/DCDC/Pages/Immunization/ncov2019.aspx
## covid19Site
## 1 http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-19/monitoring.aspx
## 2 https://alpublichealth.maps.arcgis.com/apps/opsdashboard/index.html#/6d2771faa9da4a2786a509d82c8cf0f7
## 3 https://www.healthy.arkansas.gov/programs-services/topics/novel-coronavirus
## 4 http://www.samoagovt.ws/category/latest-news/
## 5 https://www.azdhs.gov/preparedness/epidemiology-disease-control/infectious-disease-epidemiology/index.php#novel-coronavirus-home
## 6 https://www.latimes.com/projects/california-coronavirus-cases-tracking-outbreak/
## covid19SiteSecondary
## 1 http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-19/default.aspx
## 2 <NA>
## 3 <NA>
## 4 https://www.facebook.com/amsamgov/
## 5 <NA>
## 6 https://www.cdph.ca.gov/Programs/CID/DCDC/Pages/Immunization/ncov2019.aspx
## twitter pui pum
## 1 @Alaska_DHSS All data FALSE
## 2 @alpublichealth No data FALSE
## 3 @adhpio All data TRUE
## 4 <NA> No Data FALSE
## 5 @azdhs All data FALSE
## 6 @CAPublicHealth Only positives FALSE
## notes
## 1 Unclear if their reported number means "persons tested" or "specimens tested." We count them as "persons tested" because the header indicates this is the case. Total tests are taken from the annotations on the charts on the page. Negatives are calculated as totals – positives. Negatives reported on site have decreased at various times, without explanation. Latest update time taken from [this page](http://dhss.alaska.gov/dph/Epi/id/Pages/COVID-19/default.aspx).
## 2 Last negative count from 3/16. Last update time taken from [main page](http://www.alabamapublichealth.gov/infectiousdiseases/2019-coronavirus.html).
## 3 Pending = "PUIs"
## 4 American Samoa: No data, no confirmed cases yet.
## 5 Negative = “Ruled Out”. Negatives are from public labs only. Our total is slightly higher than the state’s site because we also add in the additional positives from private labs, listed in the second table.
## 6 We report positives from the [LA Times](https://www.latimes.com/projects/california-coronavirus-cases-tracking-outbreak/), which is aggregated from county-level data and is ahead of the official California data site. When using LA Times data for positives, we do not add deaths to the positive count: they are already included. Previously we added them in because the California state site had not been including them in the positive count. Negative numbers are inferred from Totals - Positives, taken from the [latest press release](https://www.cdph.ca.gov/Programs/OPA/Pages/NR20-028.aspx). We are not reporting pending numbers until they are reported regularly, or from a dashboard. We assume 1 test=1 person.
## name
## 1 Alaska
## 2 Alabama
## 3 Arkansas
## 4 American Samoa
## 5 Arizona
## 6 California
data_df <- fromJSON("../../data/covidtracking/states-daily.json")
head(data_df)
## date state positive negative pending hospitalized death total
## 1 20200320 AK 12 686 NA NA NA 698
## 2 20200320 AL 81 28 NA NA 0 109
## 3 20200320 AR 96 351 203 NA NA 650
## 4 20200320 AS 0 NA NA NA 0 0
## 5 20200320 AZ 65 211 101 NA 0 377
## 6 20200320 CA 1063 10424 NA NA 20 11487
## dateChecked
## 1 2020-03-20T20:00:00Z
## 2 2020-03-20T20:00:00Z
## 3 2020-03-20T20:00:00Z
## 4 2020-03-20T20:00:00Z
## 5 2020-03-20T20:00:00Z
## 6 2020-03-20T20:00:00Z
data_df$date <- as.Date(paste(data_df$date), format="%Y%m%d")
head(data_df)
## date state positive negative pending hospitalized death total
## 1 2020-03-20 AK 12 686 NA NA NA 698
## 2 2020-03-20 AL 81 28 NA NA 0 109
## 3 2020-03-20 AR 96 351 203 NA NA 650
## 4 2020-03-20 AS 0 NA NA NA 0 0
## 5 2020-03-20 AZ 65 211 101 NA 0 377
## 6 2020-03-20 CA 1063 10424 NA NA 20 11487
## dateChecked
## 1 2020-03-20T20:00:00Z
## 2 2020-03-20T20:00:00Z
## 3 2020-03-20T20:00:00Z
## 4 2020-03-20T20:00:00Z
## 5 2020-03-20T20:00:00Z
## 6 2020-03-20T20:00:00Z
# compute daily differences
data_df <- data.table(data_df)
tdf <- data.table(data_df)
setkey(tdf, state, date)
head(tdf)
## date state positive negative pending hospitalized death total
## 1: 2020-03-06 AK 0 8 1 NA NA 9
## 2: 2020-03-07 AK 0 12 2 NA NA 14
## 3: 2020-03-08 AK 0 14 6 NA NA 20
## 4: 2020-03-09 AK 0 23 9 NA NA 32
## 5: 2020-03-10 AK 0 23 9 NA NA 32
## 6: 2020-03-11 AK 0 46 14 NA NA 60
## dateChecked
## 1: 2020-03-06T21:00:00Z
## 2: 2020-03-07T21:00:00Z
## 3: 2020-03-08T20:00:00Z
## 4: 2020-03-09T20:00:00Z
## 5: 2020-03-10T20:00:00Z
## 6: 2020-03-11T20:00:00Z
tdf[, diff_positive := positive - shift(positive), by = .(state)]
tdf_state <- tdf[state == "WA"]
tdf_state
## date state positive negative pending hospitalized death total
## 1: 2020-03-04 WA 39 NA NA NA NA 39
## 2: 2020-03-05 WA 70 NA NA NA NA 70
## 3: 2020-03-06 WA 79 370 NA NA NA 449
## 4: 2020-03-07 WA 102 370 66 NA NA 538
## 5: 2020-03-08 WA 102 640 60 NA NA 802
## 6: 2020-03-09 WA 136 1110 NA NA NA 1246
## 7: 2020-03-10 WA 162 1110 NA NA NA 1272
## 8: 2020-03-11 WA 267 2175 NA NA 24 2466
## 9: 2020-03-12 WA 337 3037 NA NA 29 3403
## 10: 2020-03-13 WA 457 4350 NA NA 31 4807
## 11: 2020-03-14 WA 568 6001 NA NA 37 6569
## 12: 2020-03-15 WA 642 7122 NA NA 40 7764
## 13: 2020-03-16 WA 769 9451 NA NA 42 10220
## 14: 2020-03-17 WA 904 11582 NA NA 48 12486
## 15: 2020-03-18 WA 1012 13117 NA NA 52 14129
## 16: 2020-03-19 WA 1187 15918 NA NA 66 17105
## 17: 2020-03-20 WA 1376 19336 NA NA 74 20712
## dateChecked diff_positive
## 1: 2020-03-04T21:00:00Z NA
## 2: 2020-03-05T21:00:00Z 31
## 3: 2020-03-06T21:00:00Z 9
## 4: 2020-03-07T21:00:00Z 23
## 5: 2020-03-08T20:00:00Z 0
## 6: 2020-03-09T20:00:00Z 34
## 7: 2020-03-10T20:00:00Z 26
## 8: 2020-03-11T20:00:00Z 105
## 9: 2020-03-12T20:00:00Z 70
## 10: 2020-03-13T20:00:00Z 120
## 11: 2020-03-14T20:00:00Z 111
## 12: 2020-03-15T20:00:00Z 74
## 13: 2020-03-16T20:00:00Z 127
## 14: 2020-03-17T20:00:00Z 135
## 15: 2020-03-18T20:00:00Z 108
## 16: 2020-03-19T20:00:00Z 175
## 17: 2020-03-20T20:00:00Z 189
ggplot(data = tdf_state, aes(x = date, y = diff_positive)) +
labs(title = "WA", y = "Daily positive") +
geom_bar(stat = "identity", position = position_dodge(width=1))
## Warning: Removed 1 rows containing missing values (geom_bar).
data_df <- as.data.table(data_df)
pop_df <- fread("../../data/geodata/us_pop_fung_2019.csv")
setkey(pop_df, ST)
head(pop_df)
## State ST Population
## 1: Alaska AK 731545
## 2: Alabama AL 4903185
## 3: Arkansas AR 3017825
## 4: Arizona AZ 7278717
## 5: California CA 39512223
## 6: Colorado CO 5758736
most_recent_test_date <- max(data_df$date)
most_recent_df <- data_df[date == most_recent_test_date]
paste("Most recent test date", most_recent_test_date)
## [1] "Most recent test date 2020-03-20"
paste(nrow(most_recent_df), "states/territories have data on this date.")
## [1] "56 states/territories have data on this date."
most_recent_df <- merge(most_recent_df, pop_df, by.x = "state", by.y = "ST", all = TRUE)
head(most_recent_df)
## state date positive negative pending hospitalized death total
## 1: AK 2020-03-20 12 686 NA NA NA 698
## 2: AL 2020-03-20 81 28 NA NA 0 109
## 3: AR 2020-03-20 96 351 203 NA NA 650
## 4: AS 2020-03-20 0 NA NA NA 0 0
## 5: AZ 2020-03-20 65 211 101 NA 0 377
## 6: CA 2020-03-20 1063 10424 NA NA 20 11487
## dateChecked State Population
## 1: 2020-03-20T20:00:00Z Alaska 731545
## 2: 2020-03-20T20:00:00Z Alabama 4903185
## 3: 2020-03-20T20:00:00Z Arkansas 3017825
## 4: 2020-03-20T20:00:00Z <NA> NA
## 5: 2020-03-20T20:00:00Z Arizona 7278717
## 6: 2020-03-20T20:00:00Z California 39512223
most_recent_df[, total_100k := total / Population * 100000]
head(most_recent_df[, .(date, state, total_100k)])
## date state total_100k
## 1: 2020-03-20 AK 95.414499
## 2: 2020-03-20 AL 2.223045
## 3: 2020-03-20 AR 21.538691
## 4: 2020-03-20 AS NA
## 5: 2020-03-20 AZ 5.179484
## 6: 2020-03-20 CA 29.072017
setkey(most_recent_df, total_100k)
head(most_recent_df)
## state date positive negative pending hospitalized death total
## 1: AS 2020-03-20 0 NA NA NA 0 0
## 2: GU 2020-03-20 14 86 NA NA NA 100
## 3: MP 2020-03-20 0 NA NA NA 0 0
## 4: PR 2020-03-20 14 114 52 NA NA 180
## 5: VI 2020-03-20 3 NA NA NA 0 3
## 6: AL 2020-03-20 81 28 NA NA 0 109
## dateChecked State Population total_100k
## 1: 2020-03-20T20:00:00Z <NA> NA NA
## 2: 2020-03-20T20:00:00Z <NA> NA NA
## 3: 2020-03-20T20:00:00Z <NA> NA NA
## 4: 2020-03-20T20:00:00Z <NA> NA NA
## 5: 2020-03-20T20:00:00Z <NA> NA NA
## 6: 2020-03-20T20:00:00Z Alabama 4903185 2.223045
most_recent_df[, state := factor(state, levels = c(state))]
ggplot(data = most_recent_df, aes(x = state, y = total_100k)) +
labs(title = "Tests per 100k", y = "total/100k") +
geom_bar(stat="identity", position=position_dodge(width=1)) +
theme(axis.text.x = element_text(angle = 90))
## Warning: Removed 5 rows containing missing values (geom_bar).