From 602219adde240205f7cfe80aca760218848fc01c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robert=20Iv=C3=A1nek?= <robert.ivanek@unibas.ch> Date: Wed, 1 Apr 2020 18:13:03 +0200 Subject: [PATCH] Added example for processing Swiss data in R. Rmd document. --- notebooks/examples-Rmd/visualise_swiss.Rmd | 302 +++++++++++++++++++++ 1 file changed, 302 insertions(+) create mode 100644 notebooks/examples-Rmd/visualise_swiss.Rmd diff --git a/notebooks/examples-Rmd/visualise_swiss.Rmd b/notebooks/examples-Rmd/visualise_swiss.Rmd new file mode 100644 index 0000000..e356927 --- /dev/null +++ b/notebooks/examples-Rmd/visualise_swiss.Rmd @@ -0,0 +1,302 @@ +--- +title: "COVID-19 Cases in Switzerland" +output: + github_document: default +keep_md: true +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, + warning=FALSE, + message=FALSE) +library(RColorBrewer) +library(tidyverse) +theme_set(theme_bw(base_size=14)) +colours <- colorRampPalette(brewer.pal(11, "Spectral"))(26) +names(colours) <- c("AG", "AI", "AR", "BE", "BL", "BS", "FR", "GE", "GL", + "GR", "JU", "LU", "NE", "NW", "OW", "SG", "SH", "SO", + "SZ", "TG", "TI", "UR", "VD", "VS", "ZG", "ZH") +``` + +# Data clean-up + +Read in the data. + +```{r readIn} +fls <- list.files("../../data/openzh-covid-19", pattern="Kanton_.*total.csv", full.names=TRUE) +dat <- lapply(fls, read_csv) +dat <- do.call(bind_rows, dat) +``` + +Fill-in data for missing dates. Decide how to handle incomplete data from last (current) day in case of summary stats for whole Switzerland. + +- `removeLast=TRUE` removes last (current day) +- `removeLast=FALSE` transfer the values from previous day for missing cantons + +```{r removeLast} +# only change next line +removeLast <- FALSE +# following lines define max date and offset +# for filtering out last day if requested +maxDate <- max(dat$date) +if (removeLast) { + offset <- 0 +} else { + offset <- 1 +} +``` + +Original data in the repo: + +| Field Name | Description | Format | +|----------------------------|--------------------------------------------------------------------------------------|------------| +| date | Date of notification | YYYY-MM-DD | +| time | Time of notification | HH:MM | +| abbreviation_canton_and_fl | Abbreviation of the reporting canton | Text | +| ncumul_tested | Reported number of tests performed as of date | Number | +| ncumul_conf | Reported number of confirmed cases as of date | Number | +| ncumul_hosp * | _Reported number of hospitalized patients on date_ | Number | +| ncumul_ICU * | _Reported number of hospitalized patients in ICUs on date_ | Number | +| ncumul_vent * | _Reported number of patients requiring invasive ventilation on date)_ | Number | +| ncumul_released | Reported number of patients released from hospitals or reported recovered as of date | Number | +| ncumul_deceased | Reported number of deceased as of date | Number | +| source | Source of the information | href | +| ncumul_ICF | ? | Number | +| ninst_ICU_intub | ? | Number | +| ncumul_deceased_suspect | ? | Number | +| TotalPosTests1 | ? | Number | +| TotalCured | ? | Number | + + +```{r cleanData} +dat <- dat %>% + rename(n_hosp=ncumul_hosp, # rename columns which contain count and cumulative count + n_ICU=ncumul_ICU, + n_vent=ncumul_vent) %>% + group_by(abbreviation_canton_and_fl, date) %>% # group by region and date + summarize_if(is.numeric, max, na.rm=TRUE) %>% # select max value (if there are several per day) + mutate_if(is.numeric, na_if, -Inf) %>% # replace the -Inf with NA to enable fill + complete(date=seq.Date(min(date), max(date), by=1)) %>% # add missing dates (values filled with NA's) + fill(starts_with("ncumul_")) %>% # fill NA's with previous value + mutate_at(vars(starts_with("ncumul_")), replace_na, 0) %>% # replace NA's with 0's only fro cumulative values + filter(abbreviation_canton_and_fl != "FL") # remove FL (Lichtenstein) + +# # replace all NA's with 0's +# dat <- dat %>% +# mutate_if(is.numeric, replace_na, 0) +``` + +Data after clean-up and summary: + +| Field Name | Description | Format | +|----------------------------|--------------------------------------------------------------------------------------|----------| +| abbreviation_canton_and_fl | Abbreviation of the reporting canton | `<chr>` | +| date | Date of notification | `<date>` | +| ncumul_tested | Reported number of tests performed as of date | `<dbl>` | +| ncumul_conf | Reported number of confirmed cases as of date | `<dbl>` | +| n_hosp * | _Reported number of hospitalized patients on date_ | `<dbl>` | +| n_ICU * | _Reported number of hospitalized patients in ICUs on date_ | `<dbl>` | +| n_vent * | _Reported number of patients requiring invasive ventilation on date)_ | `<dbl>` | +| ncumul_released | Reported number of patients released from hospitals or reported recovered as of date | `<dbl>` | +| ncumul_deceased | Reported number of deceased as of date | `<dbl>` | +| ncumul_ICF | ? | `<dbl>` | +| ninst_ICU_intub | ? | `<dbl>` | +| ncumul_deceased_suspect | ? | `<dbl>` | +| TotalPosTests1 | ? | `<dbl>` | +| TotalCured | ? | `<dbl>` | + + +## Select data for one canton (BS) + +```{r dataOneCanton} +dat %>% + filter(abbreviation_canton_and_fl=="BS") %>% # filter specific region + as.data.frame() +``` + + +# Plotting cumulative number of confirmed cases + +## For specific canton, Basel Stadt (BS) as an example. + +```{r plotOneCanton} +dat %>% + filter(abbreviation_canton_and_fl=="BS") %>% # filter specific region + ggplot(aes(x=date, y=ncumul_conf, colour=abbreviation_canton_and_fl)) + + geom_line(size=1) + scale_y_log10() + + scale_colour_manual(values=colours, guide=FALSE) + + labs(x="", y="No. of confirmed cases", title="Canton Basel Stadt (BS)") +``` + +## For several cantons, Basel Land (BL) and Basel Stadt (BS) as an example. + +```{r plotMultipleCantons} +dat %>% + filter(abbreviation_canton_and_fl %in% c("BL","BS")) %>% # filter specific region(s) + ggplot(aes(x=date, y=ncumul_conf)) + + geom_line(aes(colour=abbreviation_canton_and_fl), size=1) + scale_y_log10() + + scale_colour_manual(values=colours) + + labs(x="", y="No. of confirmed cases", title="Cantons Basel Land (BL) and Stadt (BS)") + + theme(legend.title=element_blank()) +``` + +## For all cantons. + +```{r plotAllCantons} +dat %>% + ggplot(aes(x=date, y=ncumul_conf)) + + geom_line(aes(colour=abbreviation_canton_and_fl), size=1) + scale_y_log10() + + scale_colour_manual(values=colours) + + labs(x="", y="No. of confirmed cases") + + theme(legend.title=element_blank()) +``` + + +## For whole Switzerland (summary) + +```{r plotSwitzerland} +dat %>% + mutate(n=1) %>% + complete(date=seq.Date(min(date), maxDate, by=1)) %>% # add missing dates (values filled with NA's) + fill(-c(date, abbreviation_canton_and_fl, n)) %>% # fill NA's with previous value + mutate_if(is.numeric, replace_na, 0) %>% # replace NA's at the beginning of the time-series + group_by(date) %>% # group by date and + summarize_if(is.numeric, sum) %>% # sum-up all cases per date from all regions + filter(date < maxDate+offset) %>% # if set, remove incomplete data for last day + mutate(status=ifelse(date > max(date[n==26]), "incomplete", "complete")) %>% + ggplot(aes(x=date, y=ncumul_conf, linetype=status)) + + geom_line(colour="steelblue", size=1) + scale_y_log10() + + scale_linetype(c("solid", "dotted"), guide=FALSE) + + labs(x="", y="No. of confirmed cases", title="Switzerland") +``` + +# Doubling rate + +Function to calculate doubling rate + +```{r doublingFun} +doubling <- function(x, days) { + vapply(seq_along(x), function(y, x, days) { + if (y <= days) return(NA) + if (x[y]<1) return(NA) + round((days*log(2))/(log(x[y]/x[y-days])), 2) + }, FUN.VALUE=numeric(1), x=x, days=days) +} +``` + + +## Per canton (simple) + +```{r doublingPerCantonSimple} +dat %>% + arrange(date) %>% # order by date + group_by(abbreviation_canton_and_fl) %>% # group by region + summarize_at(vars(starts_with("ncumul")), function(x) { # count number of days to double + sum(x >= x[length(x)]/2)-1 + }) %>% + rename_at(vars(starts_with("ncumul")), + str_replace, pattern="ncumul", "daysToDouble") %>% + select(c(daysToDouble_conf, daysToDouble_released, daysToDouble_deceased)) %>% + as.data.frame() +``` + +### Per canton + +```{r doublingPerCantonTable} +dat %>% + summarize_at(vars(starts_with("ncumul")), function(x) { # count number of days to double + doubling(x, days=5)[length(x)] + }) %>% + rename_at(vars(starts_with("ncumul")), + str_replace, pattern="ncumul", "daysToDouble") %>% + select(c(daysToDouble_conf, daysToDouble_released, daysToDouble_deceased)) %>% + as.data.frame() +``` + +### Plot per canton + +```{r doublingPerCantonHeatmap} +dat %>% + mutate_at(vars(starts_with("ncumul")), doubling, days=5) %>% + select(-c(ncumul_tested, starts_with("n_"))) %>% + filter(!(abbreviation_canton_and_fl=="SZ" & date=="2020-03-19")) %>% + ggplot(aes(x=date, y=fct_rev(abbreviation_canton_and_fl), fill=ncumul_conf)) + + geom_tile(color= "white",size=0.1) + + xlim(as.Date("2020-03-01"), NA) + + scale_fill_viridis_c(name="Doubling rate", + option="A", na.value="transparent", + direction=-1, trans="log2") + # log2 transformation to increase resolution at lower end + theme_minimal(base_size = 10) + + labs(title="Doubling rate of confirmed cases in cantons", x="", y="") + + theme(legend.position = "bottom") +``` + + +## Whole Switzerland (simple) + +```{r doublingSwitzerlandTableLast} +dat %>% + mutate(n=1) %>% + complete(date=seq.Date(min(date), maxDate, by=1)) %>% # add missing dates (values filled with NA's) + fill(-c(date, abbreviation_canton_and_fl, n)) %>% # fill NA's with previous value + mutate_if(is.numeric, replace_na, 0) %>% # replace NA's at the beginning of the time-series + arrange(date) %>% # order by date + group_by(date) %>% # group by date and + summarize_if(is.numeric, sum) %>% # sum-up all cases per date from all regions + filter(date < maxDate+offset) %>% # if set, remove incomplete data for last day + summarize_at(vars(starts_with("ncumul")), function(x) { # count number of days to double + sum(x >= x[length(x)]/2) -1 + }) %>% + rename_at(vars(starts_with("ncumul")), + str_replace, pattern="ncumul", "daysToDouble") %>% + select(c(daysToDouble_conf, daysToDouble_released, daysToDouble_deceased)) %>% + as.data.frame() +``` + +## Whole Switzerland + +```{r doublingSwitzerlandTable} +dat %>% + mutate(n=1) %>% + complete(date=seq.Date(min(date), maxDate, by=1)) %>% # add missing dates (values filled with NA's) + fill(-c(date, abbreviation_canton_and_fl, n)) %>% # fill NA's with previous value + mutate_if(is.numeric, replace_na, 0) %>% # replace NA's at the beginning of the time-series + arrange(date) %>% # order by date + group_by(date) %>% # group by date and + summarize_if(is.numeric, sum) %>% # sum-up all cases per date from all regions + filter(date < maxDate+0 ) %>% # remove incomplete data for last day !!! + summarize_at(vars(starts_with("ncumul")), function(x) { # count number of days to double + doubling(x, days=5)[length(x)] + }) %>% + rename_at(vars(starts_with("ncumul")), + str_replace, pattern="ncumul", "daysToDouble") %>% + select(c(daysToDouble_conf, daysToDouble_released, daysToDouble_deceased)) %>% + as.data.frame() +``` + +```{r} +dat %>% + mutate(n=1) %>% + complete(date=seq.Date(min(date), maxDate, by=1)) %>% # add missing dates (values filled with NA's) + fill(-c(date, abbreviation_canton_and_fl, n)) %>% # fill NA's with previous value + mutate_if(is.numeric, replace_na, 0) %>% # replace NA's at the beginning of the time-series + arrange(date) %>% # order by date + group_by(date) %>% # group by date and + summarize_if(is.numeric, sum) %>% # sum-up all cases per date from all regions + filter(date < maxDate+offset) %>% # if set, remove incomplete data for last day + mutate_at(vars(starts_with("ncumul")), doubling, days=5) %>% + select(-c(ncumul_tested, starts_with("n_"))) %>% + mutate(status=ifelse(date > max(date[n==26]), "incomplete", "complete")) %>% + ggplot(aes(x=date, y=ncumul_conf, fill=ncumul_conf, alpha=status)) + + geom_bar(stat="identity") + + xlim(as.Date("2020-03-01"), NA) + + scale_alpha_manual(values=c(1, 0.1), guide=FALSE) + + scale_fill_viridis_c(name="Doubling rate", + option="A", na.value="transparent", + direction=-1, trans="log2") + # log2 transformation to increase resolution at lower end + theme_minimal(base_size = 10) + + labs(title="Doubling rate of confirmed cases in Switzerland", x="", y="") + + theme(legend.position = "bottom") + +``` + -- GitLab