From 602219adde240205f7cfe80aca760218848fc01c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Robert=20Iv=C3=A1nek?= <robert.ivanek@unibas.ch>
Date: Wed, 1 Apr 2020 18:13:03 +0200
Subject: [PATCH] Added example for processing Swiss data in R. Rmd document.

---
 notebooks/examples-Rmd/visualise_swiss.Rmd | 302 +++++++++++++++++++++
 1 file changed, 302 insertions(+)
 create mode 100644 notebooks/examples-Rmd/visualise_swiss.Rmd

diff --git a/notebooks/examples-Rmd/visualise_swiss.Rmd b/notebooks/examples-Rmd/visualise_swiss.Rmd
new file mode 100644
index 0000000..e356927
--- /dev/null
+++ b/notebooks/examples-Rmd/visualise_swiss.Rmd
@@ -0,0 +1,302 @@
+---
+title: "COVID-19 Cases in Switzerland"
+output: 
+  github_document: default
+keep_md: true
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, 
+                      warning=FALSE, 
+                      message=FALSE)
+library(RColorBrewer)
+library(tidyverse)
+theme_set(theme_bw(base_size=14))
+colours <- colorRampPalette(brewer.pal(11, "Spectral"))(26)
+names(colours) <- c("AG", "AI", "AR", "BE", "BL", "BS", "FR", "GE", "GL", 
+                    "GR", "JU", "LU", "NE", "NW", "OW", "SG", "SH", "SO", 
+                    "SZ", "TG", "TI", "UR", "VD", "VS", "ZG", "ZH")
+```
+
+# Data clean-up
+
+Read in the data.
+
+```{r readIn}
+fls <- list.files("../../data/openzh-covid-19", pattern="Kanton_.*total.csv", full.names=TRUE)
+dat <- lapply(fls, read_csv)
+dat <- do.call(bind_rows, dat)
+```
+
+Fill-in data for missing dates. Decide how to handle incomplete data from last (current) day in case of summary stats for whole Switzerland.
+
+- `removeLast=TRUE` removes last (current day)
+- `removeLast=FALSE` transfer the values from previous day for missing cantons
+
+```{r removeLast}
+# only change next line
+removeLast <- FALSE
+# following lines define max date and offset 
+# for filtering out last day if requested
+maxDate <- max(dat$date)
+if (removeLast) {
+  offset <- 0
+} else {
+  offset <- 1
+}
+```
+
+Original data in the repo:
+
+| Field Name                 | Description                                                                          | Format     |
+|----------------------------|--------------------------------------------------------------------------------------|------------|
+| date                       | Date of notification                                                                 | YYYY-MM-DD |
+| time                       | Time of notification                                                                 | HH:MM      |
+| abbreviation_canton_and_fl | Abbreviation of the reporting canton                                                 | Text       |
+| ncumul_tested              | Reported number of tests performed as of date                                        | Number     |
+| ncumul_conf                | Reported number of confirmed cases as of date                                        | Number     |
+| ncumul_hosp *              | _Reported number of hospitalized patients on date_                                   | Number     |
+| ncumul_ICU  *              | _Reported number of hospitalized patients in ICUs on date_                           | Number     |
+| ncumul_vent *              | _Reported number of patients requiring invasive ventilation on date)_                | Number     |
+| ncumul_released            | Reported number of patients released from hospitals or reported recovered as of date | Number     |
+| ncumul_deceased            | Reported number of deceased as of date                                               | Number     |
+| source                     | Source of the information                                                            | href       |
+| ncumul_ICF                 | ?                                                                                    | Number     |
+| ninst_ICU_intub            | ?                                                                                    | Number     |
+| ncumul_deceased_suspect    | ?                                                                                    | Number     |
+| TotalPosTests1             | ?                                                                                    | Number     |
+| TotalCured                 | ?                                                                                    | Number     |
+ 
+
+```{r cleanData}
+dat <- dat %>% 
+  rename(n_hosp=ncumul_hosp,                                  # rename columns which contain count and cumulative count
+         n_ICU=ncumul_ICU,
+         n_vent=ncumul_vent) %>%
+  group_by(abbreviation_canton_and_fl, date) %>%              # group by region and date
+  summarize_if(is.numeric, max, na.rm=TRUE) %>%               # select max value (if there are several per day)
+  mutate_if(is.numeric, na_if, -Inf) %>%                      # replace the -Inf with NA to enable fill
+  complete(date=seq.Date(min(date), max(date), by=1)) %>%     # add missing dates (values filled with NA's)
+  fill(starts_with("ncumul_")) %>%                            # fill NA's with previous value
+  mutate_at(vars(starts_with("ncumul_")), replace_na, 0) %>%  # replace NA's with 0's only fro cumulative values
+  filter(abbreviation_canton_and_fl != "FL")                  # remove FL (Lichtenstein)
+
+# # replace all NA's with 0's
+# dat <- dat %>%
+#   mutate_if(is.numeric, replace_na, 0) 
+```
+
+Data after clean-up and summary:
+
+| Field Name                 | Description                                                                          | Format   |
+|----------------------------|--------------------------------------------------------------------------------------|----------|
+| abbreviation_canton_and_fl | Abbreviation of the reporting canton                                                 | `<chr>`  |
+| date                       | Date of notification                                                                 | `<date>` |
+| ncumul_tested              | Reported number of tests performed as of date                                        | `<dbl>`  |
+| ncumul_conf                | Reported number of confirmed cases as of date                                        | `<dbl>`  |
+| n_hosp *                   | _Reported number of hospitalized patients on date_                                   | `<dbl>`  |
+| n_ICU  *                   | _Reported number of hospitalized patients in ICUs on date_                           | `<dbl>`  |
+| n_vent *                   | _Reported number of patients requiring invasive ventilation on date)_                | `<dbl>`  |
+| ncumul_released            | Reported number of patients released from hospitals or reported recovered as of date | `<dbl>`  |
+| ncumul_deceased            | Reported number of deceased as of date                                               | `<dbl>`  |
+| ncumul_ICF                 | ?                                                                                    | `<dbl>`  |
+| ninst_ICU_intub            | ?                                                                                    | `<dbl>`  |
+| ncumul_deceased_suspect    | ?                                                                                    | `<dbl>`  |
+| TotalPosTests1             | ?                                                                                    | `<dbl>`  |
+| TotalCured                 | ?                                                                                    | `<dbl>`  |
+
+
+## Select data for one canton (BS)
+
+```{r dataOneCanton}
+dat %>%
+  filter(abbreviation_canton_and_fl=="BS") %>%               # filter specific region
+  as.data.frame()
+```
+
+
+# Plotting cumulative number of confirmed cases
+
+## For specific canton,  Basel Stadt (BS) as an example.
+
+```{r plotOneCanton}
+dat %>%
+  filter(abbreviation_canton_and_fl=="BS") %>%               # filter specific region
+  ggplot(aes(x=date, y=ncumul_conf, colour=abbreviation_canton_and_fl)) +
+  geom_line(size=1) + scale_y_log10() +
+  scale_colour_manual(values=colours, guide=FALSE) +
+  labs(x="", y="No. of confirmed cases", title="Canton Basel Stadt (BS)")
+```
+
+## For several cantons,  Basel Land (BL) and Basel Stadt (BS) as an example.
+
+```{r plotMultipleCantons}
+dat %>%
+  filter(abbreviation_canton_and_fl %in% c("BL","BS")) %>%    # filter specific region(s)
+  ggplot(aes(x=date, y=ncumul_conf)) +
+  geom_line(aes(colour=abbreviation_canton_and_fl), size=1) + scale_y_log10() +
+  scale_colour_manual(values=colours) +
+  labs(x="", y="No. of confirmed cases", title="Cantons Basel Land (BL) and Stadt (BS)") +
+  theme(legend.title=element_blank())
+```
+
+## For all cantons.
+
+```{r plotAllCantons}
+dat %>%
+  ggplot(aes(x=date, y=ncumul_conf)) +
+  geom_line(aes(colour=abbreviation_canton_and_fl), size=1) + scale_y_log10() +
+  scale_colour_manual(values=colours) +
+  labs(x="", y="No. of confirmed cases") +
+  theme(legend.title=element_blank())
+```
+
+
+## For whole Switzerland (summary)
+
+```{r plotSwitzerland}
+dat %>%
+  mutate(n=1) %>%
+  complete(date=seq.Date(min(date), maxDate, by=1)) %>%   # add missing dates (values filled with NA's) 
+  fill(-c(date, abbreviation_canton_and_fl, n)) %>%       # fill NA's with previous value
+  mutate_if(is.numeric, replace_na, 0) %>%                # replace NA's at the beginning of the time-series
+  group_by(date) %>%                                      # group by date and
+  summarize_if(is.numeric, sum) %>%                       # sum-up all cases per date from all regions
+  filter(date < maxDate+offset) %>%                       # if set, remove incomplete data for last day
+  mutate(status=ifelse(date > max(date[n==26]), "incomplete", "complete")) %>%
+  ggplot(aes(x=date, y=ncumul_conf, linetype=status)) +
+  geom_line(colour="steelblue", size=1) + scale_y_log10() +
+  scale_linetype(c("solid", "dotted"), guide=FALSE) +
+  labs(x="", y="No. of confirmed cases", title="Switzerland")
+```
+
+# Doubling rate
+
+Function to calculate doubling rate
+
+```{r doublingFun}
+doubling <- function(x, days) {
+  vapply(seq_along(x), function(y, x, days) {
+    if (y <= days) return(NA)
+    if (x[y]<1) return(NA)
+    round((days*log(2))/(log(x[y]/x[y-days])), 2)
+  }, FUN.VALUE=numeric(1), x=x, days=days)
+}
+```
+
+
+## Per canton (simple)
+
+```{r doublingPerCantonSimple}
+dat %>% 
+  arrange(date) %>%                                              # order by date
+  group_by(abbreviation_canton_and_fl) %>%                       # group by region
+  summarize_at(vars(starts_with("ncumul")), function(x) {        # count number of days to double
+    sum(x >= x[length(x)]/2)-1
+    }) %>% 
+  rename_at(vars(starts_with("ncumul")), 
+            str_replace, pattern="ncumul", "daysToDouble") %>%
+  select(c(daysToDouble_conf, daysToDouble_released, daysToDouble_deceased)) %>%
+  as.data.frame()
+```
+
+### Per canton 
+
+```{r doublingPerCantonTable}
+dat %>% 
+  summarize_at(vars(starts_with("ncumul")), function(x) {        # count number of days to double
+    doubling(x, days=5)[length(x)]
+    }) %>% 
+  rename_at(vars(starts_with("ncumul")), 
+            str_replace, pattern="ncumul", "daysToDouble") %>%
+  select(c(daysToDouble_conf, daysToDouble_released, daysToDouble_deceased)) %>%
+  as.data.frame()
+```
+
+### Plot per canton
+
+```{r doublingPerCantonHeatmap}
+dat %>% 
+  mutate_at(vars(starts_with("ncumul")), doubling, days=5) %>% 
+  select(-c(ncumul_tested, starts_with("n_"))) %>%
+  filter(!(abbreviation_canton_and_fl=="SZ" & date=="2020-03-19")) %>%
+  ggplot(aes(x=date, y=fct_rev(abbreviation_canton_and_fl), fill=ncumul_conf)) +
+  geom_tile(color= "white",size=0.1) + 
+  xlim(as.Date("2020-03-01"), NA) +
+  scale_fill_viridis_c(name="Doubling rate",
+                       option="A", na.value="transparent",
+                       direction=-1, trans="log2") +             # log2 transformation to increase resolution at lower end
+  theme_minimal(base_size = 10) + 
+  labs(title="Doubling rate of confirmed cases in cantons", x="", y="") +
+  theme(legend.position = "bottom") 
+```
+
+
+## Whole Switzerland (simple)
+
+```{r doublingSwitzerlandTableLast}
+dat %>% 
+  mutate(n=1) %>%
+  complete(date=seq.Date(min(date), maxDate, by=1)) %>%          # add missing dates (values filled with NA's) 
+  fill(-c(date, abbreviation_canton_and_fl, n)) %>%              # fill NA's with previous value
+  mutate_if(is.numeric, replace_na, 0) %>%                       # replace NA's at the beginning of the time-series
+  arrange(date) %>%                                              # order by date
+  group_by(date) %>%                                             # group by date and
+  summarize_if(is.numeric, sum) %>%                              # sum-up all cases per date from all regions
+  filter(date < maxDate+offset) %>%                              # if set, remove incomplete data for last day
+  summarize_at(vars(starts_with("ncumul")), function(x) {        # count number of days to double
+    sum(x >= x[length(x)]/2) -1
+    }) %>% 
+  rename_at(vars(starts_with("ncumul")), 
+            str_replace, pattern="ncumul", "daysToDouble") %>%
+  select(c(daysToDouble_conf, daysToDouble_released, daysToDouble_deceased)) %>%
+  as.data.frame()
+```
+
+## Whole Switzerland
+
+```{r doublingSwitzerlandTable}
+dat %>% 
+  mutate(n=1) %>%
+  complete(date=seq.Date(min(date), maxDate, by=1)) %>%                   # add missing dates (values filled with NA's) 
+  fill(-c(date, abbreviation_canton_and_fl, n)) %>%                       # fill NA's with previous value
+  mutate_if(is.numeric, replace_na, 0) %>%                                # replace NA's at the beginning of the time-series
+  arrange(date) %>%                                                       # order by date
+  group_by(date) %>%                                                      # group by date and
+  summarize_if(is.numeric, sum) %>%                                       # sum-up all cases per date from all regions
+  filter(date < maxDate+0     ) %>%                                       # remove incomplete data for last day !!! 
+  summarize_at(vars(starts_with("ncumul")), function(x) {                 # count number of days to double
+    doubling(x, days=5)[length(x)]
+    }) %>% 
+  rename_at(vars(starts_with("ncumul")), 
+            str_replace, pattern="ncumul", "daysToDouble") %>%
+  select(c(daysToDouble_conf, daysToDouble_released, daysToDouble_deceased)) %>%
+  as.data.frame()
+```
+
+```{r}
+dat %>% 
+  mutate(n=1) %>%
+  complete(date=seq.Date(min(date), maxDate, by=1)) %>%                   # add missing dates (values filled with NA's) 
+  fill(-c(date, abbreviation_canton_and_fl, n)) %>%                       # fill NA's with previous value
+  mutate_if(is.numeric, replace_na, 0) %>%                                # replace NA's at the beginning of the time-series
+  arrange(date) %>%                                                       # order by date
+  group_by(date) %>%                                                      # group by date and
+  summarize_if(is.numeric, sum) %>%                                       # sum-up all cases per date from all regions
+  filter(date < maxDate+offset) %>%                                       # if set, remove incomplete data for last day
+  mutate_at(vars(starts_with("ncumul")), doubling, days=5) %>% 
+  select(-c(ncumul_tested, starts_with("n_"))) %>%
+  mutate(status=ifelse(date > max(date[n==26]), "incomplete", "complete")) %>%
+  ggplot(aes(x=date, y=ncumul_conf, fill=ncumul_conf, alpha=status)) +
+  geom_bar(stat="identity") +
+  xlim(as.Date("2020-03-01"), NA) +
+  scale_alpha_manual(values=c(1, 0.1), guide=FALSE) +
+  scale_fill_viridis_c(name="Doubling rate",
+                       option="A", na.value="transparent",
+                       direction=-1, trans="log2") +             # log2 transformation to increase resolution at lower end
+  theme_minimal(base_size = 10) + 
+  labs(title="Doubling rate of confirmed cases in Switzerland", x="", y="") +
+  theme(legend.position = "bottom") 
+
+```
+
-- 
GitLab