Andreas Bleuler
--- a/notebooks/datasets_summary.ipynb

+ 1

− 1
+++ b/notebooks/datasets_summary.ipynb

+ 1

− 1
 %% Cell type:code id: tags:

 ``` python
 %load_ext autoreload
 %autoreload 2
 ```

 %% Cell type:code id: tags:

 ``` python
 from IPython.display import display, HTML
 renkulab_credits = HTML("""<p style="font-size: smaller">Analysis and Visualization:
  <a href="https://renkulab.io/projects/covid-19/covid-19-public-data">
  Covid-19 Public Data Collaboration Project @ renkulab.io
  </a>
  </p>
 """)
 ecdc_credits=HTML('''
 <p style="font-size: smaller">Data Sources:
  <a href="https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide">ECDC</a>
 </p>
 '''
 )
 all_credits=HTML("""
 <p style="font-size: smaller">Data Sources:
  <a href="https://www.ecdc.europa.eu/en/publications-data/download-todays-data-geographic-distribution-covid-19-cases-worldwide">ECDC</a>,
  <a href="https://github.com/pcm-dpc/COVID-19">Italian Civil Protection</a>,
  <a href="https://github.com/nytimes/covid-19-data">The New York Times</a>,
  <a href="https://github.com/datadista/datasets">Datadista</a>.
 </p>
 """)
 ```

 %% Cell type:markdown id: tags:

 # Global and Regional Covid-19 Case Count Data

 In this project we collect data about the ongoing Covid-19 pandemic from a variety of sources. Some aggregate worldwide country-level statistics while others provide fine-grained data about regions of individual countries.

 Here we summarize some of the available data using the simple standardized views we have built for these datasets, which allow us to use the same analysis tools across the heterogeneous datasets.

 %% Cell type:code id: tags:

 ``` python
 # Standard python imports - we will be using pandas for data wrangling and altair for visualization
 from datetime import datetime
 from pathlib import Path

 import altair as alt
 import pandas as pd
 ```

 %% Cell type:code id: tags:

 ``` python
 # Importing our custom tools for the datasets in this repository.
 from covid_19_utils import helper, plotting
 from covid_19_utils.converters import CaseConverter
 ```

-%% Cell type:code id: tags:
+%% Cell type:code id: tags:parameters

 ``` python
 data_path = "../data"
 atlas_path = "../data/atlas"
 ```

 %% Cell type:code id: tags:

 ``` python
 # Instantiating the CaseConverter to read in the data.
 # It takes a path to the atlas dataset as an argument - the atlas holds various population and country metadata data.
 converter = CaseConverter(atlas_path)
 ```

 %% Cell type:markdown id: tags:

 ## Global deaths attributed to Covid-19

 How deadly has the pandemic been over the past few months? Where has it proven to be the deadliest? We investigate the data collected by the [European Center for Disease Prevention and Control](https://www.ecdc.europa.eu/en) to gain some basic understanding of these questions.

 %% Cell type:code id: tags:

 ``` python
 # read in the data from the European Center for Disease Prevention and Control
 global_df = converter.read_convert(Path(data_path) / "covid-19-ecdc")
 ```

 %% Cell type:code id: tags:

 ``` python
 # Set a start date of Feb 1st and plot the daily deaths globally
 start_date = datetime.fromisoformat('2020-02-01')
 global_daily_totals = global_df.groupby('date').sum().reset_index()

 # basic chart
 base = alt.Chart(global_daily_totals.loc[global_daily_totals.date >= start_date])

 chart = base.mark_line().encode(
    x=alt.X('date', title='Date'),
    y=alt.Y('deceased_daily', title='Daily deaths')
 )

 # 7-day mean - we use the altair window function to compute this directly
 running_mean = base.transform_window(
    mean_deceased='mean(deceased_daily)',
    frame=[-3, 3],
 ).mark_area(opacity=0.3, color='orange').encode(
    x='date',
    y='mean_deceased:Q'
 )

 # combine the two charts together and set the size and title
 combined = (
    chart +
    running_mean +
    running_mean.mark_line(color='orange')
 ).properties(
    width=600,
    height=350,
    title="Daily global deaths and 7-day mean"
 )

 display(combined)
 display(ecdc_credits)
 display(renkulab_credits)
 ```

 %% Output




 %% Cell type:markdown id: tags:

 ### Top ten countries by total number of deaths attributed to Covid-19

 %% Cell type:code id: tags:

 ``` python
 # Sort countries by their total deaths
 country_death_sort = global_df.groupby(
    "country_label"
 )['deceased'].max().sort_values(
    ascending=False
 ).index.to_list()
 ```

 %% Cell type:code id: tags:

 ``` python
 # define some column translations for nicer display and print the top ten
 columns = ["deceased", "deceased_100k", "positive", "positive_100k"]
 pretty_print_columns = {
    "deceased": "Deaths",
    "deceased_daily": "Daily deaths",
    "deceased_100k": "Deaths per 100k",
    "positive": "Positive cases",
    "positive_daily": "Daily positive cases",
    "positive_100k": "Positive cases per 100k",
    "country_label": "Country",
    "county_state": "County, State",
    "region_label": "Region"
    }

 # global data sorted by cumulative deaths
 global_sorted = global_df.set_index(["country_label"]).loc[country_death_sort]

 # display a table of cumulative data for the top ten countries by total deaths
 global_sorted.loc[
    global_sorted.date == global_sorted.date.max()
 ][columns].reset_index().rename(
    columns=pretty_print_columns
 )[:10]
 ```

 %% Output

                        Country  Deaths  Deaths per 100k  Positive cases  \
    0  United States of America   42539        13.002211          787752
    1                     Italy   24114        39.903174          181228
    2                     Spain   20852        44.628268          200210
    3                    France   20265        30.252028          114657
    4            United Kingdom   16509        24.829674          124743
    5                   Belgium    5828        51.024035           39983
    6                      Iran    5209         6.367950           83505
    7                     China    4636         0.332871           83849
    8                   Germany    4598         5.544574          143457
    9               Netherlands    3751        21.768883           33405
    
       Positive cases per 100k
    0               240.779466
    1               299.891035
    2               428.497294
    3               171.162438
    4               187.614518
    5               350.050446
    6               102.084016
    7                 6.020478
    8               172.989985
    9               193.865516

 %% Cell type:markdown id: tags:

 ## Growth of deaths in hardest-hit regions

 For the three top countries in the list above we have regional datasets - in each of those, specific regions have contributed disproportionately to the death counts in their respective countries.

 %% Cell type:code id: tags:

 ``` python
 # read in the regional data for the U.S., Italy and Spain
 us_df = converter.read_convert(Path(data_path) / "covid-19-us-nyt")
 spain_df = converter.read_convert(Path(data_path) / "covid-19-spain")
 italy_df = converter.read_convert(Path(data_path) / "covid-19-italy/dpc-covid19-ita-regioni.csv")

 # In the U.S. there are counties with the same name in different states - create a unique key
 us_df['county_state'] = us_df.apply(lambda row: f"{row['admin2_label']}, {row['region_label']}", axis=1)
 ```

 %% Cell type:code id: tags:

 ``` python
 # In each country, find the three most affected regions.
 # Note that in these data we have two levels of sub-regions.
 # For the U.S. data this means that "regions" are "states" and "admin_2" refers to "counties".
 ```

 %% Cell type:code id: tags:

 ``` python
 # we define a simple function to return the top five "keys" based on a particular "field"

 def get_top5(df,key="region_label",field="deceased"):
    """Return the top 5 keys"""
    return df.groupby(key).last().sort_values(by=field, ascending=False).reset_index().iloc[:5]
 ```

 %% Cell type:code id: tags:

 ``` python
 # get the top 5 U.S. counties
 top5_us = get_top5(us_df, key="county_state")
 top5_spain = get_top5(spain_df)
 top5_italy = get_top5(italy_df)
 date = top5_spain.iloc[0].date.strftime("%b. %d, %Y")

 # we will use HTML renderings of these DataFrames to make them look nice on the final notebook
 top5_us_html = top5_us[
    ['county_state']+columns
 ].rename(
    columns=pretty_print_columns
 ).to_html(
    index=False
 )

 top5_spain_html = top5_spain[
    ['region_label']+columns
 ].rename(
    columns=pretty_print_columns
 ).to_html(
    index=False
 )

 top5_italy_html = top5_italy[
    ['region_label']+columns
 ].rename(
    columns=pretty_print_columns
 ).to_html(
    index=False
 )

 # display some generic text - we use HTML display for consistency with the rest of the notebook
 display(
    HTML(
        """
        <h3>
        Cummulative deaths attributed to Covid-19 in U.S., Spain and Italy
        </h3>
        """
    )
 )

 display(
    HTML(
        f"""
        <h4>United States</h4>
        <p style="font-size: smaller">(based on data from {top5_us.iloc[0].date.strftime("%b. %d, %Y")})</p>
        {top5_us_html}

        <h4>Spain</h4>
        <p style="font-size: smaller">(based on data from {top5_spain.iloc[0].date.strftime("%b. %d, %Y")})</p>
        {top5_spain_html}

        <h4>Italy</h4>
        <p style="font-size: smaller">(based on data from {top5_italy.iloc[0].date.strftime("%b. %d, %Y")})</p>
        {top5_italy_html}
        """
    )
 )

 ```

 %% Output



 %% Cell type:code id: tags:

 ``` python
 #
 # Here we will create a combined DataFrame for plotting all three datasets at once
 #

 # create a common key to use for selecting regions
 us_df['plot_key'] = us_df['county_state']
 spain_df['plot_key'] = spain_df['region_label']
 italy_df['plot_key'] = italy_df['region_label']

 # pull the top-5 region names into a list
 top5_regions = top5_us.county_state.to_list() + top5_spain.region_label.to_list() + top5_italy.region_label.to_list()
 combined_df = pd.concat([us_df, spain_df, italy_df])
 ```

 %% Cell type:code id: tags:

 ``` python
 # generate a simple plots comparing the per-capita cumulative deaths

 base = alt.Chart(combined_df[combined_df.plot_key.isin(top5_regions)])
 deaths_100k = plotting.generate_region_chart(
    base,
    column='deceased_100k',
    region_column='plot_key',
    ytitle='Deaths per 100k population',
    legend_title='Region',
    tooltip_title='Deaths/100k'
 ).properties(
    height=350,
    width=600,
    title='Total deaths per 100k population in the top 5 regions from the U.S., Spain and Italy'
 ).configure_title(
    anchor='start'
 )
 display(deaths_100k)
 display(all_credits)
 display(renkulab_credits)
 ```

 %% Output




 %% Cell type:code id: tags:

 ``` python
 # Now for a slightly more complicated plot, we make use of some of the helper functions

 # Making a plot of the evolution of the death count since the 10th death in each region
 since_10th_death = helper.make_since_df(
    combined_df[combined_df.plot_key.isin(top5_regions)],
    'deceased',
    'plot_key'
 )

 base = alt.Chart(
    since_10th_death,
    title="Top 5 regions from each country: total deaths since 10th death"
 ).encode(
    alt.Y(scale=alt.Scale(type='log'))
 )

 line_chart = plotting.make_region_since_chart(
    base,
    'deceased',
    'sinceDay0',
    'plot_key',
    'Days since 10th death',
    'Cumulative deaths',
    'Cases',
    'Region'
 ).properties(
    width=600,
    height=350
 ).configure_title(
    anchor='start'
 )
 display(line_chart)
 display(all_credits)
 display(renkulab_credits)
 ```

 %% Output




 %% Cell type:code id: tags:

 ``` python
 []
 ```