In [1]:
import pandas as pd
import os
from IPython.display import display, HTML, Markdown

In [2]:
ts_folder = "../data/covid-19_jhu-csse/"
rates_folder = "../data/covid-19_rates/"
geodata_path = "../data/geodata/geo_data.csv"
out_folder = None
PAPERMILL_OUTPUT_PATH = None

In [3]:
# Parameters
PAPERMILL_INPUT_PATH = "/tmp/e5179t23/notebooks/Dashboard.ipynb"
PAPERMILL_OUTPUT_PATH = "runs/Dashboard.run.ipynb"
ts_folder = "/tmp/e5179t23/data/covid-19_jhu-csse"
rates_folder = "/tmp/e5179t23/data/covid-19_rates"
geodata_path = "/tmp/e5179t23/data/geodata/geo_data.csv"


In [4]:
# Read in the data

In [5]:
def read_jhu_covid_df(name):
    filename = os.path.join(ts_folder, f"time_series_19-covid-{name}.csv")
    df = pd.read_csv(filename)
    df = df.set_index(['Province/State', 'Country/Region', 'Lat', 'Long'])
    df.columns = pd.to_datetime(df.columns)
    return df


jhu_frames_map = {
    "confirmed": read_jhu_covid_df("Confirmed"),
    "deaths": read_jhu_covid_df("Deaths"),
    "recovered": read_jhu_covid_df("Recovered")
}

In [6]:
def read_rates_covid_df(name):
    filename = os.path.join(rates_folder, f"ts_rates_19-covid-{name}.csv")
    df = pd.read_csv(filename).drop("Unnamed: 0", axis=1)
    df = df.set_index(['Country/Region'])
    df.columns = pd.to_datetime(df.columns)
    return df


rates_frames_map = {
    "confirmed": read_rates_covid_df("confirmed"),
    "deaths": read_rates_covid_df("deaths"),
    "recovered": read_rates_covid_df("recovered")
}

In [7]:
geodata_df = pd.read_csv(geodata_path)
geodata_df = geodata_df.drop('Unnamed: 0', axis=1)
geodata_df = geodata_df.rename({'name_jhu':'Country/Region'}, axis=1)
geodata_df = geodata_df.set_index('Country/Region')

In [8]:
# Identify countries with 100 or more cases
def latest_jhu_country_ser(name):
    return jhu_frames_map[name].iloc[:,-1].groupby(level='Country/Region').sum()


case_count_ser = latest_jhu_country_ser('confirmed')
countries_over_thresh = case_count_ser[case_count_ser > 99].index

# Questions About COVID-19 and Its Spread

Understanding the spread, distribution, and deadliness of COVID-19 is difficult, despite the data available about it. Differences in rates of testing, quality of data, demographics, etc. make it difficult to compare data between countries. 

All this needs to be considered when looking at the plots below. But despite those caveats, I found it helpful to plot the raw data, even though direct comparisons between countries might not be inaccurate.

In [9]:
data_ts = jhu_frames_map['confirmed'].iloc[:,-1].name.strftime("%b %d %Y")
display(HTML(f"<em>Data up to {data_ts}; countries with 100 or more confirmed cases.</em>"))

## How are cases per 100,000 distributed geographically?

In [10]:
import altair as alt
from vega_datasets import data

In [11]:
def latest_rates_ser(name):
    return rates_frames_map[name].iloc[:,-1]


# Compile the basic df
map_df = pd.concat([
    latest_rates_ser('confirmed'),
    latest_rates_ser('deaths'),
    latest_rates_ser('recovered')], axis=1)
nominal_df = pd.concat([
    latest_jhu_country_ser('confirmed'),
    latest_jhu_country_ser('deaths'),
    latest_jhu_country_ser('recovered')], axis=1)
map_df = pd.concat([map_df, nominal_df, geodata_df[['Longitude', 'Latitude']]], axis=1)
# Restrict to countries with 100 or more cases
map_df = map_df.loc[countries_over_thresh].dropna()
map_df = map_df.reset_index()
map_df.columns = ['Country/Region', 
                  'Confirmed/100k', 'Deaths/100k', 'Recovered/100k', 
                  'Confirmed', 'Deaths', 'Recovered',
                  'Long', 'Lat']

In [12]:
def map_of_variable(map_df, variable, title):
    # Data generators for the background
    sphere = alt.sphere()
    graticule = alt.graticule()

    # Source of land data
    source = alt.topo_feature(data.world_110m.url, 'countries')

    # Layering and configuring the components
    p = alt.layer(
        alt.Chart(sphere).mark_geoshape(fill='#cae6ef'),
        alt.Chart(graticule).mark_geoshape(stroke='white', strokeWidth=0.5),
        alt.Chart(source).mark_geoshape(fill='#dddddd', stroke='#aaaaaa'),
        alt.Chart(map_df).mark_circle(opacity=0.6).encode(
            longitude='Long:Q',
            latitude='Lat:Q',
            size=alt.Size(f'{variable}:Q', title="Cases"),
            color=alt.value('steelblue'),
            tooltip=["Country/Region:N", 
                     "Confirmed:Q", "Deaths:Q", "Recovered:Q",
                     "Confirmed/100k:Q", "Deaths/100k:Q", "Recovered/100k:Q"]
        )
    ).project(
        'naturalEarth1'
    ).properties(width=600, height=400, title=f"{title} cases per 100k inhabitants"
    ).configure_view(stroke=None)
    return p

In [13]:
display(map_of_variable(map_df, 'Confirmed/100k', 'Confirmed'))
display(HTML('''
<p style="font-size: smaller">Data Sources: 
  <a href="https://github.com/CSSEGISandData/COVID-19">JHU CSSE</a>,
  <a href="https://data.worldbank.org/indicator/SP.POP.TOTL">World Bank</a>,
  <a href="https://worldmap.harvard.edu/data/geonode:country_centroids_az8">Harvard Worldmap</a>
</p>'''))

In [14]:
bars = alt.Chart(map_df).mark_bar().encode(
    x='Confirmed/100k:Q',
    y=alt.Y("Country/Region:N", sort='-x'),
    tooltip=["Country/Region:N", 
         "Confirmed:Q", "Deaths:Q", "Recovered:Q",
         "Confirmed/100k:Q", "Deaths/100k:Q", "Recovered/100k:Q"]
)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text=alt.Text('Confirmed/100k:Q', format=".3")
)

(bars + text).properties(height=900)