In [1]:
import pandas as pd
import altair as alt
from IPython.display import display, HTML

In [2]:
html_credits=HTML('''
<p style="font-size: smaller">Data Sources: 
  <a href="https://covidtracking.com">The COVID Tracking Project</a>
<br>
Analysis and Visualization:
  <a href="https://renkulab.io/projects/covid-19/covid-19-public-data">Covid-19 Public Data Collaboration Project</a>
</p>''')

In [21]:
# Read population data
pop_df = pd.read_csv('../data/geodata/us_pop_fung_2019.csv').set_index('ST')

# Read state-level data
data_df = pd.read_json('../data/covidtracking/states-daily.json')
data_df['date'] = pd.to_datetime(data_df['date'], format="%Y%m%d")
data_df['ratio'] = data_df['positive']/data_df['total']

# Compute daily differences
tdf = data_df.sort_values(['state', 'date'], ascending=[True, False]).set_index(['state', 'date'])
diffs_df = tdf[['positive', 'negative', 'death']].groupby(level='state').diff(periods=-1).dropna(how='all')
tdf_diff=tdf.join(diffs_df, rsuffix='_diff').reset_index()

# incidence rates
tdf_diff = tdf_diff.set_index('state')
tdf_diff['positive_diff_100k'] = (tdf_diff['positive_diff'] / pop_df['Population']) * 100000
tdf_diff['death_diff_100k'] = (tdf_diff['death_diff'] / pop_df['Population']) * 100000
tdf_diff = tdf_diff.reset_index()

# "Normalizing" the totals
tdf_diff['total_10'] = tdf_diff['total']/10.

# Daily totals
daily_totals = tdf_diff.groupby('date').sum()
daily_totals.reset_index(level=0, inplace=True)

# National daily totals
nation_df = data_df.groupby('date').sum()
nation_df['state']='All US'
nation_df = nation_df.reset_index()

# Covid-19 Cases in U.S.

The case data from the U.S. is obtained from https://covidtracking.com, a public crowd-sourced covid-19 dataset. 

### Growth trends

In [22]:
# Compute theoretical trends of doubling every day, 3 days, week
days = {'day':[1,2,3,4,5,10,15,20, 50, 100]}
startCase = 10
logRuleDay_df = pd.DataFrame(days, columns=['day'])
logRuleDay_df['case']= startCase * pow(2,logRuleDay_df['day']-1)
logRuleDay_df['doubling period']='every day'

logRule3Days_df = pd.DataFrame(days, columns=['day'])
logRule3Days_df['case']= startCase * pow(2,(logRule3Days_df['day']-1)/3)
logRule3Days_df['doubling period']='three days'

logRuleWeek_df = pd.DataFrame(days, columns=['day'])
logRuleWeek_df['case']= startCase * pow(2,(logRuleWeek_df['day']-1)/7)
logRuleWeek_df['doubling period']='every week'

logRules_df = pd.concat([logRuleDay_df, logRule3Days_df, logRuleWeek_df])
logRules_df = logRules_df.reset_index()

In [37]:
# make dataframe for text labels on chart - hand edit these label locations
textLabels_df = pd.DataFrame(
    [[10,6000,'doubles every day'],
     [19,700,'doubles every 3 days'],
     [23,50, 'doubles every week']],
    columns =['labelX', 'labelY','labelText']
)

# make dataframe of states with points >=10 deaths
death10_df = data_df.loc[data_df['death']>=10]

# group death10 dataframe by state and then increasing order of date
death10_df = death10_df.sort_values(by=['state','date'])

# add US to that dataframe
nationdeath10_df = nation_df.loc[nation_df['death']>=10]
death10_df= pd.concat ([death10_df,nationdeath10_df])

death10_df = death10_df.reset_index()

# make a list of the states with 10 or more deaths
state_list = list(set(death10_df['state']))

# add a column for the number of days since the 10th death for each state
for state, df in death10_df.groupby('state'):
    death10_df.loc[df.index,'sinceDay0'] = range(0, len(df))
death10_df = death10_df.astype({'sinceDay0': 'int32'})

#Now create plotlines for each state since 10 deaths
lineChart = alt.Chart(death10_df,title='US States: Cumulative Deaths Since 10th Death').mark_line(interpolate='basis').encode(
    alt.X('sinceDay0:Q', axis=alt.Axis(title='Days Since 10th Death')),
    alt.Y('death:Q',
         axis = alt.Axis(title='Cumulative Deaths'),
         scale=alt.Scale(type='log')),
    tooltip=['state', 'sinceDay0', 'death', 'positive'],
    color = 'state'
)

#Create a layer with the lines for doubling every day and doubling every week
ruleChart = alt.Chart(logRules_df).mark_line(opacity=0.2,clip=True).encode(
    alt.X('day:Q',
            scale=alt.Scale(domain=[1,23])),
    alt.Y('case', scale=alt.Scale(type='log',domain=[10,10000]),
         ),
    color = 'doubling period',
    tooltip = ['doubling period'])        

# create a layer for the state labels
# 1) make dataframe with each state's max days
# 2) make a chart layer with text of state name to right of each state's rightmost point
stateLabels_df = death10_df[death10_df['sinceDay0'] == death10_df.groupby(['state'])['sinceDay0'].transform(max)]
labelChart = alt.Chart(stateLabels_df).mark_text(align='left', baseline='middle', dx=10).encode(
    x='sinceDay0',
    y='death',
    text='state',
    color='state')

#now put the text labels layer on top of state labels Chart
labelChart = labelChart + alt.Chart(textLabels_df).mark_text(align='right', baseline='bottom', dx=0, size=18,opacity=0.5).encode(
    x='labelX',
    y='labelY',
    text='labelText')


## Create some tooltip behavior - show Y values on mouseover
# Step 1: Selection that chooses nearest point based on value on x-axis
nearest = alt.selection(type='single', nearest=True, on='mouseover',
                            fields=['sinceDay0'])

# Step 2: Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = alt.Chart().mark_point().encode(
    x="sinceDay0:Q",
    opacity=alt.value(0),
).add_selection(
    nearest
)

# Step 3: Add text, show values in column when it's the nearest point to 
# mouseover, else show blank
text = lineChart.mark_text(align='center', dx=3, dy=-20).encode(
    text=alt.condition(nearest, 'death', alt.value(' '))
)


#Finally, lets show the chart!

chart = alt.layer(lineChart, ruleChart, labelChart, selectors, text, data=death10_df)
#chart = alt.layer(lineChart, ruleChart, labelChart)
chart.properties(width=800,height=600)

display(chart)
display(html_credits)

In [41]:
#make dataframe with lines to indicate doubling every day, 3 days, week
days = {'day':[1,2,3,4,5,10,15,20, 50, 100]}
startCase = 100
logRuleDay_df = pd.DataFrame (days, columns=['day'])
logRuleDay_df['case']= startCase * pow(2,logRuleDay_df['day']-1)
logRuleDay_df['doubling period']='every day'

logRule3Days_df = pd.DataFrame (days, columns=['day'])
logRule3Days_df['case']= startCase * pow(2,(logRule3Days_df['day']-1)/3)
logRule3Days_df['doubling period']='three days'

logRuleWeek_df = pd.DataFrame (days, columns=['day'])
logRuleWeek_df['case']= startCase * pow(2,(logRuleWeek_df['day']-1)/7)
logRuleWeek_df['doubling period']='every week'

logRules_df = pd.concat([logRuleDay_df, logRule3Days_df, logRuleWeek_df])
logRules_df = logRules_df.reset_index()

#make dataframe for text labels on chart - hand edit these label locations
textLabels_df = pd.DataFrame(
    [[9,30000,'doubles every day'],
     [28,31000,'doubles every 3 days'],
     [19,300, 'doubles every week']],
    columns =['labelX', 'labelY','labelText']
)

#make dataframe with only points >=100 positives
positive100_df = data_df.loc[data_df['positive']>=100]

##add US to that dataframe
nationpos100_df = nation_df.loc[nation_df['positive']>=100]
positive100_df= pd.concat ([positive100_df,nationpos100_df])

#group positive100 dataframe by state and then increasing order of date
positive100_df = positive100_df.sort_values(by=['state','date'])
positive100_df = positive100_df.reset_index()

#make a list of the states with 10 or more deaths (don't really need this)
#state_list = list(set(positive100_df['state']))

# add a column for the number of days since the 100th case for each state
for state, df in positive100_df.groupby('state'):
    positive100_df.loc[df.index,'sinceDay0'] = range(0, len(df))
positive100_df = positive100_df.astype({'sinceDay0': 'int32'})

    
#Now create plotlines for each state since 10 deaths
lineChart = alt.Chart(positive100_df, title="US States: total cases since 100th case").mark_line(interpolate='basis').encode(
    alt.X('sinceDay0:Q', axis=alt.Axis(title='Days since 100th case')),
    alt.Y('positive:Q',
          axis = alt.Axis(title='Cumulative positive cases'),
          scale=alt.Scale(type='log')),
    tooltip=['state', 'sinceDay0', 'death', 'positive'],
    color = 'state'
)

#Create a layer with the lines for doubling every day and doubling every week

ruleChart = alt.Chart(logRules_df).mark_line(opacity=0.2,clip=True).encode(
    alt.X('day:Q',
            scale=alt.Scale(domain=[1,30])),
    alt.Y('case', scale=alt.Scale(domain=[100,100000], type='log'),
         ),
    color = 'doubling period')

# create a layer for the state labels
# 1) make dataframe with each state's max days
# 2) make a chart layer with text of state name to right of each state's rightmost point
stateLabels_df = positive100_df[positive100_df['sinceDay0'] == positive100_df.groupby(['state'])['sinceDay0'].transform(max)]
labelChart = alt.Chart(stateLabels_df).mark_text(align='left', baseline='middle', dx=10).encode(
    x='sinceDay0',
    y='positive',
    text='state',
    color='state')

#now put the text labels layer on top of state labels Chart
labelChart = labelChart + alt.Chart(textLabels_df).mark_text(align='right', baseline='bottom', dx=0, size=18,opacity=0.5).encode(
    x='labelX',
    y='labelY',
    text='labelText')

#Create some tooltip behavior
# Step 1: Selection that chooses nearest point based on value on x-axis
nearest = alt.selection(type='single', nearest=True, on='mouseover',
                            fields=['sinceDay0'])

# Step 2: Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = alt.Chart().mark_point().encode(
    x="sinceDay0:Q",
    opacity=alt.value(0),
).add_selection(
    nearest
)

# Step 3: Add text, show values in Sex column when it's the nearest point to 
# mouseover, else show blank
text = lineChart.mark_text(align='center', dx=3, dy=-20).encode(
    text=alt.condition(nearest, 'positive', alt.value(' '))
)


#Finally, lets show the chart!

chart = alt.layer(lineChart, ruleChart, labelChart, selectors, text, data=death10_df)
#chart = alt.layer(lineChart, ruleChart, labelChart)
chart.properties (width=800,height=600)
display(chart)
display(html_credits)

### Daily Cumulative Totals

Cumulative reported totals of positive cases and deaths. 

In [4]:
base = alt.Chart(
    daily_totals
).mark_bar(size=15).encode(
    alt.X('date', axis=alt.Axis(title='')
    )
)

cumulative = base.encode(alt.Y('positive', title = 'Cumulative cases'))
cumulative_deaths = base.encode(alt.Y('death', title = 'Cumulative deaths'))
rates = base.encode(alt.Y('positive_diff', title='Daily cases'))
rates_deaths = base.encode(alt.Y('death_diff', title='Daily deaths'))
chart = alt.vconcat(
    cumulative | rates, cumulative_deaths | rates_deaths,
    title='Cumulative Covid-19 cases in the U.S.'
).configure_title(
    anchor='middle'
)
display(chart)
display(html_credits)

### Total tests and positives per 100k population

In [5]:
most_recent_test_date = data_df['date'].max()
most_recent_df = data_df[data_df['date'] == most_recent_test_date].set_index('state')
print("Most recent test date", most_recent_test_date)
print(len(most_recent_df), "states/territories have data on this date.")

most_recent_df['total/100k'] = (most_recent_df['total'] / pop_df['Population']) * 100000
most_recent_df['positive/100k'] = (most_recent_df['positive'] / pop_df['Population']) * 100000
most_recent_df = most_recent_df.reset_index()

Most recent test date 2020-03-26 00:00:00
56 states/territories have data on this date.


In [47]:
viz_df = most_recent_df.sort_values('total/100k', ascending=False)
chart = alt.Chart(viz_df, title="Cases per 100k").encode(alt.X('state', sort=None))
tests = chart.mark_bar().encode(alt.Y('total/100k', axis=alt.Axis(title='COVID-19 Tests/100k, Positive Cases/100k')))
positives = chart.mark_point(color='orange', filled=True, size=100, opacity=1).encode(alt.Y('positive/100k'))
display(alt.layer(tests, positives))
display(html_credits)

## Counts and rates by state

Taking a look at the three states with the highest per-capita incidence of covid-19. The red and yellow curves represent the total tests and total positive tests respectively. 

In [7]:
# produce the charts for a few states

charts=[]
for state in ['NY', 'WA', 'NM']: 
    state_df = tdf_diff[tdf_diff['state'] == state].copy()

    base = alt.Chart(state_df, title=state).encode(alt.X('date', axis=alt.Axis(title='Date'))).properties(width=250, height=150)
    dailies = base.mark_bar(size=10).encode(alt.Y('positive_diff', axis=alt.Axis(title='Daily positive')))

    totals = base.mark_line(color='red').encode(alt.Y('total_10', axis=alt.Axis(title='Total/10'))) 
    positives = totals.mark_line(color='orange').encode(alt.Y('positive', axis=alt.Axis(title='Positive')))
    cumulative = totals + positives

    ratio = base.mark_line(color='red').encode(alt.Y('ratio', axis=alt.Axis(title='Positive/Total'), scale=alt.Scale(domain=(0,1))))
    
    charts.append(alt.layer(dailies, cumulative).resolve_scale(y='independent'))

display(alt.hconcat(*charts))
display(html_credits)