In [1]:
import pandas as pd

# Read in JHU CSSE data

I will switch to [xarray](http://xarray.pydata.org/en/stable/), but ATM, it's easier like this...

In [2]:
def read_jhu_covid_df(name):
 filename = f"../data/covid-19_jhu-csse/time_series_19-covid-{name}.csv"
 df = pd.read_csv(filename)
 df = df.set_index(['Province/State', 'Country/Region', 'Lat', 'Long'])
 df.columns = pd.to_datetime(df.columns)
 return df

In [3]:
frames_map = {
 "confirmed": read_jhu_covid_df("Confirmed"),
 "deaths": read_jhu_covid_df("Deaths"),
 "recovered": read_jhu_covid_df("Recovered")
}

In [4]:
def current_region_totals_df(frames_map):
 sers = [df.groupby(level='Country/Region').sum().iloc[:,-1].sort_values(ascending=False)
 for name, df in frames_map.items()]
 for name, ser in zip(frames_map, sers):
 ser.name = name
 return pd.concat(sers, axis=1)

In [5]:
current_totals_df = current_region_totals_df(frames_map)
current_totals_df[current_totals_df['confirmed'] > 100]

Unnamed: 0,confirmed,deaths,recovered
Mainland China,80757,3136,60106
Italy,10149,631,724
Iran (Islamic Republic of),8042,291,2731
Republic of Korea,7513,54,247
France,1784,33,12
Spain,1695,35,32
US,1670,56,15
Germany,1457,2,18
Others,696,6,40
Japan,581,10,101


# Read in World Bank data

In [6]:
import zipfile
zf = zipfile.ZipFile("../data/worldbank/SP.POP.TOTL.zip")
pop_df = pd.read_csv(zf.open("API_SP.POP.TOTL_DS2_en_csv_v2_821007.csv"), skiprows=4)

There is 2018 pop data for all countries/regions except Eritrea

In [7]:
pop_df[pd.isna(pop_df['2018'])]

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 64
67,Eritrea,ERI,"Population, total",SP.POP.TOTL,1007590.0,1033328.0,1060486.0,1088854.0,1118159.0,1148189.0,...,3213972.0,,,,,,,,,
108,Not classified,INX,"Population, total",SP.POP.TOTL,,,,,,,...,,,,,,,,,,


Fix the country/region names that differ between the World Bank population data and the JHU CSSE data.

In [16]:
region_wb_jhu_map = {
 'China': 'Mainland China',
 'Iran, Islamic Rep.': 'Iran (Islamic Republic of)',
 'Korea, Rep.': 'Republic of Korea',
 'United States': 'US',
 'United Kingdom': 'UK',
 'Hong Kong SAR, China': 'Hong Kong SAR',
 'Egypt, Arab Rep.': 'Egypt',
 'Vietnam': 'Viet Nam',
 'Macao SAR, China': 'Macao SAR',
 'Slovak Republic': 'Slovakia',
 'Moldova': 'Republic of Moldova',
 'St. Martin (French part)': 'Saint Martin',
 'Brunei Darussalam': 'Brunei'
}
current_pop_ser = pop_df[['Country Name', '2018']].copy().replace(region_wb_jhu_map).set_index('Country Name')['2018']
data_pop_ser = current_pop_ser[current_pop_ser.index.isin(current_totals_df.index)]

There are some regions that we cannot resolve, but we will just ignore these.

In [17]:
current_totals_df[current_totals_df.index.isin(data_pop_ser.index) == False]

Unnamed: 0,confirmed,deaths,recovered
Others,696,6,40
Taipei and environs,47,1,17
occupied Palestinian territory,25,0,0
French Guiana,5,0,0
Martinique,2,0,0
Holy See,1,0,0
Saint Barthelemy,1,0,0


# Compute rates per 100,000 for regions with more than 100 cases

In [23]:
current_per_100000_df = current_totals_df[current_totals_df['confirmed'] > 100]
current_per_100000_df = current_per_100000_df.div(data_pop_ser, 'index').mul(100000).dropna()
current_per_100000_df.sort_values('confirmed', ascending=False)

Unnamed: 0,confirmed,deaths,recovered
Italy,16.794282,1.044161,1.198055
Republic of Korea,14.550136,0.10458,0.478355
Iran (Islamic Republic of),9.831264,0.355745,3.33862
Norway,7.52681,0.0,0.018817
Bahrain,7.008874,0.0,1.401775
Mainland China,5.798468,0.225169,4.315697
Switzerland,5.76525,0.035226,0.035226
Denmark,4.519231,0.0,0.017249
Spain,3.627705,0.074908,0.068488
Sweden,3.486143,0.0,0.00982
