Skip to content
Snippets Groups Projects
Commit e30ded37 authored by Chandrasekhar Ramakrishnan's avatar Chandrasekhar Ramakrishnan Committed by renku 0.9.1
Browse files

renku update --with-siblings

parent 89f1819d
No related branches found
No related tags found
No related merge requests found
class: Workflow
cwlVersion: v1.0
hints: []
inputs:
input_1:
default: ts_folder
streamable: false
type: string
input_10:
default:
class: Directory
listing: []
path: ../../data/covid-19_jhu-csse
streamable: false
type: Directory
input_11:
default: wb_path
streamable: false
type: string
input_12:
default:
class: File
path: ../../data/worldbank/SP.POP.TOTL.zip
streamable: false
type: File
input_13:
default: geodata_path
streamable: false
type: string
input_14:
default:
class: File
path: ../../data/geodata/geo_data.csv
streamable: false
type: File
input_15:
default: out_folder
streamable: false
type: string
input_16:
default: data/covid-19_rates
streamable: false
type: string
input_17:
default:
class: File
path: ../../notebooks/ToRates.ipynb
streamable: false
type: File
input_2:
default:
class: Directory
listing: []
path: ../../data/covid-19_jhu-csse
streamable: false
type: Directory
input_3:
default: rates_folder
streamable: false
type: string
input_4:
default: geodata_path
streamable: false
type: string
input_5:
default:
class: File
path: ../../data/geodata/geo_data.csv
streamable: false
type: File
input_6:
default:
class: File
path: ../../notebooks/Dashboard.ipynb
streamable: false
type: File
input_7:
default: runs/Dashboard.run.ipynb
streamable: false
type: string
input_8:
default: ts_folder
streamable: false
type: string
input_9:
default: runs/ToRates.run.ipynb
streamable: false
type: string
outputs:
output_0:
outputSource: step_2/output_0
streamable: false
type: File
output_1:
outputSource: step_1/output_0
streamable: false
type: File
output_2:
outputSource: step_2/output_1
streamable: false
type: Directory
requirements: []
steps:
step_1:
in:
input_1: input_1
input_2: input_2
input_3: input_3
input_4: step_2/output_1
input_5: input_4
input_6: input_5
input_7: input_6
input_8: input_7
out:
- output_0
run: 4cc7ffe9d5a045efb048ef2222a40ffa_papermill.cwl
step_2:
in:
input_1: input_8
input_10: input_9
input_2: input_10
input_3: input_11
input_4: input_12
input_5: input_13
input_6: input_14
input_7: input_15
input_8: input_16
input_9: input_17
out:
- output_0
- output_1
run: 2c413376f8aa4ba1a325212655d423e5_papermill.cwl
source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
source diff could not be displayed: it is stored in LFS. Options to address this: view the blob.
source diff could not be displayed: it is too large. Options to address this: view the blob.
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Convert Series to Rates per 100,000 # Convert Series to Rates per 100,000
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import pandas as pd import pandas as pd
import os import os
``` ```
%% Cell type:code id: tags:parameters %% Cell type:code id: tags:parameters
``` python ``` python
ts_folder = "../data/covid-19_jhu-csse/" ts_folder = "../data/covid-19_jhu-csse/"
wb_path = "../data/worldbank/SP.POP.TOTL.zip" wb_path = "../data/worldbank/SP.POP.TOTL.zip"
geodata_path = "../data/geodata/geo_data.csv" geodata_path = "../data/geodata/geo_data.csv"
out_folder = None out_folder = None
PAPERMILL_OUTPUT_PATH = None PAPERMILL_OUTPUT_PATH = None
``` ```
%% Cell type:code id: tags:injected-parameters %% Cell type:code id: tags:injected-parameters
``` python ``` python
# Parameters # Parameters
PAPERMILL_INPUT_PATH = "/tmp/sj0uwmdy/notebooks/ToRates.ipynb" PAPERMILL_INPUT_PATH = "/tmp/fsb4wn_r/notebooks/ToRates.ipynb"
PAPERMILL_OUTPUT_PATH = "runs/ToRates.run.ipynb" PAPERMILL_OUTPUT_PATH = "runs/ToRates.run.ipynb"
ts_folder = "/tmp/sj0uwmdy/data/covid-19_jhu-csse" ts_folder = "/tmp/fsb4wn_r/data/covid-19_jhu-csse"
wb_path = "/tmp/sj0uwmdy/data/worldbank/SP.POP.TOTL.zip" wb_path = "/tmp/fsb4wn_r/data/worldbank/SP.POP.TOTL.zip"
geodata_path = "/tmp/sj0uwmdy/data/geodata/geo_data.csv" geodata_path = "/tmp/fsb4wn_r/data/geodata/geo_data.csv"
out_folder = "data/covid-19_rates" out_folder = "data/covid-19_rates"
``` ```
%% Cell type:markdown id: tags:parameters %% Cell type:markdown id: tags:parameters
## Read in JHU CSSE data ## Read in JHU CSSE data
I will switch to [xarray](http://xarray.pydata.org/en/stable/), but ATM, it's easier like this... I will switch to [xarray](http://xarray.pydata.org/en/stable/), but ATM, it's easier like this...
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def read_jhu_covid_region_df(name): def read_jhu_covid_region_df(name):
filename = os.path.join(ts_folder, f"time_series_19-covid-{name}.csv") filename = os.path.join(ts_folder, f"time_series_19-covid-{name}.csv")
df = pd.read_csv(filename) df = pd.read_csv(filename)
df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long']) df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])
df.columns = pd.to_datetime(df.columns) df.columns = pd.to_datetime(df.columns)
region_df = df.groupby(level='Country/Region').sum() region_df = df.groupby(level='Country/Region').sum()
loc_df = df.reset_index([2,3]).groupby(level='Country/Region').mean()[['Long', 'Lat']] loc_df = df.reset_index([2,3]).groupby(level='Country/Region').mean()[['Long', 'Lat']]
return region_df.join(loc_df).set_index(['Long', 'Lat'], append=True) return region_df.join(loc_df).set_index(['Long', 'Lat'], append=True)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
frames_map = { frames_map = {
"confirmed": read_jhu_covid_region_df("Confirmed"), "confirmed": read_jhu_covid_region_df("Confirmed"),
"deaths": read_jhu_covid_region_df("Deaths"), "deaths": read_jhu_covid_region_df("Deaths"),
"recovered": read_jhu_covid_region_df("Recovered") "recovered": read_jhu_covid_region_df("Recovered")
} }
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Read in World Bank data # Read in World Bank data
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
import zipfile import zipfile
zf = zipfile.ZipFile(wb_path) zf = zipfile.ZipFile(wb_path)
pop_df = pd.read_csv(zf.open("API_SP.POP.TOTL_DS2_en_csv_v2_821007.csv"), skiprows=4) pop_df = pd.read_csv(zf.open("API_SP.POP.TOTL_DS2_en_csv_v2_821007.csv"), skiprows=4)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
There is 2018 pop data for all countries/regions except Eritrea There is 2018 pop data for all countries/regions except Eritrea
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
pop_df[pd.isna(pop_df['2018'])] pop_df[pd.isna(pop_df['2018'])]
``` ```
%% Output %% Output
Country Name Country Code Indicator Name Indicator Code 1960 \ Country Name Country Code Indicator Name Indicator Code 1960 \
67 Eritrea ERI Population, total SP.POP.TOTL 1007590.0 67 Eritrea ERI Population, total SP.POP.TOTL 1007590.0
108 Not classified INX Population, total SP.POP.TOTL NaN 108 Not classified INX Population, total SP.POP.TOTL NaN
1961 1962 1963 1964 1965 ... 2011 \ 1961 1962 1963 1964 1965 ... 2011 \
67 1033328.0 1060486.0 1088854.0 1118159.0 1148189.0 ... 3213972.0 67 1033328.0 1060486.0 1088854.0 1118159.0 1148189.0 ... 3213972.0
108 NaN NaN NaN NaN NaN ... NaN 108 NaN NaN NaN NaN NaN ... NaN
2012 2013 2014 2015 2016 2017 2018 2019 Unnamed: 64 2012 2013 2014 2015 2016 2017 2018 2019 Unnamed: 64
67 NaN NaN NaN NaN NaN NaN NaN NaN NaN 67 NaN NaN NaN NaN NaN NaN NaN NaN NaN
108 NaN NaN NaN NaN NaN NaN NaN NaN NaN 108 NaN NaN NaN NaN NaN NaN NaN NaN NaN
[2 rows x 65 columns] [2 rows x 65 columns]
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Fix the country/region names that differ between the World Bank population data and the JHU CSSE data. Fix the country/region names that differ between the World Bank population data and the JHU CSSE data.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
region_wb_jhu_map = { region_wb_jhu_map = {
'Brunei Darussalam': 'Brunei', 'Brunei Darussalam': 'Brunei',
'Czech Republic': 'Czechia', 'Czech Republic': 'Czechia',
'Egypt, Arab Rep.': 'Egypt', 'Egypt, Arab Rep.': 'Egypt',
'Hong Kong SAR, China': 'Hong Kong SAR', 'Hong Kong SAR, China': 'Hong Kong SAR',
'Iran, Islamic Rep.': 'Iran', 'Iran, Islamic Rep.': 'Iran',
'Korea, Rep.': 'Korea, South', 'Korea, Rep.': 'Korea, South',
'Macao SAR, China': 'Macao SAR', 'Macao SAR, China': 'Macao SAR',
'Russian Federation': 'Russia', 'Russian Federation': 'Russia',
'Slovak Republic': 'Slovakia', 'Slovak Republic': 'Slovakia',
'St. Martin (French part)': 'Saint Martin', 'St. Martin (French part)': 'Saint Martin',
'United States': 'US' 'United States': 'US'
} }
current_pop_ser = pop_df[['Country Name', '2018']].copy().replace(region_wb_jhu_map).set_index('Country Name')['2018'] current_pop_ser = pop_df[['Country Name', '2018']].copy().replace(region_wb_jhu_map).set_index('Country Name')['2018']
data_pop_ser = current_pop_ser[current_pop_ser.index.isin(frames_map['confirmed'].index.levels[0])] data_pop_ser = current_pop_ser[current_pop_ser.index.isin(frames_map['confirmed'].index.levels[0])]
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# Use this to find the name in the series # Use this to find the name in the series
# current_pop_ser[current_pop_ser.index.str.contains('Czech')] # current_pop_ser[current_pop_ser.index.str.contains('Czech')]
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
There are some regions that we cannot resolve, but we will just ignore these. There are some regions that we cannot resolve, but we will just ignore these.
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
frames_map['confirmed'].loc[ frames_map['confirmed'].loc[
frames_map['confirmed'].index.levels[0].isin(data_pop_ser.index) == False frames_map['confirmed'].index.levels[0].isin(data_pop_ser.index) == False
].iloc[:,-2:] ].iloc[:,-2:]
``` ```
%% Output %% Output
2020-03-13 00:00:00 \ 2020-03-13 00:00:00 \
Country/Region Long Lat Country/Region Long Lat
Congo (Kinshasa) 21.7587 -4.0383 2 Congo (Kinshasa) 21.7587 -4.0383 2
Cruise Ship 139.6380 35.4437 696 Cruise Ship 139.6380 35.4437 696
French Guiana -53.1258 3.9339 5 French Guiana -53.1258 3.9339 5
Guadeloupe -61.5510 16.2650 1 Guadeloupe -61.5510 16.2650 1
Guernsey -2.5800 49.4500 0 Guernsey -2.5800 49.4500 0
Holy See 12.4534 41.9029 1 Holy See 12.4534 41.9029 1
Jersey -2.1100 49.1900 0 Jersey -2.1100 49.1900 0
Martinique -61.0242 14.6415 3 Martinique -61.0242 14.6415 3
Reunion 55.5364 -21.1151 5 Reunion 55.5364 -21.1151 5
Saint Lucia -60.9789 13.9094 0 Saint Lucia -60.9789 13.9094 0
Saint Vincent and the Grenadines -61.2872 12.9843 0 Saint Vincent and the Grenadines -61.2872 12.9843 0
Taiwan* 121.0000 23.7000 50 Taiwan* 121.0000 23.7000 50
Venezuela -66.5897 6.4238 0 Venezuela -66.5897 6.4238 0
occupied Palestinian territory 35.2332 31.9522 0 occupied Palestinian territory 35.2332 31.9522 0
2020-03-14 00:00:00 2020-03-14 00:00:00
Country/Region Long Lat Country/Region Long Lat
Congo (Kinshasa) 21.7587 -4.0383 2 Congo (Kinshasa) 21.7587 -4.0383 2
Cruise Ship 139.6380 35.4437 696 Cruise Ship 139.6380 35.4437 696
French Guiana -53.1258 3.9339 5 French Guiana -53.1258 3.9339 5
Guadeloupe -61.5510 16.2650 1 Guadeloupe -61.5510 16.2650 1
Guernsey -2.5800 49.4500 1 Guernsey -2.5800 49.4500 1
Holy See 12.4534 41.9029 1 Holy See 12.4534 41.9029 1
Jersey -2.1100 49.1900 2 Jersey -2.1100 49.1900 2
Martinique -61.0242 14.6415 9 Martinique -61.0242 14.6415 9
Reunion 55.5364 -21.1151 6 Reunion 55.5364 -21.1151 6
Saint Lucia -60.9789 13.9094 1 Saint Lucia -60.9789 13.9094 1
Saint Vincent and the Grenadines -61.2872 12.9843 1 Saint Vincent and the Grenadines -61.2872 12.9843 1
Taiwan* 121.0000 23.7000 53 Taiwan* 121.0000 23.7000 53
Venezuela -66.5897 6.4238 2 Venezuela -66.5897 6.4238 2
occupied Palestinian territory 35.2332 31.9522 0 occupied Palestinian territory 35.2332 31.9522 0
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Read in geodata to get additional population numbers # Read in geodata to get additional population numbers
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
geodata_df = pd.read_csv(geodata_path).drop('Unnamed: 0', axis=1).set_index('name_jhu') geodata_df = pd.read_csv(geodata_path).drop('Unnamed: 0', axis=1).set_index('name_jhu')
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
Add in populations for missing countries Add in populations for missing countries
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
missing_countries = frames_map['confirmed'].loc[ missing_countries = frames_map['confirmed'].loc[
frames_map['confirmed'].index.levels[0].isin(data_pop_ser.index) == False frames_map['confirmed'].index.levels[0].isin(data_pop_ser.index) == False
].iloc[:,-2:].reset_index()['Country/Region'] ].iloc[:,-2:].reset_index()['Country/Region']
display(geodata_df.loc[geodata_df.index.isin(missing_countries)]) display(geodata_df.loc[geodata_df.index.isin(missing_countries)])
data_pop_ser = data_pop_ser.append(geodata_df.loc[geodata_df.index.isin(missing_countries), 'pop_est']) data_pop_ser = data_pop_ser.append(geodata_df.loc[geodata_df.index.isin(missing_countries), 'pop_est'])
``` ```
%% Output %% Output
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Compute rates per 100,000 for regions # Compute rates per 100,000 for regions
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
def cases_to_rates_df(df): def cases_to_rates_df(df):
per_100000_df = df.reset_index([1, 2], drop=True) per_100000_df = df.reset_index([1, 2], drop=True)
per_100000_df = per_100000_df.div(data_pop_ser, 'index').mul(100000).dropna() per_100000_df = per_100000_df.div(data_pop_ser, 'index').mul(100000).dropna()
per_100000_df.index.name = 'Country/Region' per_100000_df.index.name = 'Country/Region'
return per_100000_df return per_100000_df
def frames_to_rates(frames_map): def frames_to_rates(frames_map):
return {k: cases_to_rates_df(v) for k,v in frames_map.items()} return {k: cases_to_rates_df(v) for k,v in frames_map.items()}
rates_map = frames_to_rates(frames_map) rates_map = frames_to_rates(frames_map)
``` ```
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
if PAPERMILL_OUTPUT_PATH: if PAPERMILL_OUTPUT_PATH:
for k, v in rates_map.items(): for k, v in rates_map.items():
out_path = os.path.join(out_folder, f"ts_rates_19-covid-{k}.csv") out_path = os.path.join(out_folder, f"ts_rates_19-covid-{k}.csv")
v.reset_index().to_csv(out_path) v.reset_index().to_csv(out_path)
``` ```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment