# Extract the Geographic Info

Use the Harvard [country_centroids.csv](https://worldmap.harvard.edu/data/geonode:country_centroids_az8) data to extract the geographic info we need for the visualizations.

In [1]:
import pandas as pd
import os

In [2]:
ts_folder = "../data/covid-19_jhu-csse/"
worldmap_path = "../data/worldmap/country_centroids.csv"
out_folder = None
PAPERMILL_OUTPUT_PATH = None

## Read in JHU CSSE data

In [3]:
# Parameters
PAPERMILL_INPUT_PATH = "notebooks/process/CompileGeoData.ipynb"
PAPERMILL_OUTPUT_PATH = "runs/CompileGeoData.run.ipynb"
ts_folder = "./data/covid-19_jhu-csse/"
worldmap_path = "./data/worldmap/country_centroids.csv"
out_folder = "./data/geodata/"


In [4]:
def read_jhu_covid_region_df(name):
 filename = os.path.join(ts_folder, f"time_series_19-covid-{name}.csv")
 df = pd.read_csv(filename)
 df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])
 df.columns = pd.to_datetime(df.columns)
 region_df = df.groupby(level='Country/Region').sum()
 return region_df

In [5]:
confirmed_df = read_jhu_covid_region_df("Confirmed")

# Read in Harvard country centroids

In [6]:
country_centroids_df = pd.read_csv(worldmap_path)
country_centroids_df = country_centroids_df[['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est', 'gdp_md_est', 'income_grp', 'Longitude', 'Latitude']]
country_centroids_df['name_jhu'] = country_centroids_df['name_long'] 

In [7]:
country_centroids_df.columns

Index(['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est',
 'gdp_md_est', 'income_grp', 'Longitude', 'Latitude', 'name_jhu'],
 dtype='object')

Fix names that differ between JHU CSSE and Harvard data

In [8]:
region_hd_jhu_map = {
 'Brunei Darussalam': 'Brunei',
 "Côte d'Ivoire": "Cote d'Ivoire",
 'Czech Republic': 'Czechia',
 'Hong Kong': 'Hong Kong SAR',
 'Republic of Korea': 'Korea, South',
 'Macao': 'Macao SAR',
 'Russian Federation': 'Russia',
 'Taiwan': 'Taiwan*',
 'United States': 'US'
}
country_centroids_df['name_jhu'] = country_centroids_df['name_jhu'].replace(region_hd_jhu_map)

In [9]:
# Use this to find the name in the series
# country_centroids_df[country_centroids_df['name'].str.contains('Macao')]

There are some regions that we cannot resolve, but we will just ignore these.

In [10]:
confirmed_df.loc[
 (confirmed_df.index.isin(country_centroids_df['name_jhu']) == False)
].iloc[:,-2:]

Unnamed: 0_level_0,2020-03-16,2020-03-17
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Congo (Brazzaville),1,1
Congo (Kinshasa),2,3
Cruise Ship,696,696
Eswatini,1,1
Holy See,1,1
Martinique,15,16
North Macedonia,18,26
Republic of the Congo,1,1
The Bahamas,1,1


# Save the result

In [11]:
if PAPERMILL_OUTPUT_PATH:
 out_path = os.path.join(out_folder, f"geo_data.csv")
 country_centroids_df.to_csv(out_path)