# Extract the Geographic Info

Use the Harvard [country_centroids.csv](https://worldmap.harvard.edu/data/geonode:country_centroids_az8) data to extract the geographic info we need for the visualizations.

In [None]:
import pandas as pd
import os

In [None]:
ts_folder = "../data/covid-19_jhu-csse/"
worldmap_path = "../data/worldmap/country_centroids.csv"
out_folder = None
PAPERMILL_OUTPUT_PATH = None

## Read in JHU CSSE data

In [None]:
def read_jhu_covid_region_df(name):
 filename = os.path.join(ts_folder, f"time_series_19-covid-{name}.csv")
 df = pd.read_csv(filename)
 df = df.set_index(['Country/Region', 'Province/State', 'Lat', 'Long'])
 df.columns = pd.to_datetime(df.columns)
 region_df = df.groupby(level='Country/Region').sum()
 return region_df

In [None]:
confirmed_df = read_jhu_covid_region_df("Confirmed")

# Read in Harvard country centroids

In [None]:
country_centroids_df = pd.read_csv(worldmap_path)
country_centroids_df = country_centroids_df[['name', 'name_long', 'region_un', 'subregion', 'region_wb', 'pop_est', 'gdp_md_est', 'income_grp', 'Longitude', 'Latitude']]
country_centroids_df['name_jhu'] = country_centroids_df['name_long'] 

In [None]:
country_centroids_df.columns

Fix names that differ between JHU CSSE and Harvard data

In [None]:
region_hd_jhu_map = {
 'Brunei Darussalam': 'Brunei',
 "Côte d'Ivoire": "Cote d'Ivoire",
 'Czech Republic': 'Czechia',
 'Hong Kong': 'Hong Kong SAR',
 'Republic of Korea': 'Korea, South',
 'Macao': 'Macao SAR',
 'Russian Federation': 'Russia',
 'Taiwan': 'Taiwan*',
 'United States': 'US'
}
country_centroids_df['name_jhu'] = country_centroids_df['name_jhu'].replace(region_hd_jhu_map)

In [None]:
# Use this to find the name in the series
# country_centroids_df[country_centroids_df['name'].str.contains('Macao')]

There are some regions that we cannot resolve, but we will just ignore these.

In [None]:
confirmed_df.loc[
 (confirmed_df.index.isin(country_centroids_df['name_jhu']) == False)
].iloc[:,-2:]

# Save the result

In [None]:
if PAPERMILL_OUTPUT_PATH:
 out_path = os.path.join(out_folder, f"geo_data.csv")
 country_centroids_df.to_csv(out_path)