From c90274b2e9dfb0168acc904dd1362100618d3a60 Mon Sep 17 00:00:00 2001
From: Chandrasekhar Ramakrishnan <cramakri@ethz.ch>
Date: Wed, 8 Apr 2020 12:48:48 +0000
Subject: [PATCH] feat: use Harvard Worldmap for regions without population
 data from Worldbank

---
 .../covid_19_dashboard/converters/jhu.py      | 31 ++++++++++++++++---
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py
index 2236705ca..a6b53cdaf 100644
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py
@@ -28,32 +28,55 @@ class JhuCsseGlobalCaseConverter(CaseConverterImpl):
         pop_df = pd.read_csv(
             zf.open("API_SP.POP.TOTL_DS2_en_csv_v2_821007.csv"), skiprows=4
         )
-
+        pop_df = pop_df[["Country Name", "Country Code", "2018"]]
         # Fix the names of countries
         region_jhu_wb_map = {
+            "Bahamas": "Bahamas, The",
             "Brunei": "Brunei Darussalam",
+            "Burma": "Myanmar",
+            "Congo (Brazzaville)": "Congo, Rep.",
+            "Congo (Kinshasa)": "Congo, Dem. Rep.",
             "Czechia": "Czech Republic",
             "Egypt": "Egypt, Arab Rep.",
+            "Gambia": "Gambia, The",
             "Hong Kong SAR": "Hong Kong SAR, China",
             "Iran": "Iran, Islamic Rep.",
             "Korea, South": "Korea, Rep.",
+            "Kyrgyzstan": "Kyrgyz Republic",
+            "Laos": "Lao PDR",
             "Macao SAR": "Macao SAR, China",
             "Russia": "Russian Federation",
+            "Saint Kitts and Nevis": "St. Kitts and Nevis",
+            "Saint Lucia": "St. Lucia",
+            "Saint Vincent and the Grenadines": "St. Vincent and the Grenadines",
             "Slovakia": "Slovak Republic",
             "Saint Martin": "St. Martin (French part)",
+            "Syria": "Syrian Arab Republic",
+            'Taiwan*': 'Taiwan',
+            "Venezuela": "Venezuela, RB",
             "US": "United States",
         }
         df = df.replace(region_jhu_wb_map)
+
+        # add in missing data from Harvard worldmap
+        missing_countries = pd.unique(df.loc[df["region_label"].isin(pop_df["Country Name"]) == False, "region_label"])
+        worldmap_df = pd.read_csv(self.atlas_folder / "worldmap" / "country_centroids.csv")
+        worldmap_df = worldmap_df[['name', 'sov_a3', 'pop_est']]
+        worldmap_df = worldmap_df.rename({"name": "Country Name",
+                                          "sov_a3": "Country Code",
+                                          "pop_est": "2018"}, axis=1)
+        worldmap_df = worldmap_df.loc[worldmap_df["Country Name"].isin(missing_countries)]
+        pop_df = pop_df.append(worldmap_df)
+
+        pop_ser = pop_df.set_index("Country Code")["2018"]
         country_code_map = {
             r["Country Name"]: r["Country Code"]
             for i, r in pop_df[["Country Name", "Country Code"]].iterrows()
         }
         df["country"] = df["region_label"].replace(country_code_map)
         df['country_label'] = df['region_label']
-        pop_ser = pop_df.set_index("Country Code")["2018"]
-        merged = df.loc[df["country"].isin(pop_ser.index)].copy()
 
-        # TODO Also consult for worldmap data for countries like Taiwan
+        merged = df.loc[df["country"].isin(pop_ser.index)].copy()
         merged["population"] = merged.apply(lambda r: pop_ser.loc[r["country"]], axis=1)
         merged['region_iso'] = merged['country']
         return self._set_common_columns(merged)
-- 
GitLab