From afa34c41921cb7f21172914f6ba82c6a8ba6d0ec Mon Sep 17 00:00:00 2001
From: Chandrasekhar Ramakrishnan <cramakri@ethz.ch>
Date: Sun, 12 Apr 2020 21:12:47 +0000
Subject: [PATCH] fix: clean datatypes in JHU converter

---
 .../covid_19_utils/covid_19_utils/converters/__init__.py   | 3 ++-
 .../covid_19_utils/covid_19_utils/converters/jhu.py        | 7 +++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/covid-19/covid_19_utils/covid_19_utils/converters/__init__.py b/src/covid-19/covid_19_utils/covid_19_utils/converters/__init__.py
index 5135c5477..c2f957b37 100644
--- a/src/covid-19/covid_19_utils/covid_19_utils/converters/__init__.py
+++ b/src/covid-19/covid_19_utils/covid_19_utils/converters/__init__.py
@@ -83,7 +83,8 @@ class CaseConverterImpl:
     def _set_common_columns(self, df):
         """Use only the common columns; add missing ones when needed."""
         try:
-            df["population"] = df.population.astype(int)
+            # Int32 (as opposed to int32) is an intger type that allows nan
+            df["population"] = df.population.astype("Int32")
         except ValueError:
             pass
 
diff --git a/src/covid-19/covid_19_utils/covid_19_utils/converters/jhu.py b/src/covid-19/covid_19_utils/covid_19_utils/converters/jhu.py
index 0be24407c..8c6488c37 100644
--- a/src/covid-19/covid_19_utils/covid_19_utils/converters/jhu.py
+++ b/src/covid-19/covid_19_utils/covid_19_utils/converters/jhu.py
@@ -18,6 +18,8 @@ class JhuCsseGlobalCaseConverter(CaseConverterImpl):
     and made available at https://github.com/CSSEGISandData/COVID-19
     """
 
+    column_list = ["date", "Country/Region"]
+
     def convert(self, df):
         # combine subregions at the level of country
         df = df.groupby(["date", "Country/Region"]).sum().reset_index()
@@ -77,7 +79,8 @@ class JhuCsseGlobalCaseConverter(CaseConverterImpl):
         ]
         pop_df = pop_df.append(worldmap_df)
 
-        pop_ser = pop_df.set_index("Country Code")["2018"]
+        # Get rid of any duplicates
+        pop_ser = pop_df.groupby("Country Code").max()["2018"]
         country_code_map = {
             r["Country Name"]: r["Country Code"]
             for i, r in pop_df[["Country Name", "Country Code"]].iterrows()
@@ -86,7 +89,7 @@ class JhuCsseGlobalCaseConverter(CaseConverterImpl):
         df["country_label"] = df["region_label"]
 
         merged = df.loc[df["country"].isin(pop_ser.index)].copy()
-        merged["population"] = merged.apply(lambda r: pop_ser.loc[r["country"]], axis=1)
+        merged["population"] = merged.apply(lambda r: pop_ser.loc[r["country"]], axis=1).astype("Int32")
         merged["region_iso"] = merged["country"]
         merged["tested"] = np.nan
         return self._set_common_columns(merged)
-- 
GitLab