From afa34c41921cb7f21172914f6ba82c6a8ba6d0ec Mon Sep 17 00:00:00 2001 From: Chandrasekhar Ramakrishnan <cramakri@ethz.ch> Date: Sun, 12 Apr 2020 21:12:47 +0000 Subject: [PATCH] fix: clean datatypes in JHU converter --- .../covid_19_utils/covid_19_utils/converters/__init__.py | 3 ++- .../covid_19_utils/covid_19_utils/converters/jhu.py | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/covid-19/covid_19_utils/covid_19_utils/converters/__init__.py b/src/covid-19/covid_19_utils/covid_19_utils/converters/__init__.py index 5135c5477..c2f957b37 100644 --- a/src/covid-19/covid_19_utils/covid_19_utils/converters/__init__.py +++ b/src/covid-19/covid_19_utils/covid_19_utils/converters/__init__.py @@ -83,7 +83,8 @@ class CaseConverterImpl: def _set_common_columns(self, df): """Use only the common columns; add missing ones when needed.""" try: - df["population"] = df.population.astype(int) + # Int32 (as opposed to int32) is an intger type that allows nan + df["population"] = df.population.astype("Int32") except ValueError: pass diff --git a/src/covid-19/covid_19_utils/covid_19_utils/converters/jhu.py b/src/covid-19/covid_19_utils/covid_19_utils/converters/jhu.py index 0be24407c..8c6488c37 100644 --- a/src/covid-19/covid_19_utils/covid_19_utils/converters/jhu.py +++ b/src/covid-19/covid_19_utils/covid_19_utils/converters/jhu.py @@ -18,6 +18,8 @@ class JhuCsseGlobalCaseConverter(CaseConverterImpl): and made available at https://github.com/CSSEGISandData/COVID-19 """ + column_list = ["date", "Country/Region"] + def convert(self, df): # combine subregions at the level of country df = df.groupby(["date", "Country/Region"]).sum().reset_index() @@ -77,7 +79,8 @@ class JhuCsseGlobalCaseConverter(CaseConverterImpl): ] pop_df = pop_df.append(worldmap_df) - pop_ser = pop_df.set_index("Country Code")["2018"] + # Get rid of any duplicates + pop_ser = pop_df.groupby("Country Code").max()["2018"] country_code_map = { r["Country Name"]: r["Country Code"] for i, r in pop_df[["Country Name", "Country Code"]].iterrows() @@ -86,7 +89,7 @@ class JhuCsseGlobalCaseConverter(CaseConverterImpl): df["country_label"] = df["region_label"] merged = df.loc[df["country"].isin(pop_ser.index)].copy() - merged["population"] = merged.apply(lambda r: pop_ser.loc[r["country"]], axis=1) + merged["population"] = merged.apply(lambda r: pop_ser.loc[r["country"]], axis=1).astype("Int32") merged["region_iso"] = merged["country"] merged["tested"] = np.nan return self._set_common_columns(merged) -- GitLab