From adfb0109d85fca2e36b679e13ec96884211911c4 Mon Sep 17 00:00:00 2001 From: Rok Roskar <rokroskar@gmail.com> Date: Tue, 7 Apr 2020 12:15:10 +0200 Subject: [PATCH] chore: refactor incidence calculation and add contry labels --- notebooks/process/wikidata-pop-data.ipynb | 67 ++++++++++++++----- .../covid_19_dashboard/converters/__init__.py | 19 ++++++ .../converters/covidtracking.py | 12 +--- .../covid_19_dashboard/converters/italy.py | 8 +-- .../covid_19_dashboard/converters/jhu.py | 10 +-- .../covid_19_dashboard/converters/spain.py | 8 +-- .../converters/switzerland.py | 10 +-- .../covid_19_dashboard/helper.py | 8 ++- 8 files changed, 85 insertions(+), 57 deletions(-) diff --git a/notebooks/process/wikidata-pop-data.ipynb b/notebooks/process/wikidata-pop-data.ipynb index 1eefbd38d..e19ba2701 100644 --- a/notebooks/process/wikidata-pop-data.ipynb +++ b/notebooks/process/wikidata-pop-data.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -21,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "tags": [ "parameters" @@ -35,15 +35,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def write_population_data(iso_code, df):\n", - " out_path = os.path.join(out_folder, f\"{iso_code.lower()}-population.csv\")\n", - " print(f\"Writing {len(df)} rows to {out_path}\")\n", " if PAPERMILL_OUTPUT_PATH is None:\n", " return\n", + " out_path = os.path.join(out_folder, f\"{iso_code.lower()}-population.csv\")\n", + " print(f\"Writing {len(df)} rows to {out_path}\")\n", " df.to_csv(out_path)" ] }, @@ -56,9 +56,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": " region_iso region_label country_label istatid population\n0 IT-34 Veneto Italy 05 4926818\n1 IT-25 Lombardy Italy 03 10067494", + "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>region_iso</th>\n <th>region_label</th>\n <th>country_label</th>\n <th>istatid</th>\n <th>population</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>IT-34</td>\n <td>Veneto</td>\n <td>Italy</td>\n <td>05</td>\n <td>4926818</td>\n </tr>\n <tr>\n <th>1</th>\n <td>IT-25</td>\n <td>Lombardy</td>\n <td>Italy</td>\n <td>03</td>\n <td>10067494</td>\n </tr>\n </tbody>\n</table>\n</div>" + }, + "metadata": {}, + "execution_count": 4 + } + ], "source": [ "iso_code = \"ITA\"\n", "pops = helper.get_region_populations(\n", @@ -80,9 +90,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": " region_iso region_label country_label population\n0 CH-GE Canton of Geneva Switzerland 499480\n1 CH-JU Canton of Jura Switzerland 73419", + "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>region_iso</th>\n <th>region_label</th>\n <th>country_label</th>\n <th>population</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>CH-GE</td>\n <td>Canton of Geneva</td>\n <td>Switzerland</td>\n <td>499480</td>\n </tr>\n <tr>\n <th>1</th>\n <td>CH-JU</td>\n <td>Canton of Jura</td>\n <td>Switzerland</td>\n <td>73419</td>\n </tr>\n </tbody>\n</table>\n</div>" + }, + "metadata": {}, + "execution_count": 5 + } + ], "source": [ "iso_code = \"CHE\"\n", "pops = helper.get_region_populations(iso_code)\n", @@ -100,9 +120,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": " region_iso region_label country_label population\n0 US-DE Delaware United States of America 945934\n1 US-OR Oregon United States of America 4028977", + "text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>region_iso</th>\n <th>region_label</th>\n <th>country_label</th>\n <th>population</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>US-DE</td>\n <td>Delaware</td>\n <td>United States of America</td>\n <td>945934</td>\n </tr>\n <tr>\n <th>1</th>\n <td>US-OR</td>\n <td>Oregon</td>\n <td>United States of America</td>\n <td>4028977</td>\n </tr>\n </tbody>\n</table>\n</div>" + }, + "metadata": {}, + "execution_count": 6 + } + ], "source": [ "iso_code = \"USA\"\n", "pops = helper.get_region_populations(iso_code)\n", @@ -110,13 +140,20 @@ "write_population_data(iso_code, df)\n", "df.head(2)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.7.7 64-bit ('.venv': venv)", "language": "python", - "name": "python3" + "name": "python37764bitvenvvenv814492364d964019a25eb1cf3dc3e99c" }, "language_info": { "codemirror_mode": { @@ -128,9 +165,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.7-final" } }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py index 12eb114ce..52dea73b3 100644 --- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py @@ -46,8 +46,11 @@ class CaseConverterImpl: common_columns = [ "date", "country", + "country_label", "region_iso", "region_label", + "admin2", + "admin2_label", "tested", "positive", "deceased", @@ -75,6 +78,22 @@ class CaseConverterImpl: "{}".format(", ".join([str(c) for c in _converter_registry])) ) + def _set_common_columns(self, df): + """Use only the common columns; add missing ones when needed.""" + try: + df["population"] = df.population.astype(int) + except ValueError: + pass + + df["positive_100k"] = df["positive"] / df["population"] * 100000 + df["deceased_100k"] = df["deceased"] / df["population"] * 100000 + + for column in self.common_columns: + if column not in df.columns: + df[column] = "" + + return df[self.common_columns] + @classmethod def _register(cls): _converter_registry[cls.__name__] = cls diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py index 52e53f52b..df5d0b395 100644 --- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py @@ -53,21 +53,13 @@ class CovidtrackingCaseConverter(CaseConverter): df_conv["region_iso"] = df_conv.apply(lambda row: f'US-{row["state"]}', axis=1) # get population data for US states through right-join type operation - metadata = pd.read_csv( - self.atlas_folder / "wikidata" / "usa-population.csv" - ).rename(columns={"regionLabel": "region_label"}) - + metadata = pd.read_csv(self.atlas_folder / "wikidata" / "usa-population.csv") merged = pd.merge(df_conv, metadata, on="region_iso", how="right") # add country information merged["country"] = "USA" - # calculate incidence rates - merged["population"] = merged.population.astype(int) - merged["positive_100k"] = merged["positive"] / merged["population"] * 100000 - merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000 - - return merged[CaseConverter.common_columns] + return self._set_common_columns(merged) @classmethod def read_data(cls, path): diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py index 2bd385e04..48a854588 100644 --- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py @@ -36,14 +36,8 @@ class ItalyCaseConverter(CaseConverter): metadata["istatid"] = metadata.istatid.astype(int) df_conv = _correct_trentino(df_conv) merged = pd.merge(df_conv, metadata, on="istatid").drop_duplicates() - merged = merged.rename(columns={"regionLabel": "region_label"}) - # calculate incidence rates - merged["population"] = merged.population.astype(int) - merged["positive_100k"] = merged["positive"] / merged["population"] * 100000 - merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000 - - return merged[CaseConverter.common_columns] + return self._set_common_columns(merged) def read_data(self, path): """Read in the data for Italy.""" diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py index e4c89a3ef..67efbb968 100644 --- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py @@ -54,14 +54,8 @@ class JhuCsseGlobalCaseConverter(CaseConverterImpl): # TODO Also consult for worldmap data for countries like Taiwan merged["population"] = merged.apply(lambda r: pop_ser.loc[r["country"]], axis=1) - - # calculate incidence rates - merged["positive_100k"] = merged["positive"] / merged["population"] * 100000 - merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000 - merged["region_iso"] = merged["country"] - merged["tested"] = np.nan - - return merged[CaseConverterImpl.common_columns] + merged['region_iso'] = merged['country'] + return self._set_common_columns(merged) def read_ser(self, path, name): """Read in the path and return as a column named name""" diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/spain.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/spain.py index e9ca026e1..3fcbb2259 100644 --- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/spain.py +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/spain.py @@ -67,13 +67,9 @@ class SpainCaseConverter(CaseConverter): # calculate incidence rates merged = df_conv.merge(pd.DataFrame(region_populations)) - merged["population"] = merged.population.astype(int) - merged["positive_100k"] = merged["positive"] / merged["population"] * 100000 - merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000 - merged['country'] = 'ESP' - merged['tested'] = None - return merged[CaseConverter.common_columns] + + return self._set_common_columns(merged) def read_data(self, path): """Read data for Spain.""" diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py index 3154e074b..b8f19e776 100644 --- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py @@ -52,16 +52,10 @@ class OpenZHCaseConverter(CaseConverter): # standardize the canton names merged["regionLabel"] = merged.apply( - lambda row: _standardize_canton_name(row["regionLabel"]), axis=1 + lambda row: _standardize_canton_name(row["region_label"]), axis=1 ) - merged = merged.rename(columns={"regionLabel": "region_label"}) - # calculate incidence rates - merged["population"] = merged.population.astype(int) - merged["positive_100k"] = merged["positive"] / merged["population"] * 100000 - merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000 - - return merged[CaseConverter.common_columns] + return self._set_common_columns(merged) def read_data(self, path): """Read in the swiss cantonal data.""" diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py index 5629bf9a3..986695d6d 100644 --- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py @@ -162,7 +162,7 @@ def get_region_populations( endpoint_url = "https://query.wikidata.org/sparql" query = """ - SELECT DISTINCT ?region_iso ?regionLabel {additional_fields} (MAX(?population_cnt) as ?population) + SELECT DISTINCT ?region_iso ?regionLabel ?countryLabel {additional_fields} (MAX(?population_cnt) as ?population) {{ # select country by its iso-3 ?country wdt:P298 "{country_iso3}" . @@ -178,7 +178,7 @@ def get_region_populations( SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{label_lang}" }} }} - GROUP BY ?region_iso ?regionLabel {additional_fields} + GROUP BY ?region_iso ?regionLabel ?countryLabel {additional_fields} """ def get_results(endpoint_url, query): @@ -204,7 +204,9 @@ def get_region_populations( res = [] for binding in results["results"]["bindings"]: - res.append({k: v["value"] for k, v in binding.items()}) + res.append( + {k.replace("Label", "_label"): v["value"] for k, v in binding.items()} + ) return res -- GitLab