From adfb0109d85fca2e36b679e13ec96884211911c4 Mon Sep 17 00:00:00 2001
From: Rok Roskar <rokroskar@gmail.com>
Date: Tue, 7 Apr 2020 12:15:10 +0200
Subject: [PATCH] chore: refactor incidence calculation and add contry labels

---
 notebooks/process/wikidata-pop-data.ipynb     | 67 ++++++++++++++-----
 .../covid_19_dashboard/converters/__init__.py | 19 ++++++
 .../converters/covidtracking.py               | 12 +---
 .../covid_19_dashboard/converters/italy.py    |  8 +--
 .../covid_19_dashboard/converters/jhu.py      | 10 +--
 .../covid_19_dashboard/converters/spain.py    |  8 +--
 .../converters/switzerland.py                 | 10 +--
 .../covid_19_dashboard/helper.py              |  8 ++-
 8 files changed, 85 insertions(+), 57 deletions(-)

diff --git a/notebooks/process/wikidata-pop-data.ipynb b/notebooks/process/wikidata-pop-data.ipynb
index 1eefbd38d..e19ba2701 100644
--- a/notebooks/process/wikidata-pop-data.ipynb
+++ b/notebooks/process/wikidata-pop-data.ipynb
@@ -9,7 +9,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -21,7 +21,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {
     "tags": [
      "parameters"
@@ -35,15 +35,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "def write_population_data(iso_code, df):\n",
-    "    out_path = os.path.join(out_folder, f\"{iso_code.lower()}-population.csv\")\n",
-    "    print(f\"Writing {len(df)} rows to {out_path}\")\n",
     "    if PAPERMILL_OUTPUT_PATH is None:\n",
     "        return\n",
+    "    out_path = os.path.join(out_folder, f\"{iso_code.lower()}-population.csv\")\n",
+    "    print(f\"Writing {len(df)} rows to {out_path}\")\n",
     "    df.to_csv(out_path)"
    ]
   },
@@ -56,9 +56,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": "  region_iso region_label country_label istatid population\n0      IT-34       Veneto         Italy      05    4926818\n1      IT-25     Lombardy         Italy      03   10067494",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>region_iso</th>\n      <th>region_label</th>\n      <th>country_label</th>\n      <th>istatid</th>\n      <th>population</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>IT-34</td>\n      <td>Veneto</td>\n      <td>Italy</td>\n      <td>05</td>\n      <td>4926818</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>IT-25</td>\n      <td>Lombardy</td>\n      <td>Italy</td>\n      <td>03</td>\n      <td>10067494</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+     },
+     "metadata": {},
+     "execution_count": 4
+    }
+   ],
    "source": [
     "iso_code = \"ITA\"\n",
     "pops = helper.get_region_populations(\n",
@@ -80,9 +90,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": "  region_iso      region_label country_label population\n0      CH-GE  Canton of Geneva   Switzerland     499480\n1      CH-JU    Canton of Jura   Switzerland      73419",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>region_iso</th>\n      <th>region_label</th>\n      <th>country_label</th>\n      <th>population</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>CH-GE</td>\n      <td>Canton of Geneva</td>\n      <td>Switzerland</td>\n      <td>499480</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>CH-JU</td>\n      <td>Canton of Jura</td>\n      <td>Switzerland</td>\n      <td>73419</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+     },
+     "metadata": {},
+     "execution_count": 5
+    }
+   ],
    "source": [
     "iso_code = \"CHE\"\n",
     "pops = helper.get_region_populations(iso_code)\n",
@@ -100,9 +120,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": "  region_iso region_label             country_label population\n0      US-DE     Delaware  United States of America     945934\n1      US-OR       Oregon  United States of America    4028977",
+      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>region_iso</th>\n      <th>region_label</th>\n      <th>country_label</th>\n      <th>population</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>US-DE</td>\n      <td>Delaware</td>\n      <td>United States of America</td>\n      <td>945934</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>US-OR</td>\n      <td>Oregon</td>\n      <td>United States of America</td>\n      <td>4028977</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
+     },
+     "metadata": {},
+     "execution_count": 6
+    }
+   ],
    "source": [
     "iso_code = \"USA\"\n",
     "pops = helper.get_region_populations(iso_code)\n",
@@ -110,13 +140,20 @@
     "write_population_data(iso_code, df)\n",
     "df.head(2)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.7.7 64-bit ('.venv': venv)",
    "language": "python",
-   "name": "python3"
+   "name": "python37764bitvenvvenv814492364d964019a25eb1cf3dc3e99c"
   },
   "language_info": {
    "codemirror_mode": {
@@ -128,9 +165,9 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.7.7-final"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
+}
\ No newline at end of file
diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py
index 12eb114ce..52dea73b3 100644
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py
@@ -46,8 +46,11 @@ class CaseConverterImpl:
     common_columns = [
         "date",
         "country",
+        "country_label",
         "region_iso",
         "region_label",
+        "admin2",
+        "admin2_label",
         "tested",
         "positive",
         "deceased",
@@ -75,6 +78,22 @@ class CaseConverterImpl:
             "{}".format(", ".join([str(c) for c in _converter_registry]))
         )
 
+    def _set_common_columns(self, df):
+        """Use only the common columns; add missing ones when needed."""
+        try:
+            df["population"] = df.population.astype(int)
+        except ValueError:
+            pass
+
+        df["positive_100k"] = df["positive"] / df["population"] * 100000
+        df["deceased_100k"] = df["deceased"] / df["population"] * 100000
+
+        for column in self.common_columns:
+            if column not in df.columns:
+                df[column] = ""
+
+        return df[self.common_columns]
+
     @classmethod
     def _register(cls):
         _converter_registry[cls.__name__] = cls
diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py
index 52e53f52b..df5d0b395 100644
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py
@@ -53,21 +53,13 @@ class CovidtrackingCaseConverter(CaseConverter):
         df_conv["region_iso"] = df_conv.apply(lambda row: f'US-{row["state"]}', axis=1)
 
         # get population data for US states through right-join type operation
-        metadata = pd.read_csv(
-            self.atlas_folder / "wikidata" / "usa-population.csv"
-        ).rename(columns={"regionLabel": "region_label"})
-
+        metadata = pd.read_csv(self.atlas_folder / "wikidata" / "usa-population.csv")
         merged = pd.merge(df_conv, metadata, on="region_iso", how="right")
 
         # add country information
         merged["country"] = "USA"
 
-        # calculate incidence rates
-        merged["population"] = merged.population.astype(int)
-        merged["positive_100k"] = merged["positive"] / merged["population"] * 100000
-        merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000
-
-        return merged[CaseConverter.common_columns]
+        return self._set_common_columns(merged)
 
     @classmethod
     def read_data(cls, path):
diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py
index 2bd385e04..48a854588 100644
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py
@@ -36,14 +36,8 @@ class ItalyCaseConverter(CaseConverter):
         metadata["istatid"] = metadata.istatid.astype(int)
         df_conv = _correct_trentino(df_conv)
         merged = pd.merge(df_conv, metadata, on="istatid").drop_duplicates()
-        merged = merged.rename(columns={"regionLabel": "region_label"})
 
-        # calculate incidence rates
-        merged["population"] = merged.population.astype(int)
-        merged["positive_100k"] = merged["positive"] / merged["population"] * 100000
-        merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000
-
-        return merged[CaseConverter.common_columns]
+        return self._set_common_columns(merged)
 
     def read_data(self, path):
         """Read in the data for Italy."""
diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py
index e4c89a3ef..67efbb968 100644
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/jhu.py
@@ -54,14 +54,8 @@ class JhuCsseGlobalCaseConverter(CaseConverterImpl):
 
         # TODO Also consult for worldmap data for countries like Taiwan
         merged["population"] = merged.apply(lambda r: pop_ser.loc[r["country"]], axis=1)
-
-        # calculate incidence rates
-        merged["positive_100k"] = merged["positive"] / merged["population"] * 100000
-        merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000
-        merged["region_iso"] = merged["country"]
-        merged["tested"] = np.nan
-
-        return merged[CaseConverterImpl.common_columns]
+        merged['region_iso'] = merged['country']
+        return self._set_common_columns(merged)
 
     def read_ser(self, path, name):
         """Read in the path and return as a column named name"""
diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/spain.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/spain.py
index e9ca026e1..3fcbb2259 100644
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/spain.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/spain.py
@@ -67,13 +67,9 @@ class SpainCaseConverter(CaseConverter):
 
         # calculate incidence rates
         merged = df_conv.merge(pd.DataFrame(region_populations))
-        merged["population"] = merged.population.astype(int)
-        merged["positive_100k"] = merged["positive"] / merged["population"] * 100000
-        merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000
-
         merged['country'] = 'ESP'
-        merged['tested'] = None
-        return merged[CaseConverter.common_columns]
+
+        return self._set_common_columns(merged)
 
     def read_data(self, path):
         """Read data for Spain."""
diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py
index 3154e074b..b8f19e776 100644
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py
@@ -52,16 +52,10 @@ class OpenZHCaseConverter(CaseConverter):
 
         # standardize the canton names
         merged["regionLabel"] = merged.apply(
-            lambda row: _standardize_canton_name(row["regionLabel"]), axis=1
+            lambda row: _standardize_canton_name(row["region_label"]), axis=1
         )
-        merged = merged.rename(columns={"regionLabel": "region_label"})
 
-        # calculate incidence rates
-        merged["population"] = merged.population.astype(int)
-        merged["positive_100k"] = merged["positive"] / merged["population"] * 100000
-        merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000
-
-        return merged[CaseConverter.common_columns]
+        return self._set_common_columns(merged)
 
     def read_data(self, path):
         """Read in the swiss cantonal data."""
diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py
index 5629bf9a3..986695d6d 100644
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py
@@ -162,7 +162,7 @@ def get_region_populations(
     endpoint_url = "https://query.wikidata.org/sparql"
 
     query = """
-    SELECT DISTINCT  ?region_iso ?regionLabel {additional_fields} (MAX(?population_cnt) as ?population)
+    SELECT DISTINCT  ?region_iso ?regionLabel ?countryLabel {additional_fields} (MAX(?population_cnt) as ?population)
     {{
         # select country by its iso-3
         ?country wdt:P298 "{country_iso3}" .
@@ -178,7 +178,7 @@ def get_region_populations(
 
         SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{label_lang}" }}
     }}
-    GROUP BY ?region_iso ?regionLabel {additional_fields}
+    GROUP BY ?region_iso ?regionLabel ?countryLabel {additional_fields}
     """
 
     def get_results(endpoint_url, query):
@@ -204,7 +204,9 @@ def get_region_populations(
 
     res = []
     for binding in results["results"]["bindings"]:
-        res.append({k: v["value"] for k, v in binding.items()})
+        res.append(
+            {k.replace("Label", "_label"): v["value"] for k, v in binding.items()}
+        )
 
     return res
 
-- 
GitLab