From 88bf5be91d30ca1206147c4de6b527fcd370099b Mon Sep 17 00:00:00 2001
From: Rok Roskar <rokroskar@gmail.com>
Date: Fri, 3 Apr 2020 18:15:24 +0200
Subject: [PATCH] chore: implement read methods on converter classes

---
 .../covid_19_dashboard/converters/__init__.py | 29 ++++++++++++-----
 .../converters/covidtracking.py               | 32 ++++++++++++-------
 .../covid_19_dashboard/converters/italy.py    | 32 ++++++++++++-------
 .../converters/switzerland.py                 | 32 ++++++++++++++-----
 .../covid_19_dashboard/helper.py              |  7 ++--
 .../covid_19_dashboard/italy_utils.py         |  2 +-
 6 files changed, 92 insertions(+), 42 deletions(-)

diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py
index b8ee7cfea..7bad5783c 100644
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py
@@ -9,7 +9,8 @@ import pandas as pd
 
 from .. import helper
 
-class CaseConverter(object):
+
+class CaseConverter():
     """Base converter class."""
 
     _converter_registry = []
@@ -26,23 +27,35 @@ class CaseConverter(object):
         "deceased",
         "population",
         "positive_100k",
-        "deceased_100k"
+        "deceased_100k",
     ]
 
     @classmethod
-    def can_convert(cls, df):
+    def can_convert(cls, path):
         """Returns true if the class can convert the Dataframe."""
+        try:
+            df = cls.read_data(path)
+        except (FileNotFoundError, ValueError):
+            return False
         return all([col in df.columns for col in cls.column_list])
 
     @classmethod
-    def convert(cls, df):
+    def read_convert(cls, path):
         """Converts the Dataframe into the common format."""
         for converter in cls._converter_registry:
-            if converter.can_convert(df):
-                return converter.convert(df)
-        raise NotImplementedError("DataFrame could not be converted")
+            if converter.can_convert(path):
+                print(f'Using {converter} for {path}')
+                return converter.convert(converter.read_data(path))
+        raise NotImplementedError(f"{path} could not be read and converted.")
+
+    @classmethod
+    def read_data(cls, path):
+        """Read in the data from a directory path."""
+        raise NotImplementedError(
+            "Please use one of these subclasses instead:\n"
+            "{}".format(", ".join([str(c) for c in cls._converter_registry]))
+        )
 
     @classmethod
     def _register(cls):
         CaseConverter._converter_registry.append(cls)
-
diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py
index da04a74e8..13b6aec15 100644
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py
@@ -2,11 +2,15 @@
 Covid-19 converters for data from the covidtracking.com (USA).
 """
 
+from pathlib import Path
+
+
 import pandas as pd
 
 from . import CaseConverter
 from .. import helper
 
+
 class CovidtrackingCaseConverter(CaseConverter):
     """
     Converter for data from the United States, collected by
@@ -44,25 +48,23 @@ class CovidtrackingCaseConverter(CaseConverter):
     def convert(cls, df):
         # rename the existing columns
         df_conv = df.rename(columns=cls.conversion_dict)
-        
+
         # convert date
         df_conv["date"] = pd.to_datetime(df_conv["date"], format="%Y%m%d")
-        
+
         # make states iso-3116 2 compliant
-        df_conv["region_iso"] = df_conv.apply(
-            lambda row: f'US-{row["state"]}', axis=1
-        )
+        df_conv["region_iso"] = df_conv.apply(lambda row: f'US-{row["state"]}', axis=1)
 
         # get population data for US states through right-join type operation
-        metadata = pd.DataFrame(
-            helper.get_region_populations("USA")
-        ).rename(columns={"regionLabel": "region_label"})
-        
+        metadata = pd.DataFrame(helper.get_region_populations("USA")).rename(
+            columns={"regionLabel": "region_label"}
+        )
+
         merged = pd.merge(df_conv, metadata, on="region_iso", how="right")
-        
+
         # add country information
         merged["country"] = "USA"
-        
+
         # calculate incidence rates
         merged["population"] = merged.population.astype(int)
         merged["positive_100k"] = merged["positive"] / merged["population"] * 100000
@@ -70,4 +72,10 @@ class CovidtrackingCaseConverter(CaseConverter):
 
         return merged[CaseConverter.common_columns]
 
-CovidtrackingCaseConverter._register()
\ No newline at end of file
+    @classmethod
+    def read_data(cls, path):
+        """Read in the covidtracking state-level data."""
+        return pd.read_json(Path(path) / "states-daily.json")
+
+
+CovidtrackingCaseConverter._register()
diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py
index 7e04ddff4..11873554a 100644
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py
@@ -2,6 +2,8 @@
 Converters for covid-19 data from Italy.
 """
 
+from pathlib import Path
+
 import pandas as pd
 
 from . import CaseConverter
@@ -40,29 +42,37 @@ class ItalyCaseConverter(CaseConverter):
         )
         metadata["istatid"] = metadata.istatid.astype(int)
         df_conv = _correct_trentino(df_conv)
-        merged = pd.merge(df_conv, metadata, on='istatid').drop_duplicates()
+        merged = pd.merge(df_conv, metadata, on="istatid").drop_duplicates()
         merged = merged.rename(columns={"regionLabel": "region_label"})
 
         # calculate incidence rates
-        merged['population'] = merged.population.astype(int)
-        merged['positive_100k'] = merged['positive'] / merged['population'] * 100000
-        merged['deceased_100k'] = merged['deceased'] / merged['population'] * 100000
+        merged["population"] = merged.population.astype(int)
+        merged["positive_100k"] = merged["positive"] / merged["population"] * 100000
+        merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000
 
         return merged[CaseConverter.common_columns]
 
+    @classmethod
+    def read_data(cls, path):
+        """Read in the data for Italy."""
+        df = pd.read_csv(Path(path) / "dpc-covid19-ita-regioni.csv")
+        return df
+
 
 def _correct_trentino(df):
     """Merge Bolzano and Trento."""
     # The regional data includes Bolzano and Trento separate regions - they are
     # two provinces of the Trentino Alto Adige region, so merge them together
 
-    df_trentino_alto_adige = df.loc[df['istatid'] == 4].groupby('date').sum().reset_index()
-    df_trentino_alto_adige['istatid'] = 4
-    df_trentino_alto_adige['latitude'] = 46.4337
-    df_trentino_alto_adige['longitude'] = 11.1693
-    df_trentino_alto_adige['denominazione_regione'] = 'Trentino Alto Adige'
-    df_trentino_alto_adige['country'] = 'ITA'
-    df = df[df['istatid'] !=4 ].append(df_trentino_alto_adige)
+    df_trentino_alto_adige = (
+        df.loc[df["istatid"] == 4].groupby("date").sum().reset_index()
+    )
+    df_trentino_alto_adige["istatid"] = 4
+    df_trentino_alto_adige["latitude"] = 46.4337
+    df_trentino_alto_adige["longitude"] = 11.1693
+    df_trentino_alto_adige["denominazione_regione"] = "Trentino Alto Adige"
+    df_trentino_alto_adige["country"] = "ITA"
+    df = df[df["istatid"] != 4].append(df_trentino_alto_adige)
     return df
 
 
diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py
index 1fbf87ddc..035e0d7f2 100644
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py
@@ -2,11 +2,14 @@
 Covid-19 converters for data from Switzerland.
 """
 
+from pathlib import Path
+
 import pandas as pd
 
 from . import CaseConverter
 from .. import helper
 
+
 class OpenZHCaseConverter(CaseConverter):
     """
     Converter for data from Switzerland, collected by
@@ -45,25 +48,38 @@ class OpenZHCaseConverter(CaseConverter):
 
         # get metadata for swiss cantons
         metadata = pd.DataFrame(helper.get_region_populations("CHE"))
-        merged = pd.merge(df_conv, metadata, on='region_iso').drop_duplicates()
+        merged = pd.merge(df_conv, metadata, on="region_iso").drop_duplicates()
         merged["country"] = "CHE"
 
         # standardize the canton names
-        merged['regionLabel'] = merged.apply(lambda row: _standardize_canton_name(row['regionLabel']), axis=1)
+        merged["regionLabel"] = merged.apply(
+            lambda row: _standardize_canton_name(row["regionLabel"]), axis=1
+        )
         merged = merged.rename(columns={"regionLabel": "region_label"})
 
         # calculate incidence rates
-        merged['population'] = merged.population.astype(int)
-        merged['positive_100k'] = merged['positive'] / merged['population'] * 100000
-        merged['deceased_100k'] = merged['deceased'] / merged['population'] * 100000
+        merged["population"] = merged.population.astype(int)
+        merged["positive_100k"] = merged["positive"] / merged["population"] * 100000
+        merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000
 
         return merged[CaseConverter.common_columns]
 
+    @classmethod
+    def read_data(self, path):
+        """Read in the swiss cantonal data."""
+        df_list = []
+        for f in Path(path).glob("COVID19_Fallzahlen_Kanton_*total.csv"):
+            df_list.append(pd.read_csv(f))
+        df = pd.concat(df_list)
+        df["date"] = pd.to_datetime(df["date"], dayfirst=True)
+        return df
+
 
 def _standardize_canton_name(label):
     label = label.lower()
-    if label.startswith('canton of'):
-        label = label[len('canton of'):].strip()
+    if label.startswith("canton of"):
+        label = label[len("canton of") :].strip()
     return label.capitalize()
 
-OpenZHCaseConverter._register()
\ No newline at end of file
+
+OpenZHCaseConverter._register()
diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py
index 163e7a863..5629bf9a3 100644
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py
@@ -153,7 +153,9 @@ def growth_df(rates_frames_map, geodata_df, name, countries_over_thresh, cutoff)
     return confirmed_rate_df
 
 
-def get_region_populations(country_iso3, additional_fields="", additional_query=""):
+def get_region_populations(
+    country_iso3, additional_fields="", additional_query="", label_lang="en"
+):
     import sys
     from SPARQLWrapper import SPARQLWrapper, JSON
 
@@ -174,7 +176,7 @@ def get_region_populations(country_iso3, additional_fields="", additional_query=
         ?country wdt:P31 wd:Q3624078 .
         {additional_query}
 
-        SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
+        SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{label_lang}" }}
     }}
     GROUP BY ?region_iso ?regionLabel {additional_fields}
     """
@@ -196,6 +198,7 @@ def get_region_populations(country_iso3, additional_fields="", additional_query=
             country_iso3=country_iso3,
             additional_fields=additional_fields,
             additional_query=additional_query,
+            label_lang=label_lang,
         ),
     )
 
diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/italy_utils.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/italy_utils.py
index e8254c90a..912d59df8 100644
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/italy_utils.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/italy_utils.py
@@ -18,7 +18,7 @@ def set_time_index(df, drop_hour=True):
     if drop_hour:
         lambda_func = lambda x: x.split(" ")[0]
     else:
-        labda_func = lambda x: x
+        lambda_func = lambda x: x
 
     timestamp = pd.DatetimeIndex(df["date"].apply(lambda_func))
     df.set_index(timestamp, inplace=True)
-- 
GitLab