chore: implement read methods on converter classes

88bf5be9 · Rok Roškar · Chandrasekhar Ramakrishnan · 7163637d · 88bf5be9 · 88bf5be9
Commit 88bf5be9 authored 4 years ago by Rok Roškar Committed by Chandrasekhar Ramakrishnan 4 years ago
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py
@@ -9,7 +9,8 @@ import pandas as pd

 from .. import helper

-class CaseConverter(object):
+
+class CaseConverter():
    """Base converter class."""

    _converter_registry = []
@@ -26,23 +27,35 @@ class CaseConverter(object):
        "deceased",
        "population",
        "positive_100k",
-        "deceased_100k"
+        "deceased_100k",
    ]

    @classmethod
-    def can_convert(cls, df):
+    def can_convert(cls, path):
        """Returns true if the class can convert the Dataframe."""
+        try:
+            df = cls.read_data(path)
+        except (FileNotFoundError, ValueError):
+            return False
        return all([col in df.columns for col in cls.column_list])

    @classmethod
-    def convert(cls, df):
+    def read_convert(cls, path):
        """Converts the Dataframe into the common format."""
        for converter in cls._converter_registry:
-            if converter.can_convert(df):
-                return converter.convert(df)
-        raise NotImplementedError("DataFrame could not be converted")
+            if converter.can_convert(path):
+                print(f'Using {converter} for {path}')
+                return converter.convert(converter.read_data(path))
+        raise NotImplementedError(f"{path} could not be read and converted.")
+
+    @classmethod
+    def read_data(cls, path):
+        """Read in the data from a directory path."""
+        raise NotImplementedError(
+            "Please use one of these subclasses instead:\n"
+            "{}".format(", ".join([str(c) for c in cls._converter_registry]))
+        )

    @classmethod
    def _register(cls):
        CaseConverter._converter_registry.append(cls)
-
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py
@@ -2,11 +2,15 @@
 Covid-19 converters for data from the covidtracking.com (USA).
 """

+from pathlib import Path
+
+
 import pandas as pd

 from . import CaseConverter
 from .. import helper

+
 class CovidtrackingCaseConverter(CaseConverter):
    """
    Converter for data from the United States, collected by
@@ -44,25 +48,23 @@ class CovidtrackingCaseConverter(CaseConverter):
    def convert(cls, df):
        # rename the existing columns
        df_conv = df.rename(columns=cls.conversion_dict)
-        
+
        # convert date
        df_conv["date"] = pd.to_datetime(df_conv["date"], format="%Y%m%d")
-        
+
        # make states iso-3116 2 compliant
-        df_conv["region_iso"] = df_conv.apply(
-            lambda row: f'US-{row["state"]}', axis=1
-        )
+        df_conv["region_iso"] = df_conv.apply(lambda row: f'US-{row["state"]}', axis=1)

        # get population data for US states through right-join type operation
-        metadata = pd.DataFrame(
-            helper.get_region_populations("USA")
-        ).rename(columns={"regionLabel": "region_label"})
-        
+        metadata = pd.DataFrame(helper.get_region_populations("USA")).rename(
+            columns={"regionLabel": "region_label"}
+        )
+
        merged = pd.merge(df_conv, metadata, on="region_iso", how="right")
-        
+
        # add country information
        merged["country"] = "USA"
-        
+
        # calculate incidence rates
        merged["population"] = merged.population.astype(int)
        merged["positive_100k"] = merged["positive"] / merged["population"] * 100000
@@ -70,4 +72,10 @@ class CovidtrackingCaseConverter(CaseConverter):

        return merged[CaseConverter.common_columns]

-CovidtrackingCaseConverter._register()
\ No newline at end of file
+    @classmethod
+    def read_data(cls, path):
+        """Read in the covidtracking state-level data."""
+        return pd.read_json(Path(path) / "states-daily.json")
+
+
+CovidtrackingCaseConverter._register()
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py
@@ -2,6 +2,8 @@
 Converters for covid-19 data from Italy.
 """

+from pathlib import Path
+
 import pandas as pd

 from . import CaseConverter
@@ -40,29 +42,37 @@ class ItalyCaseConverter(CaseConverter):
        )
        metadata["istatid"] = metadata.istatid.astype(int)
        df_conv = _correct_trentino(df_conv)
-        merged = pd.merge(df_conv, metadata, on='istatid').drop_duplicates()
+        merged = pd.merge(df_conv, metadata, on="istatid").drop_duplicates()
        merged = merged.rename(columns={"regionLabel": "region_label"})

        # calculate incidence rates
-        merged['population'] = merged.population.astype(int)
-        merged['positive_100k'] = merged['positive'] / merged['population'] * 100000
-        merged['deceased_100k'] = merged['deceased'] / merged['population'] * 100000
+        merged["population"] = merged.population.astype(int)
+        merged["positive_100k"] = merged["positive"] / merged["population"] * 100000
+        merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000

        return merged[CaseConverter.common_columns]

+    @classmethod
+    def read_data(cls, path):
+        """Read in the data for Italy."""
+        df = pd.read_csv(Path(path) / "dpc-covid19-ita-regioni.csv")
+        return df
+

 def _correct_trentino(df):
    """Merge Bolzano and Trento."""
    # The regional data includes Bolzano and Trento separate regions - they are
    # two provinces of the Trentino Alto Adige region, so merge them together

-    df_trentino_alto_adige = df.loc[df['istatid'] == 4].groupby('date').sum().reset_index()
-    df_trentino_alto_adige['istatid'] = 4
-    df_trentino_alto_adige['latitude'] = 46.4337
-    df_trentino_alto_adige['longitude'] = 11.1693
-    df_trentino_alto_adige['denominazione_regione'] = 'Trentino Alto Adige'
-    df_trentino_alto_adige['country'] = 'ITA'
-    df = df[df['istatid'] !=4 ].append(df_trentino_alto_adige)
+    df_trentino_alto_adige = (
+        df.loc[df["istatid"] == 4].groupby("date").sum().reset_index()
+    )
+    df_trentino_alto_adige["istatid"] = 4
+    df_trentino_alto_adige["latitude"] = 46.4337
+    df_trentino_alto_adige["longitude"] = 11.1693
+    df_trentino_alto_adige["denominazione_regione"] = "Trentino Alto Adige"
+    df_trentino_alto_adige["country"] = "ITA"
+    df = df[df["istatid"] != 4].append(df_trentino_alto_adige)
    return df



--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py
@@ -2,11 +2,14 @@
 Covid-19 converters for data from Switzerland.
 """

+from pathlib import Path
+
 import pandas as pd

 from . import CaseConverter
 from .. import helper

+
 class OpenZHCaseConverter(CaseConverter):
    """
    Converter for data from Switzerland, collected by
@@ -45,25 +48,38 @@ class OpenZHCaseConverter(CaseConverter):

        # get metadata for swiss cantons
        metadata = pd.DataFrame(helper.get_region_populations("CHE"))
-        merged = pd.merge(df_conv, metadata, on='region_iso').drop_duplicates()
+        merged = pd.merge(df_conv, metadata, on="region_iso").drop_duplicates()
        merged["country"] = "CHE"

        # standardize the canton names
-        merged['regionLabel'] = merged.apply(lambda row: _standardize_canton_name(row['regionLabel']), axis=1)
+        merged["regionLabel"] = merged.apply(
+            lambda row: _standardize_canton_name(row["regionLabel"]), axis=1
+        )
        merged = merged.rename(columns={"regionLabel": "region_label"})

        # calculate incidence rates
-        merged['population'] = merged.population.astype(int)
-        merged['positive_100k'] = merged['positive'] / merged['population'] * 100000
-        merged['deceased_100k'] = merged['deceased'] / merged['population'] * 100000
+        merged["population"] = merged.population.astype(int)
+        merged["positive_100k"] = merged["positive"] / merged["population"] * 100000
+        merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000

        return merged[CaseConverter.common_columns]

+    @classmethod
+    def read_data(self, path):
+        """Read in the swiss cantonal data."""
+        df_list = []
+        for f in Path(path).glob("COVID19_Fallzahlen_Kanton_*total.csv"):
+            df_list.append(pd.read_csv(f))
+        df = pd.concat(df_list)
+        df["date"] = pd.to_datetime(df["date"], dayfirst=True)
+        return df
+

 def _standardize_canton_name(label):
    label = label.lower()
-    if label.startswith('canton of'):
-        label = label[len('canton of'):].strip()
+    if label.startswith("canton of"):
+        label = label[len("canton of") :].strip()
    return label.capitalize()

-OpenZHCaseConverter._register()
\ No newline at end of file
+
+OpenZHCaseConverter._register()
--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py
@@ -153,7 +153,9 @@ def growth_df(rates_frames_map, geodata_df, name, countries_over_thresh, cutoff)
    return confirmed_rate_df


-def get_region_populations(country_iso3, additional_fields="", additional_query=""):
+def get_region_populations(
+    country_iso3, additional_fields="", additional_query="", label_lang="en"
+):
    import sys
    from SPARQLWrapper import SPARQLWrapper, JSON

@@ -174,7 +176,7 @@ def get_region_populations(country_iso3, additional_fields="", additional_query=
        ?country wdt:P31 wd:Q3624078 .
        {additional_query}

-        SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
+        SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{label_lang}" }}
    }}
    GROUP BY ?region_iso ?regionLabel {additional_fields}
    """
@@ -196,6 +198,7 @@ def get_region_populations(country_iso3, additional_fields="", additional_query=
            country_iso3=country_iso3,
            additional_fields=additional_fields,
            additional_query=additional_query,
+            label_lang=label_lang,
        ),
    )


--- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/italy_utils.py
+++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/italy_utils.py
@@ -18,7 +18,7 @@ def set_time_index(df, drop_hour=True):
    if drop_hour:
        lambda_func = lambda x: x.split(" ")[0]
    else:
-        labda_func = lambda x: x
+        lambda_func = lambda x: x

    timestamp = pd.DatetimeIndex(df["date"].apply(lambda_func))
    df.set_index(timestamp, inplace=True)