From 88bf5be91d30ca1206147c4de6b527fcd370099b Mon Sep 17 00:00:00 2001 From: Rok Roskar <rokroskar@gmail.com> Date: Fri, 3 Apr 2020 18:15:24 +0200 Subject: [PATCH] chore: implement read methods on converter classes --- .../covid_19_dashboard/converters/__init__.py | 29 ++++++++++++----- .../converters/covidtracking.py | 32 ++++++++++++------- .../covid_19_dashboard/converters/italy.py | 32 ++++++++++++------- .../converters/switzerland.py | 32 ++++++++++++++----- .../covid_19_dashboard/helper.py | 7 ++-- .../covid_19_dashboard/italy_utils.py | 2 +- 6 files changed, 92 insertions(+), 42 deletions(-) diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py index b8ee7cfea..7bad5783c 100644 --- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/__init__.py @@ -9,7 +9,8 @@ import pandas as pd from .. import helper -class CaseConverter(object): + +class CaseConverter(): """Base converter class.""" _converter_registry = [] @@ -26,23 +27,35 @@ class CaseConverter(object): "deceased", "population", "positive_100k", - "deceased_100k" + "deceased_100k", ] @classmethod - def can_convert(cls, df): + def can_convert(cls, path): """Returns true if the class can convert the Dataframe.""" + try: + df = cls.read_data(path) + except (FileNotFoundError, ValueError): + return False return all([col in df.columns for col in cls.column_list]) @classmethod - def convert(cls, df): + def read_convert(cls, path): """Converts the Dataframe into the common format.""" for converter in cls._converter_registry: - if converter.can_convert(df): - return converter.convert(df) - raise NotImplementedError("DataFrame could not be converted") + if converter.can_convert(path): + print(f'Using {converter} for {path}') + return converter.convert(converter.read_data(path)) + raise NotImplementedError(f"{path} could not be read and converted.") + + @classmethod + def read_data(cls, path): + """Read in the data from a directory path.""" + raise NotImplementedError( + "Please use one of these subclasses instead:\n" + "{}".format(", ".join([str(c) for c in cls._converter_registry])) + ) @classmethod def _register(cls): CaseConverter._converter_registry.append(cls) - diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py index da04a74e8..13b6aec15 100644 --- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/covidtracking.py @@ -2,11 +2,15 @@ Covid-19 converters for data from the covidtracking.com (USA). """ +from pathlib import Path + + import pandas as pd from . import CaseConverter from .. import helper + class CovidtrackingCaseConverter(CaseConverter): """ Converter for data from the United States, collected by @@ -44,25 +48,23 @@ class CovidtrackingCaseConverter(CaseConverter): def convert(cls, df): # rename the existing columns df_conv = df.rename(columns=cls.conversion_dict) - + # convert date df_conv["date"] = pd.to_datetime(df_conv["date"], format="%Y%m%d") - + # make states iso-3116 2 compliant - df_conv["region_iso"] = df_conv.apply( - lambda row: f'US-{row["state"]}', axis=1 - ) + df_conv["region_iso"] = df_conv.apply(lambda row: f'US-{row["state"]}', axis=1) # get population data for US states through right-join type operation - metadata = pd.DataFrame( - helper.get_region_populations("USA") - ).rename(columns={"regionLabel": "region_label"}) - + metadata = pd.DataFrame(helper.get_region_populations("USA")).rename( + columns={"regionLabel": "region_label"} + ) + merged = pd.merge(df_conv, metadata, on="region_iso", how="right") - + # add country information merged["country"] = "USA" - + # calculate incidence rates merged["population"] = merged.population.astype(int) merged["positive_100k"] = merged["positive"] / merged["population"] * 100000 @@ -70,4 +72,10 @@ class CovidtrackingCaseConverter(CaseConverter): return merged[CaseConverter.common_columns] -CovidtrackingCaseConverter._register() \ No newline at end of file + @classmethod + def read_data(cls, path): + """Read in the covidtracking state-level data.""" + return pd.read_json(Path(path) / "states-daily.json") + + +CovidtrackingCaseConverter._register() diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py index 7e04ddff4..11873554a 100644 --- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/italy.py @@ -2,6 +2,8 @@ Converters for covid-19 data from Italy. """ +from pathlib import Path + import pandas as pd from . import CaseConverter @@ -40,29 +42,37 @@ class ItalyCaseConverter(CaseConverter): ) metadata["istatid"] = metadata.istatid.astype(int) df_conv = _correct_trentino(df_conv) - merged = pd.merge(df_conv, metadata, on='istatid').drop_duplicates() + merged = pd.merge(df_conv, metadata, on="istatid").drop_duplicates() merged = merged.rename(columns={"regionLabel": "region_label"}) # calculate incidence rates - merged['population'] = merged.population.astype(int) - merged['positive_100k'] = merged['positive'] / merged['population'] * 100000 - merged['deceased_100k'] = merged['deceased'] / merged['population'] * 100000 + merged["population"] = merged.population.astype(int) + merged["positive_100k"] = merged["positive"] / merged["population"] * 100000 + merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000 return merged[CaseConverter.common_columns] + @classmethod + def read_data(cls, path): + """Read in the data for Italy.""" + df = pd.read_csv(Path(path) / "dpc-covid19-ita-regioni.csv") + return df + def _correct_trentino(df): """Merge Bolzano and Trento.""" # The regional data includes Bolzano and Trento separate regions - they are # two provinces of the Trentino Alto Adige region, so merge them together - df_trentino_alto_adige = df.loc[df['istatid'] == 4].groupby('date').sum().reset_index() - df_trentino_alto_adige['istatid'] = 4 - df_trentino_alto_adige['latitude'] = 46.4337 - df_trentino_alto_adige['longitude'] = 11.1693 - df_trentino_alto_adige['denominazione_regione'] = 'Trentino Alto Adige' - df_trentino_alto_adige['country'] = 'ITA' - df = df[df['istatid'] !=4 ].append(df_trentino_alto_adige) + df_trentino_alto_adige = ( + df.loc[df["istatid"] == 4].groupby("date").sum().reset_index() + ) + df_trentino_alto_adige["istatid"] = 4 + df_trentino_alto_adige["latitude"] = 46.4337 + df_trentino_alto_adige["longitude"] = 11.1693 + df_trentino_alto_adige["denominazione_regione"] = "Trentino Alto Adige" + df_trentino_alto_adige["country"] = "ITA" + df = df[df["istatid"] != 4].append(df_trentino_alto_adige) return df diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py index 1fbf87ddc..035e0d7f2 100644 --- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/converters/switzerland.py @@ -2,11 +2,14 @@ Covid-19 converters for data from Switzerland. """ +from pathlib import Path + import pandas as pd from . import CaseConverter from .. import helper + class OpenZHCaseConverter(CaseConverter): """ Converter for data from Switzerland, collected by @@ -45,25 +48,38 @@ class OpenZHCaseConverter(CaseConverter): # get metadata for swiss cantons metadata = pd.DataFrame(helper.get_region_populations("CHE")) - merged = pd.merge(df_conv, metadata, on='region_iso').drop_duplicates() + merged = pd.merge(df_conv, metadata, on="region_iso").drop_duplicates() merged["country"] = "CHE" # standardize the canton names - merged['regionLabel'] = merged.apply(lambda row: _standardize_canton_name(row['regionLabel']), axis=1) + merged["regionLabel"] = merged.apply( + lambda row: _standardize_canton_name(row["regionLabel"]), axis=1 + ) merged = merged.rename(columns={"regionLabel": "region_label"}) # calculate incidence rates - merged['population'] = merged.population.astype(int) - merged['positive_100k'] = merged['positive'] / merged['population'] * 100000 - merged['deceased_100k'] = merged['deceased'] / merged['population'] * 100000 + merged["population"] = merged.population.astype(int) + merged["positive_100k"] = merged["positive"] / merged["population"] * 100000 + merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000 return merged[CaseConverter.common_columns] + @classmethod + def read_data(self, path): + """Read in the swiss cantonal data.""" + df_list = [] + for f in Path(path).glob("COVID19_Fallzahlen_Kanton_*total.csv"): + df_list.append(pd.read_csv(f)) + df = pd.concat(df_list) + df["date"] = pd.to_datetime(df["date"], dayfirst=True) + return df + def _standardize_canton_name(label): label = label.lower() - if label.startswith('canton of'): - label = label[len('canton of'):].strip() + if label.startswith("canton of"): + label = label[len("canton of") :].strip() return label.capitalize() -OpenZHCaseConverter._register() \ No newline at end of file + +OpenZHCaseConverter._register() diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py index 163e7a863..5629bf9a3 100644 --- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/helper.py @@ -153,7 +153,9 @@ def growth_df(rates_frames_map, geodata_df, name, countries_over_thresh, cutoff) return confirmed_rate_df -def get_region_populations(country_iso3, additional_fields="", additional_query=""): +def get_region_populations( + country_iso3, additional_fields="", additional_query="", label_lang="en" +): import sys from SPARQLWrapper import SPARQLWrapper, JSON @@ -174,7 +176,7 @@ def get_region_populations(country_iso3, additional_fields="", additional_query= ?country wdt:P31 wd:Q3624078 . {additional_query} - SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }} + SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{label_lang}" }} }} GROUP BY ?region_iso ?regionLabel {additional_fields} """ @@ -196,6 +198,7 @@ def get_region_populations(country_iso3, additional_fields="", additional_query= country_iso3=country_iso3, additional_fields=additional_fields, additional_query=additional_query, + label_lang=label_lang, ), ) diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/italy_utils.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/italy_utils.py index e8254c90a..912d59df8 100644 --- a/src/covid-19/covid_19_dashboard/covid_19_dashboard/italy_utils.py +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/italy_utils.py @@ -18,7 +18,7 @@ def set_time_index(df, drop_hour=True): if drop_hour: lambda_func = lambda x: x.split(" ")[0] else: - labda_func = lambda x: x + lambda_func = lambda x: x timestamp = pd.DatetimeIndex(df["date"].apply(lambda_func)) df.set_index(timestamp, inplace=True) -- GitLab