Skip to content
Snippets Groups Projects
Commit 88bf5be9 authored by Rok Roškar's avatar Rok Roškar Committed by Chandrasekhar Ramakrishnan
Browse files

chore: implement read methods on converter classes

parent 7163637d
No related branches found
No related tags found
2 merge requests!107US Census,!103standardize-data
......@@ -9,7 +9,8 @@ import pandas as pd
from .. import helper
class CaseConverter(object):
class CaseConverter():
"""Base converter class."""
_converter_registry = []
......@@ -26,23 +27,35 @@ class CaseConverter(object):
"deceased",
"population",
"positive_100k",
"deceased_100k"
"deceased_100k",
]
@classmethod
def can_convert(cls, df):
def can_convert(cls, path):
"""Returns true if the class can convert the Dataframe."""
try:
df = cls.read_data(path)
except (FileNotFoundError, ValueError):
return False
return all([col in df.columns for col in cls.column_list])
@classmethod
def convert(cls, df):
def read_convert(cls, path):
"""Converts the Dataframe into the common format."""
for converter in cls._converter_registry:
if converter.can_convert(df):
return converter.convert(df)
raise NotImplementedError("DataFrame could not be converted")
if converter.can_convert(path):
print(f'Using {converter} for {path}')
return converter.convert(converter.read_data(path))
raise NotImplementedError(f"{path} could not be read and converted.")
@classmethod
def read_data(cls, path):
"""Read in the data from a directory path."""
raise NotImplementedError(
"Please use one of these subclasses instead:\n"
"{}".format(", ".join([str(c) for c in cls._converter_registry]))
)
@classmethod
def _register(cls):
CaseConverter._converter_registry.append(cls)
......@@ -2,11 +2,15 @@
Covid-19 converters for data from the covidtracking.com (USA).
"""
from pathlib import Path
import pandas as pd
from . import CaseConverter
from .. import helper
class CovidtrackingCaseConverter(CaseConverter):
"""
Converter for data from the United States, collected by
......@@ -44,25 +48,23 @@ class CovidtrackingCaseConverter(CaseConverter):
def convert(cls, df):
# rename the existing columns
df_conv = df.rename(columns=cls.conversion_dict)
# convert date
df_conv["date"] = pd.to_datetime(df_conv["date"], format="%Y%m%d")
# make states iso-3116 2 compliant
df_conv["region_iso"] = df_conv.apply(
lambda row: f'US-{row["state"]}', axis=1
)
df_conv["region_iso"] = df_conv.apply(lambda row: f'US-{row["state"]}', axis=1)
# get population data for US states through right-join type operation
metadata = pd.DataFrame(
helper.get_region_populations("USA")
).rename(columns={"regionLabel": "region_label"})
metadata = pd.DataFrame(helper.get_region_populations("USA")).rename(
columns={"regionLabel": "region_label"}
)
merged = pd.merge(df_conv, metadata, on="region_iso", how="right")
# add country information
merged["country"] = "USA"
# calculate incidence rates
merged["population"] = merged.population.astype(int)
merged["positive_100k"] = merged["positive"] / merged["population"] * 100000
......@@ -70,4 +72,10 @@ class CovidtrackingCaseConverter(CaseConverter):
return merged[CaseConverter.common_columns]
CovidtrackingCaseConverter._register()
\ No newline at end of file
@classmethod
def read_data(cls, path):
"""Read in the covidtracking state-level data."""
return pd.read_json(Path(path) / "states-daily.json")
CovidtrackingCaseConverter._register()
......@@ -2,6 +2,8 @@
Converters for covid-19 data from Italy.
"""
from pathlib import Path
import pandas as pd
from . import CaseConverter
......@@ -40,29 +42,37 @@ class ItalyCaseConverter(CaseConverter):
)
metadata["istatid"] = metadata.istatid.astype(int)
df_conv = _correct_trentino(df_conv)
merged = pd.merge(df_conv, metadata, on='istatid').drop_duplicates()
merged = pd.merge(df_conv, metadata, on="istatid").drop_duplicates()
merged = merged.rename(columns={"regionLabel": "region_label"})
# calculate incidence rates
merged['population'] = merged.population.astype(int)
merged['positive_100k'] = merged['positive'] / merged['population'] * 100000
merged['deceased_100k'] = merged['deceased'] / merged['population'] * 100000
merged["population"] = merged.population.astype(int)
merged["positive_100k"] = merged["positive"] / merged["population"] * 100000
merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000
return merged[CaseConverter.common_columns]
@classmethod
def read_data(cls, path):
"""Read in the data for Italy."""
df = pd.read_csv(Path(path) / "dpc-covid19-ita-regioni.csv")
return df
def _correct_trentino(df):
"""Merge Bolzano and Trento."""
# The regional data includes Bolzano and Trento separate regions - they are
# two provinces of the Trentino Alto Adige region, so merge them together
df_trentino_alto_adige = df.loc[df['istatid'] == 4].groupby('date').sum().reset_index()
df_trentino_alto_adige['istatid'] = 4
df_trentino_alto_adige['latitude'] = 46.4337
df_trentino_alto_adige['longitude'] = 11.1693
df_trentino_alto_adige['denominazione_regione'] = 'Trentino Alto Adige'
df_trentino_alto_adige['country'] = 'ITA'
df = df[df['istatid'] !=4 ].append(df_trentino_alto_adige)
df_trentino_alto_adige = (
df.loc[df["istatid"] == 4].groupby("date").sum().reset_index()
)
df_trentino_alto_adige["istatid"] = 4
df_trentino_alto_adige["latitude"] = 46.4337
df_trentino_alto_adige["longitude"] = 11.1693
df_trentino_alto_adige["denominazione_regione"] = "Trentino Alto Adige"
df_trentino_alto_adige["country"] = "ITA"
df = df[df["istatid"] != 4].append(df_trentino_alto_adige)
return df
......
......@@ -2,11 +2,14 @@
Covid-19 converters for data from Switzerland.
"""
from pathlib import Path
import pandas as pd
from . import CaseConverter
from .. import helper
class OpenZHCaseConverter(CaseConverter):
"""
Converter for data from Switzerland, collected by
......@@ -45,25 +48,38 @@ class OpenZHCaseConverter(CaseConverter):
# get metadata for swiss cantons
metadata = pd.DataFrame(helper.get_region_populations("CHE"))
merged = pd.merge(df_conv, metadata, on='region_iso').drop_duplicates()
merged = pd.merge(df_conv, metadata, on="region_iso").drop_duplicates()
merged["country"] = "CHE"
# standardize the canton names
merged['regionLabel'] = merged.apply(lambda row: _standardize_canton_name(row['regionLabel']), axis=1)
merged["regionLabel"] = merged.apply(
lambda row: _standardize_canton_name(row["regionLabel"]), axis=1
)
merged = merged.rename(columns={"regionLabel": "region_label"})
# calculate incidence rates
merged['population'] = merged.population.astype(int)
merged['positive_100k'] = merged['positive'] / merged['population'] * 100000
merged['deceased_100k'] = merged['deceased'] / merged['population'] * 100000
merged["population"] = merged.population.astype(int)
merged["positive_100k"] = merged["positive"] / merged["population"] * 100000
merged["deceased_100k"] = merged["deceased"] / merged["population"] * 100000
return merged[CaseConverter.common_columns]
@classmethod
def read_data(self, path):
"""Read in the swiss cantonal data."""
df_list = []
for f in Path(path).glob("COVID19_Fallzahlen_Kanton_*total.csv"):
df_list.append(pd.read_csv(f))
df = pd.concat(df_list)
df["date"] = pd.to_datetime(df["date"], dayfirst=True)
return df
def _standardize_canton_name(label):
label = label.lower()
if label.startswith('canton of'):
label = label[len('canton of'):].strip()
if label.startswith("canton of"):
label = label[len("canton of") :].strip()
return label.capitalize()
OpenZHCaseConverter._register()
\ No newline at end of file
OpenZHCaseConverter._register()
......@@ -153,7 +153,9 @@ def growth_df(rates_frames_map, geodata_df, name, countries_over_thresh, cutoff)
return confirmed_rate_df
def get_region_populations(country_iso3, additional_fields="", additional_query=""):
def get_region_populations(
country_iso3, additional_fields="", additional_query="", label_lang="en"
):
import sys
from SPARQLWrapper import SPARQLWrapper, JSON
......@@ -174,7 +176,7 @@ def get_region_populations(country_iso3, additional_fields="", additional_query=
?country wdt:P31 wd:Q3624078 .
{additional_query}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "{label_lang}" }}
}}
GROUP BY ?region_iso ?regionLabel {additional_fields}
"""
......@@ -196,6 +198,7 @@ def get_region_populations(country_iso3, additional_fields="", additional_query=
country_iso3=country_iso3,
additional_fields=additional_fields,
additional_query=additional_query,
label_lang=label_lang,
),
)
......
......@@ -18,7 +18,7 @@ def set_time_index(df, drop_hour=True):
if drop_hour:
lambda_func = lambda x: x.split(" ")[0]
else:
labda_func = lambda x: x
lambda_func = lambda x: x
timestamp = pd.DatetimeIndex(df["date"].apply(lambda_func))
df.set_index(timestamp, inplace=True)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment