From e8dd4bed2bd92c7824c1cfd093572fe664740b1a Mon Sep 17 00:00:00 2001 From: Rok Roskar <rokroskar@gmail.com> Date: Mon, 30 Mar 2020 15:35:23 +0200 Subject: [PATCH] italy: refactor utils and papermill-ize the italy notebook --- .gitignore | 2 + README.md | 10 +- ...e.ipynb => italy-covid-19-dashboard.ipynb} | 0 ...ook-example.ipynb => italy-covid-19.ipynb} | 118 +++++++++--------- .../examples/italy-examples/italy_utils.py | 46 ------- .../.github/ISSUE_TEMPLATE.md | 15 --- .../covid_19_dashboard/italy_utils.py | 89 +++++++++++++ 7 files changed, 156 insertions(+), 124 deletions(-) rename notebooks/examples/{italy-examples/italy-dashboard-example.ipynb => italy-covid-19-dashboard.ipynb} (100%) rename notebooks/examples/{italy-examples/italy-notebook-example.ipynb => italy-covid-19.ipynb} (99%) delete mode 100644 notebooks/examples/italy-examples/italy_utils.py delete mode 100644 src/covid-19/covid_19_dashboard/.github/ISSUE_TEMPLATE.md create mode 100644 src/covid-19/covid_19_dashboard/covid_19_dashboard/italy_utils.py diff --git a/.gitignore b/.gitignore index 7112dac..2be4bb4 100644 --- a/.gitignore +++ b/.gitignore @@ -378,3 +378,5 @@ tags .renku.lock .renku/tmp .renku/cache + +.vscode diff --git a/README.md b/README.md index 8d54616..89c4240 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ results from your fork! If you don't know how or just need help with some of the git-heavy aspects of this, shoot us a line [on Discourse](https://renku.discourse.group) or [open an issue](https://renkulab.io/projects/covid-19/covid-19-public-data/collaboration/issues) -and someone will be able to help out. +and someone will be able to help out. The environment image allows you to work in Python or R in JupyterLab or RStudio/Shiny. @@ -58,7 +58,7 @@ The environment image allows you to work in Python or R in JupyterLab or RStudio <td><a href="https://github.com/pcm-dpc/COVID-19">Covid-19 data for Italy</a></td> <td><a href="https://renkulab.io/projects/covid-19/covid-19-public-data/datasets/286c58b1-dbbc-4caa-a23a-fcb001d5ac51/">covid-19-italy</a></td> <td><code>data/covid-19-italy</code></td> -<td><a href="https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/notebooks/examples/italy-examples/italy-notebook-example.ipynb">notebook</a> +<td><a href="https://renkulab.io/projects/covid-19/covid-19-public-data/files/blob/runs/italy-covid-19.ipynb">notebook</a> </td> </tr> <tr> @@ -92,11 +92,11 @@ this data in combination with population data from the world bank. ### Covid tracking crowdsourcing project -[Covid tracking](https://covidtracking.com) is a crowd-sourced dataset for US state-level data. It is updated by hand by an army of volunteers. +[Covid tracking](https://covidtracking.com) is a crowd-sourced dataset for US state-level data. It is updated by hand by an army of volunteers. ### OpenData Zuerich -The [swiss cantonal data](https://github.com/openZH/covid_19) collected by the Zürich statistical office. Parts are updated manually, others are starting to become automated. +The [swiss cantonal data](https://github.com/openZH/covid_19) collected by the Zürich statistical office. Parts are updated manually, others are starting to become automated. ### Case data for Italy @@ -135,7 +135,7 @@ A collection of tweet-ids related to covid-19 from https://github.com/echen102/C If you are interested in working on this project, we would love to get contributions. We would really like to collect more data sources and make them available here! Please provide ideas for data sources that are relevant to -understanding covid-19. +understanding covid-19. If you want to add a new datasource yourself, see the section [Adding a new data source](#adding-a-new-data-source) diff --git a/notebooks/examples/italy-examples/italy-dashboard-example.ipynb b/notebooks/examples/italy-covid-19-dashboard.ipynb similarity index 100% rename from notebooks/examples/italy-examples/italy-dashboard-example.ipynb rename to notebooks/examples/italy-covid-19-dashboard.ipynb diff --git a/notebooks/examples/italy-examples/italy-notebook-example.ipynb b/notebooks/examples/italy-covid-19.ipynb similarity index 99% rename from notebooks/examples/italy-examples/italy-notebook-example.ipynb rename to notebooks/examples/italy-covid-19.ipynb index a12e68a..7ccb963 100644 --- a/notebooks/examples/italy-examples/italy-notebook-example.ipynb +++ b/notebooks/examples/italy-covid-19.ipynb @@ -1,10 +1,28 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# COVID-19 Case data for Italy\n", + "\n", + "Data from [Civil Protection of Italy](https://github.com/pcm-dpc/COVID-19).\n" + ] + }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], "source": [ "%load_ext autoreload\n", "%autoreload 2" @@ -12,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -21,12 +39,12 @@ "import pandas as pd\n", "import altair as alt\n", "\n", - "from italy_utils import * " + "from covid_19_dashboard.italy_utils import get_region_populations, prepare_dataframe" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "tags": [ "parameters" @@ -34,12 +52,12 @@ }, "outputs": [], "source": [ - "data_folder = \"../../../data/covid-19-italy/\"" + "data_folder = \"../../data/covid-19-italy/\"" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -53,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -62,19 +80,26 @@ "df_national[\"New cases per day\"] = df_national[\"total_cases\"].diff()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## National summary of total cases and tests" + ] + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "<div id=\"altair-viz-13c82e6f28564e11aa2669e7cef5118c\"></div>\n", + "<div id=\"altair-viz-b0e9028f399740e6ae252330c1ab787a\"></div>\n", "<script type=\"text/javascript\">\n", " (function(spec, embedOpt){\n", - " const outputDiv = document.getElementById(\"altair-viz-13c82e6f28564e11aa2669e7cef5118c\");\n", + " const outputDiv = document.getElementById(\"altair-viz-b0e9028f399740e6ae252330c1ab787a\");\n", " const paths = {\n", " \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n", " \"vega-lib\": \"https://cdn.jsdelivr.net/npm//vega-lib?noext\",\n", @@ -122,7 +147,7 @@ "alt.HConcatChart(...)" ] }, - "execution_count": 6, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -143,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -168,43 +193,13 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Retrieve population data and include it in the dataframe\n", "\n", - "def get_region_populations():\n", - " import sys\n", - " from SPARQLWrapper import SPARQLWrapper, JSON\n", - "\n", - " endpoint_url = \"https://query.wikidata.org/sparql\"\n", - "\n", - " query = \"\"\"SELECT DISTINCT ?istatid (MAX(?population) AS ?maxPopulation)\n", - " WHERE \n", - " { \n", - " ?region wdt:P31 wd:Q16110 .\n", - " ?region wdt:P1082 ?population .\n", - " ?region wdt:P635 ?istatid\n", - "\n", - " }\n", - " GROUP BY ?istatid\n", - " ORDER BY DESC(?maxPopulation)\n", - " \"\"\"\n", - "\n", - "\n", - " def get_results(endpoint_url, query):\n", - " user_agent = \"WDQS-example Python/%s.%s\" % (sys.version_info[0], sys.version_info[1])\n", - " # TODO adjust user agent; see https://w.wiki/CX6\n", - " sparql = SPARQLWrapper(endpoint_url, agent=user_agent)\n", - " sparql.setQuery(query)\n", - " sparql.setReturnFormat(JSON)\n", - " return sparql.query().convert()\n", - "\n", - "\n", - " results = get_results(endpoint_url, query)\n", "\n", - " return {int(result['istatid']['value']): int(result['maxPopulation']['value']) for result in results[\"results\"][\"bindings\"]}\n", "\n", "populations = get_region_populations()\n", "\n", @@ -214,7 +209,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -223,19 +218,26 @@ "sorted_regions = max_cases.sort_values(by='total_cases', ascending=False)['region']" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cases per region" + ] + }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "<div id=\"altair-viz-15e14124b6a841b9aa8498e393f469cb\"></div>\n", + "<div id=\"altair-viz-7052d881cde742fd9e6efd03dec87b6a\"></div>\n", "<script type=\"text/javascript\">\n", " (function(spec, embedOpt){\n", - " const outputDiv = document.getElementById(\"altair-viz-15e14124b6a841b9aa8498e393f469cb\");\n", + " const outputDiv = document.getElementById(\"altair-viz-7052d881cde742fd9e6efd03dec87b6a\");\n", " const paths = {\n", " \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n", " \"vega-lib\": \"https://cdn.jsdelivr.net/npm//vega-lib?noext\",\n", @@ -283,7 +285,7 @@ "alt.FacetChart(...)" ] }, - "execution_count": 10, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -308,17 +310,17 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "<div id=\"altair-viz-9060b987df434411b98238de09c624f7\"></div>\n", + "<div id=\"altair-viz-db198cb7a8a74e879ac65b2f99eb1f88\"></div>\n", "<script type=\"text/javascript\">\n", " (function(spec, embedOpt){\n", - " const outputDiv = document.getElementById(\"altair-viz-9060b987df434411b98238de09c624f7\");\n", + " const outputDiv = document.getElementById(\"altair-viz-db198cb7a8a74e879ac65b2f99eb1f88\");\n", " const paths = {\n", " \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n", " \"vega-lib\": \"https://cdn.jsdelivr.net/npm//vega-lib?noext\",\n", @@ -366,7 +368,7 @@ "alt.FacetChart(...)" ] }, - "execution_count": 11, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -391,7 +393,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -403,17 +405,17 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", - "<div id=\"altair-viz-5ee298d0662e49f4b9aa4bf59a5beff8\"></div>\n", + "<div id=\"altair-viz-55822820a19a4ebbb884079884d1d6fa\"></div>\n", "<script type=\"text/javascript\">\n", " (function(spec, embedOpt){\n", - " const outputDiv = document.getElementById(\"altair-viz-5ee298d0662e49f4b9aa4bf59a5beff8\");\n", + " const outputDiv = document.getElementById(\"altair-viz-55822820a19a4ebbb884079884d1d6fa\");\n", " const paths = {\n", " \"vega\": \"https://cdn.jsdelivr.net/npm//vega@5?noext\",\n", " \"vega-lib\": \"https://cdn.jsdelivr.net/npm//vega-lib?noext\",\n", @@ -461,7 +463,7 @@ "alt.FacetChart(...)" ] }, - "execution_count": 13, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } diff --git a/notebooks/examples/italy-examples/italy_utils.py b/notebooks/examples/italy-examples/italy_utils.py deleted file mode 100644 index fef759c..0000000 --- a/notebooks/examples/italy-examples/italy_utils.py +++ /dev/null @@ -1,46 +0,0 @@ -import json -import os -import pandas as pd - - -def translate_columns(data_folder, df, description_filename): - description_file_path = os.path.join(data_folder, description_filename) - - with open(description_file_path, 'r') as description_file: - descriptions = json.loads(description_file.read().encode().decode('utf-8-sig')) - descriptions = { column_dict['Nome campo']: column_dict for column_dict in descriptions} - df.rename(columns=lambda col: descriptions[col]['Field name'], inplace=True) - return df - -def set_time_index(df, drop_hour=True): - if drop_hour: - lambda_func = lambda x: x.split(' ')[0] - else: - labda_func = lambda x: x - - timestamp = pd.DatetimeIndex(df['date'].apply(lambda_func)) - df.set_index(timestamp, inplace=True) - del df['date'] - return df - -def prepare_dataframe(data_folder, df_filename, description_filename, use_time_index=False): - data_file_path = os.path.join(data_folder, df_filename) - df = pd.read_csv(data_file_path) - del df['note_it'] - del df['note_en'] - df = translate_columns(data_folder, df, description_filename) - if use_time_index: - df = set_time_index(df) - return df - -def get_province_structure(df_provinces): - """Extract the province/region structure from the province dataframe.""" - - def get_province_list(region): - """Get list of provinces for a given region.""" - provinces = set(df_provinces.loc[df_provinces['region']==region]['province']) - provinces.discard('In fase di definizione/aggiornamento') - return list(provinces) - - regions = df_provinces['region'].unique() - return {region: get_province_list(region) for region in regions} \ No newline at end of file diff --git a/src/covid-19/covid_19_dashboard/.github/ISSUE_TEMPLATE.md b/src/covid-19/covid_19_dashboard/.github/ISSUE_TEMPLATE.md deleted file mode 100644 index 127e9a1..0000000 --- a/src/covid-19/covid_19_dashboard/.github/ISSUE_TEMPLATE.md +++ /dev/null @@ -1,15 +0,0 @@ -* COVID-19 Dashboard version: -* Python version: -* Operating System: - -### Description - -Describe what you were trying to get done. -Tell us what happened, what went wrong, and what you expected to happen. - -### What I Did - -``` -Paste the command(s) you ran and the output. -If there was a crash, please include the traceback here. -``` diff --git a/src/covid-19/covid_19_dashboard/covid_19_dashboard/italy_utils.py b/src/covid-19/covid_19_dashboard/covid_19_dashboard/italy_utils.py new file mode 100644 index 0000000..6bc392b --- /dev/null +++ b/src/covid-19/covid_19_dashboard/covid_19_dashboard/italy_utils.py @@ -0,0 +1,89 @@ +import json +import os +import pandas as pd + + +def translate_columns(data_folder, df, description_filename): + description_file_path = os.path.join(data_folder, description_filename) + + with open(description_file_path, "r") as description_file: + descriptions = json.loads(description_file.read().encode().decode("utf-8-sig")) + descriptions = { + column_dict["Nome campo"]: column_dict for column_dict in descriptions + } + df.rename(columns=lambda col: descriptions[col]["Field name"], inplace=True) + return df + + +def set_time_index(df, drop_hour=True): + if drop_hour: + lambda_func = lambda x: x.split(" ")[0] + else: + labda_func = lambda x: x + + timestamp = pd.DatetimeIndex(df["date"].apply(lambda_func)) + df.set_index(timestamp, inplace=True) + del df["date"] + return df + + +def prepare_dataframe( + data_folder, df_filename, description_filename, use_time_index=False +): + data_file_path = os.path.join(data_folder, df_filename) + df = pd.read_csv(data_file_path) + del df["note_it"] + del df["note_en"] + df = translate_columns(data_folder, df, description_filename) + if use_time_index: + df = set_time_index(df) + return df + + +def get_province_structure(df_provinces): + """Extract the province/region structure from the province dataframe.""" + + def get_province_list(region): + """Get list of provinces for a given region.""" + provinces = set(df_provinces.loc[df_provinces["region"] == region]["province"]) + provinces.discard("In fase di definizione/aggiornamento") + return list(provinces) + + regions = df_provinces["region"].unique() + return {region: get_province_list(region) for region in regions} + + +def get_region_populations(): + import sys + from SPARQLWrapper import SPARQLWrapper, JSON + + endpoint_url = "https://query.wikidata.org/sparql" + + query = """SELECT DISTINCT ?istatid (MAX(?population) AS ?maxPopulation) + WHERE + { + ?region wdt:P31 wd:Q16110 . + ?region wdt:P1082 ?population . + ?region wdt:P635 ?istatid + } + GROUP BY ?istatid + ORDER BY DESC(?maxPopulation) + """ + + def get_results(endpoint_url, query): + user_agent = "WDQS-example Python/%s.%s" % ( + sys.version_info[0], + sys.version_info[1], + ) + # TODO adjust user agent; see https://w.wiki/CX6 + sparql = SPARQLWrapper(endpoint_url, agent=user_agent) + sparql.setQuery(query) + sparql.setReturnFormat(JSON) + return sparql.query().convert() + + results = get_results(endpoint_url, query) + + return { + int(result["istatid"]["value"]): int(result["maxPopulation"]["value"]) + for result in results["results"]["bindings"] + } -- GitLab