From 8877d3c1bcd753483d45564209732fc8c8eb83a8 Mon Sep 17 00:00:00 2001 From: Chandrasekhar Ramakrishnan <cramakri@ethz.ch> Date: Fri, 3 Apr 2020 21:33:26 +0000 Subject: [PATCH] renku run papermill -p out_folder ./data/atlas/wikidata --inject-paths notebooks/process/wikidata-pop-data.ipynb runs/wikidata-pop-data.run.ipynb --- .gitattributes | 1 + ...5b7a4c8945e4bdb9601e5be88822_papermill.cwl | 70 +++ data/atlas/wikidata/che-population.csv | 3 + data/atlas/wikidata/ita-population.csv | 3 + data/atlas/wikidata/usa-population.csv | 3 + runs/wikidata-pop-data.run.ipynb | 448 ++++++++++++++++++ 6 files changed, 528 insertions(+) create mode 100644 .renku/workflow/1c505b7a4c8945e4bdb9601e5be88822_papermill.cwl create mode 100644 data/atlas/wikidata/che-population.csv create mode 100644 data/atlas/wikidata/ita-population.csv create mode 100644 data/atlas/wikidata/usa-population.csv create mode 100644 runs/wikidata-pop-data.run.ipynb diff --git a/.gitattributes b/.gitattributes index 349040c3e..8d0f84a41 100644 --- a/.gitattributes +++ b/.gitattributes @@ -376,3 +376,4 @@ data/covid-19_jhu-csse/csse_covid_19_daily_reports/04-07-2020.csv filter=lfs dif data/covid-19_jhu-csse/csse_covid_19_daily_reports/04-08-2020.csv filter=lfs diff=lfs merge=lfs -text data/atlas/worldbank/SP.POP.TOTL.zip filter=lfs diff=lfs merge=lfs -text data/atlas/worldmap/country_centroids.csv filter=lfs diff=lfs merge=lfs -text +data/atlas/wikidata/** filter=lfs diff=lfs merge=lfs -text diff --git a/.renku/workflow/1c505b7a4c8945e4bdb9601e5be88822_papermill.cwl b/.renku/workflow/1c505b7a4c8945e4bdb9601e5be88822_papermill.cwl new file mode 100644 index 000000000..1494f7705 --- /dev/null +++ b/.renku/workflow/1c505b7a4c8945e4bdb9601e5be88822_papermill.cwl @@ -0,0 +1,70 @@ +arguments: [] +baseCommand: +- papermill +class: CommandLineTool +cwlVersion: v1.0 +hints: [] +inputs: + input_1: + default: out_folder + inputBinding: + position: 1 + prefix: -p + separate: true + shellQuote: true + streamable: false + type: string + input_2: + default: data/atlas/wikidata + inputBinding: + position: 2 + separate: true + shellQuote: true + streamable: false + type: string + input_3: + default: + class: File + path: ../../notebooks/process/wikidata-pop-data.ipynb + inputBinding: + position: 3 + prefix: --inject-paths + separate: true + shellQuote: true + streamable: false + type: File + input_4: + default: runs/wikidata-pop-data.run.ipynb + inputBinding: + position: 4 + separate: true + shellQuote: true + streamable: false + type: string +outputs: + output_0: + outputBinding: + glob: $(inputs.input_4) + streamable: false + type: File + output_1: + outputBinding: + glob: $(inputs.input_2) + streamable: false + type: Directory +permanentFailCodes: [] +requirements: +- class: InlineJavascriptRequirement +- class: InitialWorkDirRequirement + listing: + - entry: '$({"listing": [], "class": "Directory"})' + entryname: runs + writable: true + - entry: '$({"listing": [], "class": "Directory"})' + entryname: data/atlas/wikidata + writable: true + - entry: $(inputs.input_3) + entryname: notebooks/process/wikidata-pop-data.ipynb + writable: false +successCodes: [] +temporaryFailCodes: [] diff --git a/data/atlas/wikidata/che-population.csv b/data/atlas/wikidata/che-population.csv new file mode 100644 index 000000000..bf74a2ff0 --- /dev/null +++ b/data/atlas/wikidata/che-population.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2ef111e3d106d90faa83ec77894ec886222dc565f535a6aeb625a8efb6152b7 +size 823 diff --git a/data/atlas/wikidata/ita-population.csv b/data/atlas/wikidata/ita-population.csv new file mode 100644 index 000000000..db6ea7c38 --- /dev/null +++ b/data/atlas/wikidata/ita-population.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e40c8f362efc8a1619ad19b2b9761730d8daa85eaf8b38b75ad904257f68a79 +size 631 diff --git a/data/atlas/wikidata/usa-population.csv b/data/atlas/wikidata/usa-population.csv new file mode 100644 index 000000000..b9633408d --- /dev/null +++ b/data/atlas/wikidata/usa-population.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d846fe3e0df640a2d38df9631532a59fbbad753dad58bfe5f806d4b372dfeb27 +size 1350 diff --git a/runs/wikidata-pop-data.run.ipynb b/runs/wikidata-pop-data.run.ipynb new file mode 100644 index 000000000..b4e317e4b --- /dev/null +++ b/runs/wikidata-pop-data.run.ipynb @@ -0,0 +1,448 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.012354, + "end_time": "2020-04-03T21:33:19.377218", + "exception": false, + "start_time": "2020-04-03T21:33:19.364864", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "# Gather Population Data from Wikidata" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "papermill": { + "duration": 3.985074, + "end_time": "2020-04-03T21:33:23.369368", + "exception": false, + "start_time": "2020-04-03T21:33:19.384294", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "\n", + "from covid_19_dashboard import helper" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "papermill": { + "duration": 0.015991, + "end_time": "2020-04-03T21:33:23.392938", + "exception": false, + "start_time": "2020-04-03T21:33:23.376947", + "status": "completed" + }, + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "out_folder = '../../data/atlas/wikidata'\n", + "PAPERMILL_OUTPUT_PATH = None" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "papermill": { + "duration": 0.015632, + "end_time": "2020-04-03T21:33:23.416629", + "exception": false, + "start_time": "2020-04-03T21:33:23.400997", + "status": "completed" + }, + "tags": [ + "injected-parameters" + ] + }, + "outputs": [], + "source": [ + "# Parameters\n", + "PAPERMILL_INPUT_PATH = \"notebooks/process/wikidata-pop-data.ipynb\"\n", + "PAPERMILL_OUTPUT_PATH = \"runs/wikidata-pop-data.run.ipynb\"\n", + "out_folder = \"./data/atlas/wikidata\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "papermill": { + "duration": 0.016971, + "end_time": "2020-04-03T21:33:23.440680", + "exception": false, + "start_time": "2020-04-03T21:33:23.423709", + "status": "completed" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "def write_population_data(iso_code, df):\n", + " out_path = os.path.join(out_folder, f\"{iso_code.lower()}-population.csv\")\n", + " print(f\"Writing {len(df)} rows to {out_path}\")\n", + " if PAPERMILL_OUTPUT_PATH is None:\n", + " return\n", + " df.to_csv(out_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.006952, + "end_time": "2020-04-03T21:33:23.455153", + "exception": false, + "start_time": "2020-04-03T21:33:23.448201", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Italy" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "papermill": { + "duration": 1.075272, + "end_time": "2020-04-03T21:33:24.536917", + "exception": false, + "start_time": "2020-04-03T21:33:23.461645", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing 20 rows to ./data/atlas/wikidata/ita-population.csv\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>region_iso</th>\n", + " <th>regionLabel</th>\n", + " <th>istatid</th>\n", + " <th>population</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>IT-52</td>\n", + " <td>Tuscany</td>\n", + " <td>09</td>\n", + " <td>3729641</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>IT-55</td>\n", + " <td>Umbria</td>\n", + " <td>10</td>\n", + " <td>882015</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " region_iso regionLabel istatid population\n", + "0 IT-52 Tuscany 09 3729641\n", + "1 IT-55 Umbria 10 882015" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iso_code = \"ITA\"\n", + "pops = helper.get_region_populations(\n", + " iso_code,\n", + " additional_fields=\"?istatid\",\n", + " additional_query=\"?region wdt:P635 ?istatid .\",\n", + ")\n", + "df = pd.DataFrame(pops)\n", + "write_population_data(iso_code, df)\n", + "df.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.008965, + "end_time": "2020-04-03T21:33:24.555296", + "exception": false, + "start_time": "2020-04-03T21:33:24.546331", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## Switzerland" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "papermill": { + "duration": 0.566085, + "end_time": "2020-04-03T21:33:25.129545", + "exception": false, + "start_time": "2020-04-03T21:33:24.563460", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing 26 rows to ./data/atlas/wikidata/che-population.csv\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>region_iso</th>\n", + " <th>regionLabel</th>\n", + " <th>population</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>CH-SZ</td>\n", + " <td>Canton of Schwyz</td>\n", + " <td>159165</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>CH-TG</td>\n", + " <td>Thurgau</td>\n", + " <td>276472</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " region_iso regionLabel population\n", + "0 CH-SZ Canton of Schwyz 159165\n", + "1 CH-TG Thurgau 276472" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iso_code = \"CHE\"\n", + "pops = helper.get_region_populations(iso_code)\n", + "df = pd.DataFrame(pops)\n", + "write_population_data(iso_code, df)\n", + "df.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "papermill": { + "duration": 0.009881, + "end_time": "2020-04-03T21:33:25.149605", + "exception": false, + "start_time": "2020-04-03T21:33:25.139724", + "status": "completed" + }, + "tags": [] + }, + "source": [ + "## United States" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "papermill": { + "duration": 0.952179, + "end_time": "2020-04-03T21:33:26.110326", + "exception": false, + "start_time": "2020-04-03T21:33:25.158147", + "status": "completed" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing 50 rows to ./data/atlas/wikidata/usa-population.csv\n" + ] + }, + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>region_iso</th>\n", + " <th>regionLabel</th>\n", + " <th>population</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>US-NH</td>\n", + " <td>New Hampshire</td>\n", + " <td>1330608</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>US-GA</td>\n", + " <td>Georgia</td>\n", + " <td>10214860</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " region_iso regionLabel population\n", + "0 US-NH New Hampshire 1330608\n", + "1 US-GA Georgia 10214860" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "iso_code = \"USA\"\n", + "pops = helper.get_region_populations(iso_code)\n", + "df = pd.DataFrame(pops)\n", + "write_population_data(iso_code, df)\n", + "df.head(2)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "papermill": { + "duration": 8.557795, + "end_time": "2020-04-03T21:33:26.443798", + "environment_variables": {}, + "exception": null, + "input_path": "notebooks/process/wikidata-pop-data.ipynb", + "output_path": "runs/wikidata-pop-data.run.ipynb", + "parameters": { + "PAPERMILL_INPUT_PATH": "notebooks/process/wikidata-pop-data.ipynb", + "PAPERMILL_OUTPUT_PATH": "runs/wikidata-pop-data.run.ipynb", + "out_folder": "./data/atlas/wikidata" + }, + "start_time": "2020-04-03T21:33:17.886003", + "version": "1.1.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file -- GitLab