From 8877d3c1bcd753483d45564209732fc8c8eb83a8 Mon Sep 17 00:00:00 2001
From: Chandrasekhar Ramakrishnan <cramakri@ethz.ch>
Date: Fri, 3 Apr 2020 21:33:26 +0000
Subject: [PATCH] renku run papermill -p out_folder ./data/atlas/wikidata
 --inject-paths notebooks/process/wikidata-pop-data.ipynb
 runs/wikidata-pop-data.run.ipynb

---
 .gitattributes                                |   1 +
 ...5b7a4c8945e4bdb9601e5be88822_papermill.cwl |  70 +++
 data/atlas/wikidata/che-population.csv        |   3 +
 data/atlas/wikidata/ita-population.csv        |   3 +
 data/atlas/wikidata/usa-population.csv        |   3 +
 runs/wikidata-pop-data.run.ipynb              | 448 ++++++++++++++++++
 6 files changed, 528 insertions(+)
 create mode 100644 .renku/workflow/1c505b7a4c8945e4bdb9601e5be88822_papermill.cwl
 create mode 100644 data/atlas/wikidata/che-population.csv
 create mode 100644 data/atlas/wikidata/ita-population.csv
 create mode 100644 data/atlas/wikidata/usa-population.csv
 create mode 100644 runs/wikidata-pop-data.run.ipynb

diff --git a/.gitattributes b/.gitattributes
index 349040c3e..8d0f84a41 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -376,3 +376,4 @@ data/covid-19_jhu-csse/csse_covid_19_daily_reports/04-07-2020.csv filter=lfs dif
 data/covid-19_jhu-csse/csse_covid_19_daily_reports/04-08-2020.csv filter=lfs diff=lfs merge=lfs -text
 data/atlas/worldbank/SP.POP.TOTL.zip filter=lfs diff=lfs merge=lfs -text
 data/atlas/worldmap/country_centroids.csv filter=lfs diff=lfs merge=lfs -text
+data/atlas/wikidata/** filter=lfs diff=lfs merge=lfs -text
diff --git a/.renku/workflow/1c505b7a4c8945e4bdb9601e5be88822_papermill.cwl b/.renku/workflow/1c505b7a4c8945e4bdb9601e5be88822_papermill.cwl
new file mode 100644
index 000000000..1494f7705
--- /dev/null
+++ b/.renku/workflow/1c505b7a4c8945e4bdb9601e5be88822_papermill.cwl
@@ -0,0 +1,70 @@
+arguments: []
+baseCommand:
+- papermill
+class: CommandLineTool
+cwlVersion: v1.0
+hints: []
+inputs:
+  input_1:
+    default: out_folder
+    inputBinding:
+      position: 1
+      prefix: -p
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: string
+  input_2:
+    default: data/atlas/wikidata
+    inputBinding:
+      position: 2
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: string
+  input_3:
+    default:
+      class: File
+      path: ../../notebooks/process/wikidata-pop-data.ipynb
+    inputBinding:
+      position: 3
+      prefix: --inject-paths
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: File
+  input_4:
+    default: runs/wikidata-pop-data.run.ipynb
+    inputBinding:
+      position: 4
+      separate: true
+      shellQuote: true
+    streamable: false
+    type: string
+outputs:
+  output_0:
+    outputBinding:
+      glob: $(inputs.input_4)
+    streamable: false
+    type: File
+  output_1:
+    outputBinding:
+      glob: $(inputs.input_2)
+    streamable: false
+    type: Directory
+permanentFailCodes: []
+requirements:
+- class: InlineJavascriptRequirement
+- class: InitialWorkDirRequirement
+  listing:
+  - entry: '$({"listing": [], "class": "Directory"})'
+    entryname: runs
+    writable: true
+  - entry: '$({"listing": [], "class": "Directory"})'
+    entryname: data/atlas/wikidata
+    writable: true
+  - entry: $(inputs.input_3)
+    entryname: notebooks/process/wikidata-pop-data.ipynb
+    writable: false
+successCodes: []
+temporaryFailCodes: []
diff --git a/data/atlas/wikidata/che-population.csv b/data/atlas/wikidata/che-population.csv
new file mode 100644
index 000000000..bf74a2ff0
--- /dev/null
+++ b/data/atlas/wikidata/che-population.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2ef111e3d106d90faa83ec77894ec886222dc565f535a6aeb625a8efb6152b7
+size 823
diff --git a/data/atlas/wikidata/ita-population.csv b/data/atlas/wikidata/ita-population.csv
new file mode 100644
index 000000000..db6ea7c38
--- /dev/null
+++ b/data/atlas/wikidata/ita-population.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4e40c8f362efc8a1619ad19b2b9761730d8daa85eaf8b38b75ad904257f68a79
+size 631
diff --git a/data/atlas/wikidata/usa-population.csv b/data/atlas/wikidata/usa-population.csv
new file mode 100644
index 000000000..b9633408d
--- /dev/null
+++ b/data/atlas/wikidata/usa-population.csv
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d846fe3e0df640a2d38df9631532a59fbbad753dad58bfe5f806d4b372dfeb27
+size 1350
diff --git a/runs/wikidata-pop-data.run.ipynb b/runs/wikidata-pop-data.run.ipynb
new file mode 100644
index 000000000..b4e317e4b
--- /dev/null
+++ b/runs/wikidata-pop-data.run.ipynb
@@ -0,0 +1,448 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.012354,
+     "end_time": "2020-04-03T21:33:19.377218",
+     "exception": false,
+     "start_time": "2020-04-03T21:33:19.364864",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "# Gather Population Data from Wikidata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "papermill": {
+     "duration": 3.985074,
+     "end_time": "2020-04-03T21:33:23.369368",
+     "exception": false,
+     "start_time": "2020-04-03T21:33:19.384294",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import pandas as pd\n",
+    "\n",
+    "from covid_19_dashboard import helper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "papermill": {
+     "duration": 0.015991,
+     "end_time": "2020-04-03T21:33:23.392938",
+     "exception": false,
+     "start_time": "2020-04-03T21:33:23.376947",
+     "status": "completed"
+    },
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "out_folder = '../../data/atlas/wikidata'\n",
+    "PAPERMILL_OUTPUT_PATH = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "papermill": {
+     "duration": 0.015632,
+     "end_time": "2020-04-03T21:33:23.416629",
+     "exception": false,
+     "start_time": "2020-04-03T21:33:23.400997",
+     "status": "completed"
+    },
+    "tags": [
+     "injected-parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "# Parameters\n",
+    "PAPERMILL_INPUT_PATH = \"notebooks/process/wikidata-pop-data.ipynb\"\n",
+    "PAPERMILL_OUTPUT_PATH = \"runs/wikidata-pop-data.run.ipynb\"\n",
+    "out_folder = \"./data/atlas/wikidata\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "papermill": {
+     "duration": 0.016971,
+     "end_time": "2020-04-03T21:33:23.440680",
+     "exception": false,
+     "start_time": "2020-04-03T21:33:23.423709",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def write_population_data(iso_code, df):\n",
+    "    out_path = os.path.join(out_folder, f\"{iso_code.lower()}-population.csv\")\n",
+    "    print(f\"Writing {len(df)} rows to {out_path}\")\n",
+    "    if PAPERMILL_OUTPUT_PATH is None:\n",
+    "        return\n",
+    "    df.to_csv(out_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.006952,
+     "end_time": "2020-04-03T21:33:23.455153",
+     "exception": false,
+     "start_time": "2020-04-03T21:33:23.448201",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "## Italy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "papermill": {
+     "duration": 1.075272,
+     "end_time": "2020-04-03T21:33:24.536917",
+     "exception": false,
+     "start_time": "2020-04-03T21:33:23.461645",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Writing 20 rows to ./data/atlas/wikidata/ita-population.csv\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>region_iso</th>\n",
+       "      <th>regionLabel</th>\n",
+       "      <th>istatid</th>\n",
+       "      <th>population</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>IT-52</td>\n",
+       "      <td>Tuscany</td>\n",
+       "      <td>09</td>\n",
+       "      <td>3729641</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>IT-55</td>\n",
+       "      <td>Umbria</td>\n",
+       "      <td>10</td>\n",
+       "      <td>882015</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  region_iso regionLabel istatid population\n",
+       "0      IT-52     Tuscany      09    3729641\n",
+       "1      IT-55      Umbria      10     882015"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "iso_code = \"ITA\"\n",
+    "pops = helper.get_region_populations(\n",
+    "    iso_code,\n",
+    "    additional_fields=\"?istatid\",\n",
+    "    additional_query=\"?region wdt:P635 ?istatid .\",\n",
+    ")\n",
+    "df = pd.DataFrame(pops)\n",
+    "write_population_data(iso_code, df)\n",
+    "df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.008965,
+     "end_time": "2020-04-03T21:33:24.555296",
+     "exception": false,
+     "start_time": "2020-04-03T21:33:24.546331",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "## Switzerland"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "papermill": {
+     "duration": 0.566085,
+     "end_time": "2020-04-03T21:33:25.129545",
+     "exception": false,
+     "start_time": "2020-04-03T21:33:24.563460",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Writing 26 rows to ./data/atlas/wikidata/che-population.csv\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>region_iso</th>\n",
+       "      <th>regionLabel</th>\n",
+       "      <th>population</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>CH-SZ</td>\n",
+       "      <td>Canton of Schwyz</td>\n",
+       "      <td>159165</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>CH-TG</td>\n",
+       "      <td>Thurgau</td>\n",
+       "      <td>276472</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  region_iso       regionLabel population\n",
+       "0      CH-SZ  Canton of Schwyz     159165\n",
+       "1      CH-TG           Thurgau     276472"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "iso_code = \"CHE\"\n",
+    "pops = helper.get_region_populations(iso_code)\n",
+    "df = pd.DataFrame(pops)\n",
+    "write_population_data(iso_code, df)\n",
+    "df.head(2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "papermill": {
+     "duration": 0.009881,
+     "end_time": "2020-04-03T21:33:25.149605",
+     "exception": false,
+     "start_time": "2020-04-03T21:33:25.139724",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "source": [
+    "## United States"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "papermill": {
+     "duration": 0.952179,
+     "end_time": "2020-04-03T21:33:26.110326",
+     "exception": false,
+     "start_time": "2020-04-03T21:33:25.158147",
+     "status": "completed"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Writing 50 rows to ./data/atlas/wikidata/usa-population.csv\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>region_iso</th>\n",
+       "      <th>regionLabel</th>\n",
+       "      <th>population</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>US-NH</td>\n",
+       "      <td>New Hampshire</td>\n",
+       "      <td>1330608</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>US-GA</td>\n",
+       "      <td>Georgia</td>\n",
+       "      <td>10214860</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "  region_iso    regionLabel population\n",
+       "0      US-NH  New Hampshire    1330608\n",
+       "1      US-GA        Georgia   10214860"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "iso_code = \"USA\"\n",
+    "pops = helper.get_region_populations(iso_code)\n",
+    "df = pd.DataFrame(pops)\n",
+    "write_population_data(iso_code, df)\n",
+    "df.head(2)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  },
+  "papermill": {
+   "duration": 8.557795,
+   "end_time": "2020-04-03T21:33:26.443798",
+   "environment_variables": {},
+   "exception": null,
+   "input_path": "notebooks/process/wikidata-pop-data.ipynb",
+   "output_path": "runs/wikidata-pop-data.run.ipynb",
+   "parameters": {
+    "PAPERMILL_INPUT_PATH": "notebooks/process/wikidata-pop-data.ipynb",
+    "PAPERMILL_OUTPUT_PATH": "runs/wikidata-pop-data.run.ipynb",
+    "out_folder": "./data/atlas/wikidata"
+   },
+   "start_time": "2020-04-03T21:33:17.886003",
+   "version": "1.1.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
-- 
GitLab