diff --git a/notebooks/Play.ipynb b/notebooks/Play.ipynb index fd34756b9b546785942cec51d4b5d4dacaf5e74a..5ab6e9cc2b984fc7c96f86f730e9fc8217e77031 100644 --- a/notebooks/Play.ipynb +++ b/notebooks/Play.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -34,36 +34,240 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ - "confirmed_df = read_jhu_covid_df(\"Confirmed\")\n", - "deaths_df = read_jhu_covid_df(\"Deaths\")\n", - "recovered_df = read_jhu_covid_df(\"Recovered\")" + "frames_map = {\n", + " \"confirmed\": read_jhu_covid_df(\"Confirmed\"),\n", + " \"deaths\": read_jhu_covid_df(\"Deaths\"),\n", + " \"recovered\": read_jhu_covid_df(\"Recovered\")\n", + "}" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ - "def summarize_df(df, name):\n", - " ser = df.groupby(level='Country/Region').sum().iloc[:,-1].sort_values(ascending=False)\n", - " ser.name = f\"Total {name}\"\n", - " return ser" + "def current_region_totals_df(frames_map):\n", + " sers = [df.groupby(level='Country/Region').sum().iloc[:,-1].sort_values(ascending=False)\n", + " for name, df in frames_map.items()]\n", + " for name, ser in zip(frames_map, sers):\n", + " ser.name = name\n", + " return pd.concat(sers, axis=1)" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>confirmed</th>\n", + " <th>deaths</th>\n", + " <th>recovered</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>Mainland China</th>\n", + " <td>80757</td>\n", + " <td>3136</td>\n", + " <td>60106</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Italy</th>\n", + " <td>10149</td>\n", + " <td>631</td>\n", + " <td>724</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Iran (Islamic Republic of)</th>\n", + " <td>8042</td>\n", + " <td>291</td>\n", + " <td>2731</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Republic of Korea</th>\n", + " <td>7513</td>\n", + " <td>54</td>\n", + " <td>247</td>\n", + " </tr>\n", + " <tr>\n", + " <th>France</th>\n", + " <td>1784</td>\n", + " <td>33</td>\n", + " <td>12</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Spain</th>\n", + " <td>1695</td>\n", + " <td>35</td>\n", + " <td>32</td>\n", + " </tr>\n", + " <tr>\n", + " <th>US</th>\n", + " <td>1670</td>\n", + " <td>56</td>\n", + " <td>15</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Germany</th>\n", + " <td>1457</td>\n", + " <td>2</td>\n", + " <td>18</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Others</th>\n", + " <td>696</td>\n", + " <td>6</td>\n", + " <td>40</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Japan</th>\n", + " <td>581</td>\n", + " <td>10</td>\n", + " <td>101</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Switzerland</th>\n", + " <td>491</td>\n", + " <td>3</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Norway</th>\n", + " <td>400</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>UK</th>\n", + " <td>382</td>\n", + " <td>6</td>\n", + " <td>18</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Netherlands</th>\n", + " <td>382</td>\n", + " <td>4</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Sweden</th>\n", + " <td>355</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Belgium</th>\n", + " <td>267</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Denmark</th>\n", + " <td>262</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Austria</th>\n", + " <td>182</td>\n", + " <td>0</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Singapore</th>\n", + " <td>160</td>\n", + " <td>0</td>\n", + " <td>78</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Malaysia</th>\n", + " <td>129</td>\n", + " <td>0</td>\n", + " <td>24</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Hong Kong SAR</th>\n", + " <td>120</td>\n", + " <td>3</td>\n", + " <td>65</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Bahrain</th>\n", + " <td>110</td>\n", + " <td>0</td>\n", + " <td>22</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Australia</th>\n", + " <td>107</td>\n", + " <td>3</td>\n", + " <td>21</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " confirmed deaths recovered\n", + "Mainland China 80757 3136 60106\n", + "Italy 10149 631 724\n", + "Iran (Islamic Republic of) 8042 291 2731\n", + "Republic of Korea 7513 54 247\n", + "France 1784 33 12\n", + "Spain 1695 35 32\n", + "US 1670 56 15\n", + "Germany 1457 2 18\n", + "Others 696 6 40\n", + "Japan 581 10 101\n", + "Switzerland 491 3 3\n", + "Norway 400 0 1\n", + "UK 382 6 18\n", + "Netherlands 382 4 0\n", + "Sweden 355 0 1\n", + "Belgium 267 0 1\n", + "Denmark 262 0 1\n", + "Austria 182 0 4\n", + "Singapore 160 0 78\n", + "Malaysia 129 0 24\n", + "Hong Kong SAR 120 3 65\n", + "Bahrain 110 0 22\n", + "Australia 107 3 21" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "confirmed_ser = summarize_df(confirmed_df, \"Confirmed\")\n", - "deaths_ser = summarize_df(deaths_df, \"Deaths\")\n", - "recovered_ser = summarize_df(recovered_df, \"Recovered\")" + "current_totals_df = current_region_totals_df(frames_map)\n", + "current_totals_df[current_totals_df['confirmed'] > 100]" ] }, { @@ -75,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -83,6 +287,279 @@ "zf = zipfile.ZipFile(\"../data/worldbank/SP.POP.TOTL.zip\")\n", "pop_df = pd.read_csv(zf.open(\"API_SP.POP.TOTL_DS2_en_csv_v2_821007.csv\"), skiprows=4)" ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Country Name</th>\n", + " <th>Country Code</th>\n", + " <th>Indicator Name</th>\n", + " <th>Indicator Code</th>\n", + " <th>1960</th>\n", + " <th>1961</th>\n", + " <th>1962</th>\n", + " <th>1963</th>\n", + " <th>1964</th>\n", + " <th>1965</th>\n", + " <th>...</th>\n", + " <th>2011</th>\n", + " <th>2012</th>\n", + " <th>2013</th>\n", + " <th>2014</th>\n", + " <th>2015</th>\n", + " <th>2016</th>\n", + " <th>2017</th>\n", + " <th>2018</th>\n", + " <th>2019</th>\n", + " <th>Unnamed: 64</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>67</th>\n", + " <td>Eritrea</td>\n", + " <td>ERI</td>\n", + " <td>Population, total</td>\n", + " <td>SP.POP.TOTL</td>\n", + " <td>1007590.0</td>\n", + " <td>1033328.0</td>\n", + " <td>1060486.0</td>\n", + " <td>1088854.0</td>\n", + " <td>1118159.0</td>\n", + " <td>1148189.0</td>\n", + " <td>...</td>\n", + " <td>3213972.0</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " <tr>\n", + " <th>108</th>\n", + " <td>Not classified</td>\n", + " <td>INX</td>\n", + " <td>Population, total</td>\n", + " <td>SP.POP.TOTL</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>...</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " <td>NaN</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>2 rows × 65 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Country Name Country Code Indicator Name Indicator Code 1960 \\\n", + "67 Eritrea ERI Population, total SP.POP.TOTL 1007590.0 \n", + "108 Not classified INX Population, total SP.POP.TOTL NaN \n", + "\n", + " 1961 1962 1963 1964 1965 ... 2011 \\\n", + "67 1033328.0 1060486.0 1088854.0 1118159.0 1148189.0 ... 3213972.0 \n", + "108 NaN NaN NaN NaN NaN ... NaN \n", + "\n", + " 2012 2013 2014 2015 2016 2017 2018 2019 Unnamed: 64 \n", + "67 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "108 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "[2 rows x 65 columns]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# There is 2018 pop data for all countries/regions except Eritrea\n", + "pop_df[pd.isna(pop_df['2018'])]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Fix the country/region names that differ" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [], + "source": [ + "region_wb_jhu_map = {\n", + " 'China': 'Mainland China',\n", + " 'Iran, Islamic Rep.': 'Iran (Islamic Republic of)',\n", + " 'Korea, Rep.': 'Republic of Korea',\n", + " 'United States': 'US',\n", + " 'United Kingdom': 'UK',\n", + " 'Hong Kong SAR, China': 'Hong Kong SAR',\n", + " 'Egypt, Arab Rep.': 'Egypt',\n", + " 'Vietnam': 'Viet Nam',\n", + " 'Macao SAR, China': 'Macao SAR',\n", + " 'Slovak Republic': 'Slovakia',\n", + " 'Moldova': 'Republic of Moldova',\n", + " 'St. Martin (French part)': 'Saint Martin',\n", + " 'Brunei Darussalam': 'Brunei'\n", + "}\n", + "current_pop_ser = pop_df[['Country Name', '2018']].copy().replace(region_wb_jhu_map).set_index('Country Name')\n", + "data_pop_ser = current_pop_ser[current_pop_ser.index.isin(current_totals_df.index)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are some regions that we cannot resolve, but we will just ignore these." + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>confirmed</th>\n", + " <th>deaths</th>\n", + " <th>recovered</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>Others</th>\n", + " <td>696</td>\n", + " <td>6</td>\n", + " <td>40</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Taipei and environs</th>\n", + " <td>47</td>\n", + " <td>1</td>\n", + " <td>17</td>\n", + " </tr>\n", + " <tr>\n", + " <th>occupied Palestinian territory</th>\n", + " <td>25</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>French Guiana</th>\n", + " <td>5</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Martinique</th>\n", + " <td>2</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Holy See</th>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>Saint Barthelemy</th>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " confirmed deaths recovered\n", + "Others 696 6 40\n", + "Taipei and environs 47 1 17\n", + "occupied Palestinian territory 25 0 0\n", + "French Guiana 5 0 0\n", + "Martinique 2 0 0\n", + "Holy See 1 0 0\n", + "Saint Barthelemy 1 0 0" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "current_totals_df[current_totals_df.index.isin(data_pop_ser.index) == False]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {