Added Bayesian R^2

bd3f959c · Mirko Birbaumer · 1f4bd733 · bd3f959c
Commit bd3f959c authored 3 weeks ago by Mirko Birbaumer
--- a/notebooks/Linear Regression/LR_7_2.ipynb
+++ b/notebooks/Linear Regression/LR_7_2.ipynb
@@ -10,7 +10,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 16,
   "id": "6645d58f-73f2-4bb4-ae9f-17a0b334a652",
   "metadata": {},
   "outputs": [],
@@ -27,7 +27,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 17,
   "id": "a76946c6-bb24-4032-9b40-35a2303dec27",
   "metadata": {
    "scrolled": true
@@ -65,9 +65,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 18,
   "id": "97ef4d21-6ffb-4b9e-a973-94c6815600b3",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
   "outputs": [
    {
     "name": "stderr",
@@ -120,7 +122,37 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 19,
+   "id": "da7dd39e-0b36-4742-bc45-cc495b55cac8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "r2        0.825992\n",
+       "r2_std    0.000000\n",
+       "dtype: float64"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_trz.predict(idata_trz, kind=\"response\")\n",
+    "y_pred = az.extract(idata_trz.posterior_predictive).Verkauf.values.T\n",
+    "y_obs = werbung.Verkauf.values\n",
+    "# number of predictor variables\n",
+    "p=3\n",
+    "# number of observations\n",
+    "n = len(y_obs)\n",
+    "pm.r2_score( y_pred, y_obs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
   "id": "27f1854b-e66e-432e-aa61-66b928e73f16",
   "metadata": {},
   "outputs": [],
@@ -137,9 +169,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 10,
   "id": "45172e19-c1df-48ed-a524-cb8408a603e4",
-   "metadata": {},
+   "metadata": {
+    "scrolled": true
+   },
   "outputs": [
    {
     "name": "stdout",
@@ -162,6 +196,29 @@
    "print(\"R²:\", r2)"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "12b11a54-925b-4c1b-b3a2-4f6b479a11b0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "r2        0.897365\n",
+       "r2_std    0.000000\n",
+       "dtype: float64"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pm.r2_score(y_obs, y_pred )"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "67580aab-6f1d-47ac-bace-3212033a0e40",
@@ -172,7 +229,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
   "id": "fd86c259-21a6-4ebc-a2c9-6d7165f6a13a",
   "metadata": {},
   "outputs": [
@@ -227,7 +284,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 12,
   "id": "125cbdce-ae89-456c-945b-b603d8cf3d7d",
   "metadata": {},
   "outputs": [],
@@ -243,7 +300,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 13,
   "id": "4990d7e2-8761-469b-aca8-7dc283fd8c99",
   "metadata": {},
   "outputs": [
@@ -276,10 +333,33 @@
    "Die beiden Modelle haben beinahe den identischen $R^2$-Wert, d.h., wir können auf die Prädiktorvariable `Zeitung` ohne Einbusse an der Güte des Modells verzichten."
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "28c5351e-08dd-420d-8d2b-56ec4c4173e9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "r2        0.897365\n",
+       "r2_std    0.000000\n",
+       "dtype: float64"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pm.r2_score(y_obs, y_pred )"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "d07bbeb8-05d2-4e61-a806-b9dbce147722",
+   "id": "8c724547-2917-4675-b39e-41f42a306a9f",
   "metadata": {},
   "outputs": [],
   "source": []

 %% Cell type:markdown id:a8e187e3-0d7b-44a5-bc5b-6a85fa8592b6 tags:

 # Beispiel 7.2

 %% Cell type:code id:6645d58f-73f2-4bb4-ae9f-17a0b334a652 tags:

 ``` python
 import arviz as az
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import pymc as pm
 import xarray as xr
 from scipy.interpolate import PchipInterpolator
 from scipy.stats import linregress
 ```

 %% Cell type:code id:a76946c6-bb24-4032-9b40-35a2303dec27 tags:

 ``` python
 import pandas as pd

 werbung = pd.read_csv("./Daten/Werbung.csv").drop(["Unnamed: 0"], axis=1)

 werbung.head()
 print(werbung.head())
 ```

 %% Output

          TV  Radio  Zeitung  Verkauf
    0  230.1   37.8     69.2     22.1
    1   44.5   39.3     45.1     10.4
    2   17.2   45.9     69.3      9.3
    3  151.5   41.3     58.5     18.5
    4  180.8   10.8     58.4     12.9

 %% Cell type:markdown id:c83160ba-4388-46e2-9655-94a97ddfc42f tags:

 Wir ermitteln den $ R^{2} $-Wert zuerst für das lineare Regressionsmodell mit den Prädiktorvariablen `TV`, `Radio` und `Zeitung`:

 %% Cell type:code id:97ef4d21-6ffb-4b9e-a973-94c6815600b3 tags:

 ``` python
 import bambi as bmb
 model_trz = bmb.Model("Verkauf ~ TV + Zeitung + Radio", werbung)
 idata_trz = model_trz.fit(random_seed=123)
 ```

 %% Output

    Auto-assigning NUTS sampler...
    Initializing NUTS using jitter+adapt_diag...
    Multiprocess sampling (4 chains in 4 jobs)
    NUTS: [sigma, Intercept, TV, Zeitung, Radio]



    Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 5 seconds.

+%% Cell type:code id:da7dd39e-0b36-4742-bc45-cc495b55cac8 tags:
+
+``` python
+model_trz.predict(idata_trz, kind="response")
+y_pred = az.extract(idata_trz.posterior_predictive).Verkauf.values.T
+y_obs = werbung.Verkauf.values
+# number of predictor variables
+p=3
+# number of observations
+n = len(y_obs)
+pm.r2_score( y_pred, y_obs)
+```
+
+%% Output
+
+    r2        0.825992
+    r2_std    0.000000
+    dtype: float64
+
 %% Cell type:code id:27f1854b-e66e-432e-aa61-66b928e73f16 tags:

 ``` python
 # Extract posterior means of coefficients
 beta_0 = idata_trz.posterior["Intercept"].mean().item()
 beta_tv = idata_trz.posterior["TV"].mean().item()
 beta_zeitung = idata_trz.posterior["Zeitung"].mean().item()
 beta_radio = idata_trz.posterior["Radio"].mean().item()

 # Compute predictions (ŷ)
 y_pred = beta_0 + beta_tv * werbung["TV"] + beta_zeitung * werbung["Zeitung"] + beta_radio * werbung["Radio"]
 ```

 %% Cell type:code id:45172e19-c1df-48ed-a524-cb8408a603e4 tags:

 ``` python
 # Actual y values
 y_obs = werbung["Verkauf"].values

 # Compute total sum of squares (TSS) and residual sum of squares (RSS)
 ss_total = np.sum((y_obs - np.mean(y_obs))**2)
 ss_residual = np.sum((y_obs - y_pred) ** 2)

 # Compute R²
 r2 = 1 - (ss_residual / ss_total)
 print("R²:", r2)
 ```

 %% Output

    R²: 0.8972105391174441

+%% Cell type:code id:12b11a54-925b-4c1b-b3a2-4f6b479a11b0 tags:
+
+``` python
+pm.r2_score(y_obs, y_pred )
+```
+
+%% Output
+
+    r2        0.897365
+    r2_std    0.000000
+    dtype: float64
+
 %% Cell type:markdown id:67580aab-6f1d-47ac-bace-3212033a0e40 tags:

 Nun bestimmen wir den $R^2$ Wert für das lineare Regressionsmodell mit den Prädiktorvariablen `TV` und `Radio`:

 %% Cell type:code id:fd86c259-21a6-4ebc-a2c9-6d7165f6a13a tags:

 ``` python
 import bambi as bmb
 model_tr = bmb.Model("Verkauf ~ TV + Radio", werbung)
 idata_tr = model_trz.fit(random_seed=123)
 ```

 %% Output

    Auto-assigning NUTS sampler...
    Initializing NUTS using jitter+adapt_diag...
    Multiprocess sampling (4 chains in 4 jobs)
    NUTS: [sigma, Intercept, TV, Zeitung, Radio]



    Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 5 seconds.

 %% Cell type:code id:125cbdce-ae89-456c-945b-b603d8cf3d7d tags:

 ``` python
 # Extract posterior means of coefficients
 beta_0 = idata_trz.posterior["Intercept"].mean().item()
 beta_tv = idata_trz.posterior["TV"].mean().item()
 beta_radio = idata_trz.posterior["Radio"].mean().item()

 # Compute predictions (ŷ)
 y_pred = beta_0 + beta_tv * werbung["TV"] + beta_radio * werbung["Radio"]
 ```

 %% Cell type:code id:4990d7e2-8761-469b-aca8-7dc283fd8c99 tags:

 ``` python
 # Actual y values
 y_obs = werbung["Verkauf"].values

 # Compute total sum of squares (TSS) and residual sum of squares (RSS)
 ss_total = np.sum((y_obs - np.mean(y_obs))**2)
 ss_residual = np.sum((y_obs - y_pred) ** 2)

 # Compute R²
 r2 = 1 - (ss_residual / ss_total)
 print("R²:", r2)
 ```

 %% Output

    R²: 0.897153312886268

 %% Cell type:markdown id:c52232c9-90eb-4fed-ae84-35b29d30f6c6 tags:

 Die beiden Modelle haben beinahe den identischen $R^2$-Wert, d.h., wir können auf die Prädiktorvariable `Zeitung` ohne Einbusse an der Güte des Modells verzichten.

-%% Cell type:code id:d07bbeb8-05d2-4e61-a806-b9dbce147722 tags:
+%% Cell type:code id:28c5351e-08dd-420d-8d2b-56ec4c4173e9 tags:
+
+``` python
+pm.r2_score(y_obs, y_pred )
+```
+
+%% Output
+
+    r2        0.897365
+    r2_std    0.000000
+    dtype: float64
+
+%% Cell type:code id:8c724547-2917-4675-b39e-41f42a306a9f tags:

 ``` python
 ```