From d2f4c1ed333ccc7ac7cf7a00b527db2859a92d9d Mon Sep 17 00:00:00 2001
From: Jeanette Lee <jeanette.lee@stud.hslu.ch>
Date: Sun, 13 Mar 2022 17:50:38 +0000
Subject: [PATCH] New

---
 notebooks/Block_1/kNearestNeighbor.ipynb | 284 +++++++++++++++++++++++
 1 file changed, 284 insertions(+)
 create mode 100644 notebooks/Block_1/kNearestNeighbor.ipynb

diff --git a/notebooks/Block_1/kNearestNeighbor.ipynb b/notebooks/Block_1/kNearestNeighbor.ipynb
new file mode 100644
index 0000000..5be6fd5
--- /dev/null
+++ b/notebooks/Block_1/kNearestNeighbor.ipynb
@@ -0,0 +1,284 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "70449bad-7bb6-44fe-ac93-ec22041346ff",
+   "metadata": {},
+   "source": [
+    "# K Nearest Neighbor - L2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4f6914a2-3658-473d-9f65-c8b3ed7a0c96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class KNearestNeighbor():\n",
+    "  \"\"\" a kNN classifier with L2 distance \"\"\"\n",
+    "\n",
+    "  def __init__(self):\n",
+    "    pass\n",
+    "\n",
+    "  def train(self, X, y):\n",
+    "    \"\"\"\n",
+    "    Train the classifier. For k-nearest neighbors this is just \n",
+    "    memorizing the training data.\n",
+    "\n",
+    "    Inputs:\n",
+    "    - X: A numpy array of shape (num_train, D) containing the training data\n",
+    "      consisting of num_train samples each of dimension D.\n",
+    "    - y: A numpy array of shape (N,) containing the training labels, where\n",
+    "         y[i] is the label for X[i].\n",
+    "    \"\"\"\n",
+    "    self.X_train = X.astype('float')\n",
+    "    self.y_train = y\n",
+    "    \n",
+    "  def predict(self, X, k=1, num_loops=0):\n",
+    "    \"\"\"\n",
+    "    Predict labels for test data using this classifier.\n",
+    "\n",
+    "    Inputs:\n",
+    "    - X: A numpy array of shape (num_test, D) containing test data consisting\n",
+    "         of num_test samples each of dimension D.\n",
+    "    - k: The number of nearest neighbors that vote for the predicted labels.\n",
+    "    - num_loops: Determines which implementation to use to compute distances\n",
+    "      between training points and testing points.\n",
+    "\n",
+    "    Returns:\n",
+    "    - y: A numpy array of shape (num_test,) containing predicted labels for the\n",
+    "      test data, where y[i] is the predicted label for the test point X[i].  \n",
+    "    \"\"\"\n",
+    "    if num_loops == 0:\n",
+    "      dists = self.compute_distances_no_loops(X)\n",
+    "    elif num_loops == 1:\n",
+    "      dists = self.compute_distances_one_loop(X)\n",
+    "    elif num_loops == 2:\n",
+    "      dists = self.compute_distances_two_loops(X)\n",
+    "    else:\n",
+    "      raise ValueError('Invalid value %d for num_loops' % num_loops)\n",
+    "\n",
+    "    return self.predict_labels(dists, k=k)\n",
+    "\n",
+    "  def compute_distances_two_loops(self, X):\n",
+    "    \"\"\"\n",
+    "    Compute the distance between each test point in X and each \n",
+    "    training point in self.X_train using a nested loop over both \n",
+    "    the training data and the test data.\n",
+    "\n",
+    "    Inputs:\n",
+    "    - X: A numpy array of shape (num_test, D) containing test data.\n",
+    "\n",
+    "    Returns:\n",
+    "    - dists: A numpy array of shape (num_test, num_train) where \n",
+    "      dists[i, j] is the L2 (Euclidean) distance between the ith test \n",
+    "      point and the jth training point.\n",
+    "    \"\"\"\n",
+    "    num_test = X.shape[0]\n",
+    "    num_train = self.X_train.shape[0]\n",
+    "    dists = np.zeros((num_test, num_train))   # return matrix of size num_test x num_train filled with 0s\n",
+    "    X = X.astype('float')\n",
+    "    for i in range(num_test):\n",
+    "      for j in range(num_train):\n",
+    "          dists[i, j] = np.sqrt(np.sum(np.square(self.X_train[j,:] - X[i,:])))   # assign value for cell (i, j) in matrix dists\n",
+    "        \n",
+    "    return dists\n",
+    "\n",
+    "  def compute_distances_one_loop(self, X):\n",
+    "    \"\"\"\n",
+    "    Compute the distance between each test point in X and each training point\n",
+    "    in self.X_train using a single loop over the test data.\n",
+    "\n",
+    "    Input / Output: Same as compute_distances_two_loops\n",
+    "    \"\"\"\n",
+    "    num_test = X.shape[0]\n",
+    "    num_train = self.X_train.shape[0]\n",
+    "    dists = np.zeros((num_test, num_train))\n",
+    "    X = X.astype('float')\n",
+    "    for i in range(num_test):\n",
+    "      dists[i, :] = np.sqrt(np.sum(np.square(self.X_train - X[i,:]), axis = 1))\n",
+    "      \n",
+    "     \n",
+    "    return dists\n",
+    "\n",
+    "  def compute_distances_no_loops(self, X):\n",
+    "    \"\"\"\n",
+    "    Compute the distance between each test point in X and each training point\n",
+    "    in self.X_train using no explicit loops.\n",
+    "\n",
+    "    Input / Output: Same as compute_distances_two_loops\n",
+    "    \"\"\"\n",
+    "    num_test = X.shape[0]\n",
+    "    num_train = self.X_train.shape[0]\n",
+    "    dists = np.zeros((num_test, num_train)) \n",
+    "    X=X.astype('float')\n",
+    "    \n",
+    "    # Most \"elegant\" solution leads however to memory issues\n",
+    "    # dists = np.sqrt(np.square((self.X_train[:, np.newaxis, :] - X)).sum(axis=2)).T\n",
+    "    # split (p-q)^2 to p^2 + q^2 - 2pq\n",
+    "    dists = np.sqrt((X**2).sum(axis=1)[:, np.newaxis] + (self.X_train**2).sum(axis=1) - 2 * X.dot(self.X_train.T))\n",
+    "                     \n",
+    "    \n",
+    "    \n",
+    "    return dists\n",
+    "\n",
+    "  def predict_labels(self, dists, k=1):\n",
+    "    \"\"\"\n",
+    "    Given a matrix of distances between test points and training points,\n",
+    "    predict a label for each test point.\n",
+    "\n",
+    "    Inputs:\n",
+    "    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n",
+    "      gives the distance betwen the ith test point and the jth training point.\n",
+    "\n",
+    "    Returns:\n",
+    "    - y: A numpy array of shape (num_test,) containing predicted labels for the\n",
+    "      test data, where y[i] is the predicted label for the test point X[i].  \n",
+    "    \"\"\"\n",
+    "    num_test = dists.shape[0]\n",
+    "    y_pred = np.zeros(num_test, dtype='float64')   # array with num_test elements all = 0\n",
+    "    for i in range(num_test):\n",
+    "        # A list of length k storing the labels of the k nearest neighbors to\n",
+    "        # the ith test point.\n",
+    "        closest_y = []\n",
+    "        # get the k indices with smallest distances\n",
+    "        min_indices = np.argsort(dists[i,:])[:k] \n",
+    "        closest_y = np.bincount(self.y_train[min_indices])\n",
+    "        # predict the label of the nearest example\n",
+    "        y_pred[i] = np.argmax(closest_y)  \n",
+    "\n",
+    "    return y_pred"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cca1c5c5-4888-4576-9c5c-101106e1ca70",
+   "metadata": {},
+   "source": [
+    "# K Nearest Neighbor - L1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c785482f-f6bd-4dfa-9484-f82ea3b860ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class KNearestNeighbor_L1(KNearestNeighbor):\n",
+    "  \"\"\" a kNN classifier with L1 distance \"\"\"\n",
+    "\n",
+    "  def __init__(self):\n",
+    "    super().__init__()\n",
+    "    \n",
+    "\n",
+    "  def compute_distances_one_loop(self, X):\n",
+    "    \"\"\"\n",
+    "    We overwrite the compute_distance_one_loop method of the parent class \n",
+    "    KNearestNeighbor. \n",
+    "    Compute the distance between each test point in X and each training point\n",
+    "    in self.X_train using one loop and the L1 distance measure.\n",
+    "\n",
+    "    Input / Output: Same as compute_distances_two_loops\n",
+    "    \"\"\"\n",
+    "    num_test = X.shape[0]\n",
+    "    num_train = self.X_train.shape[0]\n",
+    "    dists = np.zeros((num_test, num_train))\n",
+    "    X = X.astype('float')\n",
+    "    for i in range(num_test):\n",
+    "      dists[i, :] = (np.sum(np.abs(self.X_train - X[i,:]), axis = 1))\n",
+    "    return dists \n",
+    "  \n",
+    "  def compute_distances_two_loops(self, X):\n",
+    "    \"\"\"\n",
+    "    Compute the distance between each test point in X and each \n",
+    "    training point in self.X_train using a nested loop over both \n",
+    "    the training data and the test data.\n",
+    "\n",
+    "    Inputs:\n",
+    "    - X: A numpy array of shape (num_test, D) containing test data.\n",
+    "\n",
+    "    Returns:\n",
+    "    - dists: A numpy array of shape (num_test, num_train) where \n",
+    "      dists[i, j] is the L1 distance between the ith test \n",
+    "      point and the jth training point.\n",
+    "    \"\"\"\n",
+    "    num_test = X.shape[0]\n",
+    "    num_train = self.X_train.shape[0]\n",
+    "    dists = np.zeros((num_test, num_train))\n",
+    "    X = X.astype('float')\n",
+    "    for i in range(num_test):\n",
+    "      for j in range(num_train):\n",
+    "          dists[i, j] = np.sum(np.abs(self.X_train[j,:] - X[i,:]))\n",
+    "        \n",
+    "    return dists"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a25745ac-cd12-41f2-949e-748eaeeb613d",
+   "metadata": {},
+   "source": [
+    "# k-Fold Cross Validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e6887a07-fc90-4f81-a176-a4c0269a8b5f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# k-fold cross validation\n",
+    "def k_fold_cv(num_folds, k_choices):\n",
+    "    X_train_folds = []\n",
+    "    y_train_folds = []\n",
+    "    \n",
+    "    num_train = X_train.shape[0]\n",
+    "    fold_size = np.ceil(num_train/num_folds).astype('int')\n",
+    "    \n",
+    "    X_train_folds = np.split(X_train, [(i + 1)*fold_size for i in np.arange(num_folds)])\n",
+    "    y_train_folds = np.split(y_train, [(i + 1)*fold_size for i in np.arange(num_folds)])\n",
+    "    \n",
+    "    k_to_accuracies = {}\n",
+    "    \n",
+    "    for k in k_choices:\n",
+    "        k_to_accuracies[k] = []\n",
+    "        classifier = KNearestNeighbor()\n",
+    "        for i in range(num_folds):\n",
+    "            X_cv_training = np.concatenate([x for k, x in enumerate(X_train_folds) if k!=i], axis=0)\n",
+    "            y_cv_training = np.concatenate([x for k, x in enumerate(y_train_folds) if k!=i], axis=0)\n",
+    "            classifier.train(X_cv_training, y_cv_training)\n",
+    "            dists = classifier.compute_distances_no_loops(X_train_folds[i])\n",
+    "            y_test_pred = classifier.predict_labels(dists, k=k)\n",
+    "            k_to_accuracies[k].append(np.mean(y_train_folds[i] == y_test_pred))\n",
+    "    \n",
+    "    k = next(key for key, value in k_to_accuracies.items() if value == sorted( k_to_accuracies.values(), reverse=True)[0])\n",
+    "    \n",
+    "    return k"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
-- 
GitLab