From d2f4c1ed333ccc7ac7cf7a00b527db2859a92d9d Mon Sep 17 00:00:00 2001 From: Jeanette Lee <jeanette.lee@stud.hslu.ch> Date: Sun, 13 Mar 2022 17:50:38 +0000 Subject: [PATCH] New --- notebooks/Block_1/kNearestNeighbor.ipynb | 284 +++++++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 notebooks/Block_1/kNearestNeighbor.ipynb diff --git a/notebooks/Block_1/kNearestNeighbor.ipynb b/notebooks/Block_1/kNearestNeighbor.ipynb new file mode 100644 index 0000000..5be6fd5 --- /dev/null +++ b/notebooks/Block_1/kNearestNeighbor.ipynb @@ -0,0 +1,284 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "70449bad-7bb6-44fe-ac93-ec22041346ff", + "metadata": {}, + "source": [ + "# K Nearest Neighbor - L2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f6914a2-3658-473d-9f65-c8b3ed7a0c96", + "metadata": {}, + "outputs": [], + "source": [ + "class KNearestNeighbor():\n", + " \"\"\" a kNN classifier with L2 distance \"\"\"\n", + "\n", + " def __init__(self):\n", + " pass\n", + "\n", + " def train(self, X, y):\n", + " \"\"\"\n", + " Train the classifier. For k-nearest neighbors this is just \n", + " memorizing the training data.\n", + "\n", + " Inputs:\n", + " - X: A numpy array of shape (num_train, D) containing the training data\n", + " consisting of num_train samples each of dimension D.\n", + " - y: A numpy array of shape (N,) containing the training labels, where\n", + " y[i] is the label for X[i].\n", + " \"\"\"\n", + " self.X_train = X.astype('float')\n", + " self.y_train = y\n", + " \n", + " def predict(self, X, k=1, num_loops=0):\n", + " \"\"\"\n", + " Predict labels for test data using this classifier.\n", + "\n", + " Inputs:\n", + " - X: A numpy array of shape (num_test, D) containing test data consisting\n", + " of num_test samples each of dimension D.\n", + " - k: The number of nearest neighbors that vote for the predicted labels.\n", + " - num_loops: Determines which implementation to use to compute distances\n", + " between training points and testing points.\n", + "\n", + " Returns:\n", + " - y: A numpy array of shape (num_test,) containing predicted labels for the\n", + " test data, where y[i] is the predicted label for the test point X[i]. \n", + " \"\"\"\n", + " if num_loops == 0:\n", + " dists = self.compute_distances_no_loops(X)\n", + " elif num_loops == 1:\n", + " dists = self.compute_distances_one_loop(X)\n", + " elif num_loops == 2:\n", + " dists = self.compute_distances_two_loops(X)\n", + " else:\n", + " raise ValueError('Invalid value %d for num_loops' % num_loops)\n", + "\n", + " return self.predict_labels(dists, k=k)\n", + "\n", + " def compute_distances_two_loops(self, X):\n", + " \"\"\"\n", + " Compute the distance between each test point in X and each \n", + " training point in self.X_train using a nested loop over both \n", + " the training data and the test data.\n", + "\n", + " Inputs:\n", + " - X: A numpy array of shape (num_test, D) containing test data.\n", + "\n", + " Returns:\n", + " - dists: A numpy array of shape (num_test, num_train) where \n", + " dists[i, j] is the L2 (Euclidean) distance between the ith test \n", + " point and the jth training point.\n", + " \"\"\"\n", + " num_test = X.shape[0]\n", + " num_train = self.X_train.shape[0]\n", + " dists = np.zeros((num_test, num_train)) # return matrix of size num_test x num_train filled with 0s\n", + " X = X.astype('float')\n", + " for i in range(num_test):\n", + " for j in range(num_train):\n", + " dists[i, j] = np.sqrt(np.sum(np.square(self.X_train[j,:] - X[i,:]))) # assign value for cell (i, j) in matrix dists\n", + " \n", + " return dists\n", + "\n", + " def compute_distances_one_loop(self, X):\n", + " \"\"\"\n", + " Compute the distance between each test point in X and each training point\n", + " in self.X_train using a single loop over the test data.\n", + "\n", + " Input / Output: Same as compute_distances_two_loops\n", + " \"\"\"\n", + " num_test = X.shape[0]\n", + " num_train = self.X_train.shape[0]\n", + " dists = np.zeros((num_test, num_train))\n", + " X = X.astype('float')\n", + " for i in range(num_test):\n", + " dists[i, :] = np.sqrt(np.sum(np.square(self.X_train - X[i,:]), axis = 1))\n", + " \n", + " \n", + " return dists\n", + "\n", + " def compute_distances_no_loops(self, X):\n", + " \"\"\"\n", + " Compute the distance between each test point in X and each training point\n", + " in self.X_train using no explicit loops.\n", + "\n", + " Input / Output: Same as compute_distances_two_loops\n", + " \"\"\"\n", + " num_test = X.shape[0]\n", + " num_train = self.X_train.shape[0]\n", + " dists = np.zeros((num_test, num_train)) \n", + " X=X.astype('float')\n", + " \n", + " # Most \"elegant\" solution leads however to memory issues\n", + " # dists = np.sqrt(np.square((self.X_train[:, np.newaxis, :] - X)).sum(axis=2)).T\n", + " # split (p-q)^2 to p^2 + q^2 - 2pq\n", + " dists = np.sqrt((X**2).sum(axis=1)[:, np.newaxis] + (self.X_train**2).sum(axis=1) - 2 * X.dot(self.X_train.T))\n", + " \n", + " \n", + " \n", + " return dists\n", + "\n", + " def predict_labels(self, dists, k=1):\n", + " \"\"\"\n", + " Given a matrix of distances between test points and training points,\n", + " predict a label for each test point.\n", + "\n", + " Inputs:\n", + " - dists: A numpy array of shape (num_test, num_train) where dists[i, j]\n", + " gives the distance betwen the ith test point and the jth training point.\n", + "\n", + " Returns:\n", + " - y: A numpy array of shape (num_test,) containing predicted labels for the\n", + " test data, where y[i] is the predicted label for the test point X[i]. \n", + " \"\"\"\n", + " num_test = dists.shape[0]\n", + " y_pred = np.zeros(num_test, dtype='float64') # array with num_test elements all = 0\n", + " for i in range(num_test):\n", + " # A list of length k storing the labels of the k nearest neighbors to\n", + " # the ith test point.\n", + " closest_y = []\n", + " # get the k indices with smallest distances\n", + " min_indices = np.argsort(dists[i,:])[:k] \n", + " closest_y = np.bincount(self.y_train[min_indices])\n", + " # predict the label of the nearest example\n", + " y_pred[i] = np.argmax(closest_y) \n", + "\n", + " return y_pred" + ] + }, + { + "cell_type": "markdown", + "id": "cca1c5c5-4888-4576-9c5c-101106e1ca70", + "metadata": {}, + "source": [ + "# K Nearest Neighbor - L1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c785482f-f6bd-4dfa-9484-f82ea3b860ca", + "metadata": {}, + "outputs": [], + "source": [ + "class KNearestNeighbor_L1(KNearestNeighbor):\n", + " \"\"\" a kNN classifier with L1 distance \"\"\"\n", + "\n", + " def __init__(self):\n", + " super().__init__()\n", + " \n", + "\n", + " def compute_distances_one_loop(self, X):\n", + " \"\"\"\n", + " We overwrite the compute_distance_one_loop method of the parent class \n", + " KNearestNeighbor. \n", + " Compute the distance between each test point in X and each training point\n", + " in self.X_train using one loop and the L1 distance measure.\n", + "\n", + " Input / Output: Same as compute_distances_two_loops\n", + " \"\"\"\n", + " num_test = X.shape[0]\n", + " num_train = self.X_train.shape[0]\n", + " dists = np.zeros((num_test, num_train))\n", + " X = X.astype('float')\n", + " for i in range(num_test):\n", + " dists[i, :] = (np.sum(np.abs(self.X_train - X[i,:]), axis = 1))\n", + " return dists \n", + " \n", + " def compute_distances_two_loops(self, X):\n", + " \"\"\"\n", + " Compute the distance between each test point in X and each \n", + " training point in self.X_train using a nested loop over both \n", + " the training data and the test data.\n", + "\n", + " Inputs:\n", + " - X: A numpy array of shape (num_test, D) containing test data.\n", + "\n", + " Returns:\n", + " - dists: A numpy array of shape (num_test, num_train) where \n", + " dists[i, j] is the L1 distance between the ith test \n", + " point and the jth training point.\n", + " \"\"\"\n", + " num_test = X.shape[0]\n", + " num_train = self.X_train.shape[0]\n", + " dists = np.zeros((num_test, num_train))\n", + " X = X.astype('float')\n", + " for i in range(num_test):\n", + " for j in range(num_train):\n", + " dists[i, j] = np.sum(np.abs(self.X_train[j,:] - X[i,:]))\n", + " \n", + " return dists" + ] + }, + { + "cell_type": "markdown", + "id": "a25745ac-cd12-41f2-949e-748eaeeb613d", + "metadata": {}, + "source": [ + "# k-Fold Cross Validation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6887a07-fc90-4f81-a176-a4c0269a8b5f", + "metadata": {}, + "outputs": [], + "source": [ + "# k-fold cross validation\n", + "def k_fold_cv(num_folds, k_choices):\n", + " X_train_folds = []\n", + " y_train_folds = []\n", + " \n", + " num_train = X_train.shape[0]\n", + " fold_size = np.ceil(num_train/num_folds).astype('int')\n", + " \n", + " X_train_folds = np.split(X_train, [(i + 1)*fold_size for i in np.arange(num_folds)])\n", + " y_train_folds = np.split(y_train, [(i + 1)*fold_size for i in np.arange(num_folds)])\n", + " \n", + " k_to_accuracies = {}\n", + " \n", + " for k in k_choices:\n", + " k_to_accuracies[k] = []\n", + " classifier = KNearestNeighbor()\n", + " for i in range(num_folds):\n", + " X_cv_training = np.concatenate([x for k, x in enumerate(X_train_folds) if k!=i], axis=0)\n", + " y_cv_training = np.concatenate([x for k, x in enumerate(y_train_folds) if k!=i], axis=0)\n", + " classifier.train(X_cv_training, y_cv_training)\n", + " dists = classifier.compute_distances_no_loops(X_train_folds[i])\n", + " y_test_pred = classifier.predict_labels(dists, k=k)\n", + " k_to_accuracies[k].append(np.mean(y_train_folds[i] == y_test_pred))\n", + " \n", + " k = next(key for key, value in k_to_accuracies.items() if value == sorted( k_to_accuracies.values(), reverse=True)[0])\n", + " \n", + " return k" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- GitLab