import numpy as np # NumPy for math

def generate_data(samples=1000, features=3, classes=3):                                        # define function with input for samples, features, and classes
    centers = np.random.uniform(-10, 10, size=(classes, features))                             # random class centers for features
    y = np.random.randint(0, classes, size=samples)                                            # assign each sample to a class randomly
    X = np.array([centers[label] + np.random.normal(scale=1.0, size=features) for label in y]) # generate features by adding noise arround centers
    return X, y                                                                                # return features and classes

def split_data(X, y, test_size=0.2):
    # sample indices
    indices = [i for i in range(len(X))]        # list indices corresponding to samples in X
    np.random.shuffle(indices)                  # shuffle indices for random split
    # where to split data
    split_index = int(len(X) * (1 - test_size)) # where to split the data 
    train_index = indices[:split_index]         # select indices for training set
    test_index = indices[split_index:]          # select indices for testing seu
    # train samples
    X_train = X[train_index]                    # get training features
    y_train = y[train_index]                    # get training labels
    # test samples
    X_test = X[test_index]                      # get testing features
    y_test = y[test_index]                      # get testing labels
    return X_train, X_test, y_train, y_test     # return train/test set

import matplotlib.pyplot as plt # for plotting

def visualize_data(X, y):                            # function to create graph
    plt.scatter(X[:,0], X[:,1], c=y, cmap='viridis') # make scatter plot of 2 features in X, colors based on label
    plt.title('Training Data')                       # add a title
    plt.xlabel('Feature 1')                          # label the x axis
    plt.ylabel('Feature 2')                          # label the y axis
    plt.show()                                       # print the graph

from collections import Counter # to find label frequencies

def knn(X_train, y_train, X_test, k=3):                              # define the models function
    predictions = []                                                 # list to store predictions
    for i in X_test:                                                 # iterate through test data
        distances = [np.linalg.norm(i - j) for j in X_train]         # calculate the euclidean distance between this sample and all training samples
        k_indices = np.argsort(distances)[:k]                        # get the indices of the k closest samples  (or K-Nearest Neighbors)
        k_nearest_labels = [y_train[j] for j in k_indices]           # get the labels of the k nearest neigbors
        most_common = Counter(k_nearest_labels).most_common(1)[0][0] # find most common label of the k nearest neighbors
        predictions.append(most_common)                              # append the most common label
    return np.array(predictions)                                     # return predictions

X, y = generate_data()                              # generate data
visualize_data(X, y)                                # plot data
X_train, X_test, y_train, y_test = split_data(X, y) # split train/test

predictions = knn(X_train, y_train, X_test) # predict test set

accuracy = np.mean(predictions == y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 98.50%

K-Nearest Neighbors From Scratch¶

Background¶

Use cases¶

Implementation¶

Summary¶

Author and Liscense¶