import numpy as np # import NumPy for math and arrays

def generate_data(n_samples=1000, n_features=3, n_classes=3):
    X = [] # store features
    y = [] # store labels

    # generate for each class
    for i in range(n_classes):
        mean = np.random.uniform(-5, 5, size=n_features)              # random mean for class
        covariance = np.eye(n_features) * np.random.uniform(0.5, 1.5) # randomly scalled covariance matrix
        # generate samples
        samples = np.random.multivariate_normal(mean, covariance, size=n_samples // n_classes)
        noise = np.random.normal(0, 0.1, samples.shape) # calculate noise to add
        samples += noise                                # add noise
        X.append(samples)                               # append to features
        y.extend([i] * (n_samples // n_classes))        # add labels
    return np.vstack(X), np.array(y)                    # return data

def split_data(X, y, test_size=0.2):
    # sample indices
    indices = [i for i in range(len(X))]        # list indices corresponding to samples in X
    np.random.shuffle(indices)                  # shuffle indices for random split
    # where to split data
    split_index = int(len(X) * (1 - test_size)) # where to split the data 
    train_index = indices[:split_index]         # select indices for training set
    test_index = indices[split_index:]          # select indices for testing seu
    # train samples
    X_train = X[train_index]                    # get training features
    y_train = y[train_index]                    # get training labels
    # test samples
    X_test = X[test_index]                      # get testing features
    y_test = y[test_index]                      # get testing labels
    return X_train, X_test, y_train, y_test     # return train/test set

def fit(X, y):                                     # function to fit model on data
    classes = np.unique(y)                         # get each unique class
    stats = {}                                     # dict to store stats for each class
    for i in classes:                              # iterate through each class
        class_X = X[y == i]                        # get all samples with the class
        stats[i] = {                               # add stats for class
            "mean": class_X.mean(axis=0),          # average of each feature in class
            "var": class_X.var(axis=0),            # variance of each feature in class
            "prior": class_X.shape[0] / X.shape[0] # estimated probability of class
        }
    return stats                                   # return data stats

def pdf(x, mean, var):                                    # compute probability density with class mean and variance
    eps = 1e-6                                            # epsilon to avoid division by 0
    numerator = np.exp(- (x - mean)**2 / (2 * var + eps)) # calculate numerator of the pdf
    denominator = np.sqrt(2 * np.pi * var + eps)          # calculate pdf denominator
    return numerator / denominator                        # return probability density

def predict(X, stats):                                                  # predict classes based on data stats
    predictions = []                                                    # list to store predictions
    for i in X:                                                         # iterate through samples
        posteriors = []                                                 # store posteriors or updated probabilities
        for j, params in stats.items():                                 # iterate through each class and parameter
            prior = np.log(params["prior"])                             # log of classes prior
            probability_density = pdf(i, params["mean"], params["var"]) # calculate probability density
            likelihood = np.sum(np.log(probability_density))            # calculate class likelihood
            posteriors.append(prior + likelihood)                       # calculate and store posterior probability
        predictions.append(np.argmax(posteriors))                       # predict class with highest likelihood and append
    return np.array(predictions)

# Generate data
X, y = generate_data()
X_train, X_test, y_train, y_test = split_data(X, y)

# fit
model_stats = fit(X_train, y_train)

# Predict
predictions = predict(X_test, model_stats)

# Accuracy
accuracy = np.mean(predictions == y_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.96

Naive Bayes Classifier From Scratch¶

Background¶

Bayes’ Theorem¶

The Naive Assumption¶

Types of Naive Bayes Classifiers¶

Step 1: Data Generation¶

Step 2: Model Fitting¶

Step 3: Probability Density Function¶

Step 4: Predict¶

Step 5: Using the Code¶

Summary¶

Author and Liscense¶