import numpy as np                                   # linear algebra and arrays
from sklearn.model_selection import train_test_split # spliting data into train/test sets
from sklearn.preprocessing import PolynomialFeatures # adding polynomial features to input data
from sklearn.linear_model import LinearRegression    # model used in example
from sklearn.metrics import mean_squared_error       # evaluation metric
import matplotlib.pyplot as plt                      # plotting data points and results

X = np.sort(5 * np.random.rand(100, 1), axis=0)              # generate and sort 100 numbers between 1 and 5 to use as features
y = np.sin(X).ravel() + np.random.normal(0, 0.2, X.shape[0]) # Generate the sine of X with gaussian noise 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # split the data into train and test sets

def plot_model(degree):
    # create polynomial features
    polynomial = PolynomialFeatures(degree)          # create a feature transformer for a specified degree
    polynomial_X = polynomial.fit_transform(X_train) # transform training features

    # fit regression model using transformed data
    model = LinearRegression()       # define a linear regression model
    model.fit(polynomial_X, y_train) # fit model on polynomial data

    # generate smooth curve
    X_plot = np.linspace(0, 5, 100).reshape(-1, 1)   # generate X plot for predictions curve
    polynomial_X_plot = polynomial.transform(X_plot) # transform X plot
    y_plot = model.predict(polynomial_X_plot)        # predict input points

    # predict train and test data
    train_predictions = model.predict(polynomial_X)     # predict training data
    polynomial_X_test = polynomial.transform(X_test)    # transform testing data
    test_predictions = model.predict(polynomial_X_test) # predict testing data

    # calculate MSE of training and testing predictions
    train_mse = mean_squared_error(y_train, train_predictions) # evaluate training predictions
    test_mse = mean_squared_error(y_test, test_predictions)    # evaluate testing predictions

    # create a visualization
    plt.figure(figsize=(8, 6))                                                                  # create an 8*6 figure
    plt.scatter(X_train, y_train, color="blue", label="train")                                  # plot training data
    plt.scatter(X_test, y_test, color="green", label="test")                                    # plot testing data
    plt.plot(X_plot, y_plot, color="red", label=f"Model Predictions")                           # plot model predictions
    plt.title(f"Degree {degree} | Training MSE: {train_mse:.2f} | Testing MSE: {test_mse:.2f}") # title plot with model information
    plt.legend()                                                                                # display legend on graph
    plt.show()                                                                                  # print graph

for degree in [1, 4, 15]: # define degree of an underfit (1), well fit (4), and overfit (15) model
    plot_model(degree)    # call function with specified degree

Underfitting vs Overfitting¶

Overview¶

Underfitting¶

Overfitting¶

Demonstration¶

Summary¶

Author and Liscense¶