Imports¶

In [1]:
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt

# Set plotting options
params = {'text.usetex' : True,
          'font.size' : 16,
          'font.family' : 'lmodern'
          }
plt.rcParams.update(params) 

Network class¶

In [2]:
def relu(x):
    """Rectified linear unit
    
    Input:
    x - numpy array of shape (m, n) where n is the number
        of points and m is the dimension
    
    Return:
    numpy array of shape (m ,n) of the pointwise ReLU
    """
    return np.maximum(x, 0)


def grad_relu(x):
    """Heaviside function
    
    Input:
    x - numpy array of shape (m, n) where n is the number
        of points and m is the dimension
    
    Return:
    numpy array of shape (m ,n) of the pointwise Heaviside
    """
    return (x > 0).astype(float)


class TwoLayerNetwork:
    
    def __init__(self, m0, m1, m2):
        """Initialize the network with random weights.
        
        Input:
        m0 - int for the input dimension
        m1 - int for the hidden layer dimension
        m2 - int for the output dimension
        """
        self.m0 = m0
        self.m1 = m1
        self.m2 = m2
        # Weight matrices
        self.W1 = np.random.randn(m1, m0)
        self.W2 = np.random.randn(m2, m1)
        # Save b1, b2 as column vectors for correct
        # broadcasting during prediction.
        self.b1 = np.random.randn(m1, 1)
        self.b2 = np.random.randn(m2, 1)
   

    def predict(self, X):
        """Evaluate the network on test points.
        
        Input:
        X - numpy array of shape (m0, n) where m0 is the input
            dimension and n is the number of points
        
        Return:
        y2 - numpy array of shape (m2, n) where m2 is the 
             output dimension of the network
        """
        y1 = self.W1 @ X + self.b1
        y2 = self.W2 @ relu(y1) + self.b2
        return y2
     
        
    def grad_loss(self, X, y):
        """Compute the gradient of the loss on the training set.
        
        The loss is given by
        1/n sum_{i=1}^n ||f(x_i) - y_i||^2
        
        Input:
        X - numpy array of shape (m0, n)
        y - numpy array of shape (m2, n)
        
        Return:
        loss - float for the mean-squared error of the network on 
               the training points
        grad_W1 - numpy array of shape (m0, m1)
        grad_W2 - numpy array of shape (m1, m2)
        grad_b1 - numpy array of shape (m1, 1)
        grad_b2 - numpy array of shape (m2, 1)
        """
        n = X.shape[1]  # Number of training samples
        fx = self.predict(X)  # shape (m2, n)
        
        # Pre-compute matrices
        R = fx - y # residuals
        V = self.W2.T @ R
        S = relu(self.W1 @ X + self.b1) # shape (m1, n)
        D = grad_relu(self.W1 @ X + self.b1) # shape (m1, n)
        
        # Multiply the loss by m2 since there are n*m2 elements
        # in the matrices fx, y
        loss = self.m2 * np.mean(R*R)
        
        # Gradients
        grad_W1 = (2/n) * (V * D) @ X.T
        
        grad_W2 = (2/n) * (R @ S.T)
        
        grad_b1 = 2 * np.mean(V * D, axis=1)
        grad_b1 = grad_b1.reshape((self.m1, 1))
        
        grad_b2 = 2 * np.mean(R, axis=1)
        grad_b2 = grad_b2.reshape((self.m2, 1))
        
        return loss, grad_W1, grad_W2, grad_b1, grad_b2
       
        
    def train(self, X, y, lr, epochs):
        """Train the network using gradient descent.
        Updates the network parameters in place.
        
        Input:
        X - numpy array of shape (m0, n)
        y - numpy array of shape (m2, n)
        lr - float > 0 that specifies the learning rate
        epochs - int > 0 that specifies the number of iterations
        
        Return:
        loss - float for the final mean-squared error of the network 
               on the training points
        """   
        for t in tqdm(range(epochs)):  
            # Compute loss function and gradients
            loss, grad_W1, grad_W2, grad_b1, grad_b2 = \
            self.grad_loss(X, y)  
            
            # Update parameters
            self.W1 -= lr*grad_W1
            self.W2 -= lr*grad_W2
            self.b1 -= lr*grad_b1
            self.b2 -= lr*grad_b2        
        return loss

Test example 1¶

Approximate the function $f(x) = x^2$.

In [16]:
# Approximating the function x^2
nn = TwoLayerNetwork(1, 15, 1)

# Generate training, validation, and test data
ntrain = 1000
Xtrain = 5*np.random.randn(1, ntrain)
ytrain = Xtrain * Xtrain

nval = 1000
Xval = 5*np.random.randn(1, ntrain)
yval = Xval * Xval

ntest = 1000
Xtest = np.linspace(-10, 10, ntest).reshape((1, ntest))
ytest = Xtest * Xtest
In [17]:
# Train the network and compute validation and training loss
epochs = 2500
lr = 1e-3

train_loss = []
val_loss = []

# Gradient descent update
for t in tqdm(range(epochs)):  
    # Compute loss function and gradients
    loss_t, grad_W1, grad_W2, grad_b1, grad_b2 = \
    nn.grad_loss(Xtrain, ytrain)  
    train_loss.append(loss_t)
    
    # Compute validation loss
    loss_t, _, _, _, _ = nn.grad_loss(Xval, yval)
    val_loss.append(loss_t)
    
    # Update parameters
    nn.W1 -= lr*grad_W1
    nn.W2 -= lr*grad_W2
    nn.b1 -= lr*grad_b1
    nn.b2 -= lr*grad_b2    
    
# Get the prediction on the test data
ypred = nn.predict(Xtest)
100%|██████████| 2500/2500 [00:06<00:00, 387.49it/s]
In [19]:
# Plot results
fig, ax = plt.subplots(1, 2, figsize=(10,4))

ax[0].set_title('Training and Validation Loss')
ax[0].set_xlabel('Iteration')
ax[0].set_ylabel('Loss (MSE)')
ax[0].semilogy(np.arange(len(train_loss)), train_loss, label='Training', lw=3)
ax[0].semilogy(np.arange(len(val_loss)), val_loss, label='Validation', lw=3)
ax[0].legend()
ax[0].grid()

ax[1].set_title('True and Predicted Data')
ax[1].plot(Xtest.flatten(), ypred.flatten(), label='Prediction', lw=3)
ax[1].plot(Xtest.flatten(), ytest.flatten(), label='Truth', lw=3)
ax[1].legend()

plt.tight_layout()
plt.savefig('test_example_loss.png')
No description has been provided for this image

Test example 2¶

Approximating the product and max functions.

In [6]:
# f(x) = [x1*x2*x3, max(x1, x2, x3)] nonlinear function
m0 = 3
m1 = 20
m2 = 2

nn = TwoLayerNetwork(m0, m1, m2)
In [7]:
# Training data
n = 10000
X = 10*(np.random.rand(3, n) - 0.5)  # Uniform [-5, 5]
y = np.vstack([X.prod(axis=0), X.max(axis=0)])
In [8]:
nn.train(X, y, 1e-3, 200000)  # Note that this may take 20-30 minutes.
100%|██████████| 200000/200000 [21:25<00:00, 155.59it/s] 
Out[8]:
12.815207909936653
In [9]:
# Evaluate the network on a test point
test_point = np.array([1, 2, 3]).reshape((-1, 1))
prediction = nn.predict(test_point)

prod_pred = prediction[0][0]
max_pred = prediction[1][0]

print(prediction)
[[5.93894474]
 [3.63720082]]

Plot ReLU and gradient¶

In [10]:
# Plot ReLU and gradient

x = np.linspace(-1, 1, 101)
y = relu(x)

fig, ax = plt.subplots(1,2, figsize=(10,4))
ax[0].plot(x,y, lw=4, c='k')
ax[0].set_xticks([-1, 0, 1])
ax[0].set_yticks([0, 1], ["0", "1"])
ax[0].set_ylim([-0.2, 1])
ax[0].set_title('$\\texttt{relu}$')



xn = np.linspace(-1, 0, 101)
yn = grad_relu(xn)

xp = np.linspace(0, 1, 101)[1:]
yp = grad_relu(xp)

ax[1].plot(xn,yn, lw=4, c='k')
ax[1].plot(xp,yp, lw=4, c='k')
ax[1].plot([0,0], [0,1], lw=4, c='k', ls=':')
ax[1].set_xticks([-1, 0, 1])
ax[1].set_yticks([0, 1], ["0", "1"])
ax[1].set_ylim([-0.2, 1.2])
ax[1].set_title('$\\texttt{relu}^{\prime}$')

plt.tight_layout()
#plt.savefig('relu.png')
No description has been provided for this image