admin管理员组文章数量:1417092
I've recently implemented a neural network from scratch and am now focusing on visualizing the optimization process. Specifically, I'm interested in creating a 3D visualization of the loss landscape that clearly illustrates the gradient descent trajectory as it converges to a local minimum.
My architecture is quite simple, Im using MNIST dataset and my model predicts the handwritten numbers. The input layer consists of 784 neurons, hidden_layer of 128 neurons and output_layer of 10 neurons.
I understand to start I have to store weights and biases W1, W2, b1, b2 and then pass them along with losses to my gradient descent visualizer. Im not sure what to do next.
If anyone has an idea or implemented 3D gradient descent visualization before I would be very grateful for any hints and suggestions.
Code:
import numpy as np
import struct
from array import array
from os.path import join
import matplotlib.pyplot as plt
class MnistDataloader(object):
def __init__(self, training_images_filepath,training_labels_filepath,
test_images_filepath, test_labels_filepath):
self.training_images_filepath = training_images_filepath
self.training_labels_filepath = training_labels_filepath
self.test_images_filepath = test_images_filepath
self.test_labels_filepath = test_labels_filepath
def read_images_labels(self, images_filepath, labels_filepath):
labels = []
with open(labels_filepath, 'rb') as file:
magic, size = struct.unpack(">II", file.read(8))
if magic != 2049:
raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
labels = array("B", file.read())
with open(images_filepath, 'rb') as file:
magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
if magic != 2051:
raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
image_data = array("B", file.read())
images = []
for i in range(size):
images.append([0] * rows * cols)
for i in range(size):
img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
img = img.reshape(28, 28)
images[i][:] = img
return images, labels
def load_data(self):
x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
return (x_train, y_train),(x_test, y_test)
def preprocess_data(self, x_train, y_train, x_test, y_test):
x_train = np.array(x_train, dtype='float32') # vector (60000, 28, 28)
x_test = np.array(x_test, dtype='float32') # vector (10000, 28, 28)
y_train = np.array(y_train, dtype='int32') # vector (60000)
y_test = np.array(y_test, dtype='int32') # vector (10000)
x_train = x_train / 255.0
x_test = x_test / 255.0
x_train = x_train.reshape(x_train.shape[0], -1) # vector (60000, 784)
x_test = x_test.reshape(x_test.shape[0], -1)
def to_one_hot(y, num_classes=10): # one hot encoding
return np.eye(num_classes)[y]
y_train_onehot = to_one_hot(y_train) # vector (60000, 10)
y_test_onehot = to_one_hot(y_test) # vector (10000, 10)
return x_train, y_train_onehot, x_test, y_test_onehot
class miniModel():
def __init__(self):
pass
def initialize_parameters(self, input_size, hidden_size, output_size):
np.random.seed(np.random.randint(1000))
# 1. Uniform distribution
# W1 = np.random.uniform(-1/np.sqrt(input_size), 1/np.sqrt(input_size),
# size=(hidden_size, input_size))
# W2 = np.random.uniform(-1/np.sqrt(hidden_size), 1/np.sqrt(hidden_size),
# size=(output_size, hidden_size))
# 2. Xavier Normal
# W1 = np.random.randn(hidden_size, input_size) * np.sqrt(2.0/input_size)
# W2 = np.random.randn(output_size, hidden_size) * np.sqrt(2.0/hidden_size)
# 3. Xavier Uniform
# W1 = np.random.uniform(-np.sqrt(6)/np.sqrt(input_size + hidden_size),
# np.sqrt(6)/np.sqrt(input_size + hidden_size),
# size=(hidden_size, input_size))
# W2 = np.random.uniform(-np.sqrt(6)/np.sqrt(hidden_size + output_size),
# np.sqrt(6)/np.sqrt(hidden_size + output_size),
# size=(output_size, hidden_size))
# 4. He uniform
# W1 = np.random.uniform(-np.sqrt(6/input_size), np.sqrt(6/input_size), size = (hidden_size, input_size))
# W2 = np.random.uniform(-np.sqrt(6/input_size), np.sqrt(6/input_size), size = (output_size, hidden_size))
# 5. He normal
W1 = np.random.randn(hidden_size, input_size) * np.sqrt(2/input_size)
W2 = np.random.randn(output_size, hidden_size) * np.sqrt(2/input_size)
b1 = np.zeros((1, hidden_size))
b2 = np.zeros((1, output_size))
return {"W1": W1, "b1": b1, "W2": W2, "b2": b2}
def relu(self, x):
return np.maximum(0, x)
def softmax(self, x):
exps = np.exp(x - np.max(x, axis = 1, keepdims=True))
return exps/np.sum(exps, axis=1, keepdims=True)
# Linear combination calculations
# hiden layer activation z1 = W1 * X + b1 , ReLU a1 = ReLU(z1)
# output layer activation z2 = W2 * a1 + b2, softmax a2 = softmax(z2)
def forward_propagation(self, X, params):
# Hidden Layer
z1 = np.dot(X, params["W1"].T) + params["b1"]
a1 = self.relu(z1)
# Output Layer
z2 = np.dot(a1, params["W2"].T) + params["b2"]
a2 = self.softmax(z2)
return {"z1":z1, "a1":a1, "z2":z2, "a2":a2}
# Cross-Entropy loss function
def compute_loss(self, y_true, y_pred):
m = y_true.shape[0]
return -np.sum(y_true * np.log(y_pred + 1e-15))/m
# backward_propagation
def backward_propagation(self, X, y_true, params, forward_cache):
m = X.shape[0] # m = 64
dz2 = forward_cache["a2"] - y_true # (m, 10)
dW2 = np.dot(forward_cache["a1"].T, dz2) / m # (128, 10)
db2 = np.sum(dz2, axis=0, keepdims=True) / m # (10,)
dz1 = np.dot(dz2, params["W2"]) * (forward_cache["z1"] > 0) # (m, 128)
dW1 = np.dot(X.T, dz1) / m # (784, 128)
db1 = np.sum(dz1, axis=0, keepdims=True) / m # (128,)
return {"dW1": dW1.T, "db1": db1, "dW2": dW2.T, "db2": db2}
def update_parameters(self, params, grads, learning_rate=0.01):
params["W1"] -= learning_rate * grads["dW1"]
params["b1"] -= learning_rate * grads["db1"]
params["W2"] -= learning_rate * grads["dW2"]
params["b2"] -= learning_rate * grads["db2"]
return params
def show_loss_accuracy_graph(self, loss_accuracy_history, epochs):
epochs_history = list(range(1, epochs+1))
loss = [l[0] for l in loss_accuracy_history]
accuracy = [a[1] for a in loss_accuracy_history]
figure, axis = plt.subplots(2)
axis[0].plot(epochs_history, loss, 'tab:green')
axis[0].set_title(f"Loss over {epochs} epochs")
axis[0].set_xticks(np.arange(1, 21, 1))
axis[0].grid(True)
axis[0].set(xlabel = "Epochs", ylabel = "Loss")
axis[1].plot(epochs_history, accuracy)
axis[1].set_title(f"Accuracy over {epochs} epochs")
axis[1].set_xticks(np.arange(1, 21, 1))
axis[1].grid(True)
axis[1].set(xlabel = "Epochs", ylabel = "Accuracy")
plt.tight_layout()
plt.savefig('charts/loss_accuracy.png')
def train(self, X, y, params, epochs=10, batch_size=64, learning_rate=0.01):
loss_accuracy_history = []
params_history = [] # Track parameter history
for epoch in range(epochs):
permutation = np.random.permutation(X.shape[0])
X_shuffled = X[permutation]
y_shuffled = y[permutation]
for i in range(0, X.shape[0], batch_size):
X_batch = X_shuffled[i:i+batch_size]
y_batch = y_shuffled[i:i+batch_size]
cache = self.forward_propagation(X_batch, params)
grads = self.backward_propagation(X_batch, y_batch, params, cache)
params = self.update_parameters(params, grads, learning_rate)
# Store current parameters and loss
params_history.append({
'W1': params['W1'].copy(),
'W2': params['W2'].copy(),
'b1': params['b1'].copy(),
'b2': params['b2'].copy()
})
cache = self.forward_propagation(X, params)
loss = selfpute_loss(y, cache["a2"])
predictions = np.argmax(cache["a2"], axis=1)
accuracy = np.mean(predictions == np.argmax(y, axis=1))
loss_accuracy_history.append([loss, accuracy])
print(f"Epoch {epoch+1}/{epochs} | Loss: {loss:.4f} | Accuracy: {accuracy*100:.2f}%")
self.show_loss_accuracy_graph(loss_accuracy_history, epochs)
return params
def evaluate(self, X_test, y_test, params):
cache = self.forward_propagation(X_test, params)
predictions = np.argmax(cache["a2"], axis=1)
accuracy = np.mean(predictions == np.argmax(y_test, axis=1))
print(f"Test Accuracy: {accuracy * 100:.2f}%")
# Paths to the downloaded data
input_path = 'data'
training_images_filepath = join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte')
# Loading and processing data
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()
x_train, y_train_onehot, x_test, y_test_onehot = mnist_dataloader.preprocess_data(x_train, y_train, x_test, y_test)
# Running the model
model = miniModel()
params = model.initialize_parameters(784, 128, 10)
params = model.train(x_train, y_train_onehot, params, epochs=20, batch_size=64, learning_rate=0.01)
model.evaluate(x_test, y_test_onehot, params)
sample_image = x_test[20]
predicted_digit = np.argmax(model.forward_propagation(sample_image.reshape(1, -1), params)["a2"])
print(f"Predicted: {predicted_digit}, True: {np.argmax(y_test_onehot[0])}")
本文标签: machine learningGradient descent 3D visualization PythonStack Overflow
版权声明:本文标题:machine learning - Gradient descent 3D visualization Python - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1745254339a2650010.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论