machine learning - Gradient descent 3D visualization Python - Stack Overflow-软件玩家

admin管理员组
文章数量:1417092

I've recently implemented a neural network from scratch and am now focusing on visualizing the optimization process. Specifically, I'm interested in creating a 3D visualization of the loss landscape that clearly illustrates the gradient descent trajectory as it converges to a local minimum.

My architecture is quite simple, Im using MNIST dataset and my model predicts the handwritten numbers. The input layer consists of 784 neurons, hidden_layer of 128 neurons and output_layer of 10 neurons.

I understand to start I have to store weights and biases W1, W2, b1, b2 and then pass them along with losses to my gradient descent visualizer. Im not sure what to do next.

If anyone has an idea or implemented 3D gradient descent visualization before I would be very grateful for any hints and suggestions.

Code:

import numpy as np 
import struct
from array import array
from os.path  import join
import matplotlib.pyplot as plt

class MnistDataloader(object):
    def __init__(self, training_images_filepath,training_labels_filepath,
                 test_images_filepath, test_labels_filepath):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath
    
    def read_images_labels(self, images_filepath, labels_filepath):        
        labels = []
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
            labels = array("B", file.read())        
        
        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
            image_data = array("B", file.read())        
        images = []
        for i in range(size):
            images.append([0] * rows * cols)
        for i in range(size):
            img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
            img = img.reshape(28, 28)
            images[i][:] = img            
        
        return images, labels
            
    def load_data(self):
        x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
        x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
        return (x_train, y_train),(x_test, y_test)  


    def preprocess_data(self, x_train, y_train, x_test, y_test):
        x_train = np.array(x_train, dtype='float32') # vector (60000, 28, 28)
        x_test = np.array(x_test, dtype='float32') # vector (10000, 28, 28)
        y_train = np.array(y_train, dtype='int32') # vector (60000)
        y_test = np.array(y_test, dtype='int32') # vector (10000)
        
        x_train = x_train / 255.0
        x_test = x_test / 255.0

        x_train = x_train.reshape(x_train.shape[0], -1) # vector (60000, 784)
        x_test = x_test.reshape(x_test.shape[0], -1)
        
        def to_one_hot(y, num_classes=10): # one hot encoding
            return np.eye(num_classes)[y] 
        
        y_train_onehot = to_one_hot(y_train) # vector (60000, 10)
        y_test_onehot = to_one_hot(y_test) # vector (10000, 10)
        
        return x_train, y_train_onehot, x_test, y_test_onehot
    
class miniModel():
    def __init__(self):
        pass

    def initialize_parameters(self, input_size, hidden_size, output_size):
        np.random.seed(np.random.randint(1000))
        
        # 1. Uniform distribution
        # W1 = np.random.uniform(-1/np.sqrt(input_size), 1/np.sqrt(input_size), 
        #                        size=(hidden_size, input_size))
        # W2 = np.random.uniform(-1/np.sqrt(hidden_size), 1/np.sqrt(hidden_size), 
        #                        size=(output_size, hidden_size))
        
        # 2. Xavier Normal
        # W1 = np.random.randn(hidden_size, input_size) * np.sqrt(2.0/input_size)
        # W2 = np.random.randn(output_size, hidden_size) * np.sqrt(2.0/hidden_size)
        
        # 3. Xavier Uniform
        # W1 = np.random.uniform(-np.sqrt(6)/np.sqrt(input_size + hidden_size),
        #                       np.sqrt(6)/np.sqrt(input_size + hidden_size), 
        #                       size=(hidden_size, input_size))
        # W2 = np.random.uniform(-np.sqrt(6)/np.sqrt(hidden_size + output_size),
        #                       np.sqrt(6)/np.sqrt(hidden_size + output_size), 
        #                       size=(output_size, hidden_size))

        # 4. He uniform
        # W1 = np.random.uniform(-np.sqrt(6/input_size), np.sqrt(6/input_size), size = (hidden_size, input_size))
        # W2 = np.random.uniform(-np.sqrt(6/input_size), np.sqrt(6/input_size), size = (output_size, hidden_size))
        
        # 5. He normal
        W1 = np.random.randn(hidden_size, input_size) * np.sqrt(2/input_size)
        W2 = np.random.randn(output_size, hidden_size) * np.sqrt(2/input_size)
        
        b1 = np.zeros((1, hidden_size))
        b2 = np.zeros((1, output_size))
        
        return {"W1": W1, "b1": b1, "W2": W2, "b2": b2}

    def relu(self, x):
        return np.maximum(0, x)

    def softmax(self, x):
        exps = np.exp(x - np.max(x, axis = 1, keepdims=True))
        return exps/np.sum(exps, axis=1, keepdims=True)

    # Linear combination calculations
    # hiden layer activation z1 = W1 * X + b1 , ReLU a1 = ReLU(z1)
    # output layer activation z2 = W2 * a1 + b2, softmax a2 = softmax(z2)

    def forward_propagation(self, X, params):
        # Hidden Layer
        z1 = np.dot(X, params["W1"].T) + params["b1"]  
        a1 = self.relu(z1)
        
        # Output Layer
        z2 = np.dot(a1, params["W2"].T) + params["b2"] 
        a2 = self.softmax(z2)

        return {"z1":z1, "a1":a1, "z2":z2, "a2":a2}

    # Cross-Entropy loss function
    def compute_loss(self, y_true, y_pred):
        m = y_true.shape[0]
        return -np.sum(y_true * np.log(y_pred + 1e-15))/m

    # backward_propagation
    def backward_propagation(self, X, y_true, params, forward_cache):
        m = X.shape[0] # m = 64
        
        dz2 = forward_cache["a2"] - y_true   # (m, 10)
        dW2 = np.dot(forward_cache["a1"].T, dz2) / m  # (128, 10)
        db2 = np.sum(dz2, axis=0, keepdims=True) / m  # (10,)
        
        dz1 = np.dot(dz2, params["W2"]) * (forward_cache["z1"] > 0)  # (m, 128)
        dW1 = np.dot(X.T, dz1) / m   # (784, 128)
        db1 = np.sum(dz1, axis=0, keepdims=True) / m  # (128,)
        
        return {"dW1": dW1.T, "db1": db1, "dW2": dW2.T, "db2": db2}

    def update_parameters(self, params, grads, learning_rate=0.01):
        params["W1"] -= learning_rate * grads["dW1"]
        params["b1"] -= learning_rate * grads["db1"]
        params["W2"] -= learning_rate * grads["dW2"]
        params["b2"] -= learning_rate * grads["db2"]
        return params

    def show_loss_accuracy_graph(self, loss_accuracy_history, epochs):
        epochs_history = list(range(1, epochs+1))

        loss = [l[0] for l in loss_accuracy_history]
        accuracy = [a[1] for a in loss_accuracy_history]

        figure, axis = plt.subplots(2)

        axis[0].plot(epochs_history, loss, 'tab:green')
        axis[0].set_title(f"Loss over {epochs} epochs")
        axis[0].set_xticks(np.arange(1, 21, 1))
        axis[0].grid(True)
        axis[0].set(xlabel = "Epochs", ylabel = "Loss")

        axis[1].plot(epochs_history, accuracy)
        axis[1].set_title(f"Accuracy over {epochs} epochs")
        axis[1].set_xticks(np.arange(1, 21, 1))
        axis[1].grid(True)
        axis[1].set(xlabel = "Epochs", ylabel = "Accuracy")

        plt.tight_layout()
        plt.savefig('charts/loss_accuracy.png')
    
    def train(self, X, y, params, epochs=10, batch_size=64, learning_rate=0.01):
        loss_accuracy_history = []
        params_history = []  # Track parameter history
        
        for epoch in range(epochs):
            permutation = np.random.permutation(X.shape[0])
            X_shuffled = X[permutation]
            y_shuffled = y[permutation]
            
            for i in range(0, X.shape[0], batch_size):
                X_batch = X_shuffled[i:i+batch_size]
                y_batch = y_shuffled[i:i+batch_size]
                
                cache = self.forward_propagation(X_batch, params)
                grads = self.backward_propagation(X_batch, y_batch, params, cache)
                params = self.update_parameters(params, grads, learning_rate)
                
                # Store current parameters and loss
                params_history.append({
                    'W1': params['W1'].copy(),
                    'W2': params['W2'].copy(),
                    'b1': params['b1'].copy(),
                    'b2': params['b2'].copy()
                })
            
            cache = self.forward_propagation(X, params)
            loss = selfpute_loss(y, cache["a2"])
            
            predictions = np.argmax(cache["a2"], axis=1)
            accuracy = np.mean(predictions == np.argmax(y, axis=1))
            loss_accuracy_history.append([loss, accuracy])
            
            print(f"Epoch {epoch+1}/{epochs} | Loss: {loss:.4f} | Accuracy: {accuracy*100:.2f}%")
        
        self.show_loss_accuracy_graph(loss_accuracy_history, epochs)
        
        return params

    def evaluate(self, X_test, y_test, params):
        cache = self.forward_propagation(X_test, params)
        predictions = np.argmax(cache["a2"], axis=1)
        accuracy = np.mean(predictions == np.argmax(y_test, axis=1))
        print(f"Test Accuracy: {accuracy * 100:.2f}%")
    

# Paths to the downloaded data
input_path = 'data'
training_images_filepath = join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte')

# Loading and processing data
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()
x_train, y_train_onehot, x_test, y_test_onehot = mnist_dataloader.preprocess_data(x_train, y_train, x_test, y_test)

# Running the model
model = miniModel()
params = model.initialize_parameters(784, 128, 10)
params = model.train(x_train, y_train_onehot, params, epochs=20, batch_size=64, learning_rate=0.01)
model.evaluate(x_test, y_test_onehot, params)
sample_image = x_test[20] 
predicted_digit = np.argmax(model.forward_propagation(sample_image.reshape(1, -1), params)["a2"])
print(f"Predicted: {predicted_digit}, True: {np.argmax(y_test_onehot[0])}")

本文标签： machine learningGradient descent 3D visualization PythonStack Overflow

版权声明：本文标题：machine learning - Gradient descent 3D visualization Python - Stack Overflow 内容由网友自发贡献，该文观点仅代表作者本人，转载请联系作者并注明出处：http://www.betaflare.com/web/1745254339a2650010.html，本站仅提供信息存储空间服务，不拥有所有权，不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容，一经查实，本站将立刻删除。

编程频道|软件玩家 - 软件改变生活！

machine learning - Gradient descent 3D visualization Python - Stack Overflow

更多相关文章