mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-20 01:27:06 -05:00
191 lines
6.9 KiB
Python
191 lines
6.9 KiB
Python
# Copyright (c) Microsoft Corporation. All rights reserved.
|
|
# Licensed under the MIT License.
|
|
|
|
import numpy as np
|
|
import argparse
|
|
import os
|
|
import re
|
|
import tensorflow as tf
|
|
import time
|
|
import glob
|
|
|
|
from azureml.core import Run
|
|
from utils import load_data
|
|
from tensorflow.keras import Model, layers
|
|
|
|
|
|
# Create TF Model.
|
|
class NeuralNet(Model):
|
|
# Set layers.
|
|
def __init__(self):
|
|
super(NeuralNet, self).__init__()
|
|
# First hidden layer.
|
|
self.h1 = layers.Dense(n_h1, activation=tf.nn.relu)
|
|
# Second hidden layer.
|
|
self.h2 = layers.Dense(n_h2, activation=tf.nn.relu)
|
|
self.out = layers.Dense(n_outputs)
|
|
|
|
# Set forward pass.
|
|
def call(self, x, is_training=False):
|
|
x = self.h1(x)
|
|
x = self.h2(x)
|
|
x = self.out(x)
|
|
if not is_training:
|
|
# Apply softmax when not training.
|
|
x = tf.nn.softmax(x)
|
|
return x
|
|
|
|
|
|
def cross_entropy_loss(y, logits):
|
|
# Convert labels to int 64 for tf cross-entropy function.
|
|
y = tf.cast(y, tf.int64)
|
|
# Apply softmax to logits and compute cross-entropy.
|
|
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
|
|
# Average loss across the batch.
|
|
return tf.reduce_mean(loss)
|
|
|
|
|
|
# Accuracy metric.
|
|
def accuracy(y_pred, y_true):
|
|
# Predicted class is the index of highest score in prediction vector (i.e. argmax).
|
|
correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
|
|
return tf.reduce_mean(tf.cast(correct_prediction, tf.float32), axis=-1)
|
|
|
|
|
|
# Optimization process.
|
|
def run_optimization(x, y):
|
|
# Wrap computation inside a GradientTape for automatic differentiation.
|
|
with tf.GradientTape() as g:
|
|
# Forward pass.
|
|
logits = neural_net(x, is_training=True)
|
|
# Compute loss.
|
|
loss = cross_entropy_loss(y, logits)
|
|
|
|
# Variables to update, i.e. trainable variables.
|
|
trainable_variables = neural_net.trainable_variables
|
|
|
|
# Compute gradients.
|
|
gradients = g.gradient(loss, trainable_variables)
|
|
|
|
# Update W and b following gradients.
|
|
optimizer.apply_gradients(zip(gradients, trainable_variables))
|
|
|
|
|
|
print("TensorFlow version:", tf.__version__)
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--data-folder', type=str, dest='data_folder', default='data', help='data folder mounting point')
|
|
parser.add_argument('--batch-size', type=int, dest='batch_size', default=128, help='mini batch size for training')
|
|
parser.add_argument('--first-layer-neurons', type=int, dest='n_hidden_1', default=128,
|
|
help='# of neurons in the first layer')
|
|
parser.add_argument('--second-layer-neurons', type=int, dest='n_hidden_2', default=128,
|
|
help='# of neurons in the second layer')
|
|
parser.add_argument('--learning-rate', type=float, dest='learning_rate', default=0.01, help='learning rate')
|
|
parser.add_argument('--resume-from', type=str, default=None,
|
|
help='location of the model or checkpoint files from where to resume the training')
|
|
args = parser.parse_args()
|
|
|
|
previous_model_location = args.resume_from
|
|
# You can also use environment variable to get the model/checkpoint files location
|
|
# previous_model_location = os.path.expandvars(os.getenv("AZUREML_DATAREFERENCE_MODEL_LOCATION", None))
|
|
|
|
data_folder = args.data_folder
|
|
print('Data folder:', data_folder)
|
|
|
|
# load train and test set into numpy arrays
|
|
# note we scale the pixel intensity values to 0-1 (by dividing it with 255.0) so the model can converge faster.
|
|
X_train = load_data(glob.glob(os.path.join(data_folder, '**/train-images-idx3-ubyte.gz'),
|
|
recursive=True)[0], False) / np.float32(255.0)
|
|
X_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-images-idx3-ubyte.gz'),
|
|
recursive=True)[0], False) / np.float32(255.0)
|
|
y_train = load_data(glob.glob(os.path.join(data_folder, '**/train-labels-idx1-ubyte.gz'),
|
|
recursive=True)[0], True).reshape(-1)
|
|
y_test = load_data(glob.glob(os.path.join(data_folder, '**/t10k-labels-idx1-ubyte.gz'),
|
|
recursive=True)[0], True).reshape(-1)
|
|
|
|
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, sep='\n')
|
|
|
|
training_set_size = X_train.shape[0]
|
|
|
|
n_inputs = 28 * 28
|
|
n_h1 = args.n_hidden_1
|
|
n_h2 = args.n_hidden_2
|
|
n_outputs = 10
|
|
learning_rate = args.learning_rate
|
|
n_epochs = 20
|
|
batch_size = args.batch_size
|
|
|
|
# Build neural network model.
|
|
neural_net = NeuralNet()
|
|
|
|
# Stochastic gradient descent optimizer.
|
|
optimizer = tf.optimizers.SGD(learning_rate)
|
|
|
|
# start an Azure ML run
|
|
run = Run.get_context()
|
|
|
|
if previous_model_location:
|
|
# Restore variables from latest checkpoint.
|
|
checkpoint = tf.train.Checkpoint(model=neural_net, optimizer=optimizer)
|
|
checkpoint_file_path = tf.train.latest_checkpoint(previous_model_location)
|
|
checkpoint.restore(checkpoint_file_path)
|
|
checkpoint_filename = os.path.basename(checkpoint_file_path)
|
|
num_found = re.search(r'\d+', checkpoint_filename)
|
|
if num_found:
|
|
start_epoch = int(num_found.group(0))
|
|
print("Resuming from epoch {}".format(str(start_epoch)))
|
|
|
|
start_time = time.perf_counter()
|
|
for epoch in range(0, n_epochs):
|
|
|
|
# randomly shuffle training set
|
|
indices = np.random.permutation(training_set_size)
|
|
X_train = X_train[indices]
|
|
y_train = y_train[indices]
|
|
|
|
# batch index
|
|
b_start = 0
|
|
b_end = b_start + batch_size
|
|
for _ in range(training_set_size // batch_size):
|
|
# get a batch
|
|
X_batch, y_batch = X_train[b_start: b_end], y_train[b_start: b_end]
|
|
|
|
# update batch index for the next batch
|
|
b_start = b_start + batch_size
|
|
b_end = min(b_start + batch_size, training_set_size)
|
|
|
|
# train
|
|
run_optimization(X_batch, y_batch)
|
|
|
|
# evaluate training set
|
|
pred = neural_net(X_batch, is_training=False)
|
|
acc_train = accuracy(pred, y_batch)
|
|
|
|
# evaluate validation set
|
|
pred = neural_net(X_test, is_training=False)
|
|
acc_val = accuracy(pred, y_test)
|
|
|
|
# log accuracies
|
|
run.log('training_acc', np.float(acc_train))
|
|
run.log('validation_acc', np.float(acc_val))
|
|
print(epoch, '-- Training accuracy:', acc_train, '\b Validation accuracy:', acc_val)
|
|
|
|
# Save checkpoints in the "./outputs" folder so that they are automatically uploaded into run history.
|
|
checkpoint_dir = './outputs/'
|
|
checkpoint = tf.train.Checkpoint(model=neural_net, optimizer=optimizer)
|
|
|
|
if epoch % 2 == 0:
|
|
checkpoint.save(checkpoint_dir)
|
|
|
|
run.log('final_acc', np.float(acc_val))
|
|
os.makedirs('./outputs/model', exist_ok=True)
|
|
|
|
# files saved in the "./outputs" folder are automatically uploaded into run history
|
|
# this is workaround for https://github.com/tensorflow/tensorflow/issues/33913 and will be fixed once we move to >tf2.1
|
|
neural_net._set_inputs(X_train)
|
|
tf.saved_model.save(neural_net, './outputs/model/')
|
|
|
|
stop_time = time.perf_counter()
|
|
training_time = (stop_time - start_time) * 1000
|
|
print("Total time in milliseconds for training: {}".format(str(training_time)))
|