diff --git a/how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-nccl-gloo/pytorch_mnist.py b/how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-nccl-gloo/pytorch_mnist.py deleted file mode 100644 index 7a9aeb60..00000000 --- a/how-to-use-azureml/ml-frameworks/pytorch/distributed-pytorch-with-nccl-gloo/pytorch_mnist.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright (c) 2017, PyTorch contributors -# Modifications copyright (C) Microsoft Corporation -# Licensed under the BSD license -# Adapted from https://github.com/Azure/BatchAI/tree/master/recipes/PyTorch/PyTorch-GPU-Distributed-Gloo - -from __future__ import print_function -import argparse -import os -import shutil -import time -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -from torchvision import datasets, transforms -import torch.nn.parallel -import torch.backends.cudnn as cudnn -import torch.distributed as dist -import torch.utils.data -import torch.utils.data.distributed -import torchvision.models as models - -from azureml.core.run import Run -# get the Azure ML run object -run = Run.get_context() - -# Training settings -parser = argparse.ArgumentParser(description='PyTorch MNIST Example') -parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') -parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') -parser.add_argument('--epochs', type=int, default=10, metavar='N', - help='number of epochs to train (default: 10)') -parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') -parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') -parser.add_argument('--seed', type=int, default=1, metavar='S', - help='random seed (default: 1)') -parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', - help='number of data loading workers (default: 4)') -parser.add_argument('--log-interval', type=int, default=10, metavar='N', - help='how many batches to wait before logging training status') -parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float, - metavar='W', help='weight decay (default: 1e-4)') -parser.add_argument('--world-size', default=1, type=int, - help='number of distributed processes') -parser.add_argument('--dist-url', type=str, - help='url used to set up distributed training') -parser.add_argument('--dist-backend', default='nccl', type=str, - help='distributed backend') -parser.add_argument('--rank', default=-1, type=int, - help='rank of the worker') - -best_prec1 = 0 -args = parser.parse_args() - -args.distributed = args.world_size >= 2 - -if args.distributed: - dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, - world_size=args.world_size, rank=args.rank) - -train_dataset = datasets.MNIST('data-%d' % args.rank, train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])) - -if args.distributed: - train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) -else: - train_sampler = None - -train_loader = torch.utils.data.DataLoader( - train_dataset, - batch_size=args.batch_size, shuffle=(train_sampler is None), - num_workers=args.workers, pin_memory=True, sampler=train_sampler) - - -test_loader = torch.utils.data.DataLoader( - train_dataset, - batch_size=args.batch_size, shuffle=False, - num_workers=args.workers, pin_memory=True) - - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 10, kernel_size=5) - self.conv2 = nn.Conv2d(10, 20, kernel_size=5) - self.conv2_drop = nn.Dropout2d() - self.fc1 = nn.Linear(320, 50) - self.fc2 = nn.Linear(50, 10) - - def forward(self, x): - x = F.relu(F.max_pool2d(self.conv1(x), 2)) - x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) - x = x.view(-1, 320) - x = F.relu(self.fc1(x)) - x = F.dropout(x, training=self.training) - x = self.fc2(x) - return F.log_softmax(x) - - -model = Net() - -if not args.distributed: - model = torch.nn.DataParallel(model).cuda() -else: - model.cuda() - model = torch.nn.parallel.DistributedDataParallel(model) - -# define loss function (criterion) and optimizer -criterion = nn.CrossEntropyLoss().cuda() - -optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) - - -def train(epoch): - batch_time = AverageMeter() - data_time = AverageMeter() - losses = AverageMeter() - top1 = AverageMeter() - top5 = AverageMeter() - - # switch to train mode - model.train() - end = time.time() - for i, (input, target) in enumerate(train_loader): - # measure data loading time - data_time.update(time.time() - end) - - input, target = input.cuda(), target.cuda() - - # compute output - try: - output = model(input) - loss = criterion(output, target) - - # measure accuracy and record loss - prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) - losses.update(loss.item(), input.size(0)) - top1.update(prec1[0], input.size(0)) - top5.update(prec5[0], input.size(0)) - - # compute gradient and do SGD step - optimizer.zero_grad() - loss.backward() - optimizer.step() - - # measure elapsed time - batch_time.update(time.time() - end) - end = time.time() - - if i % 10 == 0: - run.log("loss", losses.avg) - run.log("prec@1", "{0:.3f}".format(top1.avg)) - run.log("prec@5", "{0:.3f}".format(top5.avg)) - print('Epoch: [{0}][{1}/{2}]\t' - 'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' - 'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' - 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' - 'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t' - 'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(epoch, i, len(train_loader), - batch_time=batch_time, data_time=data_time, - loss=losses, top1=top1, top5=top5)) - except: - import sys - print("Unexpected error:", sys.exc_info()[0]) - - -class AverageMeter(object): - """Computes and stores the average and current value""" - def __init__(self): - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - -def accuracy(output, target, topk=(1,)): - """Computes the precision@k for the specified values of k""" - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) - res.append(correct_k.mul_(100.0 / batch_size)) - return res - - -for epoch in range(1, args.epochs + 1): - train(epoch)