mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-19 17:17:04 -05:00
Delete pytorch_mnist.py
This commit is contained in:
@@ -1,209 +0,0 @@
|
|||||||
# Copyright (c) 2017, PyTorch contributors
|
|
||||||
# Modifications copyright (C) Microsoft Corporation
|
|
||||||
# Licensed under the BSD license
|
|
||||||
# Adapted from https://github.com/Azure/BatchAI/tree/master/recipes/PyTorch/PyTorch-GPU-Distributed-Gloo
|
|
||||||
|
|
||||||
from __future__ import print_function
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
import shutil
|
|
||||||
import time
|
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
import torch.nn.functional as F
|
|
||||||
import torch.optim as optim
|
|
||||||
from torchvision import datasets, transforms
|
|
||||||
import torch.nn.parallel
|
|
||||||
import torch.backends.cudnn as cudnn
|
|
||||||
import torch.distributed as dist
|
|
||||||
import torch.utils.data
|
|
||||||
import torch.utils.data.distributed
|
|
||||||
import torchvision.models as models
|
|
||||||
|
|
||||||
from azureml.core.run import Run
|
|
||||||
# get the Azure ML run object
|
|
||||||
run = Run.get_context()
|
|
||||||
|
|
||||||
# Training settings
|
|
||||||
parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
|
|
||||||
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
|
|
||||||
help='input batch size for training (default: 64)')
|
|
||||||
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
|
|
||||||
help='input batch size for testing (default: 1000)')
|
|
||||||
parser.add_argument('--epochs', type=int, default=10, metavar='N',
|
|
||||||
help='number of epochs to train (default: 10)')
|
|
||||||
parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
|
|
||||||
help='learning rate (default: 0.01)')
|
|
||||||
parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
|
|
||||||
help='SGD momentum (default: 0.5)')
|
|
||||||
parser.add_argument('--seed', type=int, default=1, metavar='S',
|
|
||||||
help='random seed (default: 1)')
|
|
||||||
parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
|
|
||||||
help='number of data loading workers (default: 4)')
|
|
||||||
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
|
|
||||||
help='how many batches to wait before logging training status')
|
|
||||||
parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
|
|
||||||
metavar='W', help='weight decay (default: 1e-4)')
|
|
||||||
parser.add_argument('--world-size', default=1, type=int,
|
|
||||||
help='number of distributed processes')
|
|
||||||
parser.add_argument('--dist-url', type=str,
|
|
||||||
help='url used to set up distributed training')
|
|
||||||
parser.add_argument('--dist-backend', default='nccl', type=str,
|
|
||||||
help='distributed backend')
|
|
||||||
parser.add_argument('--rank', default=-1, type=int,
|
|
||||||
help='rank of the worker')
|
|
||||||
|
|
||||||
best_prec1 = 0
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
args.distributed = args.world_size >= 2
|
|
||||||
|
|
||||||
if args.distributed:
|
|
||||||
dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
|
|
||||||
world_size=args.world_size, rank=args.rank)
|
|
||||||
|
|
||||||
train_dataset = datasets.MNIST('data-%d' % args.rank, train=True, download=True,
|
|
||||||
transform=transforms.Compose([
|
|
||||||
transforms.ToTensor(),
|
|
||||||
transforms.Normalize((0.1307,), (0.3081,))
|
|
||||||
]))
|
|
||||||
|
|
||||||
if args.distributed:
|
|
||||||
train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
|
|
||||||
else:
|
|
||||||
train_sampler = None
|
|
||||||
|
|
||||||
train_loader = torch.utils.data.DataLoader(
|
|
||||||
train_dataset,
|
|
||||||
batch_size=args.batch_size, shuffle=(train_sampler is None),
|
|
||||||
num_workers=args.workers, pin_memory=True, sampler=train_sampler)
|
|
||||||
|
|
||||||
|
|
||||||
test_loader = torch.utils.data.DataLoader(
|
|
||||||
train_dataset,
|
|
||||||
batch_size=args.batch_size, shuffle=False,
|
|
||||||
num_workers=args.workers, pin_memory=True)
|
|
||||||
|
|
||||||
|
|
||||||
class Net(nn.Module):
|
|
||||||
def __init__(self):
|
|
||||||
super(Net, self).__init__()
|
|
||||||
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
|
|
||||||
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
|
|
||||||
self.conv2_drop = nn.Dropout2d()
|
|
||||||
self.fc1 = nn.Linear(320, 50)
|
|
||||||
self.fc2 = nn.Linear(50, 10)
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
x = F.relu(F.max_pool2d(self.conv1(x), 2))
|
|
||||||
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
|
|
||||||
x = x.view(-1, 320)
|
|
||||||
x = F.relu(self.fc1(x))
|
|
||||||
x = F.dropout(x, training=self.training)
|
|
||||||
x = self.fc2(x)
|
|
||||||
return F.log_softmax(x)
|
|
||||||
|
|
||||||
|
|
||||||
model = Net()
|
|
||||||
|
|
||||||
if not args.distributed:
|
|
||||||
model = torch.nn.DataParallel(model).cuda()
|
|
||||||
else:
|
|
||||||
model.cuda()
|
|
||||||
model = torch.nn.parallel.DistributedDataParallel(model)
|
|
||||||
|
|
||||||
# define loss function (criterion) and optimizer
|
|
||||||
criterion = nn.CrossEntropyLoss().cuda()
|
|
||||||
|
|
||||||
optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
|
|
||||||
|
|
||||||
|
|
||||||
def train(epoch):
|
|
||||||
batch_time = AverageMeter()
|
|
||||||
data_time = AverageMeter()
|
|
||||||
losses = AverageMeter()
|
|
||||||
top1 = AverageMeter()
|
|
||||||
top5 = AverageMeter()
|
|
||||||
|
|
||||||
# switch to train mode
|
|
||||||
model.train()
|
|
||||||
end = time.time()
|
|
||||||
for i, (input, target) in enumerate(train_loader):
|
|
||||||
# measure data loading time
|
|
||||||
data_time.update(time.time() - end)
|
|
||||||
|
|
||||||
input, target = input.cuda(), target.cuda()
|
|
||||||
|
|
||||||
# compute output
|
|
||||||
try:
|
|
||||||
output = model(input)
|
|
||||||
loss = criterion(output, target)
|
|
||||||
|
|
||||||
# measure accuracy and record loss
|
|
||||||
prec1, prec5 = accuracy(output.data, target, topk=(1, 5))
|
|
||||||
losses.update(loss.item(), input.size(0))
|
|
||||||
top1.update(prec1[0], input.size(0))
|
|
||||||
top5.update(prec5[0], input.size(0))
|
|
||||||
|
|
||||||
# compute gradient and do SGD step
|
|
||||||
optimizer.zero_grad()
|
|
||||||
loss.backward()
|
|
||||||
optimizer.step()
|
|
||||||
|
|
||||||
# measure elapsed time
|
|
||||||
batch_time.update(time.time() - end)
|
|
||||||
end = time.time()
|
|
||||||
|
|
||||||
if i % 10 == 0:
|
|
||||||
run.log("loss", losses.avg)
|
|
||||||
run.log("prec@1", "{0:.3f}".format(top1.avg))
|
|
||||||
run.log("prec@5", "{0:.3f}".format(top5.avg))
|
|
||||||
print('Epoch: [{0}][{1}/{2}]\t'
|
|
||||||
'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
|
|
||||||
'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
|
|
||||||
'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
|
|
||||||
'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
|
|
||||||
'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(epoch, i, len(train_loader),
|
|
||||||
batch_time=batch_time, data_time=data_time,
|
|
||||||
loss=losses, top1=top1, top5=top5))
|
|
||||||
except:
|
|
||||||
import sys
|
|
||||||
print("Unexpected error:", sys.exc_info()[0])
|
|
||||||
|
|
||||||
|
|
||||||
class AverageMeter(object):
|
|
||||||
"""Computes and stores the average and current value"""
|
|
||||||
def __init__(self):
|
|
||||||
self.reset()
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
self.val = 0
|
|
||||||
self.avg = 0
|
|
||||||
self.sum = 0
|
|
||||||
self.count = 0
|
|
||||||
|
|
||||||
def update(self, val, n=1):
|
|
||||||
self.val = val
|
|
||||||
self.sum += val * n
|
|
||||||
self.count += n
|
|
||||||
self.avg = self.sum / self.count
|
|
||||||
|
|
||||||
|
|
||||||
def accuracy(output, target, topk=(1,)):
|
|
||||||
"""Computes the precision@k for the specified values of k"""
|
|
||||||
maxk = max(topk)
|
|
||||||
batch_size = target.size(0)
|
|
||||||
|
|
||||||
_, pred = output.topk(maxk, 1, True, True)
|
|
||||||
pred = pred.t()
|
|
||||||
correct = pred.eq(target.view(1, -1).expand_as(pred))
|
|
||||||
|
|
||||||
res = []
|
|
||||||
for k in topk:
|
|
||||||
correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
|
|
||||||
res.append(correct_k.mul_(100.0 / batch_size))
|
|
||||||
return res
|
|
||||||
|
|
||||||
|
|
||||||
for epoch in range(1, args.epochs + 1):
|
|
||||||
train(epoch)
|
|
||||||
Reference in New Issue
Block a user