mirror of
https://github.com/Azure/MachineLearningNotebooks.git
synced 2025-12-23 11:02:39 -05:00
260 lines
9.6 KiB
Python
260 lines
9.6 KiB
Python
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
|
# Modifications copyright (C) 2017 Uber Technologies, Inc.
|
|
# Additional modifications copyright (C) Microsoft Corporation
|
|
# Licensed under the Apache License, Version 2.0
|
|
# Script adapted from: https://github.com/uber/horovod/blob/master/examples/tensorflow_word2vec.py
|
|
# ======================================
|
|
"""Basic word2vec example."""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import collections
|
|
import math
|
|
import os
|
|
import random
|
|
import zipfile
|
|
import argparse
|
|
|
|
import numpy as np
|
|
from six.moves import urllib
|
|
from six.moves import xrange # pylint: disable=redefined-builtin
|
|
import tensorflow as tf
|
|
import horovod.tensorflow as hvd
|
|
from azureml.core.run import Run
|
|
|
|
# Horovod: initialize Horovod.
|
|
hvd.init()
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--input_data', type=str, help='training data')
|
|
|
|
args = parser.parse_args()
|
|
|
|
input_data = args.input_data
|
|
print("the input data is at %s" % input_data)
|
|
|
|
# Step 1: Download the data.
|
|
url = 'http://mattmahoney.net/dc/text8.zip'
|
|
|
|
|
|
def maybe_download(filename, expected_bytes):
|
|
"""Download a file if not present, and make sure it's the right size."""
|
|
if not filename:
|
|
filename = "text8.zip"
|
|
if not os.path.exists(filename):
|
|
print("Downloading the data from http://mattmahoney.net/dc/text8.zip")
|
|
filename, _ = urllib.request.urlretrieve(url, filename)
|
|
else:
|
|
print("Use the data from %s" % input_data)
|
|
statinfo = os.stat(filename)
|
|
if statinfo.st_size == expected_bytes:
|
|
print('Found and verified', filename)
|
|
else:
|
|
print(statinfo.st_size)
|
|
raise Exception(
|
|
'Failed to verify ' + url + '. Can you get to it with a browser?')
|
|
return filename
|
|
|
|
|
|
filename = maybe_download(input_data, 31344016)
|
|
|
|
|
|
# Read the data into a list of strings.
|
|
def read_data(filename):
|
|
"""Extract the first file enclosed in a zip file as a list of words."""
|
|
with zipfile.ZipFile(filename) as f:
|
|
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
|
|
return data
|
|
|
|
|
|
vocabulary = read_data(filename)
|
|
print('Data size', len(vocabulary))
|
|
|
|
# Step 2: Build the dictionary and replace rare words with UNK token.
|
|
vocabulary_size = 50000
|
|
|
|
|
|
def build_dataset(words, n_words):
|
|
"""Process raw inputs into a dataset."""
|
|
count = [['UNK', -1]]
|
|
count.extend(collections.Counter(words).most_common(n_words - 1))
|
|
dictionary = dict()
|
|
for word, _ in count:
|
|
dictionary[word] = len(dictionary)
|
|
data = list()
|
|
unk_count = 0
|
|
for word in words:
|
|
if word in dictionary:
|
|
index = dictionary[word]
|
|
else:
|
|
index = 0 # dictionary['UNK']
|
|
unk_count += 1
|
|
data.append(index)
|
|
count[0][1] = unk_count
|
|
reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
|
|
return data, count, dictionary, reversed_dictionary
|
|
|
|
|
|
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
|
|
vocabulary_size)
|
|
del vocabulary # Hint to reduce memory.
|
|
print('Most common words (+UNK)', count[:5])
|
|
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
|
|
|
|
|
|
# Step 3: Function to generate a training batch for the skip-gram model.
|
|
def generate_batch(batch_size, num_skips, skip_window):
|
|
assert num_skips <= 2 * skip_window
|
|
# Adjust batch_size to match num_skips
|
|
batch_size = batch_size // num_skips * num_skips
|
|
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
|
|
# Backtrack a little bit to avoid skipping words in the end of a batch
|
|
data_index = random.randint(0, len(data) - span - 1)
|
|
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
|
|
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
|
|
buffer = collections.deque(maxlen=span)
|
|
for _ in range(span):
|
|
buffer.append(data[data_index])
|
|
data_index = (data_index + 1) % len(data)
|
|
for i in range(batch_size // num_skips):
|
|
target = skip_window # target label at the center of the buffer
|
|
targets_to_avoid = [skip_window]
|
|
for j in range(num_skips):
|
|
while target in targets_to_avoid:
|
|
target = random.randint(0, span - 1)
|
|
targets_to_avoid.append(target)
|
|
batch[i * num_skips + j] = buffer[skip_window]
|
|
labels[i * num_skips + j, 0] = buffer[target]
|
|
buffer.append(data[data_index])
|
|
data_index = (data_index + 1) % len(data)
|
|
return batch, labels
|
|
|
|
|
|
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
|
|
for i in range(8):
|
|
print(batch[i], reverse_dictionary[batch[i]],
|
|
'->', labels[i, 0], reverse_dictionary[labels[i, 0]])
|
|
|
|
# Step 4: Build and train a skip-gram model.
|
|
|
|
max_batch_size = 128
|
|
embedding_size = 128 # Dimension of the embedding vector.
|
|
skip_window = 1 # How many words to consider left and right.
|
|
num_skips = 2 # How many times to reuse an input to generate a label.
|
|
|
|
# We pick a random validation set to sample nearest neighbors. Here we limit the
|
|
# validation samples to the words that have a low numeric ID, which by
|
|
# construction are also the most frequent.
|
|
valid_size = 16 # Random set of words to evaluate similarity on.
|
|
valid_window = 100 # Only pick dev samples in the head of the distribution.
|
|
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
|
|
num_sampled = 64 # Number of negative examples to sample.
|
|
|
|
graph = tf.Graph()
|
|
|
|
with graph.as_default():
|
|
|
|
# Input data.
|
|
train_inputs = tf.placeholder(tf.int32, shape=[None])
|
|
train_labels = tf.placeholder(tf.int32, shape=[None, 1])
|
|
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
|
|
|
|
# Look up embeddings for inputs.
|
|
embeddings = tf.Variable(
|
|
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
|
|
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
|
|
|
|
# Construct the variables for the NCE loss
|
|
nce_weights = tf.Variable(
|
|
tf.truncated_normal([vocabulary_size, embedding_size],
|
|
stddev=1.0 / math.sqrt(embedding_size)))
|
|
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
|
|
|
|
# Compute the average NCE loss for the batch.
|
|
# tf.nce_loss automatically draws a new sample of the negative labels each
|
|
# time we evaluate the loss.
|
|
loss = tf.reduce_mean(
|
|
tf.nn.nce_loss(weights=nce_weights,
|
|
biases=nce_biases,
|
|
labels=train_labels,
|
|
inputs=embed,
|
|
num_sampled=num_sampled,
|
|
num_classes=vocabulary_size))
|
|
|
|
# Horovod: adjust learning rate based on number of GPUs.
|
|
optimizer = tf.train.GradientDescentOptimizer(1.0 * hvd.size())
|
|
|
|
# Horovod: add Horovod Distributed Optimizer.
|
|
optimizer = hvd.DistributedOptimizer(optimizer)
|
|
|
|
train_op = optimizer.minimize(loss)
|
|
|
|
# Compute the cosine similarity between minibatch examples and all embeddings.
|
|
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
|
|
normalized_embeddings = embeddings / norm
|
|
valid_embeddings = tf.nn.embedding_lookup(
|
|
normalized_embeddings, valid_dataset)
|
|
similarity = tf.matmul(
|
|
valid_embeddings, normalized_embeddings, transpose_b=True)
|
|
|
|
# Add variable initializer.
|
|
init = tf.global_variables_initializer()
|
|
|
|
# Horovod: broadcast initial variable states from rank 0 to all other processes.
|
|
# This is necessary to ensure consistent initialization of all workers when
|
|
# training is started with random weights or restored from a checkpoint.
|
|
bcast = hvd.broadcast_global_variables(0)
|
|
|
|
# Step 5: Begin training.
|
|
|
|
# Horovod: adjust number of steps based on number of GPUs.
|
|
num_steps = 4000 // hvd.size() + 1
|
|
|
|
# Horovod: pin GPU to be used to process local rank (one GPU per process)
|
|
config = tf.ConfigProto()
|
|
config.gpu_options.allow_growth = True
|
|
config.gpu_options.visible_device_list = str(hvd.local_rank())
|
|
|
|
with tf.Session(graph=graph, config=config) as session:
|
|
# We must initialize all variables before we use them.
|
|
init.run()
|
|
bcast.run()
|
|
print('Initialized')
|
|
run = Run.get_context()
|
|
average_loss = 0
|
|
for step in xrange(num_steps):
|
|
# simulate various sentence length by randomization
|
|
batch_size = random.randint(max_batch_size // 2, max_batch_size)
|
|
batch_inputs, batch_labels = generate_batch(
|
|
batch_size, num_skips, skip_window)
|
|
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
|
|
|
|
# We perform one update step by evaluating the optimizer op (including it
|
|
# in the list of returned values for session.run()
|
|
_, loss_val = session.run([train_op, loss], feed_dict=feed_dict)
|
|
average_loss += loss_val
|
|
|
|
if step % 2000 == 0:
|
|
if step > 0:
|
|
average_loss /= 2000
|
|
# The average loss is an estimate of the loss over the last 2000 batches.
|
|
print('Average loss at step ', step, ': ', average_loss)
|
|
run.log("Loss", average_loss)
|
|
average_loss = 0
|
|
final_embeddings = normalized_embeddings.eval()
|
|
|
|
# Evaluate similarity in the end on worker 0.
|
|
if hvd.rank() == 0:
|
|
sim = similarity.eval()
|
|
for i in xrange(valid_size):
|
|
valid_word = reverse_dictionary[valid_examples[i]]
|
|
top_k = 8 # number of nearest neighbors
|
|
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
|
|
log_str = 'Nearest to %s:' % valid_word
|
|
for k in xrange(top_k):
|
|
close_word = reverse_dictionary[nearest[k]]
|
|
log_str = '%s %s,' % (log_str, close_word)
|
|
print(log_str)
|