We provide programming data of 20 most popular languages, hope to help you!
View the best hyperparameters. Open the Amazon SageMaker Console, and in the left navigation pane, under Training, choose Hyperparameter tuning jobs, choose the tuning job, then choose Best training job. You’ll see an improvement in the training accuracy (80%) compared to results in Step 6 (60%).
https://github.com/aws/amazon-sagemaker-examples/blob/master/advanced_functionality/tensorflow_bring_your_own/utils/generate_cifar10_tfrecords.py
!pip install ipywidgets
!python generate_cifar10_tfrecords.py --data-dir cifar10
import time, os, sys
import sagemaker, boto3
import numpy as np
import pandas as pd
sess = boto3.Session()
sm = sess.client('sagemaker')
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session(boto_session=sess)
datasets = sagemaker_session.upload_data(path='cifar10', key_prefix='datasets/cifar10-dataset')
datasets
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
training_experiment = Experiment.create(
experiment_name = "sagemaker-training-experiments",
description = "Experiment to track cifar10 training trials",
sagemaker_boto_client=sm)
single_gpu_trial = Trial.create(
trial_name = 'sagemaker-single-gpu-training',
experiment_name = training_experiment.experiment_name,
sagemaker_boto_client = sm,
)
trial_comp_name = 'single-gpu-training-job'
experiment_config = {"ExperimentName": training_experiment.experiment_name,
"TrialName": single_gpu_trial.trial_name,
"TrialComponentDisplayName": trial_comp_name}
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam, SGD
import argparse
import os
import re
import time
HEIGHT = 32
WIDTH = 32
DEPTH = 3
NUM_CLASSES = 10
def single_example_parser(serialized_example):
"""Parses a single tf.Example into image and label tensors."""
# Dimensions of the images in the CIFAR-10 dataset.
# See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the
# input format.
features = tf.io.parse_single_example(
serialized_example,
features={
'image': tf.io.FixedLenFeature([], tf.string),
'label': tf.io.FixedLenFeature([], tf.int64),
})
image = tf.decode_raw(features['image'], tf.uint8)
image.set_shape([DEPTH * HEIGHT * WIDTH])
# Reshape from [depth * height * width] to [depth, height, width].
image = tf.cast(
tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]),
tf.float32)
label = tf.cast(features['label'], tf.int32)
image = train_preprocess_fn(image)
label = tf.one_hot(label, NUM_CLASSES)
return image, label
def train_preprocess_fn(image):
# Resize the image to add four extra pixels on each side.
image = tf.image.resize_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8)
# Randomly crop a [HEIGHT, WIDTH] section of the image.
image = tf.image.random_crop(image, [HEIGHT, WIDTH, DEPTH])
# Randomly flip the image horizontally.
image = tf.image.random_flip_left_right(image)
return image
def get_dataset(filenames, batch_size):
"""Read the images and labels from 'filenames'."""
# Repeat infinitely.
dataset = tf.data.TFRecordDataset(filenames).repeat().shuffle(10000)
# Parse records.
dataset = dataset.map(single_example_parser, num_parallel_calls=tf.data.experimental.AUTOTUNE)
# Batch it up.
dataset = dataset.batch(batch_size, drop_remainder=True)
return dataset
def get_model(input_shape, learning_rate, weight_decay, optimizer, momentum):
input_tensor = Input(shape=input_shape)
base_model = keras.applications.resnet50.ResNet50(include_top=False,
weights='imagenet',
input_tensor=input_tensor,
input_shape=input_shape,
classes=None)
x = Flatten()(base_model.output)
predictions = Dense(NUM_CLASSES, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=predictions)
return model
def main(args):
# Hyper-parameters
epochs = args.epochs
lr = args.learning_rate
batch_size = args.batch_size
momentum = args.momentum
weight_decay = args.weight_decay
optimizer = args.optimizer
# SageMaker options
training_dir = args.training
validation_dir = args.validation
eval_dir = args.eval
train_dataset = get_dataset(training_dir+'/train.tfrecords', batch_size)
val_dataset = get_dataset(validation_dir+'/validation.tfrecords', batch_size)
eval_dataset = get_dataset(eval_dir+'/eval.tfrecords', batch_size)
input_shape = (HEIGHT, WIDTH, DEPTH)
model = get_model(input_shape, lr, weight_decay, optimizer, momentum)
# Optimizer
if optimizer.lower() == 'sgd':
opt = SGD(lr=lr, decay=weight_decay, momentum=momentum)
else:
opt = Adam(lr=lr, decay=weight_decay)
# Compile model
model.compile(optimizer=opt,
loss='categorical_crossentropy',
metrics=['accuracy'])
# Train model
history = model.fit(train_dataset, steps_per_epoch=40000 // batch_size,
validation_data=val_dataset,
validation_steps=10000 // batch_size,
epochs=epochs)
# Evaluate model performance
score = model.evaluate(eval_dataset, steps=10000 // batch_size, verbose=1)
print('Test loss :', score[0])
print('Test accuracy:', score[1])
# Save model to model directory
model.save(f'{os.environ["SM_MODEL_DIR"]}/{time.strftime("%m%d%H%M%S", time.gmtime())}', save_format='tf')
#%%
if __name__ == "__main__":
parser = argparse.ArgumentParser()
# Hyper-parameters
parser.add_argument('--epochs', type=int, default=10)
parser.add_argument('--learning-rate', type=float, default=0.01)
parser.add_argument('--batch-size', type=int, default=128)
parser.add_argument('--weight-decay', type=float, default=2e-4)
parser.add_argument('--momentum', type=float, default='0.9')
parser.add_argument('--optimizer', type=str, default='sgd')
# SageMaker parameters
parser.add_argument('--model_dir', type=str)
parser.add_argument('--training', type=str, default=os.environ['SM_CHANNEL_TRAINING'])
parser.add_argument('--validation', type=str, default=os.environ['SM_CHANNEL_VALIDATION'])
parser.add_argument('--eval', type=str, default=os.environ['SM_CHANNEL_EVAL'])
args = parser.parse_args()
main(args)
from sagemaker.tensorflow import TensorFlow
hyperparams={'epochs' : 30,
'learning-rate': 0.01,
'batch-size' : 256,
'weight-decay' : 2e-4,
'momentum' : 0.9,
'optimizer' : 'adam'}
bucket_name = sagemaker_session.default_bucket()
output_path = f's3://{bucket_name}/jobs'
metric_definitions = [{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'}]
tf_estimator = TensorFlow(entry_point = 'cifar10-training-sagemaker.py',
output_path = f'{output_path}/',
code_location = output_path,
role = role,
train_instance_count = 1,
train_instance_type = 'ml.g4dn.xlarge',
framework_version = '1.15.2',
py_version = 'py3',
script_mode = True,
metric_definitions = metric_definitions,
sagemaker_session = sagemaker_session,
hyperparameters = hyperparams)
job_name=f'tensorflow-single-gpu-{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}'
tf_estimator.fit({'training' : datasets,
'validation': datasets,
'eval' : datasets},
job_name = job_name,
experiment_config=experiment_config)
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
hyperparameter_ranges = {
'epochs' : IntegerParameter(5, 30),
'learning-rate' : ContinuousParameter(0.001, 0.1, scaling_type='Logarithmic'),
'batch-size' : CategoricalParameter(['128', '256', '512']),
'momentum' : ContinuousParameter(0.9, 0.99),
'optimizer' : CategoricalParameter(['sgd', 'adam'])
}
objective_metric_name = 'val_acc'
objective_type = 'Maximize'
metric_definitions = [{'Name': 'val_acc', 'Regex': 'val_acc: ([0-9\\.]+)'}]
tf_estimator = TensorFlow(entry_point = 'cifar10-training-sagemaker.py',
output_path = f'{output_path}/',
code_location = output_path,
role = role,
train_instance_count = 1,
train_instance_type = 'ml.g4dn.xlarge',
framework_version = '1.15',
py_version = 'py3',
script_mode = True,
metric_definitions = metric_definitions,
sagemaker_session = sagemaker_session)
tuner = HyperparameterTuner(estimator = tf_estimator,
objective_metric_name = objective_metric_name,
hyperparameter_ranges = hyperparameter_ranges,
metric_definitions = metric_definitions,
max_jobs = 16,
max_parallel_jobs = 8,
objective_type = objective_type)
job_name=f'tf-hpo-{time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())}'
tuner.fit({'training' : datasets,
'validation': datasets,
'eval' : datasets},
job_name = job_name)
!aws s3 rm --recursive s3://sagemaker-us-west-2-ACCOUNT_NUMBER/datasets/cifar10-dataset
!aws s3 rm --recursive s3://sagemaker-us-west-2-ACCOUNT_NUMBER/jobs
I'm trying to make use of GPUs for training a network in PyTorch. I have a 'Bring Your Own' container. If I use the AWS provided containers, I can just look for the SM environment variables such as SM_NUM_GPUS as in here to configure my code for distributed compute.. I'm building my own container with Ubuntu as a base, and when I log the environment variables I …
import sagemaker_containers
...
if __name__ == '__main__':
...
env = sagemaker_containers.training_env()
print('env', env)
env_vars = env.to_env_vars()
env.write_env_vars(env_vars)