"""Inference/predict code for CIFAR-10
model must be trained before inference,
train_cifar10.py must be executed beforehand.
"""
from __future__ import print_function
import os
import argparse
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import training, iterators, serializers, optimizers, Variable, cuda
from chainer.training import extensions
from CNNSmall import CNNSmall
from CNNMedium import CNNMedium
CIFAR10_LABELS_LIST = [
'airplane',
'automobile',
'bird',
'cat',
'deer',
'dog',
'frog',
'horse',
'ship',
'truck'
]
def main():
archs = {
'cnnsmall': CNNSmall,
'cnnmedium': CNNMedium,
}
parser = argparse.ArgumentParser(description='Cifar-10 CNN predict code')
parser.add_argument('--arch', '-a', choices=archs.keys(),
default='cnnsmall', help='Convnet architecture')
#parser.add_argument('--batchsize', '-b', type=int, default=64,
# help='Number of images in each mini-batch')
parser.add_argument('--modelpath', '-m', default='result-cifar10-cnnsmall/cnnsmall-cifar10.model',
help='Model path to be loaded')
parser.add_argument('--gpu', '-g', type=int, default=-1,
help='GPU ID (negative value indicates CPU)')
args = parser.parse_args()
print('GPU: {}'.format(args.gpu))
#print('# Minibatch-size: {}'.format(args.batchsize))
print('')
# 1. Setup model
class_num = 10
model = archs[args.arch](n_out=class_num)
classifier_model = L.Classifier(model)
if args.gpu >= 0:
chainer.cuda.get_device(args.gpu).use() # Make a specified GPU current
classifier_model.to_gpu() # Copy the model to the GPU
xp = np if args.gpu < 0 else cuda.cupy
serializers.load_npz(args.modelpath, model)
# 2. Load the CIFAR-10 dataset
train, test = chainer.datasets.get_cifar10()
basedir = 'images'
plot_predict_cifar(os.path.join(basedir, 'cifar10_predict.png'), model,
train, 4, 5, scale=5., label_list=CIFAR10_LABELS_LIST)
def plot_predict_cifar(filepath, model, data, row, col,
scale=3., label_list=None):
fig_width = data[0][0].shape[1] / 80 * row * scale
fig_height = data[0][0].shape[2] / 80 * col * scale
fig, axes = plt.subplots(row,
col,
figsize=(fig_height, fig_width))
for i in range(row * col):
# train[i][0] is i-th image data with size 32x32
image, label_index = data[i]
xp = cuda.cupy
x = Variable(xp.asarray(image.reshape(1, 3, 32, 32))) # test data
#t = Variable(xp.asarray([test[i][1]])) # labels
y = model(x) # Inference result
prediction = y.data.argmax(axis=1)
image = image.transpose(1, 2, 0)
print('Predicted {}-th image, prediction={}, actual={}'
.format(i, prediction[0], label_index))
r, c = divmod(i, col)
axes[r][c].imshow(image) # cmap='gray' is for black and white picture.
if label_list is None:
axes[r][c].set_title('Predict:{}, Answer: {}'
.format(label_index, prediction[0]))
else:
pred = int(prediction[0])
axes[r][c].set_title('Predict:{} {}\nAnswer:{} {}'
.format(label_index, label_list[label_index],
pred, label_list[pred]))
axes[r][c].axis('off') # do not show axis value
plt.tight_layout(pad=0.01) # automatic padding between subplots
plt.savefig(filepath)
print('Result saved to {}'.format(filepath))
if __name__ == '__main__':
main()
This outputs the result as,
You can see that even small CNN, it successfully classifies most of the images. Of course this is just a simple example and you can improve the model accuracy by tuning the deep neural network!
\( k \) : kernal size (assuming same for width & height)
In above CNN definitions, the size of the channel is bigger for deeper layer. This can be understood by calculating the computational cost for each layer.
When L.Convolution2D with stride=2 is used, the size of image become almost half. This means \( H_I\) and \( W_I \) becomes small value, so \(CH_I \) and \( CH_O \) can take larger value.
[TODO: add computational cost table for CNN Medium example]
Only small difference is the dataset preparation for CIFAR-10,
# 3. Load the CIFAR-10 dataset
train, test = chainer.datasets.get_cifar10()
and model setup
from CNNSmall import CNNSmall
from CNNMedium import CNNMedium
archs = {
'cnnsmall': CNNSmall,
'cnnmedium': CNNMedium,
}
...
class_num = 10
model = archs[args.arch](n_out=class_num)
The whole source code is the following,
from __future__ import print_function
import argparse
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import training, iterators, serializers, optimizers
from chainer.training import extensions
from CNNSmall import CNNSmall
from CNNMedium import CNNMedium
def main():
archs = {
'cnnsmall': CNNSmall,
'cnnmedium': CNNMedium,
}
parser = argparse.ArgumentParser(description='Cifar-10 CNN example')
parser.add_argument('--arch', '-a', choices=archs.keys(),
default='cnnsmall', help='Convnet architecture')
parser.add_argument('--batchsize', '-b', type=int, default=64,
help='Number of images in each mini-batch')
parser.add_argument('--epoch', '-e', type=int, default=20,
help='Number of sweeps over the dataset to train')
parser.add_argument('--gpu', '-g', type=int, default=-1,
help='GPU ID (negative value indicates CPU)')
parser.add_argument('--out', '-o', default='result-cifar10',
help='Directory to output the result')
parser.add_argument('--resume', '-r', default='',
help='Resume the training from snapshot')
args = parser.parse_args()
print('GPU: {}'.format(args.gpu))
print('# Minibatch-size: {}'.format(args.batchsize))
print('# epoch: {}'.format(args.epoch))
print('')
# 1. Setup model
class_num = 10
model = archs[args.arch](n_out=class_num)
classifier_model = L.Classifier(model)
if args.gpu >= 0:
chainer.cuda.get_device(args.gpu).use() # Make a specified GPU current
classifier_model.to_gpu() # Copy the model to the GPU
# 2. Setup an optimizer
optimizer = optimizers.Adam()
optimizer.setup(classifier_model)
# 3. Load the CIFAR-10 dataset
train, test = chainer.datasets.get_cifar10()
# 4. Setup an Iterator
train_iter = iterators.SerialIterator(train, args.batchsize)
test_iter = iterators.SerialIterator(test, args.batchsize,
repeat=False, shuffle=False)
# 5. Setup an Updater
updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
# 6. Setup a trainer (and extensions)
trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
# Evaluate the model with the test dataset for each epoch
trainer.extend(extensions.Evaluator(test_iter, classifier_model, device=args.gpu))
trainer.extend(extensions.dump_graph('main/loss'))
trainer.extend(extensions.snapshot(), trigger=(1, 'epoch'))
trainer.extend(extensions.LogReport())
trainer.extend(extensions.PrintReport(
['epoch', 'main/loss', 'validation/main/loss',
'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
trainer.extend(extensions.PlotReport(
['main/loss', 'validation/main/loss'],
x_key='epoch', file_name='loss.png'))
trainer.extend(extensions.PlotReport(
['main/accuracy', 'validation/main/accuracy'],
x_key='epoch',
file_name='accuracy.png'))
trainer.extend(extensions.ProgressBar())
# Resume from a snapshot
if args.resume:
serializers.load_npz(args.resume, trainer)
# Run the training
trainer.run()
serializers.save_npz('{}/{}-cifar10.model'
.format(args.out, args.arch), model)
if __name__ == '__main__':
main()
See how clean the code is! Chainer abstracts the training process and thus the code can be reusable with other deep learning training.
As expected, CNNMedium takes little bit longer time for computation but it achieves higher accuracy for training data.
※ It is also important to notice that validation accuracy is almost same between CNNSmall and CNNMedium, which means CNNMedium may be overfitting to the training data. To avoid overfitting, data augmentation (flip, rotate, clip, resize, add gaussian noise etc the input image to increase the effective data size) technique is often used in practice.
Training CIFAR-100
Again, training CIFAR-100 is quite similar to the training of CIFAR-10.
See train_cifar100.py. Only the difference is model definition to set the output class number (model definition itself is not changed and can be reused!!).
# 1. Setup model
class_num = 100
model = archs[args.arch](n_out=class_num)
and dataset preparation
# 3. Load the CIFAR-10 dataset
train, test = chainer.datasets.get_cifar100()
[hands on] Try running train code.
Summary
We have learned how to train CNN with Chainer. CNN is widely used many image processing tasks, not only image classification. For example,
Bounding Box detection
SSD, YoLo V2
Semantic segmentation
FCN
Colorization
PaintsChainer
Image generation
GAN
Style transfer
chainer goph
Super resolution
SeRanet
etc. Now you are ready to enter these advanced image processing with deep learning!
[hands on]
Try modifying the CNN model or create your own CNN model and train it to see the computational speed and its performance (accuracy). You may try changing following
model depth
channel size of each layer
Layer (Ex. use F.max_pooling_2d instead of L.Convolution2D with stride 2)
activation function (F.relu to F.leaky_relu, F.sigmoid, F.tanh etc…)
Try inserting another layer, Ex. L.BatchNormalization or F.dropout. etc.
CIFAR-10 and CIFAR-100 are the small image datasets with its classification labeled. It is widely used for easy image classification task/benchmark in research community.
In Chainer, CIFAR-10 and CIFAR-100 dataset can be obtained with build-in function.
Setup code:
from __future__ import print_function
import os
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import chainer
basedir = './src/cnn/images'
CIFAR-10
chainer.datasets.get_cifar10 method is prepared in Chainer to get CIFAR-10 dataset. Dataset is automatically downloaded from https://www.cs.toronto.edu only for the first time, and its cache is used from second time.
The dataset structure is quite same with MNIST dataset, it is TupleDataset. train[i] represents i-th data, there are 50000 training data. test data structure is same, with 10000 test data.
print('len(train), type ', len(train), type(train))
print('len(test), type ', len(test), type(test))
len(train), type 50000 <class 'chainer.datasets.tuple_dataset.TupleDataset'> len(test), type 10000 <class 'chainer.datasets.tuple_dataset.TupleDataset'>
train[i] represents i-th data, type=tuple \( (x_i, y_i) \), where \(x_i\) is image data and \(y_i\) is label data.
train[i][0] represents \(x_i\), CIFAR-10 image data, this is 3 dimensional array, (3, 32, 32), which represents RGB channel, width 32 px, height 32 px respectively.
train[i][1] represents \(y_i\), the label of CIFAR-10 image data (scalar), this is scalar value whose actual label can be converted by LABELS_LIST.
def plot_cifar(filepath, data, row, col, scale=3., label_list=None):
fig_width = data[0][0].shape[1] / 80 * row * scale
fig_height = data[0][0].shape[2] / 80 * col * scale
fig, axes = plt.subplots(row,
col,
figsize=(fig_height, fig_width))
for i in range(row * col):
# train[i][0] is i-th image data with size 32x32
image, label_index = data[i]
image = image.transpose(1, 2, 0)
r, c = divmod(i, col)
axes[r][c].imshow(image) # cmap='gray' is for black and white picture.
if label_list is None:
axes[r][c].set_title('label {}'.format(label_index))
else:
axes[r][c].set_title('{}: {}'.format(label_index, label_list[label_index]))
axes[r][c].axis('off') # do not show axis value
plt.tight_layout() # automatic padding between subplots
plt.savefig(filepath)
CIFAR-100 is really similar to CIFAR-10. The difference is the number of classified label is 100. chainer.datasets.get_cifar100 method is prepared in Chainer to get CIFAR-100 dataset.
The dataset structure is quite same with MNIST dataset, it is TupleDataset.
train[i] represents i-th data, there are 50000 training data. Total train data is same size while the number of class label increased. So the training data for each class label is fewer than CIFAR-10 dataset.
test data structure is same, with 10000 test data.
Source code is uploaded on github. The sample image is obtained from PEXELS.
What is the difference between convolutional layer and linear layer? What kind of intuition is in behind of using convolutional layer in deep neural network?
This hands on shows some effects by convolutional layer to provide some intution about what convolutional layer do.
import os
import numpy as np
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
basedir = './src/cnn/images'
def read_rgb_image(imagepath):
image = cv2.imread(imagepath) # Height, Width, Channel
(major, minor, _) = cv2.__version__.split(".")
if major == '3':
# version 3 is used, need to convert
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
else:
# Version 2 is used, not necessary to convert
pass
return image
def read_gray_image(imagepath):
image = cv2.imread(imagepath) # Height, Width, Channel
(major, minor, _) = cv2.__version__.split(".")
if major == '3':
# version 3 is used, need to convert
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
# Version 2 is used, not necessary to convert
image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
return image
def plot_channels(array, filepath='out.jpg'):
"""Plot each channel component separately
Args:
array (numpy.ndarray): 3-D array (width, height, channel)
"""
ch_number = array.shape[2]
fig, axes = plt.subplots(1, ch_number)
for i in range(ch_number):
# Save each image
# cv2.imwrite(os.path.join(basedir, 'output_conv1_{}.jpg'.format(i)), array[:, :, i])
axes[i].set_title('Channel {}'.format(i))
axes[i].axis('off')
axes[i].imshow(array[:, :, i], cmap='gray')
plt.savefig(filepath)
Above type of diagram often appears in Convolutional neural network field. Below figure explains its notation.
Cuboid represents the “image” array where this image might not mean the meaningful picture. Horizontal axis represents channel number, vertical axis for image height and depth axis for image width respectively.
Convolution layer – basic usage
Input format of convolutional layer is in the order, (batch index, channel, height, width). Since openCV image format is in the order (height, width, channel), this dimension order need to be converted to input to convolution layer.
It can be done by using transpose method.
L.Convolution2D(in_channels, out_channels, ksize)
in_channels: input channel number.
out_channels: output channel number.
ksize: kernel size.
also, following parameters is often set
pad: padding
stride: stride
To understand the behavior of convolution layer, I recommend to see the animation on conv_arithmetic.
import chainer.links as L
# Read image from file, save image with matplotlib using `imshow` function
imagepath = os.path.join(basedir, 'sample.jpeg')
image = read_rgb_image(imagepath)
# height and width shows pixel size of this image
# Channel=3 indicates the RGB channel
print('image.shape (Height, Width, Channel) = ', image.shape)
conv1 = L.Convolution2D(None, 3, 5)
# Need to input image of the form (batch index, channel, height, width)
image = image.transpose(2, 0, 1)
image = image[np.newaxis, :, :, :]
# Convert from int to float
image = image.astype(np.float32)
print('image shape', image.shape)
out_image = conv1(image).data
print('shape', out_image.shape)
out_image = out_image[0].transpose(1, 2, 0)
print('shape 2', out_image.shape)
plot_channels(out_image,
filepath=os.path.join(basedir, 'output_conv1.jpg'))
#plt.imshow(image)
#plt.savefig('./src/cnn/images/out.jpg')
As you can see from the result, each convolution layer acts as emphasizing/extracting the color difference along specific direction. In this way “filter”, also called “kernel” can be considered as feature extractor.
Convolution with stride
The default value of stride is 1. If this value is specified, convolution layer will reduce output image size.
Practically, stride=2 is often used to generate the output image of the height & width almost half of the input image.
As written in the Chainer docs, the input and output shape relation is given in below formula:
$$ w_O = (w_I + 2w_P – w_K) / s_X + 1 $$
where each symbol means that
\(h\): height
\(w\): width
\(I\): input
\(O\): output
\(P\): padding
\(K\): kernel size
Max pooling
Convolution layer with stride can be used to look wide range feature, another popular method is to use max pooling.
Max pooling function extracts the maximum value in the kernel, and it dispose the rest pixel’s information.
This behavior is beneficial to impose translational symmetry. For example, consider the dog’s picture. Even if the each pixel shifted one pixel, is should be still recognized as dog. So translational symmetry can be exploited to reduce model’s calculation time and number of internal parameters for image classification task.
from chainer import functions as F
print('image.shape (Height, Width, Channel) = ', image.shape)
print('input image.shape', image.shape)
out_image = F.max_pooling_2d(image, 2).data
print('out_image.shape', out_image.shape)
out_image = out_image[0].transpose(1, 2, 0)
plot_channels(out_image,
filepath=os.path.join(basedir, 'output_max_pooling.jpg'))
Let’s see how this CNN can be used for image classification in the following. Before that, next post explains CIFAR-10, CIFAR-100 dataset which are famous image classification dataset for research.
import os
import matplotlib.pyplot as plt
import cv2
%matplotlib inline
def readRGBImage(imagepath):
image = cv2.imread(imagepath) # Height, Width, Channel
(major, minor, _) = cv2.__version__.split(".")
if major == '3':
# version 3 is used, need to convert
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
else:
# Version 2 is used, not necessary to convert
pass
return image
Loading and save image
cv2.imread for loading image.
cv2.imwrite for save image.
plt.imshow for plotting, and plt.savefig for save plot image.
OpenCV image format is usually 3 dimension (or 2 dimension if the image is gray scale).
1st dimension is for height, 2nd dimension is for width, 3rd dimension is for channel (RGB, YUV etc).
To convert color format cv2.cvtColor can be used. Details are written in next section.
# Read image from file, save image with matplotlib using `imshow` function
basedir = './src/cnn/images'
imagepath = os.path.join(basedir, 'sample.jpeg')
#image = cv2.imread(imagepath, cv2.IMREAD_GRAYSCALE)
image = readRGBImage(imagepath)
# Width and Height shows pixel size of this image
# Channel=3 indicates the RGB channel
print('image.shape (Height, Width, Channel) = ', image.shape)
# Save image with openCV
# This may be blue image because the color format RGB is opposite.
cv2.imwrite('./src/cnn/images/out.jpg', image)
# bgr_image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
# cv2.imwrite('./src/cnn/images/out.jpg', bgr_image)
# Plotting
plt.imshow(image)
plt.savefig('./src/cnn/images/out_plt.png')
Note that openCV version 3 reads the image color in the order B, G, R. However, matplotlib deals with the image color in the corder R, G, B. So you need to convert color order, refer readRGBImage function.
If the image is gray scale, the image is 2 dimensional array
1st dimension is for height, 2nd dimension is for width.
We already learned how to write training code in chainer, the last task is to use this trained model to inference (predict) the test input MNIST image.
Inference code structure usually becomes as follows,
Prepare input data
Instantiate the trained model
Load the trained model
Feed input data into loaded model to get inference result
You have already learned the necessary stuff, and it is easy. See inference_mnist.py for the source code.
Prepare input data
For MNIST, it is easy in one line
# Load the MNIST dataset
train, test = chainer.datasets.get_mnist()
Instantiate the trained model and load the model
# Load trained model
model = mlp.MLP(args.unit, 10)
if args.gpu >= 0:
chainer.cuda.get_device(args.gpu).use() # Make a specified GPU current
model.to_gpu() # Copy the model to the GPU
xp = np if args.gpu < 0 else cuda.cupy
serializers.load_npz(args.modelpath, model)
Here, note that model can be loaded after instantiating the model. This model must have the same structure (hidden unit size, layer depth etc) when you saved the model in training stage.
Feed input data into loaded model to get inference result
Below code is to get inference result y from test input data x.
for i in range(len(test)):
x = Variable(xp.asarray([test[i][0]])) # test data
# t = Variable(xp.asarray([test[i][1]])) # labels
y = model(x) # Inference result
Visualize the result
You might want to see the inference result together with the input image to understand more precisely. This code draws a plot for test input image and its inference result.
"""Original code referenced from https://github.com/hido/chainer-handson"""
ROW = 4
COLUMN = 5
# show graphical results of first 20 data to understand what's going on in inference stage
plt.figure(figsize=(15, 10))
for i in range(ROW * COLUMN):
# Example of predicting the test input one by one.
x = Variable(xp.asarray([test[i][0]])) # test data
# t = Variable(xp.asarray([test[i][1]])) # labels
y = model(x)
np.set_printoptions(precision=2, suppress=True)
print('{}-th image: answer = {}, predict = {}'.format(i, test[i][1], F.softmax(y).data))
prediction = y.data.argmax(axis=1)
example = (test[i][0] * 255).astype(np.int32).reshape(28, 28)
plt.subplot(ROW, COLUMN, i+1)
plt.imshow(example, cmap='gray')
plt.title("No.{0} / Answer:{1}, Predict:{2}".format(i, test[i][1], prediction))
plt.axis("off")
plt.tight_layout()
plt.savefig('inference.png')
Even only 50 hidden units are used, the accuracy to inference the MNIST digit number is quite high.
That’s all for MNIST dataset tutorial. Now you have learned the basics of how to use deep learning framework. how to write training code, how to write inference code with Chainer. It is now ready to go further to specialized category. Convolutional Neural Network is used in wide area especially Image processing, Reccurent Neural Network is Language processing etc.
Image processing library for deep learning training. Common data-augmentation are implemented. Also the trained models for Bounding box detection and semantic segmentation are provided.
It was announced at Deep Learning Summit 2017 that training time for ImageNet classification task took 4.4 hours (ResNet-50, 100 Epochs, 128 GPUs), which is fastest among other distributed deep learning frameworks known to date.
Until now, I was implementing the training code in “primitive” way to explain what kind of operations are going on in deep learning training (※). However, the code can be written in much clean way using Trainer modules in Chainer.
※ Trainer modules are implemented from version 1.11, and some of the open source projects are implemented without Trainer. So it helps to understand these codes by knowing the training implementation without Trainer module as well.
Motivation for using Trainer
We can notice there are many “typical” operations widely used in machine learning, for example
Iterating minibatch training, with minibatch sampled ramdomly
Separate train data & validation data, validation is used only for checking the loss to prevent overfitting
Output the log, save the trained model in regular interval
These operations are commonly applied, and Chainer provides these features in library level so that user don’t need to implement again and again. Trainer will mange the training code for you!
Details are also explained in official document of Trainer.
Source code with Trainer
Seeing is better than hearing, train_mnist_4_trainer.py is the source code which uses Trainer module. If I remove the comment, the source code looks like below,
from future import print_function import argparse
import chainer import chainer.functions as F import chainer.links as L from chainer import training from chainer.training import extensions from chainer import serializers
import mlp as mlp
def main(): parser = argparse.ArgumentParser(description=’Chainer example: MNIST’) parser.add_argument(‘–batchsize’, ‘-b’, type=int, default=100, help=’Number of images in each mini-batch’) parser.add_argument(‘–epoch’, ‘-e’, type=int, default=20, help=’Number of sweeps over the dataset to train’) parser.add_argument(‘–gpu’, ‘-g’, type=int, default=-1, help=’GPU ID (negative value indicates CPU)’) parser.add_argument(‘–out’, ‘-o’, default=’result/4′, help=’Directory to output the result’) parser.add_argument(‘–resume’, ‘-r’, default=”, help=’Resume the training from snapshot’) parser.add_argument(‘–unit’, ‘-u’, type=int, default=50, help=’Number of units’) args = parser.parse_args()
from __future__ import print_function
import argparse
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import training
from chainer.training import extensions
from chainer import serializers
import mlp as mlp
def main():
parser = argparse.ArgumentParser(description='Chainer example: MNIST')
parser.add_argument('--batchsize', '-b', type=int, default=100,
help='Number of images in each mini-batch')
parser.add_argument('--epoch', '-e', type=int, default=20,
help='Number of sweeps over the dataset to train')
parser.add_argument('--gpu', '-g', type=int, default=-1,
help='GPU ID (negative value indicates CPU)')
parser.add_argument('--out', '-o', default='result/4',
help='Directory to output the result')
parser.add_argument('--resume', '-r', default='',
help='Resume the training from snapshot')
parser.add_argument('--unit', '-u', type=int, default=50,
help='Number of units')
args = parser.parse_args()
print('GPU: {}'.format(args.gpu))
print('# unit: {}'.format(args.unit))
print('# Minibatch-size: {}'.format(args.batchsize))
print('# epoch: {}'.format(args.epoch))
print('')
model = mlp.MLP(args.unit, 10)
classifier_model = L.Classifier(model)
if args.gpu >= 0:
chainer.cuda.get_device(args.gpu).use() # Make a specified GPU current
classifier_model.to_gpu() # Copy the model to the GPU
optimizer = chainer.optimizers.Adam()
optimizer.setup(classifier_model)
train, test = chainer.datasets.get_mnist()
train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
test_iter = chainer.iterators.SerialIterator(test, args.batchsize, repeat=False, shuffle=False)
updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu)
trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)
trainer.extend(extensions.Evaluator(test_iter, classifier_model, device=args.gpu))
trainer.extend(extensions.dump_graph('main/loss'))
trainer.extend(extensions.snapshot(), trigger=(1, 'epoch'))
trainer.extend(extensions.LogReport())
trainer.extend(extensions.PrintReport(
['epoch', 'main/loss', 'validation/main/loss',
'main/accuracy', 'validation/main/accuracy', 'elapsed_time']))
trainer.extend(extensions.ProgressBar())
if args.resume:
# Resume from a snapshot
serializers.load_npz(args.resume, trainer)
trainer.run()
serializers.save_npz('{}/mlp.model'.format(args.out), model)
if __name__ == '__main__':
main()
See how clean the code is! Compare above code and train_mnist_2_predictor_classifier.py. The code even does not contains for loop, as well as random permutation for minibatch, and save function explicitly.
Save trainer snapshot in regular interval (it is including optimizer and model data.) You can pause and resume training.
Print log in formatted way, together with the progress bar which showing training status.
Output the training result to log file in json formatted text.
However it has changed much from previous code, user might not understand what’s going on. Several modules are used for together with the Trainer. Let’s see overview of the role for each module one by one.
There are several Dataset classes, TupleDataset, ImageDataset etc and even you can define your custom Dataset class by using DatasetMixin.
All the Dataset follows common rule that when data is Dataset instance data[i] points the i-th data.
Usually it consists of input data and target data (answer), where data[i][0] is the i-th input data, data[i][1] is the i-th target data. However, it can be only one element or even more than 2 elements depending on the problem.
Role: Used for preparing input value to provide index access of data. Specifically i-th data can be accessed by data[i], so that Iterator can handle.
This one line provides almost same with following training loop,
# Learning loop
for epoch in six.moves.range(1, n_epoch + 1):
# training
<strong> perm = np.random.permutation(N)
for i in six.moves.range(0, N, batchsize):
x = chainer.Variable(xp.asarray(train[perm[i:i + batchsize]][0]))
t = chainer.Variable(xp.asarray(train[perm[i:i + batchsize]][1]))
</strong>
# Pass the loss function (Classifier defines it) and its arguments
optimizer.update(classifier_model, x, t)
and in the same way applies for validation (test) dataset,
for i in six.moves.range(0, N_test, batchsize):
index = np.asarray(list(range(i, i + batchsize)))
x = chainer.Variable(xp.asarray(test[index][0]), volatile='on')
t = chainer.Variable(xp.asarray(test[index][1]), volatile='on')
loss = classifier_model(x, t)
minibatch random sampling, implemented by np.permutation can be replaced by just setting shuffle flag to True or False (default True).
Currently 2 Iterator classes are provided,
SerialIterator is the most basic class.
MultiProcessIterator provides multi process data preparation support in background.
Both of them have the
Role: Construct minibatch from Dataset (including background preparation support using multi process), and pass it to Updater.
Usually extensions are registered before start calling run of trainer, see below
Role: Manages Training lifecycle. extension can be registered.
Trainer extension
Trainer extension can be registered by trainer.extend() function.
These extensions are used in this example,
Evaluator Calculate Validation loss and accuracy, and it is printed out and logged to file.
LogReport Print out logfile in json format, in the directory specified by out argument in trainer.
PrintReport Print out log in standard out (console) to show training status.
ProgressBar Show progress bar to show current progress of training.
snapshot Save the trainer state (including model, optimizer information) in regular interval. By setting this extension, you can pause and resume training.
Role: hook trigger to trainer to do several events in specific timing
Trainer architecture summary
Refer above figure for the training abstraction procedure using Trainer module.
Advantage of using Trainer module
– Multi process data preparation using MultiProcessIterator
Python has GIL feature, so even you use multi-thread its threads are not executed in “parallel”. If the code contains heavy data preprocessing (e.g. data augmentation, adding noise before feeding as input) you can get benefit by using MultiProcessIterator.
– Multiple GPU utilization
– ParallelUpdater or MultiProcessParallelUpdater
– Trainer extensions are useful and reusable once you made your own extension
– PrintReport
– ProgressBar
– LogReport
— The log is in json format, it is easy to load and plot learning curve graph etc.
2 Chain classes, “Predictor” and “Classifier” are used for this framework.
Training phase: Predictor’s output is fed into Classifier to calculate loss.
Predict/Inference phase: Only predictor’s output is used.
Predictor
Predictor simply calculates output based on input.
# Network definition Chainer v2
# 1. `init_scope()` is used to initialize links for IDE friendly design.
# 2. input size of Linear layer can be omitted
class MLP(chainer.Chain):
def __init__(self, n_units, n_out):
super(MLP, self).__init__()
with self.init_scope():
# input size of each layer will be inferred when omitted
self.l1 = L.Linear(n_units) # n_in -> n_units
self.l2 = L.Linear(n_units) # n_units -> n_units
self.l3 = L.Linear(n_out) # n_units -> n_out
def __call__(self, x):
h1 = F.relu(self.l1(x))
h2 = F.relu(self.l2(h1))
return self.l3(h2)
model = mlp.MLP(args.unit, 10)
Classifier
Classifier “wraps” predictors output y to calculate loss between y and actual target t.
classifier_model = L.Classifier(model)
optimizer.update(classifier_model, x, t)
which invokes classifier_model(x, t) internally, calculates loss and update internal parameter by back propagation.
Both the loss calculation in train phase and predict code for inference phase are implemented within one model, and the behavior is managed by “train flag” (or “test flag”/”predict flag”).
# Network definition
class MLP(chainer.Chain):
def __init__(self, n_units, n_out):
super(MLP, self).__init__()
with self.init_scope():
self.l1 = L.Linear(None, n_units) # n_in -> n_units
self.l2 = L.Linear(None, n_units) # n_units -> n_units
self.l3 = L.Linear(None, n_out) # n_units -> n_out
# Define train flag
self.train = True
def __call__(self, x, t=None):
h1 = F.relu(self.l1(x))
h2 = F.relu(self.l2(h1))
y = self.l3(h2)
if self.train:
# return loss in training phase
#y = self.predictor(x)
self.loss = F.softmax_cross_entropy(y, t)
self.accuracy = F.accuracy(y, t)
return self.loss
else:
# return y in predict/inference phase
return y
As default, self.train = True, and this model will calculate loss so that optimizer can update its internal parameters.
To predict value, we can set train flag to False,
model.train = False
y = model(x)
# model.train = True # if necessary
Comparison
Predictor – Classifier framework has an advantage that Classifier module can be independent and it will be reusable. However, if loss calculation is complicated, it is difficult to apply this framework.
In train flag framework, train loss calculation and predict calculation can be independent. You can implement any loss calculation, even the loss calculation is very different from predict calculation.
Basically, you can use Predictor – Classifier framework if the loss function is typical. Use train flag framework otherwise.