在cifar10上训练ResNet

CIFAR-10 dataset

CIFAR-10 数据集包含60000幅大小为 32 x 32 的彩色图像，这些图像分别属于10个类，每个类 6000 幅图像。数据集分为5个训练batch和1个测试batch，每个batch包含 10000 幅图像。

数据准备

下载 CIFAR-10

打开终端，在 $CAFFE_ROOT （caffe 的根目录）下输入命令：

1	$ ./data/cifar10/get_cifar10.sh

下载完成后，会在 $CAFFE_ROOT/data/cifar10/ 中生成数据 batch，不过这些都是二进制文件，需要转换为 LMDB 格式。

生成 LMDB 文件

1	$ ./examples/cifar10/create_cifar10.sh

这样就生成了 LMDB 文件，同时生成了训练数据 (cifar10_train_lmdb) 的均值文件 mean.binaryproto。均值文件的计算方式是计算不同通道上的样本均值，在 CIFAR-10 中，训练数据 size 为 5000 x 3 x 32 x 32 (样本数 x 图像通道数 x 图像高度 x 图像宽度)，则均值的 size 是 3。

搭建网络

在 $CAFFE_ROOT/examples/ 目录下新建文件夹 ResNet ，并在文件夹下新建 resnet_cifar10.py .

import modules

import os
import caffe
import numpy as np
from caffe import layers as L, params as P
workdir = os.path.dirname(__file__)

网络层函数

ResNet 中两个基本的结构如下：

定义层函数：

def conv_BN_scale(ntype, bottom, nout, ks, stride = 1, pad = 0, bias_term = True):
    if bias_term:
    	conv = L.Convolution(bottom, num_output = nout, kernel_size = ks, 
                             stride = stride, pad = pad, bias_term = bias_term, 
                             weight_filler = dict(type = 'xavier'), 
                             bias_filler = dict(type = 'constant'), 
                             param = [dict(lr_mult = 1, decay_mult = 1), 
                                      dict(lr_mult = 2, decay_mult = 0)])
    else:
	conv = L.Convolution(bottom, num_output = nout, kernel_size = ks, 
                             stride = stride, pad = pad, bias_term = bias_term, 
                             weight_filler = dict(type = 'xavier'), 
                             param = [dict(lr_mult = 1, decay_mult = 1)])
    # **Attention:**
    # 网络结构文件中BatchNorm层的参数要注意： 
    # 1.在训练时所有BN层要设置use_global_stats: false（也可以不写，caffe默认是false） 
    # 2.在测试时所有BN层要设置use_global_stats: true
    # 区别：
    # use_global_stats: false是使用了每个Batch里的数据的均值和方差； 
    # use_global_stats: true是使用了所有数据的均值和方差。
    if ntype == 'train':
        BN = L.BatchNorm(conv, batch_norm_param = dict(use_global_stats = False), 
                         in_place = True)      
    else: 
        BN = L.BatchNorm(conv, batch_norm_param = dict(use_global_stats = True), 
                         in_place = True)
    
    scale = L.Scale(BN, scale_param = dict(bias_term = True), in_place = True)
    
    return conv, BN, scale
def conv_BN_scale_relu(ntype, bottom, nout, ks, stride = 1, pad = 0, bias_term = True):
    conv, BN, scale = conv_BN_scale(ntype, bottom, nout, ks, stride, pad, bias_term)
    relu = L.ReLU(scale, in_place = True)
    return conv, BN, scale, relu
def set_branch(n, branch_name, conv, BN, scale, relu = None):
    n.__setattr__(branch_name, conv)
    n.__setattr__('bn%s' % branch_name[3:], BN)
    n.__setattr__('scale%s' % branch_name[3:], scale)
    if relu:
        n.__setattr__('%s_relu' % branch_name, relu)

残差模块

ResNet 论文中在 CIFAR-10 数据集上实验采用的残差模块由两层 3x3 卷积加一个 skip connection构成：

定义残差模块：

def ResNet_block(n, layer_name, ntype, bottom, nout, proj_stride, ks, stride = 1, pad = 0, bias_term = False):
    # 0 代表不需要 1x1 映射
    if proj_stride == 0:
        scale1 = bottom
        stride1 = 1
    
    # 否则经过 1x1，stride = proj_stride 映射, 映射是用于匹配通道数
    # 1: 通道数改变, feature map size 不变
    # 2: 通道数和 feature map size 都改变
    else:
	branch_name = '%s_branch1' % layer_name
        conv1, BN1, scale1 = conv_BN_scale(ntype, bottom, nout, 1, proj_stride, 0, bias_term)
	set_branch(n, branch_name, conv1, BN1, scale1)
        stride1 = proj_stride
    conv2a, BN2a, scale2a, relu2a = conv_BN_scale_relu(ntype, bottom, nout, ks, stride1, pad, bias_term)
    set_branch(n, '%s_branch2a' % layer_name, conv2a, BN2a, scale2a, relu2a)
    conv2b, BN2b, scale2b = conv_BN_scale(ntype, relu2a, nout, ks, stride, pad, bias_term)
    set_branch(n, '%s_branch2b' % layer_name, conv2b, BN2b, scale2b)
    
    # ewise = L.Eltwise(scale1, scale2b, operation = P.Eltwise.SUM)
    # operation = [PROD=0, SUM=1, MAX=2] (default = SUM)
    ewise = L.Eltwise(scale1, scale2b)
    n.__setattr__(layer_name, ewise)
    ewise_relu = L.ReLU(ewise, in_place = True)
    n.__setattr__('%s_relu' % layer_name, ewise_relu)
    return ewise_relu

ResNet

论文中定义在 CIFAR-10 数据集上实验的 ResNet 结构为：

layer name	output size	ResNet
conv1	32x32	3x3, 16, stride 1
conv2_x	32x32	$$\left[ \begin{matrix} 3\times3, 16 \ 3\times3, 16 \end{matrix} \right] \times n$$
conv3_x	16x16	$$\left[ \begin{matrix} 3\times3, 32 \ 3\times3, 32 \end{matrix} \right] \times n$$
conv4_x	8x8	$$\left[ \begin{matrix} 3\times3, 64 \ 3\times3, 64 \end{matrix} \right] \times n$$
pool4	1x1	average pool
fc10	10	10-d fc

定义ResNet网络：

def ResNet(ntype):
    alphabet = ['a','b','c','d','e','f']
    # 写入数据的路径
    datadir = os.path.join(os.path.dirname(workdir), 'cifar10')
    train_file = os.path.join(datadir, 'cifar10_train_lmdb')
    test_file = os.path.join(datadir, 'cifar10_test_lmdb')
    mean_file = os.path.join(datadir, 'mean.binaryproto')
    n = caffe.NetSpec()
    # source: 导入的训练数据路径; 
    # backend: 训练数据的格式; 
    # ntop: 有多少个输出,这里是 2 个,分别是 n.data 和 n.labels,即训练数据和标签数据,
    # 在 caffe 中, bottom 是 layer 的输入,top 是输出
    # mirror: 定义是否水平翻转, default=False
    # 如果写的是训练网络的 prototxt 文件    
    if ntype == 'train':
        n.data, n.labels = L.Data(source = train_file, backend = P.Data.LMDB, 
                                  batch_size = 128, ntop = 2, 
                                  transform_param = dict(mean_file = mean_file, 
                                                         mirror = True))
    # 如果写的是测试网络的 prototxt 文件
    # 测试数据不需要水平翻转,仅仅用来测试
    else:
        n.data, n.labels = L.Data(source = test_file, backend = P.Data.LMDB, 
                                  batch_size = 128, ntop = 2, 
                                  transform_param = dict(mean_file = mean_file))
    # conv1, input size: [3, 32, 32]; output size: [16, 32, 32]                 
    n.conv1, n.bn_conv1, n.scale_conv1, n.conv1_relu = conv_BN_scale_relu(ntype, n.data, nout = 16, ks = 3, 
                                                                          stride = 1, pad = 1)
    
    # conv2_X，input size: [16, 32, 32]; output size: [16, 32, 32]
    # 设置映射步长为 1
    top = n.conv1_relu
    n = 3
    for i in range(n):
        #if i == 0:
        #    proj_stride = 1
        #else:
        #    proj_stride = 0
        proj_stride = 0
        layer_name = 'res2{}'.format(alphabet[i])
        bottom = top
        top = ResNet_block(n, layer_name, ntype, bottom, nout = 16, proj_stride = proj_stride, 
                           ks = 3, stride = 1, pad = 1)
    
    # conv3_X, input size: [16, 32, 32]; output size: [32, 16, 16]
    for i in range(n):
        if i == 0:
            # 只有在刚开始 conv2_X(32 x 32) 到 conv3_X(16 x 16) 的
            # 数据维度不一样，需要映射到相同维度，卷积映射的 stride 为 2
            proj_stride = 2       
        else:
            proj_stride = 0
        layer_name = 'res3{}'.format(alphabet[i])
        bottom = top
        top = ResNet_block(n, layer_name, ntype, bottom, nout = 32, proj_stride = proj_stride, 
                           ks = 3, stride = 1, pad = 1)
    
    # conv4_X, input size: [32, 16, 16]; output size: [64, 8, 8]                   
    for i in range(n):
        if i == 0:
            proj_stride = 2
            
        else:
            proj_stride = 0
        layer_name = 'res4{}'.format(alphabet[i])
        bottom = top
        top = ResNet_block(n, layer_name, ntype, bottom, nout = 64, proj_stride = proj_stride, 
                           ks = 3, stride = 1, pad = 1)
    
    # pool4
    n.pool4 = L.Pooling(top, pool=P.Pooling.AVE, global_pooling = True)
    # fc10, InnerProduct
    n.fc10 = L.InnerProduct(n.pool4, num_output = 10, 
                        weight_filler = dict(type = 'xavier'), 
                        bias_filler = dict(type = 'constant'))
    n.acc = L.Accuracy(n.fc10, n.labels)
    n.loss = L.SoftmaxWithLoss(n.fc10, n.labels)
    
    return str(n.to_proto())

将网络结构写入 `prototxt` 文件

def write_net():
    # write train net.
    trainnet_path = os.path.join(workdir, 'train.prototxt')
    with open(trainnet_path, 'w') as f:
        f.write(ResNet('train'))
    # write test net.
    testnet_path = os.path.join(workdir, 'val.prototxt')
    with open(testnet_path, 'w') as f:
        f.write(ResNet('test'))
if __name__ == '__main__':
    write_net()

生成 `solver.prototxt` 文件

添加 tools 模块

import sys
# $CAFFE_ROOT/examples/pycaffe/tools.py 中定义了 CaffeSolver 类
# 将 $CAFFE_ROOT/examples/pycaffe 加入路径
tools_path = os.path.join(workdir, os.pardir, 'pycaffe')
if tools_path not in sys.path:
    sys.path.append(tools_path)
import tools

修改 `tools.py`

由于采用 multistep 学习策略时，可能有多个 stepvalue 值，若直接以字典赋值会覆盖，故修改 tools.py 文件的 write() 函数。

def write(self, filepath):
        """
        Export solver parameters to INPUT "filepath". Sorted alphabetically.
        """
        f = open(filepath, 'w')
        for key, value in sorted(self.sp.items()):
            if key == 'stepvalue':
                for i in xrange(len(value)):
                    f.write('%s: %d\n' % (key, value[i]))
            else:
                if not(type(value) is str):
                    raise TypeError('All solver parameters must be strings')
                f.write('%s: %s\n' % (key, value))

写入 `solver.prototxt`

def write_net():
    # write train and test net.
    ...
    # write solver file.
    solver_path = os.path.join(workdir, 'solver.prototxt')
    solver_prototxt = tools.CaffeSolver(trainnet_prototxt_path = trainnet_path, testnet_prototxt_path = testnet_path)
    solver_prototxt.sp['base_lr'] = '0.1'
    solver_prototxt.sp['lr_policy'] = '"multistep"'
    solver_prototxt.sp['weight_decay'] = '0.0001'
    solver_prototxt.sp['stepvalue'] = [32000, 48000]
    solver_prototxt.sp['max_iter'] = '64000'
    solver_prototxt.sp['test_interval'] = '500'  # run test every 500 training iter
    solver_prototxt.sp['display'] = '100'        # display every 100 iter
    solver_prototxt.sp['snapshot'] = '5000'
    solver_prototxt.sp['snapshot_prefix'] = '"%s/snapshot/cifar10_snapshot"' % workdir
    solver_prototxt.sp['solver_mode'] = 'GPU'
    solver_prototxt.write(solver_path)

执行训练

在 $CAFFE_ROOT 目录下运行：

1	$ ./build/tools/caffe train -solver examples/ResNet/solver.prototxt