6.3. 参数初始化
在 Colab 中打开 Notebook
在 Colab 中打开 Notebook
在 Colab 中打开 Notebook
在 Colab 中打开 Notebook
在 SageMaker Studio Lab 中打开 Notebook

现在我们知道如何访问参数, 接下来我们将看看如何正确地初始化参数。 我们在 第 5.4 节中探讨了良好初始化的必要性。 深度学习框架为层提供了默认的随机初始化。 然而,我们经常希望根据各种其他协议来初始化权重。 框架提供了最常用的协议,也允许创建自定义初始化器。

import torch
from torch import nn

默认情况下,PyTorch会根据输入和输出维度,从一个均匀分布中抽取值来初始化权重和偏置矩阵。PyTorch的nn.init模块提供了多种预设的初始化方法。

net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(1))
X = torch.rand(size=(2, 4))
net(X).shape
torch.Size([2, 1])
from mxnet import init, np, npx
from mxnet.gluon import nn

npx.set_np()

默认情况下,MXNet通过从均匀分布\(U(-0.07, 0.07)\)中随机抽取数值来初始化权重参数,并将偏置参数清零。MXNet的init模块提供了多种预设的初始化方法。

net = nn.Sequential()
net.add(nn.Dense(8, activation='relu'))
net.add(nn.Dense(1))
net.initialize()  # Use the default initialization method

X = np.random.uniform(size=(2, 4))
net(X).shape
[22:10:04] ../src/storage/storage.cc:196: Using Pooled (Naive) StorageManager for CPU
(2, 1)
import jax
from flax import linen as nn
from jax import numpy as jnp
from d2l import jax as d2l
No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)

默认情况下,Flax使用jax.nn.initializers.lecun_normal来初始化权重,即从一个以0为中心、标准差为\(\sqrt{1 / \textrm{fan}_{\textrm{in}}}\)(其中fan_in是权重张量中的输入单元数)的截断正态分布中采样。偏置参数全部设置为零。Jax的nn.initializers模块提供了多种预设的初始化方法。

net = nn.Sequential([nn.Dense(8), nn.relu, nn.Dense(1)])
X = jax.random.uniform(d2l.get_key(), (2, 4))
params = net.init(d2l.get_key(), X)
net.apply(params, X).shape
(2, 1)
import tensorflow as tf

默认情况下,Keras会根据输入和输出维度,从一个均匀分布中抽取值来初始化权重矩阵,而偏置参数全部设置为零。TensorFlow在根模块和keras.initializers模块中都提供了多种初始化方法。

net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(4, activation=tf.nn.relu),
    tf.keras.layers.Dense(1),
])

X = tf.random.uniform((2, 4))
net(X).shape
TensorShape([2, 1])

6.3.1. 内置初始化

让我们从调用内置初始化器开始。下面的代码将所有权重参数初始化为标准差为0.01的高斯随机变量,并将偏置参数清零。

def init_normal(module):
    if type(module) == nn.Linear:
        nn.init.normal_(module.weight, mean=0, std=0.01)
        nn.init.zeros_(module.bias)

net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]
(tensor([-0.0129, -0.0007, -0.0033,  0.0276]), tensor(0.))
# Here force_reinit ensures that parameters are freshly initialized even if
# they were already initialized previously
net.initialize(init=init.Normal(sigma=0.01), force_reinit=True)
net[0].weight.data()[0]
array([ 0.00354961, -0.00614133,  0.0107317 ,  0.01830765])
weight_init = nn.initializers.normal(0.01)
bias_init = nn.initializers.zeros

net = nn.Sequential([nn.Dense(8, kernel_init=weight_init, bias_init=bias_init),
                     nn.relu,
                     nn.Dense(1, kernel_init=weight_init, bias_init=bias_init)])

params = net.init(jax.random.PRNGKey(d2l.get_seed()), X)
layer_0 = params['params']['layers_0']
layer_0['kernel'][:, 0], layer_0['bias'][0]
(Array([ 0.00457076,  0.01890736, -0.0014968 ,  0.00327491], dtype=float32),
 Array(0., dtype=float32))
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(
        4, activation=tf.nn.relu,
        kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.01),
        bias_initializer=tf.zeros_initializer()),
    tf.keras.layers.Dense(1)])

net(X)
net.weights[0], net.weights[1]
(<tf.Variable 'dense_2/kernel:0' shape=(4, 4) dtype=float32, numpy=
 array([[-0.02287503, -0.00437018, -0.00140329, -0.00622254],
        [ 0.00495972,  0.00324918, -0.00965284, -0.00612193],
        [-0.00183808, -0.00826601, -0.00676942,  0.00917007],
        [ 0.00847368, -0.00507652, -0.00761351, -0.00762984]],
       dtype=float32)>,
 <tf.Variable 'dense_2/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>)

我们还可以将所有参数初始化为一个给定的常数值(比如,1)。

def init_constant(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 1)
        nn.init.zeros_(module.bias)

net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]
(tensor([1., 1., 1., 1.]), tensor(0.))
net.initialize(init=init.Constant(1), force_reinit=True)
net[0].weight.data()[0]
array([1., 1., 1., 1.])
weight_init = nn.initializers.constant(1)

net = nn.Sequential([nn.Dense(8, kernel_init=weight_init, bias_init=bias_init),
                     nn.relu,
                     nn.Dense(1, kernel_init=weight_init, bias_init=bias_init)])

params = net.init(jax.random.PRNGKey(d2l.get_seed()), X)
layer_0 = params['params']['layers_0']
layer_0['kernel'][:, 0], layer_0['bias'][0]
(Array([1., 1., 1., 1.], dtype=float32), Array(0., dtype=float32))
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(
        4, activation=tf.nn.relu,
        kernel_initializer=tf.keras.initializers.Constant(1),
        bias_initializer=tf.zeros_initializer()),
    tf.keras.layers.Dense(1),
])

net(X)
net.weights[0], net.weights[1]
(<tf.Variable 'dense_4/kernel:0' shape=(4, 4) dtype=float32, numpy=
 array([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]], dtype=float32)>,
 <tf.Variable 'dense_4/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>)

我们还可以对某些块应用不同的初始化器。例如,下面我们使用Xavier初始化器初始化第一层,并将第二层初始化为常数42。

def init_xavier(module):
    if type(module) == nn.Linear:
        nn.init.xavier_uniform_(module.weight)

def init_42(module):
    if type(module) == nn.Linear:
        nn.init.constant_(module.weight, 42)

net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)
tensor([-0.0974,  0.1707,  0.5840, -0.5032])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])
net[0].weight.initialize(init=init.Xavier(), force_reinit=True)
net[1].initialize(init=init.Constant(42), force_reinit=True)
print(net[0].weight.data()[0])
print(net[1].weight.data())
[-0.26102373  0.15249556 -0.19274211 -0.24742058]
[[42. 42. 42. 42. 42. 42. 42. 42.]]
net = nn.Sequential([nn.Dense(8, kernel_init=nn.initializers.xavier_uniform(),
                              bias_init=bias_init),
                     nn.relu,
                     nn.Dense(1, kernel_init=nn.initializers.constant(42),
                              bias_init=bias_init)])

params = net.init(jax.random.PRNGKey(d2l.get_seed()), X)
params['params']['layers_0']['kernel'][:, 0], params['params']['layers_2']['kernel']
(Array([ 0.38926104, -0.4023119 , -0.41848803, -0.6341998 ], dtype=float32),
 Array([[42.],
        [42.],
        [42.],
        [42.],
        [42.],
        [42.],
        [42.],
        [42.]], dtype=float32))
net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(
        4,
        activation=tf.nn.relu,
        kernel_initializer=tf.keras.initializers.GlorotUniform()),
    tf.keras.layers.Dense(
        1, kernel_initializer=tf.keras.initializers.Constant(42)),
])

net(X)
print(net.layers[1].weights[0])
print(net.layers[2].weights[0])
<tf.Variable 'dense_6/kernel:0' shape=(4, 4) dtype=float32, numpy=
array([[ 0.54234487,  0.2669801 , -0.2516024 ,  0.1076265 ],
       [ 0.30622882,  0.30598146, -0.4484879 ,  0.07192796],
       [ 0.36688513,  0.3838529 ,  0.40699893,  0.577269  ],
       [-0.2649538 ,  0.43839508, -0.3203209 ,  0.29825717]],
      dtype=float32)>
<tf.Variable 'dense_7/kernel:0' shape=(4, 1) dtype=float32, numpy=
array([[42.],
       [42.],
       [42.],
       [42.]], dtype=float32)>

6.3.1.1. 自定义初始化

有时,我们需要的初始化方法并没有被深度学习框架提供。在下面的例子中,我们使用以下奇怪的分布为任何权重参数\(w\)定义一个初始化器。

(6.3.1)\[\begin{split}\begin{aligned} w \sim \begin{cases} U(5, 10) & \textrm{ 概率为 } \frac{1}{4} \\ 0 & \textrm{ 概率为 } \frac{1}{2} \\ U(-10, -5) & \textrm{ 概率为 } \frac{1}{4} \end{cases} \end{aligned}\end{split}\]

同样,我们实现一个my_init函数来应用于net

def my_init(module):
    if type(module) == nn.Linear:
        print("Init", *[(name, param.shape)
                        for name, param in module.named_parameters()][0])
        nn.init.uniform_(module.weight, -10, 10)
        module.weight.data *= module.weight.data.abs() >= 5

net.apply(my_init)
net[0].weight[:2]
Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])
tensor([[ 0.0000, -7.6364, -0.0000, -6.1206],
        [ 9.3516, -0.0000,  5.1208, -8.4003]], grad_fn=<SliceBackward0>)

请注意,我们总是可以选择直接设置参数。

net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]
tensor([42.0000, -6.6364,  1.0000, -5.1206])

这里我们定义一个Initializer类的子类。通常,我们只需要实现_init_weight函数,它接受一个张量参数(data)并为其赋予定制的初始化值。

class MyInit(init.Initializer):
    def _init_weight(self, name, data):
        print('Init', name, data.shape)
        data[:] = np.random.uniform(-10, 10, data.shape)
        data *= np.abs(data) >= 5

net.initialize(MyInit(), force_reinit=True)
net[0].weight.data()[:2]
Init dense0_weight (8, 4)
Init dense1_weight (1, 8)
array([[-6.0683527,  8.991421 , -0.       ,  0.       ],
       [ 6.4198647, -9.728567 , -8.057975 ,  0.       ]])

请注意,我们总是可以选择直接设置参数。

net[0].weight.data()[:] += 1
net[0].weight.data()[0, 0] = 42
net[0].weight.data()[0]
array([42.      ,  9.991421,  1.      ,  1.      ])

Jax的初始化函数接受PRNGKeyshapedtype作为参数。这里我们实现函数my_init,它根据给定的形状和数据类型返回一个期望的张量。

def my_init(key, shape, dtype=jnp.float_):
    data = jax.random.uniform(key, shape, minval=-10, maxval=10)
    return data * (jnp.abs(data) >= 5)

net = nn.Sequential([nn.Dense(8, kernel_init=my_init), nn.relu, nn.Dense(1)])
params = net.init(d2l.get_key(), X)
print(params['params']['layers_0']['kernel'][:, :2])
[[ 0.        -5.891962 ]
 [ 0.        -9.597271 ]
 [-5.809202   6.3091564]
 [ 0.         0.       ]]

在JAX和Flax中初始化参数时,返回的参数字典类型为flax.core.frozen_dict.FrozenDict。在Jax生态系统中,不建议直接修改数组的值,因此数据类型通常是不可变的。可以使用params.unfreeze()来进行更改。

这里我们定义一个Initializer的子类,并实现__call__函数,该函数根据给定的形状和数据类型返回一个期望的张量。

class MyInit(tf.keras.initializers.Initializer):
    def __call__(self, shape, dtype=None):
        data=tf.random.uniform(shape, -10, 10, dtype=dtype)
        factor=(tf.abs(data) >= 5)
        factor=tf.cast(factor, tf.float32)
        return data * factor

net = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(
        4,
        activation=tf.nn.relu,
        kernel_initializer=MyInit()),
    tf.keras.layers.Dense(1),
])

net(X)
print(net.layers[1].weights[0])
<tf.Variable 'dense_8/kernel:0' shape=(4, 4) dtype=float32, numpy=
array([[-8.454213 , -0.       ,  0.       , -0.       ],
       [-9.362183 ,  0.       ,  0.       ,  0.       ],
       [-0.       , -9.406505 , -0.       , -0.       ],
       [ 6.2464294, -0.       , -0.       , -9.80323  ]], dtype=float32)>

请注意,我们总是可以选择直接设置参数。

net.layers[1].weights[0][:].assign(net.layers[1].weights[0] + 1)
net.layers[1].weights[0][0, 0].assign(42)
net.layers[1].weights[0]
<tf.Variable 'dense_8/kernel:0' shape=(4, 4) dtype=float32, numpy=
array([[42.       ,  1.       ,  1.       ,  1.       ],
       [-8.362183 ,  1.       ,  1.       ,  1.       ],
       [ 1.       , -8.406505 ,  1.       ,  1.       ],
       [ 7.2464294,  1.       ,  1.       , -8.80323  ]], dtype=float32)>

6.3.2. 小结

我们可以使用内置和自定义的初始化器来初始化参数。

6.3.3. 练习

查阅在线文档以了解更多内置的初始化器。