6.3. 参数初始化¶ 在 SageMaker Studio Lab 中打开 Notebook
现在我们知道如何访问参数, 接下来我们将看看如何正确地初始化参数。 我们在 第 5.4 节中探讨了良好初始化的必要性。 深度学习框架为层提供了默认的随机初始化。 然而,我们经常希望根据各种其他协议来初始化权重。 框架提供了最常用的协议,也允许创建自定义初始化器。
import torch
from torch import nn
默认情况下,PyTorch会根据输入和输出维度,从一个均匀分布中抽取值来初始化权重和偏置矩阵。PyTorch的nn.init
模块提供了多种预设的初始化方法。
net = nn.Sequential(nn.LazyLinear(8), nn.ReLU(), nn.LazyLinear(1))
X = torch.rand(size=(2, 4))
net(X).shape
torch.Size([2, 1])
from mxnet import init, np, npx
from mxnet.gluon import nn
npx.set_np()
默认情况下,MXNet通过从均匀分布\(U(-0.07, 0.07)\)中随机抽取数值来初始化权重参数,并将偏置参数清零。MXNet的init
模块提供了多种预设的初始化方法。
net = nn.Sequential()
net.add(nn.Dense(8, activation='relu'))
net.add(nn.Dense(1))
net.initialize() # Use the default initialization method
X = np.random.uniform(size=(2, 4))
net(X).shape
[22:10:04] ../src/storage/storage.cc:196: Using Pooled (Naive) StorageManager for CPU
(2, 1)
import jax
from flax import linen as nn
from jax import numpy as jnp
from d2l import jax as d2l
No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
默认情况下,Flax使用jax.nn.initializers.lecun_normal
来初始化权重,即从一个以0为中心、标准差为\(\sqrt{1 / \textrm{fan}_{\textrm{in}}}\)(其中fan_in
是权重张量中的输入单元数)的截断正态分布中采样。偏置参数全部设置为零。Jax的nn.initializers
模块提供了多种预设的初始化方法。
net = nn.Sequential([nn.Dense(8), nn.relu, nn.Dense(1)])
X = jax.random.uniform(d2l.get_key(), (2, 4))
params = net.init(d2l.get_key(), X)
net.apply(params, X).shape
(2, 1)
import tensorflow as tf
默认情况下,Keras会根据输入和输出维度,从一个均匀分布中抽取值来初始化权重矩阵,而偏置参数全部设置为零。TensorFlow在根模块和keras.initializers
模块中都提供了多种初始化方法。
net = tf.keras.models.Sequential([
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(4, activation=tf.nn.relu),
tf.keras.layers.Dense(1),
])
X = tf.random.uniform((2, 4))
net(X).shape
TensorShape([2, 1])
6.3.1. 内置初始化¶
让我们从调用内置初始化器开始。下面的代码将所有权重参数初始化为标准差为0.01的高斯随机变量,并将偏置参数清零。
def init_normal(module):
if type(module) == nn.Linear:
nn.init.normal_(module.weight, mean=0, std=0.01)
nn.init.zeros_(module.bias)
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]
(tensor([-0.0129, -0.0007, -0.0033, 0.0276]), tensor(0.))
# Here force_reinit ensures that parameters are freshly initialized even if
# they were already initialized previously
net.initialize(init=init.Normal(sigma=0.01), force_reinit=True)
net[0].weight.data()[0]
array([ 0.00354961, -0.00614133, 0.0107317 , 0.01830765])
weight_init = nn.initializers.normal(0.01)
bias_init = nn.initializers.zeros
net = nn.Sequential([nn.Dense(8, kernel_init=weight_init, bias_init=bias_init),
nn.relu,
nn.Dense(1, kernel_init=weight_init, bias_init=bias_init)])
params = net.init(jax.random.PRNGKey(d2l.get_seed()), X)
layer_0 = params['params']['layers_0']
layer_0['kernel'][:, 0], layer_0['bias'][0]
(Array([ 0.00457076, 0.01890736, -0.0014968 , 0.00327491], dtype=float32),
Array(0., dtype=float32))
net = tf.keras.models.Sequential([
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(
4, activation=tf.nn.relu,
kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.01),
bias_initializer=tf.zeros_initializer()),
tf.keras.layers.Dense(1)])
net(X)
net.weights[0], net.weights[1]
(<tf.Variable 'dense_2/kernel:0' shape=(4, 4) dtype=float32, numpy=
array([[-0.02287503, -0.00437018, -0.00140329, -0.00622254],
[ 0.00495972, 0.00324918, -0.00965284, -0.00612193],
[-0.00183808, -0.00826601, -0.00676942, 0.00917007],
[ 0.00847368, -0.00507652, -0.00761351, -0.00762984]],
dtype=float32)>,
<tf.Variable 'dense_2/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>)
我们还可以将所有参数初始化为一个给定的常数值(比如,1)。
def init_constant(module):
if type(module) == nn.Linear:
nn.init.constant_(module.weight, 1)
nn.init.zeros_(module.bias)
net.apply(init_constant)
net[0].weight.data[0], net[0].bias.data[0]
(tensor([1., 1., 1., 1.]), tensor(0.))
net.initialize(init=init.Constant(1), force_reinit=True)
net[0].weight.data()[0]
array([1., 1., 1., 1.])
weight_init = nn.initializers.constant(1)
net = nn.Sequential([nn.Dense(8, kernel_init=weight_init, bias_init=bias_init),
nn.relu,
nn.Dense(1, kernel_init=weight_init, bias_init=bias_init)])
params = net.init(jax.random.PRNGKey(d2l.get_seed()), X)
layer_0 = params['params']['layers_0']
layer_0['kernel'][:, 0], layer_0['bias'][0]
(Array([1., 1., 1., 1.], dtype=float32), Array(0., dtype=float32))
net = tf.keras.models.Sequential([
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(
4, activation=tf.nn.relu,
kernel_initializer=tf.keras.initializers.Constant(1),
bias_initializer=tf.zeros_initializer()),
tf.keras.layers.Dense(1),
])
net(X)
net.weights[0], net.weights[1]
(<tf.Variable 'dense_4/kernel:0' shape=(4, 4) dtype=float32, numpy=
array([[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]], dtype=float32)>,
<tf.Variable 'dense_4/bias:0' shape=(4,) dtype=float32, numpy=array([0., 0., 0., 0.], dtype=float32)>)
我们还可以对某些块应用不同的初始化器。例如,下面我们使用Xavier初始化器初始化第一层,并将第二层初始化为常数42。
def init_xavier(module):
if type(module) == nn.Linear:
nn.init.xavier_uniform_(module.weight)
def init_42(module):
if type(module) == nn.Linear:
nn.init.constant_(module.weight, 42)
net[0].apply(init_xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)
tensor([-0.0974, 0.1707, 0.5840, -0.5032])
tensor([[42., 42., 42., 42., 42., 42., 42., 42.]])
net[0].weight.initialize(init=init.Xavier(), force_reinit=True)
net[1].initialize(init=init.Constant(42), force_reinit=True)
print(net[0].weight.data()[0])
print(net[1].weight.data())
[-0.26102373 0.15249556 -0.19274211 -0.24742058]
[[42. 42. 42. 42. 42. 42. 42. 42.]]
net = nn.Sequential([nn.Dense(8, kernel_init=nn.initializers.xavier_uniform(),
bias_init=bias_init),
nn.relu,
nn.Dense(1, kernel_init=nn.initializers.constant(42),
bias_init=bias_init)])
params = net.init(jax.random.PRNGKey(d2l.get_seed()), X)
params['params']['layers_0']['kernel'][:, 0], params['params']['layers_2']['kernel']
(Array([ 0.38926104, -0.4023119 , -0.41848803, -0.6341998 ], dtype=float32),
Array([[42.],
[42.],
[42.],
[42.],
[42.],
[42.],
[42.],
[42.]], dtype=float32))
net = tf.keras.models.Sequential([
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(
4,
activation=tf.nn.relu,
kernel_initializer=tf.keras.initializers.GlorotUniform()),
tf.keras.layers.Dense(
1, kernel_initializer=tf.keras.initializers.Constant(42)),
])
net(X)
print(net.layers[1].weights[0])
print(net.layers[2].weights[0])
<tf.Variable 'dense_6/kernel:0' shape=(4, 4) dtype=float32, numpy=
array([[ 0.54234487, 0.2669801 , -0.2516024 , 0.1076265 ],
[ 0.30622882, 0.30598146, -0.4484879 , 0.07192796],
[ 0.36688513, 0.3838529 , 0.40699893, 0.577269 ],
[-0.2649538 , 0.43839508, -0.3203209 , 0.29825717]],
dtype=float32)>
<tf.Variable 'dense_7/kernel:0' shape=(4, 1) dtype=float32, numpy=
array([[42.],
[42.],
[42.],
[42.]], dtype=float32)>
6.3.1.1. 自定义初始化¶
有时,我们需要的初始化方法并没有被深度学习框架提供。在下面的例子中,我们使用以下奇怪的分布为任何权重参数\(w\)定义一个初始化器。
同样,我们实现一个my_init
函数来应用于net
。
def my_init(module):
if type(module) == nn.Linear:
print("Init", *[(name, param.shape)
for name, param in module.named_parameters()][0])
nn.init.uniform_(module.weight, -10, 10)
module.weight.data *= module.weight.data.abs() >= 5
net.apply(my_init)
net[0].weight[:2]
Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])
tensor([[ 0.0000, -7.6364, -0.0000, -6.1206],
[ 9.3516, -0.0000, 5.1208, -8.4003]], grad_fn=<SliceBackward0>)
请注意,我们总是可以选择直接设置参数。
net[0].weight.data[:] += 1
net[0].weight.data[0, 0] = 42
net[0].weight.data[0]
tensor([42.0000, -6.6364, 1.0000, -5.1206])
这里我们定义一个Initializer
类的子类。通常,我们只需要实现_init_weight
函数,它接受一个张量参数(data
)并为其赋予定制的初始化值。
class MyInit(init.Initializer):
def _init_weight(self, name, data):
print('Init', name, data.shape)
data[:] = np.random.uniform(-10, 10, data.shape)
data *= np.abs(data) >= 5
net.initialize(MyInit(), force_reinit=True)
net[0].weight.data()[:2]
Init dense0_weight (8, 4)
Init dense1_weight (1, 8)
array([[-6.0683527, 8.991421 , -0. , 0. ],
[ 6.4198647, -9.728567 , -8.057975 , 0. ]])
请注意,我们总是可以选择直接设置参数。
net[0].weight.data()[:] += 1
net[0].weight.data()[0, 0] = 42
net[0].weight.data()[0]
array([42. , 9.991421, 1. , 1. ])
Jax的初始化函数接受PRNGKey
、shape
和dtype
作为参数。这里我们实现函数my_init
,它根据给定的形状和数据类型返回一个期望的张量。
def my_init(key, shape, dtype=jnp.float_):
data = jax.random.uniform(key, shape, minval=-10, maxval=10)
return data * (jnp.abs(data) >= 5)
net = nn.Sequential([nn.Dense(8, kernel_init=my_init), nn.relu, nn.Dense(1)])
params = net.init(d2l.get_key(), X)
print(params['params']['layers_0']['kernel'][:, :2])
[[ 0. -5.891962 ]
[ 0. -9.597271 ]
[-5.809202 6.3091564]
[ 0. 0. ]]
在JAX和Flax中初始化参数时,返回的参数字典类型为flax.core.frozen_dict.FrozenDict
。在Jax生态系统中,不建议直接修改数组的值,因此数据类型通常是不可变的。可以使用params.unfreeze()
来进行更改。
这里我们定义一个Initializer
的子类,并实现__call__
函数,该函数根据给定的形状和数据类型返回一个期望的张量。
class MyInit(tf.keras.initializers.Initializer):
def __call__(self, shape, dtype=None):
data=tf.random.uniform(shape, -10, 10, dtype=dtype)
factor=(tf.abs(data) >= 5)
factor=tf.cast(factor, tf.float32)
return data * factor
net = tf.keras.models.Sequential([
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(
4,
activation=tf.nn.relu,
kernel_initializer=MyInit()),
tf.keras.layers.Dense(1),
])
net(X)
print(net.layers[1].weights[0])
<tf.Variable 'dense_8/kernel:0' shape=(4, 4) dtype=float32, numpy=
array([[-8.454213 , -0. , 0. , -0. ],
[-9.362183 , 0. , 0. , 0. ],
[-0. , -9.406505 , -0. , -0. ],
[ 6.2464294, -0. , -0. , -9.80323 ]], dtype=float32)>
请注意,我们总是可以选择直接设置参数。
net.layers[1].weights[0][:].assign(net.layers[1].weights[0] + 1)
net.layers[1].weights[0][0, 0].assign(42)
net.layers[1].weights[0]
<tf.Variable 'dense_8/kernel:0' shape=(4, 4) dtype=float32, numpy=
array([[42. , 1. , 1. , 1. ],
[-8.362183 , 1. , 1. , 1. ],
[ 1. , -8.406505 , 1. , 1. ],
[ 7.2464294, 1. , 1. , -8.80323 ]], dtype=float32)>
6.3.2. 小结¶
我们可以使用内置和自定义的初始化器来初始化参数。