In [118]:
import torch
from torch import nn
from torch.nn import init

In [377]:
?nn.RNN

$$h_t = \text{tanh}(w_{ih} x_t + b_{ih} + w_{hh} h_{(t-1)} + b_{hh})$$

In [120]:
rnn = nn.RNN(input_size=1, hidden_size=1, nonlinearity='relu')

In [121]:
rnn

RNN(1, 1)

In [122]:
rnn.weight_hh_l0, rnn.bias_hh_l0

(Parameter containing:
 tensor([[0.7775]], requires_grad=True), Parameter containing:
 tensor([0.2025], requires_grad=True))

In [123]:
rnn.weight_ih_l0, rnn.bias_ih_l0

(Parameter containing:
 tensor([[-0.5602]], requires_grad=True), Parameter containing:
 tensor([0.0082], requires_grad=True))

In [124]:
# Init weights
init.constant_(rnn.weight_hh_l0, 0.5)
init.constant_(rnn.weight_ih_l0, 0.5)

# Init biases
init.constant_(rnn.bias_hh_l0, 0.0)
init.constant_(rnn.bias_ih_l0, 0.0)
rnn.bias_hh_l0, rnn.bias_ih_l0

(Parameter containing:
 tensor([0.], requires_grad=True), Parameter containing:
 tensor([0.], requires_grad=True))

In [125]:
x = torch.FloatTensor([[[1]]])

In [126]:
x.shape

torch.Size([1, 1, 1])

In [127]:
rnn(x)

(tensor([[[0.5000]]], grad_fn=<StackBackward>),
 tensor([[[0.5000]]], grad_fn=<StackBackward>))

$h_t = relu(W_{ih} \cdot x_t + b_{ih} + W_{hh} \cdot h_{t-1} + b_{hh})$

$0.5 = max(0, 0.5 \cdot 1 + 0 + 0 \cdot 0.5 + 0)$

In [142]:
x2 = torch.FloatTensor([[[1]], [[1]]])

In [143]:
x2.shape

torch.Size([2, 1, 1])

In [146]:
hiddens, ht = rnn(x2)
hiddens, ht

(tensor([[[0.5000]],
 
         [[0.7500]]], grad_fn=<StackBackward>),
 tensor([[[0.7500]]], grad_fn=<StackBackward>))

Calcul qui est fait:

$h_t = relu(W_{ih} \cdot x_t + b_{ih} + W_{hh} \cdot h_{t-1} + b_{hh})$

$0.75 = max(0, 0.5 \cdot 1 + 0 + 0.5 \cdot 0.5 + 0)$

# Analyse d'un "exploding gradient"

In [310]:
x3 = torch.ones((5, 1, 1), requires_grad=True)
rnn = nn.RNN(input_size=1, hidden_size=1, nonlinearity='relu')
init.constant_(rnn.weight_hh_l0, 11) # Important: Poids superieur a 1
init.constant_(rnn.weight_ih_l0, 11)
init.constant_(rnn.bias_hh_l0, 0.0)
init.constant_(rnn.bias_ih_l0, 0.0)
rnn.weight_hh_l0.grad, rnn.weight_ih_l0.grad = None, None
hiddens, ht = rnn(x3)
ht.backward(torch.ones(1, 1, 1))
print(hiddens)
rnn.weight_hh_l0.grad, rnn.weight_ih_l0.grad, ht

tensor([[[1.1000e+01]],

        [[1.3200e+02]],

        [[1.4630e+03]],

        [[1.6104e+04]],

        [[1.7716e+05]]], grad_fn=<StackBackward>)


(tensor([[62810.]]),
 tensor([[16105.]]),
 tensor([[[177155.]]], grad_fn=<StackBackward>))

In [313]:
x3.grad

tensor([[[1.6105e+05]],

        [[1.4641e+04]],

        [[1.3310e+03]],

        [[1.2100e+02]],

        [[1.1000e+01]]])

# Analyse d'un "vanishing gradient"

In [350]:
x3 = torch.ones((20, 1, 1), requires_grad=True)
rnn = nn.RNN(input_size=1, hidden_size=1, nonlinearity='tanh')
# Init weights
init.constant_(rnn.weight_hh_l0, 0.5) # Important: Poids inferieur a 1
init.constant_(rnn.weight_ih_l0, 0.5)
# Init biases
init.constant_(rnn.bias_hh_l0, 0.0)
init.constant_(rnn.bias_ih_l0, 0.0)
rnn.weight_hh_l0.grad, rnn.weight_ih_l0.grad = None, None
hiddens, ht = rnn(x3)
ht.backward()

In [351]:
rnn.weight_hh_l0.grad

tensor([[0.4920]])

In [352]:
rnn.weight_ih_l0.grad

tensor([[0.7152]])

In [353]:
ht

tensor([[[0.6879]]], grad_fn=<StackBackward>)

In [354]:
x3.grad

tensor([[[4.7499e-12]],

        [[1.2079e-11]],

        [[3.9541e-11]],

        [[1.4371e-10]],

        [[5.3918e-10]],

        [[2.0406e-09]],

        [[7.7405e-09]],

        [[2.9380e-08]],

        [[1.1154e-07]],

        [[4.2344e-07]],

        [[1.6076e-06]],

        [[6.1032e-06]],

        [[2.3171e-05]],

        [[8.7967e-05]],

        [[3.3397e-04]],

        [[1.2679e-03]],

        [[4.8136e-03]],

        [[1.8275e-02]],

        [[6.9380e-02]],

        [[2.6340e-01]]])

# Backprop through time (BPTT)

In [376]:
x3 = torch.ones((20, 1, 1), requires_grad=True)
rnn = nn.RNN(input_size=1, hidden_size=1, nonlinearity='tanh')
# Init weights
init.constant_(rnn.weight_hh_l0, 0.5) # Important: Poids inferieur a 1
init.constant_(rnn.weight_ih_l0, 0.5)
# Init biases
init.constant_(rnn.bias_hh_l0, 0.0)
init.constant_(rnn.bias_ih_l0, 0.0)

rnn.weight_hh_l0.grad, rnn.weight_ih_l0.grad = None, None

# Here comes the BPTT
hiddens, ht = rnn(x3[:5])
ht.backward()
hiddens, ht = rnn(x3[5:10])
ht.backward()
hiddens, ht = rnn(x3[10:15])
ht.backward()
hiddens, ht = rnn(x3[15:20])
ht.backward()
print(x3.grad)

tensor([[[0.0023]],

        [[0.0059]],

        [[0.0194]],

        [[0.0704]],

        [[0.2642]],

        [[0.0023]],

        [[0.0059]],

        [[0.0194]],

        [[0.0704]],

        [[0.2642]],

        [[0.0023]],

        [[0.0059]],

        [[0.0194]],

        [[0.0704]],

        [[0.2642]],

        [[0.0023]],

        [[0.0059]],

        [[0.0194]],

        [[0.0704]],

        [[0.2642]]])


# Augmentons la dimensionalit√©

In [366]:
rnn = nn.RNN(input_size=2, hidden_size=3, nonlinearity='tanh')
rnn

RNN(2, 3)

In [367]:
rnn.weight_hh_l0, rnn.bias_hh_l0

(Parameter containing:
 tensor([[-0.2226, -0.1108,  0.5350],
         [ 0.1088,  0.2017,  0.2907],
         [ 0.5027, -0.3554, -0.1394]], requires_grad=True),
 Parameter containing:
 tensor([-0.2575,  0.3207,  0.5624], requires_grad=True))

In [368]:
rnn.weight_ih_l0, rnn.bias_ih_l0

(Parameter containing:
 tensor([[ 0.2224, -0.1436],
         [-0.4591, -0.0053],
         [-0.2355,  0.2791]], requires_grad=True), Parameter containing:
 tensor([0.5437, 0.3929, 0.1203], requires_grad=True))

In [369]:
x = torch.FloatTensor([[[1, 2]], [[2, 3]]])
rnn(x)

(tensor([[[0.2178, 0.2392, 0.7638]],
 
         [[0.5607, 0.0734, 0.7473]]], grad_fn=<StackBackward>),
 tensor([[[0.5607, 0.0734, 0.7473]]], grad_fn=<StackBackward>))

# Utilisons un LSTM pour contrer le vanishing gradient

In [359]:
x3 = torch.ones((20, 1, 1), requires_grad=True)
rnn = nn.LSTM(input_size=1, hidden_size=1)
# Init weights
init.constant_(rnn.weight_hh_l0, 0.5) # Important: Poids inferieur a 1
init.constant_(rnn.weight_ih_l0, 0.5)
# Init biases
init.constant_(rnn.bias_hh_l0, 0.0)
init.constant_(rnn.bias_ih_l0, 0.0)
rnn.weight_hh_l0.grad, rnn.weight_ih_l0.grad = None, None
hiddens, (ht, ct) = rnn(x3)
ht.backward()

In [360]:
rnn.weight_hh_l0.grad

tensor([[0.0485],
        [0.1067],
        [0.1296],
        [0.1642]])

In [361]:
rnn.weight_ih_l0.grad

tensor([[0.0783],
        [0.1714],
        [0.2099],
        [0.2621]])

In [362]:
ht

tensor([[[0.6286]]], grad_fn=<StackBackward>)

In [363]:
x3.grad

tensor([[[0.0004]],

        [[0.0005]],

        [[0.0006]],

        [[0.0008]],

        [[0.0010]],

        [[0.0013]],

        [[0.0017]],

        [[0.0023]],

        [[0.0030]],

        [[0.0040]],

        [[0.0052]],

        [[0.0069]],

        [[0.0092]],

        [[0.0122]],

        [[0.0163]],

        [[0.0216]],

        [[0.0288]],

        [[0.0388]],

        [[0.0579]],

        [[0.1483]]])