\begin{align*} h_{t+1} &= x_t @ w_{xh} + h_t @ w_{hh}\\ &= [3, 100] @ [20, 100]^T + [3, 20] @ [20, 100]^T \\ &= [3, 20] \end{align*}

### nn.RNN

import torch
import torch.nn as nn
rnn = nn.RNN(100, 20)
print(rnn._parameters.keys())
print(rnn.weight_ih_l0.shape) # w_{xh} [20, 100]
print(rnn.weight_hh_l0.shape) # w_{hh} [20, 20]
print(rnn.bias_ih_l0.shape) # b_{xh} [20]
print(rnn.bias_hh_l0.shape) # b_{hh} [20]

RNN的forward函数与CNN定义的方式有点不太一样，具体见下图

$h_0$如果不写默认就是0，如果写的话，$h_0$的维度是$[layers, batch, hidden\_len]$

import torch
import torch.nn as nn
rnn = nn.RNN(input_size=100, hidden_size=20, num_layers=1)
x = torch.randn(10, 3, 100)
out, h_t = rnn(x, torch.zeros(1, 3, 20))
print(out.shape) # [10, 3, 20]
print(h_t.shape) # [1, 3, 20]

$h_t$和$out$很容易搞混，我们先看一个2层的RNN模型

$h_t$：最后一个时间戳上面所有的memory状态
$out$：所有时间戳上的最后一个memory状态

import torch
import torch.nn as nn
rnn = nn.RNN(input_size=100, hidden_size=20, num_layers=4)
x = torch.randn(10, 3, 100)
out, h_t = rnn(x)
print(out.shape) # [10, 3, 20]
print(h_t.shape) # [4, 3, 20]

### nn.RNNCell

import torch
import torch.nn as nn
cell1 = nn.RNNCell(100, 20)
x = torch.randn(10, 3, 100)
h1 = torch.zeros(3, 20)
for xt in x:
h1 = cell1(xt, h1)
print(h1.shape) # [3, 20]

import torch
import torch.nn as nn
cell1 = nn.RNNCell(100, 30) # 100 -> 30
cell2 = nn.RNNCell(30, 20)
x = torch.randn(10, 3, 100)
h1 = torch.zeros(3, 30)
h2 = torch.zeros(3, 20)
for xt in x:
h1 = cell1(xt, h1)
h2 = cell2(h1, h2)
print(h2.shape) # [3, 20]