## 概览

crf_log_likelihood：CRF 的损失函数，它由 真实路径的分数 和 所有路径的总分数 两部分构成,真实路径的分数应该是所有路径分数里最高的。真实路径的分数由 状态分数和 转移分数相加得到 $$S_{i}$$, 则分数 $$P_{S_{i}}$$ 为 $$e^{S_{i}}$$，所有路径的总分就是所有路径取指数的分数和。 这样, 损失函数的公式为： $$L = -\log \frac{P_{RealPath}}{P_{1} + P_{2} + \dots + P_{N}}$$
crf_decode：CRF 的解码模块，输入为 tensor，根据LSTM 等模块的输出加上转移矩阵来得到最佳解码序列和对应的得分。
viterbi_decode：维特比解码模块，输入为数组一类的东西而不是 tensor，在测试或者要对解码后处理时用得到。

[batch_size, max_seq_len, num_tags]

[batch_size]

[num_tags, num_tags]

## crf_log_likelihood

log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
unary_scores, gold_tags, sequence_lengths, transition_params=None)

def crf_log_likelihood(inputs,
tag_indices,
sequence_lengths,
transition_params=None):
"""Computes the log-likelihood of tag sequences in a CRF.
Args:
inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
to use as input to the CRF layer.
tag_indices: A [batch_size, max_seq_len] matrix of tag indices for which we
compute the log-likelihood.
sequence_lengths: A [batch_size] vector of true sequence lengths.
transition_params: A [num_tags, num_tags] transition matrix, if available.
Returns:
log_likelihood: A [batch_size] Tensor containing the log-likelihood of
each example, given the sequence of tag indices.
transition_params: A [num_tags, num_tags] transition matrix. This is either
provided by the caller or created in this function.
"""
// 得到 tag 的数量
num_tags = inputs.get_shape()[2].value
// 如果没给转移矩阵就自己初始化
if transition_params is None:
transition_params = vs.get_variable("transitions", [num_tags, num_tags])
// 得到最佳路径得分
sequence_scores = crf_sequence_score(inputs, tag_indices, sequence_lengths,
transition_params)
// 所有路径得分做指数运算求和后取对数的值（logsumexp）
log_norm = crf_log_norm(inputs, sequence_lengths, transition_params)
// Normalize the scores to get the log-likelihood per example.
// 最佳路径得分减去 logsumexp(所有路径) 的分数
// 简要推导看下面
// 详情请见 https://createmomo.github.io/2017/11/11/CRF-Layer-on-the-Top-of-BiLSTM-5/
log_likelihood = sequence_scores - log_norm
return log_likelihood, transition_params

$L = -\log \frac{P_{RealPath}}{P_{1} + P_{2} + \dots + P_{N}}$

\begin{aligned} L & = -\log \frac{P_{RealPath}}{P_{1} + P_{2} + \dots + P_{N}} \\ & = -\log \frac{e^{S_{RealPath}}}{e^{S_{1}} + e^{S_{2}} + \dots + e^{S_{N}}} \\ & = -(\log (e^{S_{RealPath}}) – \log (e^{S_{1}} + e^{S_{2}} + \dots + e^{S_{N}})) \\ & = -(S_{RealPath} -\log (e^{S_{1}} + e^{S_{2}} + \dots + e^{S_{N}})) \end{aligned}

def crf_log_norm(inputs, sequence_lengths, transition_params):
"""Computes the normalization for a CRF.
Args:
inputs: A [batch_size, max_seq_len, num_tags] tensor of unary potentials
to use as input to the CRF layer.
sequence_lengths: A [batch_size] vector of true sequence lengths.
transition_params: A [num_tags, num_tags] transition matrix.
Returns:
log_norm: A [batch_size] vector of normalizers for a CRF.
"""
// Split up the first and rest of the inputs in preparation for the forward
// algorithm.
first_input = array_ops.slice(inputs, [0, 0, 0], [-1, 1, -1])
first_input = array_ops.squeeze(first_input, [1])
// If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over
// the "initial state" (the unary potentials).
def _single_seq_fn():
log_norm = math_ops.reduce_logsumexp(first_input, [1])
// Mask log_norm of the sequences with length <= zero.
log_norm = array_ops.where(math_ops.less_equal(sequence_lengths, 0),
array_ops.zeros_like(log_norm),
log_norm)
return log_norm
def _multi_seq_fn():
"""Forward computation of alpha values."""
rest_of_input = array_ops.slice(inputs, [0, 1, 0], [-1, -1, -1])
// Compute the alpha values in the forward algorithm in order to get the
// partition function.
// 说是用 RNN 来做 CRF 里的前向算法
forward_cell = CrfForwardRnnCell(transition_params)
// Sequence length is not allowed to be less than zero.
// 限制 max length 的长度，若小于 1 ，则用 0 代替
sequence_lengths_less_one = math_ops.maximum( constant_op.constant(0, dtype=sequence_lengths.dtype),sequence_lengths - 1)
// alpha 是 cell state
_, alphas = rnn.dynamic_rnn(
cell=forward_cell,
inputs=rest_of_input,
sequence_length=sequence_lengths_less_one,
initial_state=first_input,
dtype=dtypes.float32)
log_norm = math_ops.reduce_logsumexp(alphas, [1])
// Mask log_norm of the sequences with length <= zero.
log_norm = array_ops.where(math_ops.less_equal(sequence_lengths, 0),
array_ops.zeros_like(log_norm),
log_norm)
return log_norm
max_seq_len = array_ops.shape(inputs)[1]
return control_flow_ops.cond(pred=math_ops.equal(max_seq_len, 1),
true_fn=_single_seq_fn,
false_fn=_multi_seq_fn)

def __call__(self, inputs, state, scope=None):
"""Build the CrfForwardRnnCell.
Args:
inputs: A [batch_size, num_tags] matrix of unary potentials.
state: A [batch_size, num_tags] matrix containing the previous alpha
values.
scope: Unused variable scope of this cell.
Returns:
new_alphas, new_alphas: A pair of [batch_size, num_tags] matrices
values containing the new alpha values.
"""
state = array_ops.expand_dims(state, 2)
// dimension and state along the second dimension. This performs the
// multiplication of previous alpha values and the current binary potentials
// in log space.
transition_scores = state + self._transition_params
new_alphas = inputs + math_ops.reduce_logsumexp(transition_scores, [1])
// Both the state and the output of this RNN cell contain the alphas values.
// The output value is currently unused and simply satisfies the RNN API.
// This could be useful in the future if we need to compute marginal
// probabilities, which would require the accumulated alpha values at every
// time step.
return new_alphas, new_alphas

def __call__(self, inputs, state, scope=None):
state = array_ops.expand_dims(state, 2)
// self._transition_params 是 [1, num_tags, num_tags]
// state 是 [batch_size, num_tags, 1]
// 加完变成 [batch_size, num_tags, num_tags]
// 相当于 previous + obs 那步
transition_scores = inputs + state + self._transition_params
// 相当于计算 scores 和 更新 previous 结合，
// new_alphas = inputs + math_ops.reduce_logsumexp(transition_scores, [1])
new_alphas = math_ops.reduce_logsumexp(transition_scores, [1])

## viterbi_decode

import numpy as np
def viterbi_decode(score, transition_params):
"""Decode the highest scoring sequence of tags with Viterbi.
Args:
score: A [seq_len, num_tags] matrix of unary potentials.
transition_params: A [num_tags, num_tags] matrix of binary potentials.
Returns:
viterbi: A [seq_len] list of integers containing the highest scoring tag
indices.
viterbi_score: A float containing the score for the Viterbi sequence.
"""
// 和输入同样 shape 的 0 矩阵
// 这个东西用来记录解码中累积的分数
trellis = np.zeros_like(score)
// 和上面一样的，只不过里面元素强制要求是 int
backpointers = np.zeros_like(score, dtype=np.int32)
// 第一个 token 的初始化
trellis[0] = score[0]
for t in range(1, score.shape[0]):
// np.expand_dims(trellis[t - 1], 1) 的 shape 是[num_tags, 1]
// 通过 numpy 矩阵的自动扩维，使得每个转移矩阵和当前分数进行结合
v = np.expand_dims(trellis[t - 1], 1) + transition_params
trellis[t] = score[t] + np.max(v, 0)
// 上一步的 tag
backpointers[t] = np.argmax(v, 0)
// 从最后时间步向前解码
print("trellis: ", trellis)
print("backpointers: ", backpointers)
viterbi = [np.argmax(trellis[-1])]
for bp in reversed(backpointers[1:]):
viterbi.append(bp[viterbi[-1]])
viterbi.reverse()
viterbi_score = np.max(trellis[-1])
return viterbi, viterbi_score

## crf_decode

viterbi_sequence, viterbi_score = tf.contrib.crf.crf_decode(
unary_scores, transition_params, sequence_lengths)