# 语言模型教程

## 简介

the dog barks STOP
the STOP
STOP

$\sum_{<x_1, x_2, …, x_n> \in \mathcal{V}^{+}}p(x_1, x_2, …, x_n)=1$

## 马尔科夫模型

### Trigram语言模型

Trigram语言模型包括一个词典集合$\mathcal{V}$和参数$q(w \vert u,v)$，其中

$q(w \vert u, v)$可以认为是看到bigram(u,v)之后出现w的概率。Trigram可以计算一个句子$x_1,x_2,…,x_n(=STOP)$的概率：

## 语言模型的平滑

，其中$\beta$是一个0-1之间的数。我们用这个打折后的

ptb.char.test.txt
ptb.char.train.txt
ptb.char.valid.txt
ptb.test.txt
ptb.train.txt
ptb.valid.txt
README

1  aer banknote berlitz calloway centrust cluett fromstein gitano guterman hydro-quebec ipo kia memotec mlx nahb punts rake regatta rubens sim snack-food ssangyong swapo wachter
2  pierre <unk> N years old will join the board as a nonexecutive director nov. N
3  mr. <unk> is chairman of <unk> n.v. the dutch publishing group

，用来表示未登录词，通常的方法是把训练数据中频率低于某个值的都替换成

，而在只在测试数据中出现没有在训练数据中出现的词也都替换成

def ptb_raw_data(data_path=None):
train_path = os.path.join(data_path, "ptb.train.txt")
valid_path = os.path.join(data_path, "ptb.valid.txt")
test_path = os.path.join(data_path, "ptb.test.txt")

word_to_id = _build_vocab(train_path)
train_data = _file_to_word_ids(train_path, word_to_id)
valid_data = _file_to_word_ids(valid_path, word_to_id)
test_data = _file_to_word_ids(test_path, word_to_id)
vocabulary = len(word_to_id)
return train_data, valid_data, test_data, vocabulary

def ptb_producer(raw_data, batch_size, num_steps, name=None):
"""PTB data上的iterator

raw_data: 来自ptb_raw_data函数
batch_size: batch size.
num_steps: 训练时的句子长度
name: 名字

返回两个tensor，每个的shape是[batch_size, num_steps]。比如句子是 it is a good day。那幺第一个tensor是"it is a good"；第二个是"is a good day"。
"""
with tf.name_scope(name, "PTBProducer", [raw_data, batch_size, num_steps]):
raw_data = tf.convert_to_tensor(raw_data, name="raw_data", dtype=tf.int32)

data_len = tf.size(raw_data)
batch_len = data_len // batch_size
data = tf.reshape(raw_data[0 : batch_size * batch_len],
[batch_size, batch_len])

epoch_size = (batch_len - 1) // num_steps
assertion = tf.assert_positive(
epoch_size,
message="epoch_size == 0, decrease batch_size or num_steps")
with tf.control_dependencies([assertion]):
epoch_size = tf.identity(epoch_size, name="epoch_size")

i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
x = tf.strided_slice(data, [0, i * num_steps],
[batch_size, (i + 1) * num_steps])
x.set_shape([batch_size, num_steps])
y = tf.strided_slice(data, [0, i * num_steps + 1],
[batch_size, (i + 1) * num_steps + 1])
y.set_shape([batch_size, num_steps])
return x, y

it is a good day <eos> I am <unk> of that <eos> .....       it is funny <eos>

data = tf.reshape(raw_data[0 : batch_size * batch_len],
[batch_size, batch_len])

epoch_size = (batch_len - 1) // num_steps

i = tf.train.range_input_producer(epoch_size, shuffle=False).dequeue()
x = tf.strided_slice(data, [0, i * num_steps],
[batch_size, (i + 1) * num_steps])
x.set_shape([batch_size, num_steps])
y = tf.strided_slice(data, [0, i * num_steps + 1],
[batch_size, (i + 1) * num_steps + 1])
y.set_shape([batch_size, num_steps])
return x, y

#### 定义模型

with tf.device("/cpu:0"):
embedding = tf.get_variable(
"embedding", [vocab_size, size], dtype=data_type())
inputs = tf.nn.embedding_lookup(embedding, input_.input_data)

if is_training and config.keep_prob < 1:
inputs = tf.nn.dropout(inputs, config.keep_prob)

output, state = self._build_rnn_graph(inputs, config, is_training)

softmax_w = tf.get_variable(
"softmax_w", [size, vocab_size], dtype=data_type())
softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=data_type())
logits = tf.nn.xw_plus_b(output, softmax_w, softmax_b)
# 从[batch*time, vocab_size]reshpae成[self.batch_size, self.num_steps, vocab_size]
logits = tf.reshape(logits, [self.batch_size, self.num_steps, vocab_size])

# 计算loss
loss = tf.contrib.seq2seq.sequence_loss(
logits,
input_.targets,
tf.ones([self.batch_size, self.num_steps], dtype=data_type()),
average_across_timesteps=False,
average_across_batch=True)

self._cost = tf.reduce_sum(loss)
self._final_state = state

logits shape是[batch_size, sequence_length, num_decoder_symbols]，dtype是float
targets 真实的label序列，shape是[batch_size, sequence_length]，dtype要求是int
average_across_timesteps 如果是True，那幺返回的loss会对timestep这个维度进行求平均值
average_across_batch 如果是True，返回的loss会对batch这个维度求平均值
softmax_loss_function 如果None，使用默认的softmax函数，调用的时候也可以自己提供以softmax函数
name

for step in range(model.input.epoch_size):
# 省略了feed_dict的构建
vals = session.run(fetches, feed_dict)
cost = vals["cost"]

costs += cost
iters += model.input.num_steps
ppl = np.exp(costs / iters)

def _build_rnn_graph(self, inputs, config, is_training):
if config.rnn_mode == CUDNN:
return self._build_rnn_graph_cudnn(inputs, config, is_training)
else:
return self._build_rnn_graph_lstm(inputs, config, is_training)

def _build_rnn_graph_lstm(self, inputs, config, is_training):
def make_cell():
cell = self._get_lstm_cell(config, is_training)
if is_training and config.keep_prob < 1:
cell = tf.contrib.rnn.DropoutWrapper(
cell, output_keep_prob=config.keep_prob)
return cell

cell = tf.contrib.rnn.MultiRNNCell(
[make_cell() for _ in range(config.num_layers)], state_is_tuple=True)

self._initial_state = cell.zero_state(config.batch_size, data_type())
state = self._initial_state

# 手动unroll来实现tf.nn.static_rnn()。
# 这里手动实现unroll的目的是为了展示怎幺手动实现。
# 实际使用是应该用tf.nn.static_rnn()或者tf.nn.static_state_saving_rnn().
#
# 实际我们应该使用如下代码：
#
# inputs = tf.unstack(inputs, num=self.num_steps, axis=1)
# outputs, state = tf.nn.static_rnn(cell, inputs,
#                                   initial_state=self._initial_state)
outputs = []
with tf.variable_scope("RNN"):
for time_step in range(self.num_steps):
if time_step > 0: tf.get_variable_scope().reuse_variables()
(cell_output, state) = cell(inputs[:, time_step, :], state)
outputs.append(cell_output)
output = tf.reshape(tf.concat(outputs, 1), [-1, config.hidden_size])
return output, state

def _get_lstm_cell(self, config, is_training):
if config.rnn_mode == BASIC:
return tf.contrib.rnn.BasicLSTMCell(
config.hidden_size, forget_bias=0.0, state_is_tuple=True,
reuse=not is_training)
if config.rnn_mode == BLOCK:
return tf.contrib.rnn.LSTMBlockCell(
config.hidden_size, forget_bias=0.0)
raise ValueError("rnn_mode %s not supported" % config.rnn_mode)

def _build_rnn_graph_cudnn(self, inputs, config, is_training):
inputs = tf.transpose(inputs, [1, 0, 2])
self._cell = tf.contrib.cudnn_rnn.CudnnLSTM(
num_layers=config.num_layers,
num_units=config.hidden_size,
input_size=config.hidden_size,
dropout=1 - config.keep_prob if is_training else 0)
params_size_t = self._cell.params_size()
self._rnn_params = tf.get_variable(
"lstm_params",
initializer=tf.random_uniform(
[params_size_t], -config.init_scale, config.init_scale),
validate_shape=False)
c = tf.zeros([config.num_layers, self.batch_size, config.hidden_size],
tf.float32)
h = tf.zeros([config.num_layers, self.batch_size, config.hidden_size],
tf.float32)
self._initial_state = (tf.contrib.rnn.LSTMStateTuple(h=h, c=c),)
outputs, h, c = self._cell(inputs, h, c, self._rnn_params, is_training)
outputs = tf.transpose(outputs, [1, 0, 2])
outputs = tf.reshape(outputs, [-1, config.hidden_size])
return outputs, (tf.contrib.rnn.LSTMStateTuple(h=h, c=c),)