### 代码实现

#### 算法参数

```epsilon = 0.9  # 贪婪度
alpha = 0.1  # 学习率
gamma = 0.8  # 奖励递减值```

`states = range(6)  # 状态集`

```def get_next_state(state, action):
'''对状态执行动作后，得到下一状态'''
global states
# l,r,n = -1,+1,0
if action == 'right' and state != states[-1]:  # 除非最后一个状态（位置），向右就+1
next_state = state + 1
elif action == 'left' and state != states[0]:  # 除非最前一个状态（位置），向左就-1
next_state = state - 1
else:
next_state = state
return next_state```

`actions = ['left', 'right']  # 动作集`

```def get_valid_actions(state):
'''取当前状态下的合法动作集合，与reward无关！'''
global actions  # ['left', 'right']
valid_actions = set(actions)
if state == states[-1]:  # 最后一个状态（位置），则
valid_actions -= set(['right'])  # 不能向右
if state == states[0]:  # 最前一个状态（位置），则
valid_actions -= set(['left'])  # 不能向左
return list(valid_actions)```

`rewards = [0, 0, 0, 0, 0, 1]  # 奖励集`

#### Q table

Q table是一种记录状态-行为值 (Q value) 的表。常见的q-table都是二维的，但是也有3维的Q table。

`q_table = pd.DataFrame(data=[[0 for _ in actions] for _ in states], index=states, columns=actions)`

#### Q-learning算法实现

```for i in range(13):
# current_state = random.choice(states)
current_state = 0
update_env(current_state)  # 环境相关
total_steps = 0  # 环境相关
while current_state != states[-1]:
if (random.uniform(0, 1) > epsilon) or ((q_table.loc[current_state] == 0).all()):  # 探索
current_action = random.choice(get_valid_actions(current_state))
else:
current_action = q_table.loc[current_state].idxmax()  # 利用（贪婪）
next_state = get_next_state(current_state, current_action)
next_state_q_values = q_table.loc[next_state, get_valid_actions(next_state)]
q_table.loc[current_state, current_action] += alpha * (
rewards[next_state] + gamma * next_state_q_values.max() - q_table.loc[current_state, current_action])
current_state = next_state
update_env(current_state)  # 环境相关
total_steps += 1  # 环境相关
print('\rEpisode {}: total_steps = {}'.format(i, total_steps), end='')  # 环境相关
time.sleep(2)  # 环境相关
print('\r                                ', end='')  # 环境相关
print('
q_table:')
print(q_table)```

#### 更新状态

```def update_env(state):
global states
env = list('-----T')
if state != states[-1]:
env[state] = '0'
print('\r{}'.format(''.join(env)), end='')
time.sleep(0.1)```

### 完整代码

```import pandas as pd
import random
import time
#########参数
epsilon = 0.9  # 贪婪度
alpha = 0.1  # 学习率
gamma = 0.8  # 奖励递减值
#####探索者的状态,即可到达的位置
states = range(6)  # 状态集
actions = ['left', 'right']  # 动作集
rewards = [0, 0, 0, 0, 0, 1]  # 奖励集
q_table = pd.DataFrame(data=[[0 for _ in actions] for _ in states], index=states, columns=actions)
def update_env(state):
global states
env = list('-----T')
if state != states[-1]:
env[state] = '0'
print('\r{}'.format(''.join(env)), end='')
time.sleep(0.1)
def get_next_state(state, action):
'''对状态执行动作后，得到下一状态'''
global states
# l,r,n = -1,+1,0
if action == 'right' and state != states[-1]:  # 除非最后一个状态（位置），向右就+1
next_state = state + 1
elif action == 'left' and state != states[0]:  # 除非最前一个状态（位置），向左就-1
next_state = state - 1
else:
next_state = state
return next_state
def get_valid_actions(state):
'''取当前状态下的合法动作集合，与reward无关！'''
global actions  # ['left', 'right']
valid_actions = set(actions)
if state == states[-1]:  # 最后一个状态（位置），则
valid_actions -= set(['right'])  # 不能向右
if state == states[0]:  # 最前一个状态（位置），则
valid_actions -= set(['left'])  # 不能向左
return list(valid_actions)
for i in range(13):
# current_state = random.choice(states)
current_state = 0
update_env(current_state)  # 环境相关
total_steps = 0  # 环境相关
while current_state != states[-1]:
if (random.uniform(0, 1) > epsilon) or ((q_table.loc[current_state] == 0).all()):  # 探索
current_action = random.choice(get_valid_actions(current_state))
else:
current_action = q_table.loc[current_state].idxmax()  # 利用（贪婪）
next_state = get_next_state(current_state, current_action)
next_state_q_values = q_table.loc[next_state, get_valid_actions(next_state)]
q_table.loc[current_state, current_action] += alpha * (
rewards[next_state] + gamma * next_state_q_values.max() - q_table.loc[current_state, current_action])
current_state = next_state
update_env(current_state)  # 环境相关
total_steps += 1  # 环境相关
print('\rEpisode {}: total_steps = {}'.format(i, total_steps), end='')  # 环境相关
time.sleep(2)  # 环境相关
print('\r                                ', end='')  # 环境相关
print('
q_table:')
print(q_table)```