## Q学习

Q学习即是学习不同状态下各个动作的质量，它定义为

```initialize Q[numstates,numactions] arbitrarily
observe initial state s
repeat
select and carry out an action a
observe reward R and new state s'
Q[s,a] = Q[s,a] + α(R + γmaxa'Q[s',a'] - Q[s,a])
s = s'
until terminated```

## 实现Q学习

```ROWS = 5
COLUMNS = 6
ENTRANCE = (0, 0)
EXIT = (4, 5)
BARRIERS = list()
BARRIERS.append((1, 1))
BARRIERS.append((2, 1))
BARRIERS.append((3, 1))
BARRIERS.append((4, 1))
BARRIERS.append((0, 3))
BARRIERS.append((1, 3))
BARRIERS.append((3, 3))
BARRIERS.append((4, 3))
BARRIERS.append((3, 4))
BARRIERS.append((1, 5))```

```TIMES = 200
R = 0.05
ALPHA = 0.1
GAMMA = 0.9
q_values = dict()
results = list()```

```def init_q_values():
for row in range(0, ROWS):
for col in range(0, COLUMNS):
state = State(row, col)
for action in Actions:
q = (state.row, state.col, action)
q_values[q] = 0```

```def move(curr_state, action):
new_state = State(curr_state.row, curr_state.col)
# check borders
if action == Actions.up:
if (new_state.row - 1) >= 0:
new_state.row -= 1
elif action == Actions.down:
if (new_state.row + 1) <= (ROWS - 1):
new_state.row += 1
elif action == Actions.left:
if (new_state.col - 1) >= 0:
new_state.col -= 1
elif action == Actions.right:
if (new_state.col + 1) <= (COLUMNS - 1):
new_state.col += 1
return new_state```

```def explore(curr_state):
rand = random.random()
if rand <= R:
return random.choice(list(Actions))
else:
best = list()
best_action = Actions.up
best_value = -10000000
for action in Actions:
q = (curr_state.row, curr_state.col, action)
if q_values[q] > best_value:
best_action = action
best_value = q_values[q]
best.append(best_action)
# perhaps it has not only one best action
for action in Actions:
q = (curr_state.row, curr_state.col, action)
if action != best_action:
if q_values[q] == best_value:
best.append(action)
return random.choice(best)```

```def update(curr_state, last_action):
q = (curr_state.row, curr_state.col, last_action)
new_state = move(curr_state, last_action)
position = (new_state.row, new_state.col)
reward = -1
if position == EXIT:
reward = 0
elif position in BARRIERS:
reward = -100
old_value = q_values[q]
max_new = max([q_values[(new_state.row, new_state.col, a)] for a in Actions])
q_values[q] = old_value + ALPHA * (reward + (GAMMA * max_new) - old_value)
curr_state.row = new_state.row
curr_state.col = new_state.col```

```Actions.right
Actions.right
Actions.down
Actions.down
Actions.right
Actions.right
Actions.right
Actions.down
Actions.down
Actions.down```