Cart pole的玩法如下动图所示，目标就是保持一根杆一直竖直朝上，杆由于重力原因会一直倾斜，当杆倾斜到一定程度就会倒下，此时需要朝左或者右移动杆保证它不会倒下来。这和在指尖上树立一只铅笔一样，只不过cart pole是一维的。

## 搭建基础框架

```import gym
import numpy as np
env = gym.make('CartPole-v1')```

```def play(env, policy):
observation = env.reset()```

```done = False
score = 0
observations = []```

```for _ in range(5000):
observations += [observation.tolist()] # Record the observations for normalization and replay
if done: # If the simulation was over last iteration, exit loop
break
# Pick an action according to the policy matrix
outcome = np.dot(policy, observation)
action = 1 if outcome > 0 else 0
# Make the action, record reward
observation, reward, done, info = env.step(action)
score += reward
return score, observations```

```outcome = np.dot(policy, observation)
action = 1 if outcome > 0 else 0```

```import gym
import numpy as np
env = gym.make('CartPole-v1')
def play(env, policy):
observation = env.reset()
done = False
score = 0
observations = []
for _ in range(5000):
observations += [observation.tolist()] # Record the observations for normalization and replay
if done: # If the simulation was over last iteration, exit loop
break
# Pick an action according to the policy matrix
outcome = np.dot(policy, observation)
action = 1 if outcome > 0 else 0
# Make the action, record reward
observation, reward, done, info = env.step(action)
score += reward
return score, observations```

## 开始第一局游戏

`policy = np.random.rand(1,4)`

```score, observations = play(env, policy)
print('Policy Score', score)```

## 观察智能体

```from flask import Flask
import json
@app.route("/data")
def data():
return json.dumps(observations)
@app.route('/')
def root():
return app.send_static_file('./index.html')
app.run(host='0.0.0.0', port='3000')```

## 策略搜索

`max = (0, [], [])`

```for _ in range(10):
policy = np.random.rand(1,4)
score, observations = play(env, policy)
if score > max[0]:
max = (score, observations, policy)
print('Max Score', max[0])```

```@app.route("/data")
def data():
return json.dumps(observations)```

```@app.route("/data")
def data():
return json.dumps(max[1])```

```import gym
import numpy as np
env = gym.make('CartPole-v1')
def play(env, policy):
observation = env.reset()
done = False
score = 0
observations = []
for _ in range(5000):
observations += [observation.tolist()] # Record the observations for normalization and replay
if done: # If the simulation was over last iteration, exit loop
break
# Pick an action according to the policy matrix
outcome = np.dot(policy, observation)
action = 1 if outcome > 0 else 0
# Make the action, record reward
observation, reward, done, info = env.step(action)
score += reward
return score, observations
max = (0, [], [])
for _ in range(10):
policy = np.random.rand(1,4)
score, observations = play(env, policy)
if score > max[0]:
max = (score, observations, policy)
print('Max Score', max[0])
import json
@app.route("/data")
def data():
return json.dumps(max[1])
@app.route('/')
def root():
return app.send_static_file('./index.html')
app.run(host='0.0.0.0', port='3000')```

## 不足之处

```import gym
import numpy as np
env = gym.make('CartPole-v1')
def play(env, policy):
observation = env.reset()
done = False
score = 0
observations = []
for _ in range(5000):
observations += [observation.tolist()] # Record the observations for normalization and replay
if done: # If the simulation was over last iteration, exit loop
break
# Pick an action according to the policy matrix
outcome = np.dot(policy, observation)
action = 1 if outcome > 0 else 0
# Make the action, record reward
observation, reward, done, info = env.step(action)
score += reward
return score, observations
max = (0, [], [])
# We changed the next two lines!
for _ in range(100):
policy = np.random.rand(1,4) - 0.5
score, observations = play(env, policy)
if score > max[0]:
max = (score, observations, policy)
print('Max Score', max[0])
import json
@app.route("/data")
def data():
return json.dumps(max[1])
@app.route('/')
def root():
return app.send_static_file('./index.html')
app.run(host='0.0.0.0', port='3000')```

```scores = []
for _ in range(100):
score, _  = play(env, max[2])
scores += [score]
print('Average Score (100 trials)', np.mean(scores))```

## 结语

• 找到一个真正的最优策略
• 减少找到最优策略的次数
• 研究如何能找到正确的策略，而不是随即进行选择
• 解决其他环境。