## 审视一下现在的 model，还缺一些推导

human state（\$s^H_t\$）跟 robot action（\$a^R_t\$） 会决定 human action（\$a^H_t\$）
\$s^H_t\$、\$a^R_t\$ 跟 \$a^H_t\$ 会决定下一时刻 t+1 的 human state
…依此类推

## 开始推导成 MDP

\$\$ S = S^H * S^W\$\$

## 程式码

1. 定义 state

1. 定义一些基本的转换机率（为了计算transition matrix）

1. 计算 transition matrix

1. 定义 reward

1. 用 value iteration 算出 optimal policy

```%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%% CSCI 699: Computational Human-Robot Interaction %%%%%
%%%%% Fall 2018, University of Southern California    %%%%%
%%%%% Author: Stefanos Nikolaidis, [email protected]   %%%%%
%%%%% Commented by: Po-Jen Lai, [email protected]      %%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

clear all
close all

## 定义 state

human_states_str = {'no_trust', 'trust'};

Label.BOTTLE = 1;
Label.GLASS = 2;

Label.ON_TABLE = 1;
Label.PICKED_BY_ROBOT = 2;
Label.PICKED_BY_HUMAN = 3;

Label.NO_TRUST = 1;
Label.TRUST = 2;

objstate_str = {'on_table','picked_robot','picked_human'};
ractions_str = {'pick_bottle','pick_glass'};

## 定义一些基本的转换机率（为了计算　transition matrix）

%{no_trust,trust} x{bottle, glass}
PROB_TRUST_INCREASE = [0.8 0.9;
0.0 0.0];

%{no_trust,trust} x {bottle,glass}
PROB_TRUST_INTERVENE = [0.3 0.8;
0.1 0.2];

counter = 1;
for ii = 1:3 %for each object
for jj = 1:3 %for each  state
world_states(counter,:) = [ii,jj];
counter = counter + 1;
end
end
num_human_states = length(human_states_str);
num_world_states = counter -1;
num_trust_states = 2;
num_states = num_world_states*num_trust_states;
num_ractions = length(ractions_str);

## 计算 transition matrix

Trans = zeros(num_states,num_ractions,num_states);

for sh = 1:num_human_states
for sw = 1:num_world_states
for ra = 1:num_ractions
world_state = world_states(sw,:);
ss = (sh-1)*num_world_states + sw;

%picked by robot
new_world_state = world_state;
new_world_state(ra) =Label.PICKED_BY_ROBOT;
nsw = findWorldState(new_world_state,world_states);

nsh = sh; %trust stays the same
nss = (nsh-1)*num_world_states + nsw;
Trans(ss,ra,nss) = (1-PROB_TRUST_INTERVENE(sh,ra))*(1-PROB_TRUST_INCREASE(sh,ra));

if sh == 1 %trust can increase only if low
nsh = sh+1;
nss = (nsh-1)*num_world_states + nsw;
Trans(ss,ra,nss) = (1-PROB_TRUST_INTERVENE(sh,ra))*PROB_TRUST_INCREASE(sh,ra);
end

%picked by human
new_world_state = world_state;
new_world_state(ra) =Label.PICKED_BY_HUMAN;
nsw = findWorldState(new_world_state,world_states);

nsh = sh; %trust stays the same
nss = (nsh-1)*num_world_states + nsw;
Trans(ss,ra,nss) = PROB_TRUST_INTERVENE(sh,ra);
end
end
end

disp(' ')
disp('Transition Matrix')
for ss = 1:num_states
for ra = 1:num_ractions
sh = floor(ss/(num_world_states+1)) + 1;
sw = ss - (sh-1)*num_world_states;
nIndices = find(Trans(ss,ra,:)>0);
%do not worry about invalid actions
if (world_states(sw,ra) == Label.ON_TABLE)
str = strcat('if~', human_states_str{sh} , ' and bottle is~' , objstate_str(world_states(sw,1)) , ' and glass is~' , objstate_str(world_states(sw,2)) , ' and robot does~' , ractions_str{ra},':');
disp(str);
for nn = 1:length(nIndices)
nss = nIndices(nn);

nsh = floor(nss/(num_world_states+1)) + 1;
nsw = nss - (nsh-1)*num_world_states;
str = strcat('then the prob of ~', human_states_str{nsh} , ' and bottle is~' , objstate_str(world_states(nsw,1)) , ' and glass is~' , objstate_str(world_states(nsw,2)),':',num2str(Trans(ss,ra,nss)));
disp(str);
end
end
end
end

## 定义 reward

%reward function
Rew = zeros(num_states,num_ractions);
for ss = 1:num_states
for ra = 1:num_ractions
sh = floor(ss/(num_world_states+1)) + 1;
sw = ss - (sh-1)*num_world_states;

%say bonus if starts with glass
if (world_states(sw,Label.GLASS)==Label.PICKED_BY_ROBOT)&& (world_states(sw,Label.BOTTLE)==Label.ON_TABLE)
Rew(ss,ra) = 5;
end

%if we are not in a final state
if (world_states(sw,1)==Label.ON_TABLE) || (world_states(sw,2)==Label.ON_TABLE)
if world_states(sw,ra)~= Label.ON_TABLE %penalize infeasible actions
Rew(ss,ra) = -1000;
end
end
end
end

goal = findWorldState([Label.PICKED_BY_ROBOT, Label.PICKED_BY_ROBOT],world_states);
for ra = 1:num_ractions
for sh = 1:2
ss = (sh-1)*num_world_states + goal;
Rew(ss,ra) = 10;
end
end

%print reward function
disp(' ')
disp('reward function')
for ss = 1:num_states
for ra = 1:num_ractions
sh = floor(ss/(num_world_states+1)) + 1;
sw = ss - (sh-1)*num_world_states;
str = strcat('if~', human_states_str{sh} , ' and bottle is~' , objstate_str(world_states(sw,1)) , ' and glass is~' , objstate_str(world_states(sw,2)) , 'and robot action is~',ractions_str{ra}, ' then reward is: ',num2str(Rew(ss,ra)));
disp(str);
end
end

## 用 value iteration 算出 optimal policy

%value iteration
T = 3;
V = zeros(num_states,1);
policy = zeros(num_states,T);
new_V = zeros(num_states,1);
Q = zeros(num_states, num_ractions);
for tt = T:-1:1
for ss = 1:num_states

%check if terminal state
if ss == 12
debug = 1;
end
sh = floor(ss/(num_world_states+1)) + 1;
sw = ss - (sh-1)*num_world_states;
if ((world_states(sw,1)~=Label.ON_TABLE) && (world_states(sw,2)~=Label.ON_TABLE))
new_V(ss) = Rew(ss,1);
policy(ss,tt) = 1;
continue;
end

maxV = -1e6;
maxIndx = -1;
for ra = 1:num_ractions
res = Rew(ss,ra);
for nss = 1:num_states
res = res +  Trans(ss,ra,nss)*V(nss);
end
Q(ss,ra) = res;
if res > maxV
maxV = res;
maxIndx = ra;
end
end
new_V(ss) = maxV;
policy(ss,tt) = maxIndx;
end
V = new_V;
end

disp(' ')
disp('policy')
%print policy
for tt = 1
tt
for ss = 1:num_states
sh = floor(ss/(num_world_states+1)) + 1;
sw = ss - (sh-1)*num_world_states;
if ((world_states(sw,1) == Label.ON_TABLE)||(world_states(sw,2) == Label.ON_TABLE)) %we care only about feasible states
str = strcat('if~', human_states_str{sh} , ' and bottle is~' , objstate_str(world_states(sw,1)) , ' and glass is~' , objstate_str(world_states(sw,2)) , 'then robot does: ',ractions_str{policy(ss,tt)});
disp(str);
end
end
end```