import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import IPython
내 메모는 녹색
강의영상
https://www.youtube.com/watch?v=Z4D70gCZwVU&list=PLQqh36zP38-zHvVuJ92xfdypwHwDFgg8k&index=1
Game2: 4x4 grid
-
문제설명: 4x4 그리드월드에서 상하좌우로 움직이는 에이전트가 목표점에 도달하도록 학습하는 방법
imports
예비학습: 시각화
def show(states):
= plt.Figure()
fig = fig.subplots()
ax 4,4]), cmap='bwr',alpha=0.0)
ax.matshow(np.zeros([= ax.scatter(0, 0, color='red', s=500)
sc 0, 0, 'start', ha='center', va='center')
ax.text(3, 3, 'end', ha='center', va='center')
ax.text(# Adding grid lines to the plot
-.5, 4, 1), minor=True)
ax.set_xticks(np.arange(-.5, 4, 1), minor=True)
ax.set_yticks(np.arange(='minor', color='black', linestyle='-', linewidth=2)
ax.grid(whichdef update(t):
sc.set_offsets(states[t])= FuncAnimation(fig,update,frames=len(states))
ani display(IPython.display.HTML(ani.to_jshtml()))
0,0],[0,1],[1,1],[1,2],[1,3],[1,2],[1,3],[1,2],[1,3],[1,2],[1,3]]) show([[
Env 클래스 구현
-
GridWorld: 강화학습에서 많이 예시로 사용되는 기본적인 시뮬레이션 환경
- State: 각 격자 셀이 하나의 상태이며, 에이전트는 이러한 상태 중 하나에 있을 수 있음.
- Action: 에이전트는 현재상태에서 다음상태로 이동하기 위해 상,하,좌,우 중 하나의 행동을 취할 수 있음.
- Reward: 에이전트가 현재상태에서 특정 action을 하면 얻어지는 보상
- Terminated: 하나의 에피소드가 종료되었음을 나타내는 상태
= 3
action = np.array([1,1]) current_state
= {
action_to_direction 0 : np.array([1, 0]), # x+
1 : np.array([0, 1]), # y+
2 : np.array([-1 ,0]), # x-
3 : np.array([0, -1]) # y-
}
= current_state + action_to_direction[action]
next_state next_state
array([1, 0])
class GridWorld:
def __init__(self):
self.reset()
self.state_space = gym.spaces.MultiDiscrete([4,4])
self.action_space = gym.spaces.Discrete(4)
self._action_to_direction = {
0 : np.array([1, 0]), # x+
1 : np.array([0, 1]), # y+
2 : np.array([-1 ,0]), # x-
3 : np.array([0, -1]) # y-
}def reset(self):
self.agent_action = None
self.agent_state = np.array([0,0])
return self.agent_state
def step(self,action):
= self._action_to_direction[action]
direction self.agent_state = self.agent_state + direction
if self.agent_state not in env.state_space: # 4x4 그리드 밖에 있는 경우
= -10
reward = True
terminated self.agent_state = self.agent_state -1/2 * direction
elif np.array_equal(env.agent_state, np.array([3,3])): # 목표지점에 도달할 경우
= 100
reward = True
terminated else:
= -1
reward = False
terminated return self.agent_state, reward, terminated
grid를 벗어나는 경우를 reward가 -10이 되게 함
= GridWorld() env
= []
states = env.reset()
state
states.append(state) for t in range(50):
= env.action_space.sample()
action = env.step(action)
state,reward,terminated
states.append(state) if terminated: break
에이전트가 무지한 경우
states
[array([0, 0]),
array([0, 1]),
array([0, 0]),
array([1, 0]),
array([2, 0]),
array([ 2. , -0.5])]
show(states)
Agent1 클래스 구현 + Run
-
우리가 구현하고 싶은 기능
.act()
: 액션을 결정 –> 여기서는 그냥 랜덤액션.save_experience()
: 데이터를 저장 –> 여기에 일단 초점을 맞추자.learn()
: 데이터로에서 학습 –> 패스
-
첫번째 시도
class Agent1:
def __init__(self,env):
self.action_space = env.action_space
self.state_spcae = env.state_space
self.n_experiences = 0
self.n_episodes = 0
self.score = 0
# episode-wise info
self.scores = []
self.playtimes = []
# time-wise info
self.current_state = None
self.action = None
self.reward = None
self.next_state = None
self.terminated = None
# replay_buffer
self.actions = []
self.current_states = []
self.rewards = []
self.next_states = []
self.terminations = []
def act(self):
self.action = self.action_space.sample()
def save_experience(self):
self.actions.append(self.action)
self.current_states.append(self.current_state)
self.rewards.append(self.reward)
self.next_states.append(self.next_state)
self.terminations.append(self.terminated)
self.n_experiences += 1
self.score = self.score + self.reward
def learn(self):
pass
일단 랜덤으로 acition 선택
에피소드의 개념은 몇번째 게임중이냐!
= GridWorld()
env = Agent1(env)
agent for _ in range(20):
## 본질적인 코드
= env.reset()
agent.current_state = False
agent.terminated = 0
agent.score for t in range(50):
# step1: agent >> env
agent.act() = agent.action
env.agent_action # step2: agent << env
= env.step(env.agent_action)
agent.next_state, agent.reward, agent.terminated
agent.save_experience() # step3: learn
# agent.learn()
# step4: state update
= agent.next_state
agent.current_state # step5:
if agent.terminated: break
agent.scores.append(agent.score) +1)
agent.playtimes.append(t= agent.n_episodes + 1
agent.n_episodes ## 덜 본질적인 코드
print(
f"Epsiode: {agent.n_episodes} \t"
f"Score: {agent.scores[-1]} \t"
f"Playtime: {agent.playtimes[-1]}"
)
Epsiode: 1 Score: -10 Playtime: 1
Epsiode: 2 Score: -10 Playtime: 1
Epsiode: 3 Score: -10 Playtime: 1
Epsiode: 4 Score: -17 Playtime: 8
Epsiode: 5 Score: -10 Playtime: 1
Epsiode: 6 Score: -11 Playtime: 2
Epsiode: 7 Score: -22 Playtime: 13
Epsiode: 8 Score: -10 Playtime: 1
Epsiode: 9 Score: -21 Playtime: 12
Epsiode: 10 Score: -10 Playtime: 1
Epsiode: 11 Score: -18 Playtime: 9
Epsiode: 12 Score: -11 Playtime: 2
Epsiode: 13 Score: -11 Playtime: 2
Epsiode: 14 Score: -10 Playtime: 1
Epsiode: 15 Score: -10 Playtime: 1
Epsiode: 16 Score: -12 Playtime: 3
Epsiode: 17 Score: 91 Playtime: 10
Epsiode: 18 Score: -10 Playtime: 1
Epsiode: 19 Score: -12 Playtime: 3
Epsiode: 20 Score: -12 Playtime: 3
if agent.terminated: break True이면 멈추라는 뜻
sum(agent.playtimes[:7])
27
sum(agent.playtimes[:8])
28
위에서 맞춘 거만 가져와봄(48 = Playtime의 누적 합)
= [np.array([0,0])] + agent.next_states[48:60]
states show(states)
- 우연히 잘맞춘 케이스
환경의 이해 (1차원적 이해)
-
무작위로 10000판을 진행해보자.
= GridWorld()
env = Agent1(env)
agent for _ in range(10000):
## 본질적인 코드
= env.reset()
agent.current_state = False
agent.terminated = 0
agent.score for t in range(50):
# step1: agent >> env
agent.act() = agent.action
env.agent_action # step2: agent << env
= env.step(env.agent_action)
agent.next_state, agent.reward, agent.terminated
agent.save_experience() # step3: learn
# agent.learn()
# step4: state update
= agent.next_state
agent.current_state # step5:
if agent.terminated: break
agent.scores.append(agent.score) +1)
agent.playtimes.append(t= agent.n_episodes + 1 agent.n_episodes
agent.n_experiences
32858
-
데이터관찰
0], agent.actions[0], agent.rewards[0], agent.next_states[0] agent.current_states[
(array([0, 0]), 3, -10, array([ 0. , -0.5]))
1], agent.actions[1], agent.rewards[1], agent.next_states[1] agent.current_states[
(array([0, 0]), 3, -10, array([ 0. , -0.5]))
2], agent.actions[2], agent.rewards[2], agent.next_states[2] agent.current_states[
(array([0, 0]), 0, -1, array([1, 0]))
3], agent.actions[3], agent.rewards[3], agent.next_states[3] agent.current_states[
(array([1, 0]), 3, -10, array([ 1. , -0.5]))
4], agent.actions[4], agent.rewards[4], agent.next_states[4] agent.current_states[
(array([0, 0]), 0, -1, array([1, 0]))
-
환경을 이해하기 위한 기록 (1)
q = x,y,a
x,y - 축 생각하면 될 듯
= np.zeros([4,4,4])
q = np.zeros([4,4,4])
count for i in range(agent.n_experiences):
= agent.current_states[i]
x,y = agent.actions[i]
a = q[x,y,a] + agent.rewards[i]
q[x,y,a] = count[x,y,a] + 1 count[x,y,a]
q의 x,y,a 차원에 rewards를 더하자
count를 기록해야 한다는 단점
== 0] = 0.01
count[count = q/count q
3] q[:,:,
array([[-10., -1., -1., -1.],
[-10., -1., -1., -1.],
[-10., -1., -1., -1.],
[-10., -1., -1., 0.]])
for a in range(4):
print(
f"action = {a}\n"
f"action-value function = \n {q[:,:,a]}\n"
)
action = 0
action-value function =
[[ -1. -1. -1. -1.]
[ -1. -1. -1. -1.]
[ -1. -1. -1. 100.]
[-10. -10. -10. 0.]]
action = 1
action-value function =
[[ -1. -1. -1. -10.]
[ -1. -1. -1. -10.]
[ -1. -1. -1. -10.]
[ -1. -1. 100. 0.]]
action = 2
action-value function =
[[-10. -10. -10. -10.]
[ -1. -1. -1. -1.]
[ -1. -1. -1. -1.]
[ -1. -1. -1. 0.]]
action = 3
action-value function =
[[-10. -1. -1. -1.]
[-10. -1. -1. -1.]
[-10. -1. -1. -1.]
[-10. -1. -1. 0.]]
-
환경을 이해하기 위한 기록 (2)
real 과 estimate의 차이를 이용하여 update
= np.zeros([4,4,4])
q for i in range(agent.n_experiences):
= agent.current_states[i]
x,y = agent.actions[i]
a = q[x,y,a] # 우리가 환경을 이해하고 있는 값, 우리가 풀어낸 답
q_estimated = agent.rewards[i] # 실제 답
q_realistic = q_realistic - q_estimated # 실제답과 풀이한값의 차이 = 오차피드백값
diff = q_estimated + 0.05 * diff ## 새로운답 = 원래답 + 오차피드백값 q[x,y,a]
오차를 5 %만 반영하자
for a in range(4):
print(
f"action = {a}\n"
f"action-value function = \n {q[:,:,a]}\n"
)
action = 0
action-value function =
[[-1. -1. -1. -0.99879276]
[-1. -1. -0.99999999 -0.99923914]
[-1. -1. -0.99999572 99.15219633]
[-9.99277183 -9.99788945 -9.9626859 0. ]]
action = 1
action-value function =
[[-1. -1. -1. -9.98910469]
[-1. -1. -0.99999997 -9.99411261]
[-1. -1. -0.99999418 -9.88466698]
[-0.99981905 -0.99974088 99.40794708 0. ]]
action = 2
action-value function =
[[-10. -10. -9.99999978 -9.9923914 ]
[ -1. -1. -0.99999999 -0.99934766]
[ -0.99999998 -0.99999998 -0.99997791 -0.98722072]
[ -0.9990167 -0.99960942 -0.98584013 0. ]]
action = 3
action-value function =
[[-10. -1. -1. -0.99764828]
[-10. -1. -0.99999996 -0.99818028]
[ -9.99999999 -0.99999999 -0.99999716 -0.99645516]
[ -9.98357707 -0.99988595 -0.99645516 0. ]]
환경의 깊은 이해 (좀 더 고차원적인 이해)
-
action=1 일때 각 state의 가치 (=기대보상)
1] q[:,:,
array([[-1. , -1. , -1. , -9.98910469],
[-1. , -1. , -0.99999997, -9.99411261],
[-1. , -1. , -0.99999418, -9.88466698],
[-0.99981905, -0.99974088, 99.40794708, 0. ]])
-
분석1
3,2,1] q[
99.40794707796658
- 상태 (3,2)에서 행동 1을 하게되면 100의 보상을 얻으므로 기대보상값은 100근처 –> 합리적임
-
분석2
3,1,1] q[
-0.9997408802884766
- 상태 (3,1)에서 행동 1을 하게되면 -1 의 보상을 얻으므로 기대보상값은 -1 근처 –> 합리적일까??
-
비판: 분석2는 합리적인것 처럼 보이지만 data를 분석한 뒤에는 그다지 합리적이지 못함
-
상황상상
- 빈 종이를 줌
- 빈 종이에는 0 또는 1을 쓸 수 있음 (action = 0 혹은 1)
- 0을 쓸때와 1을 쓸때 보상이 다름
- 무수히 많은 데이터를 분석해보니, 0을 쓰면 0원을 주고 1을 쓰면 10만원을 보상을 준다는 것을 “알게 되었음”
- 이때 빈 종이의 가치는 5만원인가? 10만원인가? –> 10만원아니야?
-
직관: 생각해보니 현재 \(s=(3,1)\) \(a=1\)에서 추정된(esitated) 값은 q[3,1,1]= -0.9997128867462345
이지만[1], 현실적으로는 “실제보상(-1)과 잠재적보상(100)”을 동시에 고려해야 하는게 합리적임
[1] 즉 next_state가 가지는 잠재적값어치는 고려되어있지 않음
= q[3,1,1]
q_estimated q_estimated
-0.9997408802884766
= (-1) + 0.99 * 100
q_realistic q_realistic
98.0
0.01은 약간의 패널티..
- 여기에서 0.99는 “미래에 받을 보상이 현재에 비해 얼마나 중요한지를 결정하는 가중치” 이다.
- 1에 가까울수록 미래에 받을 보상을 매우 중시한다는 의미 (즉 빈종이= 십만원 으로 생각한다는 의미)
-
즉 \(q(s,a)\)는 모든 \(s\), \(a\)에 대하여
\[q(s,a) \approx \text{reward}(s,a) + 0.99 \times \max_{a}q(s',a)\]
가 성립한다면 \(q(s,a)\)는 타당하게 추정된 것이라 볼 수 있다. 물론 수식을 좀 더 엄밀하게 쓰면 아래와 같다.
\[q(s,a) \approx \begin{cases} \text{reward}(s,a) & \text{terminated} \\ \text{reward}(s,a) + 0.99 \times \max_{a}q(s',a) & \text{not terminated}\end{cases}\]
s는 상태 a는 action
\(q(s,a) \approx \text{reward}(s,a) + 0.99 \times \max_{a}q(s',a)\) 여기서 \(\text{reward}(s,a)\)이거는 바로 받는 거 $ _{a}q(s’,a)$ 이거는 내각 가질 수 있는 최대 리워드
= np.zeros([4,4,4])
q for i in range(agent.n_experiences):
= agent.current_states[i]
x,y = agent.next_states[i]
xx,yy = agent.actions[i]
a = q[x,y,a]
q_estimated if agent.terminations[i]:
= agent.rewards[i]
q_realistic else:
= q[xx,yy,:].max()
q_future = agent.rewards[i] + 0.99 * q_future
q_realistic = q_realistic - q_estimated
diff = q_estimated + 0.05 * diff q[x,y,a]
for a in range(4):
print(
f"action = {a}\n"
f"action-value function = \n {q[:,:,a]}\n"
)
action = 0
action-value function =
[[88.53307362 90.49709464 92.36408758 88.68673925]
[90.28856398 92.49369954 94.61842445 95.49617968]
[90.91491115 94.21657181 96.89901308 99.15219633]
[-9.99277183 -9.99788945 -9.9626859 0. ]]
action = 1
action-value function =
[[88.55960706 90.3131511 83.87809217 -9.98910469]
[90.472853 92.49913938 92.55929104 -9.99411261]
[92.35065011 94.61963597 96.65724194 -9.88466698]
[93.42457258 96.5945232 99.40794708 0. ]]
action = 2
action-value function =
[[-10. -10. -9.99999978 -9.9923914 ]
[ 86.56190669 88.46124563 89.96848094 80.18849597]
[ 88.03732538 90.28026548 91.62827094 84.50628885]
[ 87.41906298 91.06145181 87.82431486 0. ]]
action = 3
action-value function =
[[-10. 86.5658665 88.21628148 86.74874619]
[-10. 88.40364698 90.19865977 90.75947241]
[ -9.99999999 89.90158238 91.81108837 92.72733049]
[ -9.98357707 88.02167685 91.41860035 0. ]]
행동 전략 수립
-
상태 (0,0)에 있다고 가정해보자.
0,0,:] q[
array([ 88.53307362, 88.55960706, -10. , -10. ])
- 행동 0 혹은 행동 1을 하는게 유리하다. // 행동 2,3을 하면 망한다.
-
상태 (2,3)에 있다고 가정해보자.
2,3,:] q[
array([99.15219633, -9.88466698, 84.50628885, 92.72733049])
- 행동 0을 하는게 유리함.
-
상태 (3,2)에 있다고 가정해보자.
3,2,:] q[
array([-9.9626859 , 99.40794708, 87.82431486, 91.41860035])
- 행동1을 하는게 유리함
-
각 상태에서 최적은 action은 아래와 같다.
0,0,:].argmax() q[
1
2,3,:].argmax() q[
0
3,2,:].argmax() q[
1
-
전략(=정책)을 정리해보자.
= np.array(['?????']*16).reshape(4,4)
policy policy
array([['?????', '?????', '?????', '?????'],
['?????', '?????', '?????', '?????'],
['?????', '?????', '?????', '?????'],
['?????', '?????', '?????', '?????']], dtype='<U5')
= {0:'down', 1: 'right', 2:'up', 3:'left'} directions
for x in range(4):
for y in range(4):
= directions[q[x,y,:].argmax()]
policy[x,y] policy
array([['right', 'down', 'down', 'down'],
['right', 'right', 'down', 'down'],
['right', 'right', 'down', 'down'],
['right', 'right', 'right', 'down']], dtype='<U5')
max(axis=-1) q.
array([[88.55960706, 90.49709464, 92.36408758, 88.68673925],
[90.472853 , 92.49913938, 94.61842445, 95.49617968],
[92.35065011, 94.61963597, 96.89901308, 99.15219633],
[93.42457258, 96.5945232 , 99.40794708, 0. ]])
Agent2 클래스 구현 + Run
class Agent2(Agent1):
def __init__(self,env):
super().__init__(env)
self.q = np.zeros([4,4,4])
def learn(self):
= self.current_state
x,y = self.next_state
xx,yy = self.action
a = self.q[x,y,a]
q_estimated if self.terminated:
= self.reward
q_realistic else:
= q[xx,yy,:].max()
q_future = self.reward + 0.99 * q_future
q_realistic = q_realistic - q_estimated
diff self.q[x,y,a] = q_estimated + 0.05 * diff
def act(self):
if self.n_experiences < 3000:
self.action = self.action_space.sample()
else:
= self.current_state
x,y self.action = self.q[x,y,:].argmax()
experiences 쌓일 때마다 업데이트 하기
= GridWorld()
env = Agent2(env)
agent for _ in range(2000):
## 본질적인 코드
= env.reset()
agent.current_state = False
agent.terminated = 0
agent.score for t in range(50):
# step1: agent >> env
agent.act() = agent.action
env.agent_action # step2: agent << env
= env.step(env.agent_action)
agent.next_state, agent.reward, agent.terminated
agent.save_experience() # step3: learn
agent.learn()# step4: state update
= agent.next_state
agent.current_state # step5:
if agent.terminated: break
agent.scores.append(agent.score) +1)
agent.playtimes.append(t= agent.n_episodes + 1
agent.n_episodes ## 덜 본질적인 코드
if (agent.n_episodes % 100) ==0:
print(
f"Epsiode: {agent.n_episodes} \t"
f"Score: {np.mean(agent.scores[-100:])} \t"
f"Playtime: {np.mean(agent.playtimes[-100:])}"
)
Epsiode: 100 Score: -11.76 Playtime: 2.76
Epsiode: 200 Score: -9.53 Playtime: 3.83
Epsiode: 300 Score: -9.0 Playtime: 3.3
Epsiode: 400 Score: -12.1 Playtime: 3.1
Epsiode: 500 Score: -10.38 Playtime: 3.58
Epsiode: 600 Score: -8.94 Playtime: 3.24
Epsiode: 700 Score: -12.16 Playtime: 3.16
Epsiode: 800 Score: -8.94 Playtime: 3.24
Epsiode: 900 Score: -10.02 Playtime: 5.78
Epsiode: 1000 Score: -50.0 Playtime: 50.0
Epsiode: 1100 Score: -50.0 Playtime: 50.0
Epsiode: 1200 Score: -50.0 Playtime: 50.0
Epsiode: 1300 Score: -50.0 Playtime: 50.0
Epsiode: 1400 Score: -50.0 Playtime: 50.0
Epsiode: 1500 Score: -50.0 Playtime: 50.0
Epsiode: 1600 Score: -50.0 Playtime: 50.0
Epsiode: 1700 Score: -50.0 Playtime: 50.0
Epsiode: 1800 Score: -50.0 Playtime: 50.0
Epsiode: 1900 Score: -50.0 Playtime: 50.0
Epsiode: 2000 Score: -50.0 Playtime: 50.0
agent.n_episodes % 100 –> 100의 배수
= [np.array([0,0])] + agent.next_states[-agent.playtimes[-1]:]
states show(states)
max(-1).T agent.q.
array([[88.59212369, 90.00967758, 80.87907551, 57.04075186],
[90.44044671, 89.98145875, 79.93920263, 60.65439373],
[88.59212369, 82.69209965, 67.47304639, 43.63362743],
[47.92350641, 55.86149947, 40.12630608, 0. ]])
갇혀서 업데이트가 되지 않는 상황
max로만 가지 말고 랜덤으로 다른 action 취해보고 좋으면 거기로 가자
Agnet3 클래스 구현 + Run
class Agent3(Agent2):
def __init__(self,env):
super().__init__(env)
self.eps = 0
def act(self):
if np.random.rand() < self.eps:
self.action = self.action_space.sample()
else:
= self.current_state
x,y self.action = self.q[x,y,:].argmax()
= GridWorld()
env = Agent3(env)
agent = 1
agent.eps for _ in range(5000):
## 본질적인 코드
= env.reset()
agent.current_state = False
agent.terminated = 0
agent.score for t in range(50):
# step1: agent >> env
agent.act() = agent.action
env.agent_action # step2: agent << env
= env.step(env.agent_action)
agent.next_state, agent.reward, agent.terminated
agent.save_experience() # step3: learn
agent.learn()# step4: state update
= agent.next_state
agent.current_state # step5:
if agent.terminated: break
agent.scores.append(agent.score) +1)
agent.playtimes.append(t= agent.n_episodes + 1
agent.n_episodes = agent.eps * 0.999
agent.eps ## 덜 본질적인 코드
if (agent.n_episodes % 200) ==0:
print(
f"Epsiode: {agent.n_episodes} \t"
f"Score: {np.mean(agent.scores[-100:])} \t"
f"Playtime: {np.mean(agent.playtimes[-100:])}\t"
f"Epsilon: {agent.eps : .2f}"
)
Epsiode: 200 Score: -12.82 Playtime: 3.82 Epsilon: 0.82
Epsiode: 400 Score: -13.66 Playtime: 4.66 Epsilon: 0.67
Epsiode: 600 Score: -11.49 Playtime: 6.89 Epsilon: 0.55
Epsiode: 800 Score: -13.44 Playtime: 12.68 Epsilon: 0.45
Epsiode: 1000 Score: -14.79 Playtime: 15.04 Epsilon: 0.37
Epsiode: 1200 Score: -12.01 Playtime: 15.29 Epsilon: 0.30
Epsiode: 1400 Score: 28.38 Playtime: 12.57 Epsilon: 0.25
Epsiode: 1600 Score: 72.7 Playtime: 6.3 Epsilon: 0.20
Epsiode: 1800 Score: 73.92 Playtime: 6.18 Epsilon: 0.17
Epsiode: 2000 Score: 82.54 Playtime: 6.36 Epsilon: 0.14
Epsiode: 2200 Score: 82.56 Playtime: 6.34 Epsilon: 0.11
Epsiode: 2400 Score: 77.03 Playtime: 6.37 Epsilon: 0.09
Epsiode: 2600 Score: 81.36 Playtime: 6.53 Epsilon: 0.07
Epsiode: 2800 Score: 88.94 Playtime: 6.56 Epsilon: 0.06
Epsiode: 3000 Score: 83.42 Playtime: 6.76 Epsilon: 0.05
Epsiode: 3200 Score: 93.8 Playtime: 6.1 Epsilon: 0.04
Epsiode: 3400 Score: 91.67 Playtime: 6.03 Epsilon: 0.03
Epsiode: 3600 Score: 89.19 Playtime: 6.4 Epsilon: 0.03
Epsiode: 3800 Score: 91.32 Playtime: 6.38 Epsilon: 0.02
Epsiode: 4000 Score: 93.71 Playtime: 6.19 Epsilon: 0.02
Epsiode: 4200 Score: 93.89 Playtime: 6.01 Epsilon: 0.01
Epsiode: 4400 Score: 92.45 Playtime: 6.44 Epsilon: 0.01
Epsiode: 4600 Score: 94.94 Playtime: 6.06 Epsilon: 0.01
Epsiode: 4800 Score: 90.08 Playtime: 6.61 Epsilon: 0.01
Epsiode: 5000 Score: 93.91 Playtime: 5.99 Epsilon: 0.01
= [np.array([0,0])] + agent.next_states[-agent.playtimes[-1]:]
states show(states)