import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
내 메모는 녹색
강의영상
https://youtu.be/playlist?list=PLQqh36zP38-zoOHd7w3N5q9Jc5P34Ux8X&si=MdJTHM3a27MCAssp
환경셋팅
-
설치 (코랩)
!pip install -q swig
!pip install gymnasium
!pip install gymnasium[box2d]
imports
intro
-
강화학습(대충설명): 어떠한 “(게임)환경”이 있을때 거기서 “뭘 할지”를 학습하는 과업
-
딥마인드: breakout \(\to\) 알파고
-
강화학습 미래? (이거 잘하면 먹고 살 수 있을까?)
-
선행 (강화학습)
- 프로그래밍 지식: 파이썬, 클래스에 대한 이해 // https://guebin.github.io/PP2023/ 10wk-2 이후
- 딥러닝 기본지식: DNN // https://guebin.github.io/DL2022/ 3wk-02 ~ 4wk-02
- 수학적인 지식: 마코프과정
Game1: bandit
-
문제설명: 두 개의 버튼이 있다. 버튼0을 누르면 1의 보상을, 버튼1을 누르면 100의 보상을 준다고 가정
-
처음에 어떤 행동을 해야 하는가? —> ??? 처음에는 아는게 없음 —> 일단 “아무거나” 눌러보자.
-
버튼을 아무거나 누르는 함수를 구현해보자.
= ['button0', 'button1']
action_space = np.random.choice(action_space)
action action
'button1'
-
보상을 주는 함수를 구현해보자.
if action == 'button0': # button0을 눌렀다면
= 1
reward else: # button1을 눌렀다면
= 100 reward
reward
100
-
아무버튼이나 10번정도 눌러보면서 데이터를 쌓아보자.
for _ in range(10):
= np.random.choice(action_space)
action if action == 'button0':
= 1
reward else:
= 100
reward print(action,reward)
button0 1
button0 1
button0 1
button0 1
button0 1
button1 100
button0 1
button0 1
button1 100
button0 1
-
깨달았음: button0
을 누르면 1점을 받고, button1
을 누르면 100점을 받는 “환경”이구나? \(\to\) button1
을 누르는 “동작”을 해야하는 상황이구나?
- 여기에서 \(\to\)의 과정을 체계화 시킨 학문이 강화학습
for _ in range(10):
= action_space[1]
action if action == 'button0':
= 1
reward else:
= 100
reward print(action,reward)
button1 100
button1 100
button1 100
button1 100
button1 100
button1 100
button1 100
button1 100
button1 100
button1 100
- 게임 클리어
-
강화학습: 환경을 이해 \(\to\) 행동을 결정
위의 과정이 잘 되었다는 의미로 사용하는 문장들
- 강화학습이 성공적으로 잘 되었다.
- 에이전트가 환경의 과제를 완료했다.
- 에이전트가 환경에서 성공적으로 학습했다.
- 에이전트가 올바른 행동을 학습했다.
- 게임 클리어 (비공식)
-
게임이 클리어 되었다는 것을 의미하는 지표를 정하고 싶다.
- 첫 생각:
button1
을 누르는 순간 게임클리어로 보면 되지 않나? - 두번째 생각: 아니지? 우연히 누를수도 있잖아?
- 게임클리어조건: 최근 20번의 보상이 1900점 이상이면 게임이 클리어 되었다고 생각하자.[1]
-
무지한자 – 게임을 클리어할 수 없다.
[1] button1
을 눌러야 하는건 맞지만 20번에 한번정도의 실수는 눈감아 주는 조건
= [0,1]
action_space = []
rewards for t in range(50): # 10000번을 해도 못깸
= np.random.choice(action_space) # 무지한자의 행동 (찍어)
action if action == 0:
= 1
reward
rewards.append(reward)else:
= 100
reward
rewards.append(reward)print(
f"n_try = {t+1}\t"
f"action= {action}\t"
f"reward= {reward}\t"
f"reward20= {sum(rewards[-20:])}\t"
)if np.sum(rewards[-20:])>=1900:
break
n_try = 1 action= 0 reward= 1 reward20= 1
n_try = 2 action= 0 reward= 1 reward20= 2
n_try = 3 action= 1 reward= 100 reward20= 102
n_try = 4 action= 1 reward= 100 reward20= 202
n_try = 5 action= 1 reward= 100 reward20= 302
n_try = 6 action= 1 reward= 100 reward20= 402
n_try = 7 action= 1 reward= 100 reward20= 502
n_try = 8 action= 0 reward= 1 reward20= 503
n_try = 9 action= 1 reward= 100 reward20= 603
n_try = 10 action= 0 reward= 1 reward20= 604
n_try = 11 action= 0 reward= 1 reward20= 605
n_try = 12 action= 0 reward= 1 reward20= 606
n_try = 13 action= 0 reward= 1 reward20= 607
n_try = 14 action= 0 reward= 1 reward20= 608
n_try = 15 action= 1 reward= 100 reward20= 708
n_try = 16 action= 0 reward= 1 reward20= 709
n_try = 17 action= 0 reward= 1 reward20= 710
n_try = 18 action= 0 reward= 1 reward20= 711
n_try = 19 action= 0 reward= 1 reward20= 712
n_try = 20 action= 0 reward= 1 reward20= 713
n_try = 21 action= 1 reward= 100 reward20= 812
n_try = 22 action= 0 reward= 1 reward20= 812
n_try = 23 action= 0 reward= 1 reward20= 713
n_try = 24 action= 1 reward= 100 reward20= 713
n_try = 25 action= 1 reward= 100 reward20= 713
n_try = 26 action= 0 reward= 1 reward20= 614
n_try = 27 action= 1 reward= 100 reward20= 614
n_try = 28 action= 1 reward= 100 reward20= 713
n_try = 29 action= 1 reward= 100 reward20= 713
n_try = 30 action= 0 reward= 1 reward20= 713
n_try = 31 action= 1 reward= 100 reward20= 812
n_try = 32 action= 0 reward= 1 reward20= 812
n_try = 33 action= 1 reward= 100 reward20= 911
n_try = 34 action= 1 reward= 100 reward20= 1010
n_try = 35 action= 1 reward= 100 reward20= 1010
n_try = 36 action= 0 reward= 1 reward20= 1010
n_try = 37 action= 1 reward= 100 reward20= 1109
n_try = 38 action= 0 reward= 1 reward20= 1109
n_try = 39 action= 0 reward= 1 reward20= 1109
n_try = 40 action= 1 reward= 100 reward20= 1208
n_try = 41 action= 1 reward= 100 reward20= 1208
n_try = 42 action= 0 reward= 1 reward20= 1208
n_try = 43 action= 1 reward= 100 reward20= 1307
n_try = 44 action= 1 reward= 100 reward20= 1307
n_try = 45 action= 0 reward= 1 reward20= 1208
n_try = 46 action= 1 reward= 100 reward20= 1307
n_try = 47 action= 1 reward= 100 reward20= 1307
n_try = 48 action= 1 reward= 100 reward20= 1307
n_try = 49 action= 0 reward= 1 reward20= 1208
n_try = 50 action= 1 reward= 100 reward20= 1307
-
깨달은자 – 게임클리어
= [0,1]
action_space = []
rewards for t in range(50): # 10000번을 해도 못깸
#action = np.random.choice(action_space) # 무지한자의 행동 (찍어)
= 1
action if action == 0:
= 1
reward
rewards.append(reward)else:
= 100
reward
rewards.append(reward)print(
f"n_try = {t+1}\t"
f"action= {action}\t"
f"reward= {reward}\t"
f"reward20= {sum(rewards[-20:])}\t"
)if np.sum(rewards[-20:])>=1900:
break
n_try = 1 action= 1 reward= 100 reward20= 100
n_try = 2 action= 1 reward= 100 reward20= 200
n_try = 3 action= 1 reward= 100 reward20= 300
n_try = 4 action= 1 reward= 100 reward20= 400
n_try = 5 action= 1 reward= 100 reward20= 500
n_try = 6 action= 1 reward= 100 reward20= 600
n_try = 7 action= 1 reward= 100 reward20= 700
n_try = 8 action= 1 reward= 100 reward20= 800
n_try = 9 action= 1 reward= 100 reward20= 900
n_try = 10 action= 1 reward= 100 reward20= 1000
n_try = 11 action= 1 reward= 100 reward20= 1100
n_try = 12 action= 1 reward= 100 reward20= 1200
n_try = 13 action= 1 reward= 100 reward20= 1300
n_try = 14 action= 1 reward= 100 reward20= 1400
n_try = 15 action= 1 reward= 100 reward20= 1500
n_try = 16 action= 1 reward= 100 reward20= 1600
n_try = 17 action= 1 reward= 100 reward20= 1700
n_try = 18 action= 1 reward= 100 reward20= 1800
n_try = 19 action= 1 reward= 100 reward20= 1900
수정1: action_space
의 수정
= gym.spaces.Discrete(2)
action_space action_space
Discrete(2)
type(action_space)
gymnasium.spaces.discrete.Discrete
위에서 우리가 지정한 것은 type 이 list
-
좋은점1: sample
for _ in range(10):
print(action_space.sample())
0
1
1
1
0
1
1
0
1
0
자체적 sampling 가능!
-
좋은점2: in
0 in action_space # 유효한 액션을 검사 -- 0은 유효한 액션
True
1 in action_space # 유효한 액션을 검사 -- 1은 유효한 액션
True
2 in action_space # 유효한 액션을 검사 -- 2는 유효하지 않은 액션
False
True의 뜻은 유효한 action이라는 뜻
-
코드 1차수정
= gym.spaces.Discrete(2)
action_space = []
rewards for t in range(50):
= action_space.sample()
action #action = 1
if action == 0:
= 1
reward
rewards.append(reward)else:
= 100
reward
rewards.append(reward)print(
f"n_try = {t+1}\t"
f"action= {action}\t"
f"reward= {reward}\t"
f"reward20= {sum(rewards[-20:])}\t"
)if np.sum(rewards[-20:])>=1900:
break
n_try = 1 action= 0 reward= 1 reward20= 1
n_try = 2 action= 0 reward= 1 reward20= 2
n_try = 3 action= 1 reward= 100 reward20= 102
n_try = 4 action= 0 reward= 1 reward20= 103
n_try = 5 action= 1 reward= 100 reward20= 203
n_try = 6 action= 1 reward= 100 reward20= 303
n_try = 7 action= 0 reward= 1 reward20= 304
n_try = 8 action= 1 reward= 100 reward20= 404
n_try = 9 action= 0 reward= 1 reward20= 405
n_try = 10 action= 1 reward= 100 reward20= 505
n_try = 11 action= 1 reward= 100 reward20= 605
n_try = 12 action= 0 reward= 1 reward20= 606
n_try = 13 action= 0 reward= 1 reward20= 607
n_try = 14 action= 1 reward= 100 reward20= 707
n_try = 15 action= 1 reward= 100 reward20= 807
n_try = 16 action= 1 reward= 100 reward20= 907
n_try = 17 action= 0 reward= 1 reward20= 908
n_try = 18 action= 1 reward= 100 reward20= 1008
n_try = 19 action= 0 reward= 1 reward20= 1009
n_try = 20 action= 1 reward= 100 reward20= 1109
n_try = 21 action= 0 reward= 1 reward20= 1109
n_try = 22 action= 1 reward= 100 reward20= 1208
n_try = 23 action= 1 reward= 100 reward20= 1208
n_try = 24 action= 1 reward= 100 reward20= 1307
n_try = 25 action= 1 reward= 100 reward20= 1307
n_try = 26 action= 1 reward= 100 reward20= 1307
n_try = 27 action= 1 reward= 100 reward20= 1406
n_try = 28 action= 0 reward= 1 reward20= 1307
n_try = 29 action= 1 reward= 100 reward20= 1406
n_try = 30 action= 0 reward= 1 reward20= 1307
n_try = 31 action= 0 reward= 1 reward20= 1208
n_try = 32 action= 0 reward= 1 reward20= 1208
n_try = 33 action= 0 reward= 1 reward20= 1208
n_try = 34 action= 0 reward= 1 reward20= 1109
n_try = 35 action= 1 reward= 100 reward20= 1109
n_try = 36 action= 0 reward= 1 reward20= 1010
n_try = 37 action= 1 reward= 100 reward20= 1109
n_try = 38 action= 1 reward= 100 reward20= 1109
n_try = 39 action= 1 reward= 100 reward20= 1208
n_try = 40 action= 0 reward= 1 reward20= 1109
n_try = 41 action= 1 reward= 100 reward20= 1208
n_try = 42 action= 1 reward= 100 reward20= 1208
n_try = 43 action= 1 reward= 100 reward20= 1208
n_try = 44 action= 1 reward= 100 reward20= 1208
n_try = 45 action= 1 reward= 100 reward20= 1208
n_try = 46 action= 0 reward= 1 reward20= 1109
n_try = 47 action= 0 reward= 1 reward20= 1010
n_try = 48 action= 1 reward= 100 reward20= 1109
n_try = 49 action= 1 reward= 100 reward20= 1109
n_try = 50 action= 0 reward= 1 reward20= 1109
수정2: Env 클래스
-
env 클래스 선언
class Bandit:
def step(self, action):
if action == 0:
return 1
else:
return 100
= gym.spaces.Discrete(2)
action_space = Bandit()
env = []
rewards for t in range(50):
#action = action_space.sample()
= 1
action = env.step(action)
reward
rewards.append(reward)print(
f"n_try = {t+1}\t"
f"action= {action}\t"
f"reward= {reward}\t"
f"reward20= {sum(rewards[-20:])}\t"
)if np.sum(rewards[-20:])>=1900:
break
n_try = 1 action= 1 reward= 100 reward20= 100
n_try = 2 action= 1 reward= 100 reward20= 200
n_try = 3 action= 1 reward= 100 reward20= 300
n_try = 4 action= 1 reward= 100 reward20= 400
n_try = 5 action= 1 reward= 100 reward20= 500
n_try = 6 action= 1 reward= 100 reward20= 600
n_try = 7 action= 1 reward= 100 reward20= 700
n_try = 8 action= 1 reward= 100 reward20= 800
n_try = 9 action= 1 reward= 100 reward20= 900
n_try = 10 action= 1 reward= 100 reward20= 1000
n_try = 11 action= 1 reward= 100 reward20= 1100
n_try = 12 action= 1 reward= 100 reward20= 1200
n_try = 13 action= 1 reward= 100 reward20= 1300
n_try = 14 action= 1 reward= 100 reward20= 1400
n_try = 15 action= 1 reward= 100 reward20= 1500
n_try = 16 action= 1 reward= 100 reward20= 1600
n_try = 17 action= 1 reward= 100 reward20= 1700
n_try = 18 action= 1 reward= 100 reward20= 1800
n_try = 19 action= 1 reward= 100 reward20= 1900
수정3: Agnet 클래스
-
Agent 클래스를 만들자. (액션을 하고, 환경에서 받은 reward를 간직)
class Agent1:
def __init__(self):
self.action_space = gym.spaces.Discrete(2)
self.action = None
self.reward = None
self.actions = []
self.rewards = []
def act(self):
self.action = self.action_space.sample() # 무지한자
#self.action = 1 # 깨달은 자
def save_experience(self):
self.actions.append(self.action)
self.rewards.append(self.reward)
— 대충 아래와 같은 느낌으로 코드가 돌아가요 —
시점0: init
= Bandit()
env = Agent1() agent
agent.action, agent.reward
(None, None)
시점1: agent >> env
act는 sample과 비슷한 역할을 한다.
agent.act()
agent.action, agent.reward
(1, None)
= agent.action env.agent_action
시점2: agent << env
= env.step(env.agent_action) agent.reward
agent.action, agent.reward, env.agent_action
(1, 100, 1)
agent.actions,agent.rewards
([], [])
agent.save_experience()
agent.actions,agent.rewards
([1], [100])
– 전체코드 –
= Bandit()
env = Agent1()
agent for t in range(50):
## 1. main 코드
# step1: agent >> env
agent.act() = agent.action
env.agent_action # step2: agent << env
= env.step(env.agent_action)
agent.reward
agent.save_experience()
## 2. 비본질적 코드
print(
f"n_try = {t+1}\t"
f"action= {agent.action}\t"
f"reward= {agent.reward}\t"
f"reward20= {sum(agent.rewards[-20:])}\t"
)if np.sum(agent.rewards[-20:])>=1900:
break
n_try = 1 action= 0 reward= 1 reward20= 1
n_try = 2 action= 1 reward= 100 reward20= 101
n_try = 3 action= 1 reward= 100 reward20= 201
n_try = 4 action= 0 reward= 1 reward20= 202
n_try = 5 action= 1 reward= 100 reward20= 302
n_try = 6 action= 0 reward= 1 reward20= 303
n_try = 7 action= 0 reward= 1 reward20= 304
n_try = 8 action= 1 reward= 100 reward20= 404
n_try = 9 action= 0 reward= 1 reward20= 405
n_try = 10 action= 0 reward= 1 reward20= 406
n_try = 11 action= 1 reward= 100 reward20= 506
n_try = 12 action= 0 reward= 1 reward20= 507
n_try = 13 action= 1 reward= 100 reward20= 607
n_try = 14 action= 1 reward= 100 reward20= 707
n_try = 15 action= 1 reward= 100 reward20= 807
n_try = 16 action= 1 reward= 100 reward20= 907
n_try = 17 action= 1 reward= 100 reward20= 1007
n_try = 18 action= 1 reward= 100 reward20= 1107
n_try = 19 action= 0 reward= 1 reward20= 1108
n_try = 20 action= 1 reward= 100 reward20= 1208
n_try = 21 action= 0 reward= 1 reward20= 1208
n_try = 22 action= 0 reward= 1 reward20= 1109
n_try = 23 action= 0 reward= 1 reward20= 1010
n_try = 24 action= 1 reward= 100 reward20= 1109
n_try = 25 action= 0 reward= 1 reward20= 1010
n_try = 26 action= 0 reward= 1 reward20= 1010
n_try = 27 action= 1 reward= 100 reward20= 1109
n_try = 28 action= 0 reward= 1 reward20= 1010
n_try = 29 action= 0 reward= 1 reward20= 1010
n_try = 30 action= 1 reward= 100 reward20= 1109
n_try = 31 action= 1 reward= 100 reward20= 1109
n_try = 32 action= 1 reward= 100 reward20= 1208
n_try = 33 action= 1 reward= 100 reward20= 1208
n_try = 34 action= 1 reward= 100 reward20= 1208
n_try = 35 action= 0 reward= 1 reward20= 1109
n_try = 36 action= 0 reward= 1 reward20= 1010
n_try = 37 action= 1 reward= 100 reward20= 1010
n_try = 38 action= 1 reward= 100 reward20= 1010
n_try = 39 action= 0 reward= 1 reward20= 1010
n_try = 40 action= 1 reward= 100 reward20= 1010
n_try = 41 action= 0 reward= 1 reward20= 1010
n_try = 42 action= 0 reward= 1 reward20= 1010
n_try = 43 action= 1 reward= 100 reward20= 1109
n_try = 44 action= 1 reward= 100 reward20= 1109
n_try = 45 action= 0 reward= 1 reward20= 1109
n_try = 46 action= 0 reward= 1 reward20= 1109
n_try = 47 action= 1 reward= 100 reward20= 1109
n_try = 48 action= 0 reward= 1 reward20= 1109
n_try = 49 action= 0 reward= 1 reward20= 1109
n_try = 50 action= 1 reward= 100 reward20= 1109
수정4: 학습과정을 포함
-
Game1에 대한 생각:
- 사실 강화학습은 “환경을 이해 \(\to\) 행동을 결정” 의 과정에서 \(\to\)의 과정을 수식화 한 것이다.
- 그런데 지금까지 했던 코드는 환경(env)를 이해하는 순간 에이전트가 최적의 행동(action)[1]을 직관적으로 결정하였으므로 기계가 스스로 학습을 했다고 볼 수 없다.
-
지금까지의 코드 복습
- 클래스를 선언하는 부분
- Env 클래스의 선언
- Agent 클래스의 선언
- 환경과 에이전트를 인스턴스화 (초기화)
- for loop를 반복하여 게임을 진행
- 메인코드: (1) agent \(\to\) env (2) agent \(\leftarrow\) env
- 비본질적코드: 학습과정을 display, 학습의 종료조건체크
-
앞으로 구성할 코드의 형태: 에이전트가 데이터를 보고 스스로 button1
을 눌러야 한다는 생각을 했으면 좋겠음.
- 클래스를 선언하는 부분
- Env 클래스의 선언
- Agent 클래스의 선언 // <—-
학습의 과정이 포함되어야 한다
,act함수의 수정
,learn함수의 추가
- 환경과 에이전트를 인스턴스화 (초기화)
- for loop를 반복하여 게임을 진행
- 메인코드 (1) agent \(\to\) env (2) agent \(\leftarrow\) env // <—-
agent가 데이터를 분석하고 학습하는 과정이 추가
- 비본질적코드: 학습과정을 display, 학습의 종료조건체크
- 메인코드 (1) agent \(\to\) env (2) agent \(\leftarrow\) env // <—-
-
에이전트가 학습을 어떻게 하는가? 아래와 같이 버튼을 누르도록 한다면
- 버튼0을 누를 확률: \(\frac{q_0}{q_0+q_1}\)
- 버튼1을 누를 확률: \(\frac{q_1}{q_0+q_1}\)
시간이 지날수록 버튼1을 주로 누를 것이다.
-
걱정: \(t=0\) 이면 어쩌지? \(t=1\)이면 어쩌지?… \(\to\) 해결책: 일정시간동안 랜덤액션을 하면서 데이터를 쌓고 그 뒤에 \(q_0,q_1\)을 계산
-
쌓은 데이터를 바탕으로 환경을 이해하고 action을 뽑는 코드
[1] button1
을 누른다
= [0,1,1,0,1,0,0]
agent.actions = [1,101,102,1,99,1,1.2]
agent.rewards = np.array(agent.actions)
actions = np.array(agent.rewards) rewards
= rewards[actions == 0].mean()
q0 = rewards[actions == 1].mean() q1
= np.array([q0,q1])
agent.q agent.q
array([ 1.05 , 100.66666667])
= agent.q / agent.q.sum()
prob prob
array([0.01032279, 0.98967721])
= np.random.choice([0,1], p= agent.q / agent.q.sum())
action action
1
-
최종코드정리
class Bandit:
def step(self, action):
if action == 0:
return 1
else:
return 100
class Agent:
def __init__(self):
self.action_space = gym.spaces.Discrete(2)
self.action = None
self.reward = None
self.actions = []
self.rewards = []
self.q = np.array([0,0])
self.n_experience = 0
def act(self):
if self.n_experience<30:
self.action = self.action_space.sample()
else:
self.action = np.random.choice([0,1], p= self.q / self.q.sum())
def save_experience(self):
self.actions.append(self.action)
self.rewards.append(self.reward)
self.n_experience += 1
def learn(self):
if self.n_experience<30:
pass
else:
= np.array(self.actions)
actions = np.array(self.rewards)
rewards = rewards[actions == 0].mean()
q0 = rewards[actions == 1].mean()
q1 self.q = np.array([q0,q1])
= Bandit()
env = Agent()
agent for t in range(50):
## 1. main 코드
# step1: agent >> env
agent.act() = agent.action
env.agent_action # step2: agent << env
= env.step(env.agent_action)
agent.reward
agent.save_experience() # step3: learn
agent.learn()## 2. 비본질적 코드
print(
f"n_try = {t+1}\t"
f"action= {agent.action}\t"
f"reward= {agent.reward}\t"
f"reward20= {sum(agent.rewards[-20:])}\t"
f"q = {agent.q}"
)if np.sum(agent.rewards[-20:])>=1900:
break
n_try = 1 action= 1 reward= 100 reward20= 100 q = [0 0]
n_try = 2 action= 1 reward= 100 reward20= 200 q = [0 0]
n_try = 3 action= 0 reward= 1 reward20= 201 q = [0 0]
n_try = 4 action= 1 reward= 100 reward20= 301 q = [0 0]
n_try = 5 action= 1 reward= 100 reward20= 401 q = [0 0]
n_try = 6 action= 1 reward= 100 reward20= 501 q = [0 0]
n_try = 7 action= 1 reward= 100 reward20= 601 q = [0 0]
n_try = 8 action= 0 reward= 1 reward20= 602 q = [0 0]
n_try = 9 action= 1 reward= 100 reward20= 702 q = [0 0]
n_try = 10 action= 0 reward= 1 reward20= 703 q = [0 0]
n_try = 11 action= 0 reward= 1 reward20= 704 q = [0 0]
n_try = 12 action= 0 reward= 1 reward20= 705 q = [0 0]
n_try = 13 action= 0 reward= 1 reward20= 706 q = [0 0]
n_try = 14 action= 1 reward= 100 reward20= 806 q = [0 0]
n_try = 15 action= 0 reward= 1 reward20= 807 q = [0 0]
n_try = 16 action= 1 reward= 100 reward20= 907 q = [0 0]
n_try = 17 action= 0 reward= 1 reward20= 908 q = [0 0]
n_try = 18 action= 0 reward= 1 reward20= 909 q = [0 0]
n_try = 19 action= 0 reward= 1 reward20= 910 q = [0 0]
n_try = 20 action= 0 reward= 1 reward20= 911 q = [0 0]
n_try = 21 action= 1 reward= 100 reward20= 911 q = [0 0]
n_try = 22 action= 1 reward= 100 reward20= 911 q = [0 0]
n_try = 23 action= 0 reward= 1 reward20= 911 q = [0 0]
n_try = 24 action= 0 reward= 1 reward20= 812 q = [0 0]
n_try = 25 action= 0 reward= 1 reward20= 713 q = [0 0]
n_try = 26 action= 1 reward= 100 reward20= 713 q = [0 0]
n_try = 27 action= 1 reward= 100 reward20= 713 q = [0 0]
n_try = 28 action= 0 reward= 1 reward20= 713 q = [0 0]
n_try = 29 action= 1 reward= 100 reward20= 713 q = [0 0]
n_try = 30 action= 0 reward= 1 reward20= 713 q = [ 1. 100.]
n_try = 31 action= 1 reward= 100 reward20= 812 q = [ 1. 100.]
n_try = 32 action= 1 reward= 100 reward20= 911 q = [ 1. 100.]
n_try = 33 action= 1 reward= 100 reward20= 1010 q = [ 1. 100.]
n_try = 34 action= 1 reward= 100 reward20= 1010 q = [ 1. 100.]
n_try = 35 action= 1 reward= 100 reward20= 1109 q = [ 1. 100.]
n_try = 36 action= 1 reward= 100 reward20= 1109 q = [ 1. 100.]
n_try = 37 action= 1 reward= 100 reward20= 1208 q = [ 1. 100.]
n_try = 38 action= 1 reward= 100 reward20= 1307 q = [ 1. 100.]
n_try = 39 action= 1 reward= 100 reward20= 1406 q = [ 1. 100.]
n_try = 40 action= 1 reward= 100 reward20= 1505 q = [ 1. 100.]
n_try = 41 action= 1 reward= 100 reward20= 1505 q = [ 1. 100.]
n_try = 42 action= 1 reward= 100 reward20= 1505 q = [ 1. 100.]
n_try = 43 action= 1 reward= 100 reward20= 1604 q = [ 1. 100.]
n_try = 44 action= 1 reward= 100 reward20= 1703 q = [ 1. 100.]
n_try = 45 action= 1 reward= 100 reward20= 1802 q = [ 1. 100.]
n_try = 46 action= 1 reward= 100 reward20= 1802 q = [ 1. 100.]
n_try = 47 action= 1 reward= 100 reward20= 1802 q = [ 1. 100.]
n_try = 48 action= 1 reward= 100 reward20= 1901 q = [ 1. 100.]