728x90
Dueling Double DQN
https://arxiv.org/pdf/1509.06461.pdf
https://arxiv.org/pdf/1511.06581.pdf
Double DQN
- DQN์์ reward๋ฅผ ๊ณผ๋ ํ๊ฐํ๋ ๋ฌธ์ ๊ฐ ์์.
- Q Value๊ฐ agent๊ฐ ์ค์ ๋ณด๋ค ๋์ ๋ฆฌํด์ ๋ฐ์ ๊ฒ์ด๋ผ๊ณ ์๊ฐํ๋ ๊ฒฝํฅ
- ⇒ Q learning update ๋ฐฉ์ ์์ ๋ค์ ์ํ(state)์ ๋ํ Q value ์ต๋๊ฐ์ด ์กด์ฌํ๊ธฐ ๋๋ฌธ
- Q ๊ฐ์ ๋ํ max ์ฐ์ฐ์ ํธํฅ์ ์ต๋ํํ๋ค.
- ํ๊ฒฝ์ ์ต๋ true value๊ฐ 0์ธ๋ฐ agent๊ฐ ์ถ์ ํ๋ ์ต๋ true value๊ฐ ์์์ธ ๊ฒฝ์ฐ์ ์ฑ๋ฅ ์ ํ
ํด๊ฒฐ์ ์ํด ๋ ๊ฐ์ network ์ฌ์ฉ.
- Q Next : action selection → ๋ค์ ์ก์ ์ผ๋ก ๊ฐ์ฅ ์ข์ ๊ฒ ์ ํ
- Q Eval : action evaluation → ์ ํํ ์ก์ ์ด ์ข์ ์ก์ ์ด์๋์ง ํ๊ฐ
Dueling DQN
๋ ๊ฐ์ network ์ฌ์ฉ
- V : Value function → ์ํ(state)๋ก๋ถํฐ ๋ฐ์ reward ์์ ๋ฐ๋ฅธ ์ํ ๊ฐ์น
- A : Advantage function → ๋ถ๋ถ์ ์ธ ์ํ(state)์์ ์ด๋ค action์ด ๋ค๋ฅธ action์ ๋นํ ๊ฐ์น
- Q function์ V + A
1. DQN
import tensorflow as tf
import tensorflow.keras as keras
class DuelingDeepQNetwork(keras.Model):
def __init__(self, n_actions, fc1_dims, fc2_dims):
super(DuelingDeepQNetwork, self).__init__()
self.dense1 = keras.layers.Dense(fc1_dims, activation='relu')
self.dense2 = keras.layers.Dense(fc2_dims, activation='relu')
self.V = keras.layers.Dense(1, activation=None)
self.A = keras.layers.Dense(n_actions, activation=None)
def call(self, state):
x = self.dense1(state)
x = self.dense2(x)
# Value function -> ํ์ฌ ์ํ ๊ฐ์น
V = self.V(x)
# Advantage function -> action ๊ฐ์น
A = self.A(x)
Q = (V + (A - tf.math.reduce_mean(A, axis=1, keepdims=True)))
return Q
def advantage(self, state):
x = self.dense1(state)
x = self.dense2(x)
A = self.A(x)
return A
2. Replay Buffer
- Replay Buffer๋ฅผ ์ฌ์ฉํ๋ฉด ์๋์ ๊ฐ์ ์ด์ ์กด์ฌ
- ์ ๊ฒฝ๋ง ์ ๋ฐ์ดํธ ์์ example ๊ฐ์ ์์กด๋ ๊ฐ์
- mini batch ์ฌ์ฉ์ผ๋ก ํ์ต์๋ ๊ฐ์
- ๊ณผ๊ฑฐ์ transition์ ๋ค์ ์ฌ์ฉํด์ ๋ง๊ฐ์ ๋ฐฉ์ง
import numpy as np
class ReplayBuffer():
def __init__(self, max_size, input_shape):
self.mem_size = max_size
self.mem_cntr = 0
self.state_memory = np.zeros((self.mem_size, *input_shape),
dtype=np.float32)
self.new_state_memory = np.zeros((self.mem_size, *input_shape),
dtype=np.float32)
self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
def store_transition(self, state, action, reward, state_, done):
index = self.mem_cntr % self.mem_size
self.state_memory[index] = state
self.new_state_memory[index] = state_
self.action_memory[index] = action
self.reward_memory[index] = reward
self.terminal_memory[index] = done
self.mem_cntr += 1
def sample_buffer(self, batch_size):
max_mem = min(self.mem_cntr, self.mem_size)
batch = np.random.choice(max_mem, batch_size, replace=False)
states = self.state_memory[batch]
new_states = self.new_state_memory[batch]
actions = self.action_memory[batch]
rewards = self.reward_memory[batch]
dones = self.terminal_memory[batch]
return states, actions, rewards, new_states, dones
3. Agent
from importlib import import_module
import tensorflow as tf
from dueling_ddqn_lunar import DuelingDeepQNetwork
from dueling_ddqn_replay_buffer_lunar import ReplayBuffer
from tensorflow.keras.optimizers import Adam
import numpy as np
class Agent():
def __init__(self, lr, gamma, n_actions, epsilon, batch_size,
input_dims, epsilon_dec=1e-3, eps_end=0.01,
mem_size=100000, fc1_dims=128,
fc2_dims=128, replace=100):
self.action_space = [i for i in range(n_actions)]
# gamma = discount factor
self.gamma = gamma
# epsilon = agent๊ฐ ๋ค๋ฅธ action์ ์ ํํ๋๋ก ํ๋ random ์ง์
self.epsilon = epsilon
self.eps_dec = epsilon_dec
self.eps_min = eps_end
self.replace = replace
self.batch_size = batch_size
self.learn_step_counter = 0
self.memory = ReplayBuffer(mem_size, input_dims)
# ์ด action์ ํ๊ฐํ๋ network
self.q_eval = DuelingDeepQNetwork(n_actions, fc1_dims, fc2_dims)
# ํ์ฌ ์ํฉ์ ๋ฐ๋ฅธ action์ ์ ํํ๋ network
self.q_next = DuelingDeepQNetwork(n_actions, fc1_dims, fc2_dims)
self.q_eval.compile(optimizer=Adam(learning_rate=lr),
loss='mean_squared_error')
self.q_next.compile(optimizer=Adam(learning_rate=lr),
loss='mean_squared_error')
def store_transition(self, state, action, reward, new_state, done):
self.memory.store_transition(state, action, reward, new_state, done)
def choose_action(self, observation):
if np.random.random() < self.epsilon:
action = np.random.choice(self.action_space)
else:
state = np.array([observation])
# action ํ๊ฐ
actions = self.q_eval.advantage(state)
# ํ๊ฐ ๊ฐ ์ค ์ต๊ณ
action = tf.math.argmax(actions, axis=1).numpy()[0]
return action
def learn(self):
if self.memory.mem_cntr < self.batch_size:
return
if self.learn_step_counter % self.replace == 0:
self.q_next.set_weights(self.q_eval.get_weights())
states, actions, rewards, states_, dones = \
self.memory.sample_buffer(self.batch_size)
q_pred = self.q_eval(states)
q_target = q_pred.numpy()
max_actions = tf.math.argmax(self.q_eval(states_), axis=1)
# improve on my solution!
for idx, terminal in enumerate(dones):
#if terminal:
#q_next[idx] = 0.0
q_target[idx, actions[idx]] = rewards[idx] + \
self.gamma*q_next[idx, max_actions[idx]]*(1-int(dones[idx]))
self.q_eval.train_on_batch(states, q_target)
self.epsilon = self.epsilon - self.eps_dec if self.epsilon > \
self.eps_min else self.eps_min
self.learn_step_counter += 1
728x90