update

MorvanZhou · Morvan Zhou · commit 47d532297daa · 2017-05-10T23:16:05.000+10:00
diff --git a/tutorial-contents/DQN.py b/tutorial-contents/DQN.py
@@ -0,0 +1,124 @@
+"""
+Know more, visit 莫烦Python: https://morvanzhou.github.io/tutorials/
+My Youtube Channel: https://www.youtube.com/user/MorvanZhou
+More about Reinforcement learning: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/
+
+Dependencies:
+tensorflow: 1.1.0
+matplotlib
+numpy
+"""
+import tensorflow as tf
+import numpy as np
+import gym
+
+tf.set_random_seed(1)
+np.random.seed(1)
+
+# Hyper Parameters
+BATCH_SIZE = 32
+LR = 0.01                   # learning rate
+EPSILON = 0.9               # greedy policy
+GAMMA = 0.9                 # reward discount
+TARGET_REPLACE_ITER = 100   # target update frequency
+MEMORY_CAPACITY = 2000
+MEMORY_COUNTER = 0          # for store experience
+LEARNING_STEP_COUNTER = 0   # for target updating
+env = gym.make('CartPole-v0')
+env = env.unwrapped
+N_ACTIONS = env.action_space.n
+N_STATES = env.observation_space.shape[0]
+MEMORY = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))     # initialize memory
+
+# tf placeholders
+tf_s = tf.placeholder(tf.float32, [None, N_STATES])
+tf_a = tf.placeholder(tf.int32, [None, ])
+tf_r = tf.placeholder(tf.float32, [None, ])
+tf_s_ = tf.placeholder(tf.float32, [None, N_STATES])
+
+with tf.variable_scope('q'):        # evaluation network
+    l_eval = tf.layers.dense(tf_s, 10, tf.nn.relu, kernel_initializer=tf.random_normal_initializer(0, 0.1))
+    q = tf.layers.dense(l_eval, N_ACTIONS, kernel_initializer=tf.random_normal_initializer(0, 0.1))
+
+with tf.variable_scope('q_next'):   # target network
+    l_target = tf.layers.dense(tf_s_, 10, tf.nn.relu)
+    q_next = tf.layers.dense(l_target, N_ACTIONS)
+
+q_target = tf.stop_gradient(tf_r + GAMMA * tf.reduce_max(q_next, axis=1))   # shape=(None, ), not need any gradient
+a_one_hot = tf.one_hot(tf_a, depth=N_ACTIONS, dtype=tf.float32)
+q_wrt_a = tf.reduce_sum(q * a_one_hot, axis=1)                              # shape=(None, ), q for current state
+
+loss = tf.reduce_mean(tf.squared_difference(q_target, q_wrt_a))
+train_op = tf.train.AdamOptimizer(LR).minimize(loss)
+
+sess = tf.Session()
+sess.run(tf.global_variables_initializer())
+
+
+def choose_action(s):
+    s = s[np.newaxis, :]
+    if np.random.uniform() < EPSILON:
+        # forward feed the observation and get q value for every actions
+        actions_value = sess.run(q, feed_dict={tf_s: s})
+        action = np.argmax(actions_value)
+    else:
+        action = np.random.randint(0, N_ACTIONS)
+    return action
+
+
+def store_transition(s, a, r, s_):
+    global MEMORY_COUNTER
+    transition = np.hstack((s, [a, r], s_))
+    # replace the old memory with new memory
+    index = MEMORY_COUNTER % MEMORY_CAPACITY
+    MEMORY[index, :] = transition
+    MEMORY_COUNTER += 1
+
+
+def learn():
+    # update target net
+    global LEARNING_STEP_COUNTER
+    if LEARNING_STEP_COUNTER % TARGET_REPLACE_ITER == 0:
+        t_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q_next')
+        e_params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='q')
+        sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])
+    LEARNING_STEP_COUNTER += 1
+
+    # learning
+    sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
+    b_memory = MEMORY[sample_index, :]
+    b_s = b_memory[:, :N_STATES]
+    b_a = b_memory[:, N_STATES].astype(int)
+    b_r = b_memory[:, N_STATES+1]
+    b_s_ = b_memory[:, -N_STATES:]
+    sess.run(train_op, {tf_s: b_s, tf_a: b_a, tf_r: b_r, tf_s_: b_s_})
+
+print('\nCollecting experience...')
+for i_episode in range(400):
+    s = env.reset()
+    ep_r = 0
+    while True:
+        env.render()
+        a = choose_action(s)
+
+        # take action
+        s_, r, done, info = env.step(a)
+
+        # modify the reward
+        x, x_dot, theta, theta_dot = s_
+        r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
+        r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
+        r = r1 + r2
+
+        store_transition(s, a, r, s_)
+
+        ep_r += r
+        if MEMORY_COUNTER > MEMORY_CAPACITY:
+            learn()
+            if done:
+                print('Ep: ', i_episode,
+                      '| Ep_r: ', round(ep_r, 2))
+
+        if done:
+            break
+        s = s_
diff --git a/tutorial-contents/GAN.py b/tutorial-contents/GAN.py
@@ -43,10 +43,10 @@ def artist_works():     # painting from the famous artist (real target)
 with tf.variable_scope('Discriminator'):
     real_art = tf.placeholder(tf.float32, [None, ART_COMPONENTS], name='real_in')   # receive art work from the famous artist
     D_l0 = tf.layers.dense(real_art, 128, tf.nn.relu, name='l')
-    prob_artist0 = tf.layers.dense(D_l0, 1, tf.nn.sigmoid, name='out')              # tell the probability that the art work is made by artist
+    prob_artist0 = tf.layers.dense(D_l0, 1, tf.nn.sigmoid, name='out')              # probability that the art work is made by artist
     # reuse layers for generator
     D_l1 = tf.layers.dense(G_out, 128, tf.nn.relu, name='l', reuse=True)            # receive art work from a newbie like G
-    prob_artist1 = tf.layers.dense(D_l1, 1, tf.nn.sigmoid, name='out', reuse=True)  # tell the probability that the art work is made by artist
+    prob_artist1 = tf.layers.dense(D_l1, 1, tf.nn.sigmoid, name='out', reuse=True)  # probability that the art work is made by artist
 
 D_loss = -tf.reduce_mean(tf.log(prob_artist0) + tf.log(1-prob_artist1))
 G_loss = tf.reduce_mean(tf.log(1-prob_artist1))