1+ """
2+ Know more, visit 莫烦Python: https://morvanzhou.github.io/tutorials/
3+ My Youtube Channel: https://www.youtube.com/user/MorvanZhou
4+ More about Reinforcement learning: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/
5+
6+ Dependencies:
7+ tensorflow: 1.1.0
8+ matplotlib
9+ numpy
10+ """
11+ import tensorflow as tf
12+ import numpy as np
13+ import gym
14+
15+ tf .set_random_seed (1 )
16+ np .random .seed (1 )
17+
18+ # Hyper Parameters
19+ BATCH_SIZE = 32
20+ LR = 0.01 # learning rate
21+ EPSILON = 0.9 # greedy policy
22+ GAMMA = 0.9 # reward discount
23+ TARGET_REPLACE_ITER = 100 # target update frequency
24+ MEMORY_CAPACITY = 2000
25+ MEMORY_COUNTER = 0 # for store experience
26+ LEARNING_STEP_COUNTER = 0 # for target updating
27+ env = gym .make ('CartPole-v0' )
28+ env = env .unwrapped
29+ N_ACTIONS = env .action_space .n
30+ N_STATES = env .observation_space .shape [0 ]
31+ MEMORY = np .zeros ((MEMORY_CAPACITY , N_STATES * 2 + 2 )) # initialize memory
32+
33+ # tf placeholders
34+ tf_s = tf .placeholder (tf .float32 , [None , N_STATES ])
35+ tf_a = tf .placeholder (tf .int32 , [None , ])
36+ tf_r = tf .placeholder (tf .float32 , [None , ])
37+ tf_s_ = tf .placeholder (tf .float32 , [None , N_STATES ])
38+
39+ with tf .variable_scope ('q' ): # evaluation network
40+ l_eval = tf .layers .dense (tf_s , 10 , tf .nn .relu , kernel_initializer = tf .random_normal_initializer (0 , 0.1 ))
41+ q = tf .layers .dense (l_eval , N_ACTIONS , kernel_initializer = tf .random_normal_initializer (0 , 0.1 ))
42+
43+ with tf .variable_scope ('q_next' ): # target network
44+ l_target = tf .layers .dense (tf_s_ , 10 , tf .nn .relu )
45+ q_next = tf .layers .dense (l_target , N_ACTIONS )
46+
47+ q_target = tf .stop_gradient (tf_r + GAMMA * tf .reduce_max (q_next , axis = 1 )) # shape=(None, ), not need any gradient
48+ a_one_hot = tf .one_hot (tf_a , depth = N_ACTIONS , dtype = tf .float32 )
49+ q_wrt_a = tf .reduce_sum (q * a_one_hot , axis = 1 ) # shape=(None, ), q for current state
50+
51+ loss = tf .reduce_mean (tf .squared_difference (q_target , q_wrt_a ))
52+ train_op = tf .train .AdamOptimizer (LR ).minimize (loss )
53+
54+ sess = tf .Session ()
55+ sess .run (tf .global_variables_initializer ())
56+
57+
58+ def choose_action (s ):
59+ s = s [np .newaxis , :]
60+ if np .random .uniform () < EPSILON :
61+ # forward feed the observation and get q value for every actions
62+ actions_value = sess .run (q , feed_dict = {tf_s : s })
63+ action = np .argmax (actions_value )
64+ else :
65+ action = np .random .randint (0 , N_ACTIONS )
66+ return action
67+
68+
69+ def store_transition (s , a , r , s_ ):
70+ global MEMORY_COUNTER
71+ transition = np .hstack ((s , [a , r ], s_ ))
72+ # replace the old memory with new memory
73+ index = MEMORY_COUNTER % MEMORY_CAPACITY
74+ MEMORY [index , :] = transition
75+ MEMORY_COUNTER += 1
76+
77+
78+ def learn ():
79+ # update target net
80+ global LEARNING_STEP_COUNTER
81+ if LEARNING_STEP_COUNTER % TARGET_REPLACE_ITER == 0 :
82+ t_params = tf .get_collection (tf .GraphKeys .TRAINABLE_VARIABLES , scope = 'q_next' )
83+ e_params = tf .get_collection (tf .GraphKeys .TRAINABLE_VARIABLES , scope = 'q' )
84+ sess .run ([tf .assign (t , e ) for t , e in zip (t_params , e_params )])
85+ LEARNING_STEP_COUNTER += 1
86+
87+ # learning
88+ sample_index = np .random .choice (MEMORY_CAPACITY , BATCH_SIZE )
89+ b_memory = MEMORY [sample_index , :]
90+ b_s = b_memory [:, :N_STATES ]
91+ b_a = b_memory [:, N_STATES ].astype (int )
92+ b_r = b_memory [:, N_STATES + 1 ]
93+ b_s_ = b_memory [:, - N_STATES :]
94+ sess .run (train_op , {tf_s : b_s , tf_a : b_a , tf_r : b_r , tf_s_ : b_s_ })
95+
96+ print ('\n Collecting experience...' )
97+ for i_episode in range (400 ):
98+ s = env .reset ()
99+ ep_r = 0
100+ while True :
101+ env .render ()
102+ a = choose_action (s )
103+
104+ # take action
105+ s_ , r , done , info = env .step (a )
106+
107+ # modify the reward
108+ x , x_dot , theta , theta_dot = s_
109+ r1 = (env .x_threshold - abs (x )) / env .x_threshold - 0.8
110+ r2 = (env .theta_threshold_radians - abs (theta )) / env .theta_threshold_radians - 0.5
111+ r = r1 + r2
112+
113+ store_transition (s , a , r , s_ )
114+
115+ ep_r += r
116+ if MEMORY_COUNTER > MEMORY_CAPACITY :
117+ learn ()
118+ if done :
119+ print ('Ep: ' , i_episode ,
120+ '| Ep_r: ' , round (ep_r , 2 ))
121+
122+ if done :
123+ break
124+ s = s_
0 commit comments