A2C trading results
Browse files
fin_rl_a2c_v1.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
fin_rl_policy_gradiant_v1.ipynb → fin_rl_policy_gradient_v1.ipynb
RENAMED
@@ -6,10 +6,10 @@
|
|
6 |
"id": "nwaAZRu1NTiI"
|
7 |
},
|
8 |
"source": [
|
9 |
-
"#
|
10 |
"\n",
|
11 |
"\n",
|
12 |
-
"#### This version implements
|
13 |
]
|
14 |
},
|
15 |
{
|
@@ -24,25 +24,17 @@
|
|
24 |
},
|
25 |
{
|
26 |
"cell_type": "code",
|
27 |
-
"execution_count":
|
28 |
"metadata": {
|
29 |
"id": "LNXxxKojNTiL"
|
30 |
},
|
31 |
-
"outputs": [
|
32 |
-
{
|
33 |
-
"name": "stderr",
|
34 |
-
"output_type": "stream",
|
35 |
-
"text": [
|
36 |
-
"2022-12-27 12:47:16.481995: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
|
37 |
-
"To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
|
38 |
-
"\n"
|
39 |
-
]
|
40 |
-
}
|
41 |
-
],
|
42 |
"source": [
|
43 |
"import tensorflow as tf\n",
|
44 |
-
"from tensorflow.keras import layers\n",
|
45 |
"from tensorflow.keras.utils import to_categorical\n",
|
|
|
|
|
46 |
"import gym\n",
|
47 |
"from gym import spaces\n",
|
48 |
"from gym.utils import seeding\n",
|
@@ -66,63 +58,123 @@
|
|
66 |
},
|
67 |
{
|
68 |
"cell_type": "code",
|
69 |
-
"execution_count":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
"metadata": {},
|
71 |
"outputs": [],
|
72 |
"source": [
|
73 |
"class Policy:\n",
|
74 |
-
" def __init__(self, env=None):\n",
|
75 |
"\n",
|
76 |
-
" self.action_size =
|
77 |
"\n",
|
78 |
" # Hyperparameters\n",
|
79 |
" self.gamma = 0.95 # Discount rate\n",
|
80 |
-
"
|
81 |
-
" self.
|
82 |
-
" self.epsilon_decay = 0.95 # Decay rate for epsilon\n",
|
83 |
-
" self.update_rate = 5 # Number of steps until updating the target network\n",
|
84 |
-
" self.batch_size = 200\n",
|
85 |
-
" self.learning_rate = 1e-4\n",
|
86 |
" \n",
|
87 |
-
"
|
88 |
-
" self.model.summary()\n",
|
89 |
" self.env = env\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
"\n",
|
91 |
-
" self.history = None\n",
|
92 |
-
" self.scaler = None\n",
|
93 |
"\n",
|
94 |
" def _build_model(self):\n",
|
95 |
-
"
|
96 |
-
" \n",
|
97 |
-
"
|
98 |
-
"
|
99 |
-
"
|
100 |
-
"
|
101 |
-
"
|
102 |
" \n",
|
103 |
" optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)\n",
|
104 |
-
"
|
105 |
-
" #
|
106 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
"\n",
|
108 |
" def act(self, state):\n",
|
109 |
-
" probs = self.model.predict([state])\n",
|
110 |
-
" action =
|
111 |
-
" return action.numpy()[0], tf.math.log(probs[0][action])\n",
|
112 |
"\n",
|
113 |
-
"
|
|
|
|
|
|
|
114 |
" # Help us to calculate the score during the training\n",
|
115 |
" scores_deque = deque(maxlen=100)\n",
|
116 |
" scores = []\n",
|
117 |
" # Line 3 of pseudocode\n",
|
118 |
" for i_episode in range(1, n_training_episodes+1):\n",
|
119 |
-
" saved_log_probs = []\n",
|
|
|
|
|
120 |
" rewards = []\n",
|
121 |
" state = self.env.reset()\n",
|
122 |
" # Line 4 of pseudocode\n",
|
123 |
" for t in range(max_t):\n",
|
124 |
-
"
|
125 |
-
"
|
|
|
|
|
|
|
126 |
" state, reward, done, _ = self.env.step(action)\n",
|
127 |
" rewards.append(reward)\n",
|
128 |
" if done:\n",
|
@@ -165,7 +217,7 @@
|
|
165 |
" ## a normal python list would instead require O(N) to do this.\n",
|
166 |
" for t in range(n_steps)[::-1]:\n",
|
167 |
" disc_return_t = (returns[0] if len(returns)>0 else 0)\n",
|
168 |
-
" returns.appendleft( gamma*disc_return_t + rewards[t] ) \n",
|
169 |
" \n",
|
170 |
" ## standardization of the returns is employed to make training more stable\n",
|
171 |
" eps = np.finfo(np.float32).eps.item()\n",
|
@@ -173,22 +225,32 @@
|
|
173 |
" # added to the standard deviation of the returns to avoid numerical instabilities \n",
|
174 |
" returns = np.array(returns)\n",
|
175 |
" returns = (returns - returns.mean()) / (returns.std() + eps)\n",
|
|
|
176 |
" \n",
|
177 |
" # Line 7:\n",
|
178 |
-
"
|
179 |
-
"
|
180 |
-
"
|
181 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
" \n",
|
183 |
-
" # Line 8: gradient descent
|
184 |
" # optimizer.zero_grad()\n",
|
185 |
-
" policy_loss.backward()\n",
|
186 |
-
" self.model.train_on_batch()\n",
|
187 |
" # optimizer.step()\n",
|
188 |
" \n",
|
189 |
" if i_episode % print_every == 0:\n",
|
190 |
" print('Episode {}\\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))\n",
|
191 |
-
"
|
192 |
" return scores\n",
|
193 |
"\n",
|
194 |
"\n",
|
@@ -213,7 +275,7 @@
|
|
213 |
},
|
214 |
{
|
215 |
"cell_type": "code",
|
216 |
-
"execution_count":
|
217 |
"metadata": {},
|
218 |
"outputs": [],
|
219 |
"source": [
|
@@ -445,15 +507,15 @@
|
|
445 |
},
|
446 |
{
|
447 |
"cell_type": "code",
|
448 |
-
"execution_count":
|
449 |
"metadata": {},
|
450 |
"outputs": [
|
451 |
{
|
452 |
"name": "stdout",
|
453 |
"output_type": "stream",
|
454 |
"text": [
|
455 |
-
"
|
456 |
-
"
|
457 |
]
|
458 |
}
|
459 |
],
|
@@ -476,17 +538,75 @@
|
|
476 |
},
|
477 |
{
|
478 |
"cell_type": "code",
|
479 |
-
"execution_count":
|
480 |
"metadata": {},
|
481 |
-
"outputs": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
482 |
"source": [
|
483 |
"# create env\n",
|
484 |
"max_steps = 20 \n",
|
485 |
"env = CustTradingEnv(df=eth_train, max_steps=max_steps)\n",
|
486 |
"\n",
|
487 |
-
"model = Policy(env=env)\n",
|
488 |
-
"#
|
489 |
-
"
|
|
|
|
|
|
|
490 |
]
|
491 |
},
|
492 |
{
|
@@ -495,8 +615,8 @@
|
|
495 |
"metadata": {},
|
496 |
"outputs": [],
|
497 |
"source": [
|
498 |
-
"model.save(\"./alt/
|
499 |
-
"joblib.dump(env.get_scaler(),\"./alt/
|
500 |
]
|
501 |
},
|
502 |
{
|
|
|
6 |
"id": "nwaAZRu1NTiI"
|
7 |
},
|
8 |
"source": [
|
9 |
+
"# Policy Gradient\n",
|
10 |
"\n",
|
11 |
"\n",
|
12 |
+
"#### This version implements Policy Gradient using a custom enviroment (Unit 4)"
|
13 |
]
|
14 |
},
|
15 |
{
|
|
|
24 |
},
|
25 |
{
|
26 |
"cell_type": "code",
|
27 |
+
"execution_count": 44,
|
28 |
"metadata": {
|
29 |
"id": "LNXxxKojNTiL"
|
30 |
},
|
31 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
"source": [
|
33 |
"import tensorflow as tf\n",
|
34 |
+
"from tensorflow.keras import layers, Model, Input\n",
|
35 |
"from tensorflow.keras.utils import to_categorical\n",
|
36 |
+
"import tensorflow.keras.backend as K\n",
|
37 |
+
"\n",
|
38 |
"import gym\n",
|
39 |
"from gym import spaces\n",
|
40 |
"from gym.utils import seeding\n",
|
|
|
58 |
},
|
59 |
{
|
60 |
"cell_type": "code",
|
61 |
+
"execution_count": 45,
|
62 |
+
"metadata": {},
|
63 |
+
"outputs": [],
|
64 |
+
"source": [
|
65 |
+
"# custom model to be able to run a custom loss with parameters\n",
|
66 |
+
"class CustomModel(tf.keras.Model):\n",
|
67 |
+
" def custom_loss(self,y, y_pred, d_returns):\n",
|
68 |
+
" log_like = y * K.log(y_pred)\n",
|
69 |
+
" # K.print_tensor(d_returns)\n",
|
70 |
+
" return K.sum(-log_like * d_returns )\n",
|
71 |
+
" \n",
|
72 |
+
" def train_step(self, data):\n",
|
73 |
+
" # Unpack the data. Its structure depends on your model and\n",
|
74 |
+
" # on what you pass to `fit()`.\n",
|
75 |
+
" if len(data) == 3:\n",
|
76 |
+
" x, y, sample_weight = data\n",
|
77 |
+
" else:\n",
|
78 |
+
" sample_weight = None\n",
|
79 |
+
" x, y = data\n",
|
80 |
+
"\n",
|
81 |
+
" # check if we passed the d_return\n",
|
82 |
+
" if isinstance(x, tuple):\n",
|
83 |
+
" x, d_return = x\n",
|
84 |
+
"\n",
|
85 |
+
" with tf.GradientTape() as tape:\n",
|
86 |
+
" y_pred = self(x, training=True) # Forward pass\n",
|
87 |
+
" # Compute the loss value.\n",
|
88 |
+
" y = tf.cast(y, tf.float32)\n",
|
89 |
+
" loss = self.custom_loss(y, y_pred, d_return)\n",
|
90 |
+
"\n",
|
91 |
+
" # Compute gradients\n",
|
92 |
+
" trainable_vars = self.trainable_variables\n",
|
93 |
+
" gradients = tape.gradient(loss, trainable_vars)\n",
|
94 |
+
"\n",
|
95 |
+
" # Update weights\n",
|
96 |
+
" self.optimizer.apply_gradients(zip(gradients, trainable_vars))\n",
|
97 |
+
"\n",
|
98 |
+
" # Update the metrics.\n",
|
99 |
+
" # Metrics are configured in `compile()`.\n",
|
100 |
+
" self.compiled_metrics.update_state(y, y_pred, sample_weight=sample_weight)\n",
|
101 |
+
"\n",
|
102 |
+
" # Return a dict mapping metric names to current value.\n",
|
103 |
+
" # Note that it will include the loss (tracked in self.metrics).\n",
|
104 |
+
" return {m.name: m.result() for m in self.metrics}"
|
105 |
+
]
|
106 |
+
},
|
107 |
+
{
|
108 |
+
"cell_type": "code",
|
109 |
+
"execution_count": 46,
|
110 |
"metadata": {},
|
111 |
"outputs": [],
|
112 |
"source": [
|
113 |
"class Policy:\n",
|
114 |
+
" def __init__(self, env=None, action_size=2):\n",
|
115 |
"\n",
|
116 |
+
" self.action_size = action_size\n",
|
117 |
"\n",
|
118 |
" # Hyperparameters\n",
|
119 |
" self.gamma = 0.95 # Discount rate\n",
|
120 |
+
"\n",
|
121 |
+
" self.learning_rate = 1e-3\n",
|
|
|
|
|
|
|
|
|
122 |
" \n",
|
123 |
+
" # Construct DQN models\n",
|
|
|
124 |
" self.env = env\n",
|
125 |
+
" self.action_size = action_size\n",
|
126 |
+
" self.action_space = [i for i in range(action_size)]\n",
|
127 |
+
" print(\"action space\",self.action_space)\n",
|
128 |
+
" # self.saved_log_probs = None\n",
|
129 |
+
" self.model= self._build_model()\n",
|
130 |
+
" self.model.summary()\n",
|
131 |
"\n",
|
|
|
|
|
132 |
"\n",
|
133 |
" def _build_model(self):\n",
|
134 |
+
" x = Input(shape=(4,), name='x_input')\n",
|
135 |
+
" # y_true = Input( shape=(2,), name='y_true' )\n",
|
136 |
+
" d_returns = Input(shape=[1], name='d_returns')\n",
|
137 |
+
"\n",
|
138 |
+
" l = layers.Dense(16, activation = 'relu')(x)\n",
|
139 |
+
" l = layers.Dense(16, activation = 'relu')(l)\n",
|
140 |
+
" y_pred = layers.Dense(self.action_size, activation = 'softmax', name='y_pred')(l)\n",
|
141 |
" \n",
|
142 |
" optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)\n",
|
143 |
+
"\n",
|
144 |
+
" # model_train = Model( inputs=[x], outputs=[y_pred], name='train_only' )\n",
|
145 |
+
" model_train = CustomModel( inputs=x, outputs=y_pred, name='train_only' )\n",
|
146 |
+
" # model_predict = Model( inputs=x, outputs=y_pred, name='predict_only' )\n",
|
147 |
+
" model_train.compile(loss=None, optimizer=optimizer, metrics = ['accuracy'])\n",
|
148 |
+
" # use run_eagerly to print values inside the loss function to debug\n",
|
149 |
+
" # model_train.compile(loss=None, optimizer=optimizer, metrics = ['accuracy'], run_eagerly = True)\n",
|
150 |
+
"\n",
|
151 |
+
" return model_train\n",
|
152 |
"\n",
|
153 |
" def act(self, state):\n",
|
154 |
+
" probs = self.model.predict(np.array([state]), verbose=0)[0]\n",
|
155 |
+
" action = np.random.choice(self.action_space, p=probs)\n",
|
|
|
156 |
"\n",
|
157 |
+
" return action\n",
|
158 |
+
"\n",
|
159 |
+
" # this implements the reinforce \n",
|
160 |
+
" def learn(self, n_training_episodes=None, max_t=None, print_every=100):\n",
|
161 |
" # Help us to calculate the score during the training\n",
|
162 |
" scores_deque = deque(maxlen=100)\n",
|
163 |
" scores = []\n",
|
164 |
" # Line 3 of pseudocode\n",
|
165 |
" for i_episode in range(1, n_training_episodes+1):\n",
|
166 |
+
" # saved_log_probs = []\n",
|
167 |
+
" saved_actions = []\n",
|
168 |
+
" saved_state = []\n",
|
169 |
" rewards = []\n",
|
170 |
" state = self.env.reset()\n",
|
171 |
" # Line 4 of pseudocode\n",
|
172 |
" for t in range(max_t):\n",
|
173 |
+
" saved_state.append(state)\n",
|
174 |
+
" action = self.act(state)\n",
|
175 |
+
" # action, log_prob = self.act(state)\n",
|
176 |
+
" # saved_log_probs.append(log_prob)\n",
|
177 |
+
" saved_actions.append(action)\n",
|
178 |
" state, reward, done, _ = self.env.step(action)\n",
|
179 |
" rewards.append(reward)\n",
|
180 |
" if done:\n",
|
|
|
217 |
" ## a normal python list would instead require O(N) to do this.\n",
|
218 |
" for t in range(n_steps)[::-1]:\n",
|
219 |
" disc_return_t = (returns[0] if len(returns)>0 else 0)\n",
|
220 |
+
" returns.appendleft( self.gamma*disc_return_t + rewards[t] ) \n",
|
221 |
" \n",
|
222 |
" ## standardization of the returns is employed to make training more stable\n",
|
223 |
" eps = np.finfo(np.float32).eps.item()\n",
|
|
|
225 |
" # added to the standard deviation of the returns to avoid numerical instabilities \n",
|
226 |
" returns = np.array(returns)\n",
|
227 |
" returns = (returns - returns.mean()) / (returns.std() + eps)\n",
|
228 |
+
" # self.saved_log_probs = saved_log_probs\n",
|
229 |
" \n",
|
230 |
" # Line 7:\n",
|
231 |
+
" saved_state = np.array(saved_state)\n",
|
232 |
+
" # print(\"Saved state\", saved_state, saved_state.shape)\n",
|
233 |
+
" saved_actions = np.array(to_categorical(saved_actions, num_classes=self.action_size))\n",
|
234 |
+
" # print(\"Saved actions\", saved_actions, saved_actions.shape)\n",
|
235 |
+
" returns = returns.reshape(-1,1)\n",
|
236 |
+
" # print(\"Returns\", returns, returns.shape)\n",
|
237 |
+
" # this is the trick part, we send a tuple so the CustomModel is able to split the x and use \n",
|
238 |
+
" # the returns inside to calculate the custom loss\n",
|
239 |
+
" self.model.train_on_batch(x=(saved_state,returns), y=saved_actions)\n",
|
240 |
+
"\n",
|
241 |
+
" # policy_loss = []\n",
|
242 |
+
" # for action, log_prob, disc_return in zip(saved_actions, saved_log_probs, returns):\n",
|
243 |
+
" # policy_loss.append(-log_prob * disc_return)\n",
|
244 |
+
" # policy_loss = torch.cat(policy_loss).sum()\n",
|
245 |
" \n",
|
246 |
+
" # # Line 8: PyTorch prefers gradient descent \n",
|
247 |
" # optimizer.zero_grad()\n",
|
248 |
+
" # policy_loss.backward()\n",
|
|
|
249 |
" # optimizer.step()\n",
|
250 |
" \n",
|
251 |
" if i_episode % print_every == 0:\n",
|
252 |
" print('Episode {}\\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))\n",
|
253 |
+
" \n",
|
254 |
" return scores\n",
|
255 |
"\n",
|
256 |
"\n",
|
|
|
275 |
},
|
276 |
{
|
277 |
"cell_type": "code",
|
278 |
+
"execution_count": 47,
|
279 |
"metadata": {},
|
280 |
"outputs": [],
|
281 |
"source": [
|
|
|
507 |
},
|
508 |
{
|
509 |
"cell_type": "code",
|
510 |
+
"execution_count": 48,
|
511 |
"metadata": {},
|
512 |
"outputs": [
|
513 |
{
|
514 |
"name": "stdout",
|
515 |
"output_type": "stream",
|
516 |
"text": [
|
517 |
+
"3067\n",
|
518 |
+
"1918\n"
|
519 |
]
|
520 |
}
|
521 |
],
|
|
|
538 |
},
|
539 |
{
|
540 |
"cell_type": "code",
|
541 |
+
"execution_count": 49,
|
542 |
"metadata": {},
|
543 |
+
"outputs": [
|
544 |
+
{
|
545 |
+
"name": "stdout",
|
546 |
+
"output_type": "stream",
|
547 |
+
"text": [
|
548 |
+
"action space [0, 1, 2]\n",
|
549 |
+
"Model: \"train_only\"\n",
|
550 |
+
"_________________________________________________________________\n",
|
551 |
+
" Layer (type) Output Shape Param # \n",
|
552 |
+
"=================================================================\n",
|
553 |
+
" x_input (InputLayer) [(None, 4)] 0 \n",
|
554 |
+
" \n",
|
555 |
+
" dense_22 (Dense) (None, 16) 80 \n",
|
556 |
+
" \n",
|
557 |
+
" dense_23 (Dense) (None, 16) 272 \n",
|
558 |
+
" \n",
|
559 |
+
" y_pred (Dense) (None, 3) 51 \n",
|
560 |
+
" \n",
|
561 |
+
"=================================================================\n",
|
562 |
+
"Total params: 403\n",
|
563 |
+
"Trainable params: 403\n",
|
564 |
+
"Non-trainable params: 0\n",
|
565 |
+
"_________________________________________________________________\n",
|
566 |
+
"Episode 100\tAverage Score: -180.05\n",
|
567 |
+
"Episode 200\tAverage Score: -164.72\n",
|
568 |
+
"Episode 300\tAverage Score: -81.03\n",
|
569 |
+
"Episode 400\tAverage Score: -117.40\n",
|
570 |
+
"Episode 500\tAverage Score: -182.76\n",
|
571 |
+
"Episode 600\tAverage Score: -92.27\n",
|
572 |
+
"Episode 700\tAverage Score: -207.78\n",
|
573 |
+
"Episode 800\tAverage Score: -232.02\n",
|
574 |
+
"Episode 900\tAverage Score: -29.72\n",
|
575 |
+
"Episode 1000\tAverage Score: -44.37\n",
|
576 |
+
"Episode 1100\tAverage Score: -60.61\n",
|
577 |
+
"Episode 1200\tAverage Score: -67.30\n",
|
578 |
+
"Episode 1300\tAverage Score: -36.28\n",
|
579 |
+
"Episode 1400\tAverage Score: -60.42\n",
|
580 |
+
"Episode 1500\tAverage Score: -93.99\n",
|
581 |
+
"Episode 1600\tAverage Score: -70.92\n",
|
582 |
+
"Episode 1700\tAverage Score: -88.01\n",
|
583 |
+
"Episode 1800\tAverage Score: -21.69\n",
|
584 |
+
"Episode 1900\tAverage Score: -66.15\n",
|
585 |
+
"Episode 2000\tAverage Score: -96.49\n",
|
586 |
+
"Episode 2100\tAverage Score: -33.40\n",
|
587 |
+
"Episode 2200\tAverage Score: -25.62\n",
|
588 |
+
"Episode 2300\tAverage Score: -46.25\n",
|
589 |
+
"Episode 2400\tAverage Score: -63.88\n",
|
590 |
+
"Episode 2500\tAverage Score: -29.43\n",
|
591 |
+
"Episode 2600\tAverage Score: -19.85\n",
|
592 |
+
"Episode 2700\tAverage Score: -53.53\n",
|
593 |
+
"Episode 2800\tAverage Score: -42.98\n",
|
594 |
+
"Episode 2900\tAverage Score: -50.12\n",
|
595 |
+
"Episode 3000\tAverage Score: -27.25\n"
|
596 |
+
]
|
597 |
+
}
|
598 |
+
],
|
599 |
"source": [
|
600 |
"# create env\n",
|
601 |
"max_steps = 20 \n",
|
602 |
"env = CustTradingEnv(df=eth_train, max_steps=max_steps)\n",
|
603 |
"\n",
|
604 |
+
"model = Policy(env=env, action_size=3)\n",
|
605 |
+
"# model.learn(total_steps=6_000)\n",
|
606 |
+
"\n",
|
607 |
+
"model.learn(n_training_episodes=3000, max_t=20, print_every=100)\n",
|
608 |
+
"# model.learn(n_training_episodes=1000, max_t=1000, print_every=100)\n",
|
609 |
+
"env.close()\n"
|
610 |
]
|
611 |
},
|
612 |
{
|
|
|
615 |
"metadata": {},
|
616 |
"outputs": [],
|
617 |
"source": [
|
618 |
+
"model.save(\"./alt/fin_rl_policy_gradient_v1\")\n",
|
619 |
+
"joblib.dump(env.get_scaler(),\"./alt/fin_rl_policy_gradient_v1.h5_scaler\")\n"
|
620 |
]
|
621 |
},
|
622 |
{
|
test_return.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import deque
|
2 |
+
import numpy as np
|
3 |
+
returns = deque(maxlen=20)
|
4 |
+
rewards = [1,1,1,1,1]
|
5 |
+
n_steps = len(rewards)
|
6 |
+
|
7 |
+
for t in range(n_steps)[::-1]:
|
8 |
+
print("Step=======",t)
|
9 |
+
disc_return_t = (returns[0] if len(returns)>0 else 0)
|
10 |
+
print("return",disc_return_t)
|
11 |
+
print("reward",rewards[t] )
|
12 |
+
returns.appendleft( 0.95 * disc_return_t +rewards[t] )
|
13 |
+
print("appended ret",returns )
|
14 |
+
|
15 |
+
returns = np.array(returns)
|
16 |
+
print(returns)
|