bonadio
/

rl-fin

Model card Files Files and versions Community

bonadio commited on Feb 15, 2023

Commit

da4e4fb

•

1 Parent(s): 421e9fd

A2C trading results

Browse files

Files changed (3) hide show

fin_rl_a2c_v1.ipynb +0 -0
fin_rl_policy_gradiant_v1.ipynb → fin_rl_policy_gradient_v1.ipynb +185 -65
test_return.py +16 -0

fin_rl_a2c_v1.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

fin_rl_policy_gradiant_v1.ipynb → fin_rl_policy_gradient_v1.ipynb RENAMED Viewed

@@ -6,10 +6,10 @@
         "id": "nwaAZRu1NTiI"
       },
       "source": [
-        "# PolicyGradiant\n",
         "\n",
         "\n",
-        "#### This version implements PolicyGradiant using a custom enviroment (Unit 4)"
       ]
     },
     {
@@ -24,25 +24,17 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 1,
       "metadata": {
         "id": "LNXxxKojNTiL"
       },
-      "outputs": [
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "2022-12-27 12:47:16.481995: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA\n",
-            "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-            "\n"
-          ]
-        }
-      ],
       "source": [
         "import tensorflow as tf\n",
-        "from tensorflow.keras import layers\n",
         "from tensorflow.keras.utils import to_categorical\n",
         "import gym\n",
         "from gym import spaces\n",
         "from gym.utils import seeding\n",
@@ -66,63 +58,123 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 66,
       "metadata": {},
       "outputs": [],
       "source": [
         "class Policy:\n",
-        "    def __init__(self, env=None):\n",
         "\n",
-        "        self.action_size = env.action_space.n\n",
         "\n",
         "        # Hyperparameters\n",
         "        self.gamma = 0.95           # Discount rate\n",
-        "        self.epsilon = 1.0          # Exploration rate\n",
-        "        self.epsilon_min = 0.001      # Minimal exploration rate (epsilon-greedy)\n",
-        "        self.epsilon_decay = 0.95    # Decay rate for epsilon\n",
-        "        self.update_rate = 5       # Number of steps until updating the target network\n",
-        "        self.batch_size = 200\n",
-        "        self.learning_rate = 1e-4\n",
         "        \n",
-        "        self.model = self._build_model()\n",
-        "        self.model.summary()\n",
         "        self.env = env\n",
         "\n",
-        "        self.history = None\n",
-        "        self.scaler = None\n",
         "\n",
         "    def _build_model(self):\n",
-        "        model = tf.keras.Sequential()\n",
-        "        \n",
-        "        model.add(tf.keras.Input(shape=(4,)))\n",
-        "        model.add(layers.Dense(256, activation = 'relu'))\n",
-        "        model.add(layers.Dense(128, activation = 'relu'))\n",
-        "        model.add(layers.Dense(64, activation = 'relu'))\n",
-        "        model.add(layers.Dense(self.action_size, activation = 'softmax'))\n",
         "        \n",
         "        optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)\n",
-        "        model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics = ['accuracy'])\n",
-        "        # model.compile(loss='mse', optimizer=optimizer, metrics = ['mse'])\n",
-        "        return model\n",
         "\n",
         "    def act(self, state):\n",
-        "        probs = self.model.predict([state])\n",
-        "        action = tf.random.categorical(tf.math.log(probs), 1)\n",
-        "        return action.numpy()[0], tf.math.log(probs[0][action])\n",
         "\n",
-        "    def reinforce(self, n_training_episodes, max_t, gamma, print_every):\n",
         "        # Help us to calculate the score during the training\n",
         "        scores_deque = deque(maxlen=100)\n",
         "        scores = []\n",
         "        # Line 3 of pseudocode\n",
         "        for i_episode in range(1, n_training_episodes+1):\n",
-        "            saved_log_probs = []\n",
         "            rewards = []\n",
         "            state = self.env.reset()\n",
         "            # Line 4 of pseudocode\n",
         "            for t in range(max_t):\n",
-        "                action, log_prob = self.act(state)\n",
-        "                saved_log_probs.append(log_prob)\n",
         "                state, reward, done, _ = self.env.step(action)\n",
         "                rewards.append(reward)\n",
         "                if done:\n",
@@ -165,7 +217,7 @@
         "            ## a normal python list would instead require O(N) to do this.\n",
         "            for t in range(n_steps)[::-1]:\n",
         "                disc_return_t = (returns[0] if len(returns)>0 else 0)\n",
-        "                returns.appendleft( gamma*disc_return_t + rewards[t]   )    \n",
         "                \n",
         "            ## standardization of the returns is employed to make training more stable\n",
         "            eps = np.finfo(np.float32).eps.item()\n",
@@ -173,22 +225,32 @@
         "            # added to the standard deviation of the returns to avoid numerical instabilities        \n",
         "            returns = np.array(returns)\n",
         "            returns = (returns - returns.mean()) / (returns.std() + eps)\n",
         "            \n",
         "            # Line 7:\n",
-        "            policy_loss = []\n",
-        "            for log_prob, disc_return in zip(saved_log_probs, returns):\n",
-        "                policy_loss.append(-log_prob * disc_return)\n",
-        "            policy_loss = np.concatenate(policy_loss).sum()\n",
         "            \n",
-        "            # Line 8: gradient descent step\n",
         "            # optimizer.zero_grad()\n",
-        "            policy_loss.backward()\n",
-        "            self.model.train_on_batch()\n",
         "            # optimizer.step()\n",
         "            \n",
         "            if i_episode % print_every == 0:\n",
         "                print('Episode {}\\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))\n",
-        "            \n",
         "        return scores\n",
         "\n",
         "\n",
@@ -213,7 +275,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 67,
       "metadata": {},
       "outputs": [],
       "source": [
@@ -445,15 +507,15 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 68,
       "metadata": {},
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "3024\n",
-            "1875\n"
           ]
         }
       ],
@@ -476,17 +538,75 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "# create env\n",
         "max_steps = 20  \n",
         "env = CustTradingEnv(df=eth_train, max_steps=max_steps)\n",
         "\n",
-        "model = Policy(env=env)\n",
-        "#n_training_episodes, max_t, gamma, print_every\n",
-        "model.reinforce(1000, 1000, 0.95, 100)\n"
       ]
     },
     {
@@ -495,8 +615,8 @@
       "metadata": {},
       "outputs": [],
       "source": [
-        "model.save(\"./alt/fin_rl_dqn_v1\")\n",
-        "joblib.dump(env.get_scaler(),\"./alt/fin_rl_dqn_v1.h5_scaler\")\n"
       ]
     },
     {

         "id": "nwaAZRu1NTiI"
       },
       "source": [
+        "# Policy Gradient\n",
         "\n",
         "\n",
+        "#### This version implements Policy Gradient using a custom enviroment (Unit 4)"
       ]
     },
     {
     },
     {
       "cell_type": "code",
+      "execution_count": 44,
       "metadata": {
         "id": "LNXxxKojNTiL"
       },
+      "outputs": [],
       "source": [
         "import tensorflow as tf\n",
+        "from tensorflow.keras import layers, Model, Input\n",
         "from tensorflow.keras.utils import to_categorical\n",
+        "import tensorflow.keras.backend as K\n",
+        "\n",
         "import gym\n",
         "from gym import spaces\n",
         "from gym.utils import seeding\n",
     },
     {
       "cell_type": "code",
+      "execution_count": 45,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# custom model to be able to run a custom loss with parameters\n",
+        "class CustomModel(tf.keras.Model):\n",
+        "    def custom_loss(self,y, y_pred, d_returns):\n",
+        "        log_like = y * K.log(y_pred)\n",
+        "        # K.print_tensor(d_returns)\n",
+        "        return K.sum(-log_like * d_returns )\n",
+        "        \n",
+        "    def train_step(self, data):\n",
+        "        # Unpack the data. Its structure depends on your model and\n",
+        "        # on what you pass to `fit()`.\n",
+        "        if len(data) == 3:\n",
+        "            x, y, sample_weight = data\n",
+        "        else:\n",
+        "            sample_weight = None\n",
+        "            x, y = data\n",
+        "\n",
+        "        # check if we passed the d_return\n",
+        "        if isinstance(x, tuple):\n",
+        "            x, d_return = x\n",
+        "\n",
+        "        with tf.GradientTape() as tape:\n",
+        "            y_pred = self(x, training=True)  # Forward pass\n",
+        "            # Compute the loss value.\n",
+        "            y = tf.cast(y, tf.float32)\n",
+        "            loss = self.custom_loss(y, y_pred, d_return)\n",
+        "\n",
+        "        # Compute gradients\n",
+        "        trainable_vars = self.trainable_variables\n",
+        "        gradients = tape.gradient(loss, trainable_vars)\n",
+        "\n",
+        "        # Update weights\n",
+        "        self.optimizer.apply_gradients(zip(gradients, trainable_vars))\n",
+        "\n",
+        "        # Update the metrics.\n",
+        "        # Metrics are configured in `compile()`.\n",
+        "        self.compiled_metrics.update_state(y, y_pred, sample_weight=sample_weight)\n",
+        "\n",
+        "        # Return a dict mapping metric names to current value.\n",
+        "        # Note that it will include the loss (tracked in self.metrics).\n",
+        "        return {m.name: m.result() for m in self.metrics}"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 46,
       "metadata": {},
       "outputs": [],
       "source": [
         "class Policy:\n",
+        "    def __init__(self, env=None, action_size=2):\n",
         "\n",
+        "        self.action_size = action_size\n",
         "\n",
         "        # Hyperparameters\n",
         "        self.gamma = 0.95           # Discount rate\n",
+        "\n",
+        "        self.learning_rate = 1e-3\n",
         "        \n",
+        "        # Construct DQN models\n",
         "        self.env = env\n",
+        "        self.action_size = action_size\n",
+        "        self.action_space = [i for i in range(action_size)]\n",
+        "        print(\"action space\",self.action_space)\n",
+        "        # self.saved_log_probs = None\n",
+        "        self.model= self._build_model()\n",
+        "        self.model.summary()\n",
         "\n",
         "\n",
         "    def _build_model(self):\n",
+        "        x = Input(shape=(4,), name='x_input')\n",
+        "        # y_true = Input( shape=(2,), name='y_true' )\n",
+        "        d_returns = Input(shape=[1], name='d_returns')\n",
+        "\n",
+        "        l = layers.Dense(16, activation = 'relu')(x)\n",
+        "        l = layers.Dense(16, activation = 'relu')(l)\n",
+        "        y_pred = layers.Dense(self.action_size, activation = 'softmax', name='y_pred')(l)\n",
         "        \n",
         "        optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)\n",
+        "\n",
+        "        # model_train = Model( inputs=[x], outputs=[y_pred], name='train_only' )\n",
+        "        model_train = CustomModel( inputs=x, outputs=y_pred, name='train_only' )\n",
+        "        # model_predict = Model( inputs=x, outputs=y_pred, name='predict_only' )\n",
+        "        model_train.compile(loss=None, optimizer=optimizer, metrics = ['accuracy'])\n",
+        "        # use run_eagerly to print values inside the loss function to debug\n",
+        "        # model_train.compile(loss=None, optimizer=optimizer, metrics = ['accuracy'], run_eagerly = True)\n",
+        "\n",
+        "        return model_train\n",
         "\n",
         "    def act(self, state):\n",
+        "        probs = self.model.predict(np.array([state]), verbose=0)[0]\n",
+        "        action = np.random.choice(self.action_space, p=probs)\n",
         "\n",
+        "        return action\n",
+        "\n",
+        "    # this implements the reinforce \n",
+        "    def learn(self, n_training_episodes=None, max_t=None, print_every=100):\n",
         "        # Help us to calculate the score during the training\n",
         "        scores_deque = deque(maxlen=100)\n",
         "        scores = []\n",
         "        # Line 3 of pseudocode\n",
         "        for i_episode in range(1, n_training_episodes+1):\n",
+        "            # saved_log_probs = []\n",
+        "            saved_actions = []\n",
+        "            saved_state = []\n",
         "            rewards = []\n",
         "            state = self.env.reset()\n",
         "            # Line 4 of pseudocode\n",
         "            for t in range(max_t):\n",
+        "                saved_state.append(state)\n",
+        "                action = self.act(state)\n",
+        "                # action, log_prob = self.act(state)\n",
+        "                # saved_log_probs.append(log_prob)\n",
+        "                saved_actions.append(action)\n",
         "                state, reward, done, _ = self.env.step(action)\n",
         "                rewards.append(reward)\n",
         "                if done:\n",
         "            ## a normal python list would instead require O(N) to do this.\n",
         "            for t in range(n_steps)[::-1]:\n",
         "                disc_return_t = (returns[0] if len(returns)>0 else 0)\n",
+        "                returns.appendleft( self.gamma*disc_return_t + rewards[t]   )    \n",
         "                \n",
         "            ## standardization of the returns is employed to make training more stable\n",
         "            eps = np.finfo(np.float32).eps.item()\n",
         "            # added to the standard deviation of the returns to avoid numerical instabilities        \n",
         "            returns = np.array(returns)\n",
         "            returns = (returns - returns.mean()) / (returns.std() + eps)\n",
+        "            # self.saved_log_probs = saved_log_probs\n",
         "            \n",
         "            # Line 7:\n",
+        "            saved_state = np.array(saved_state)\n",
+        "            # print(\"Saved state\", saved_state, saved_state.shape)\n",
+        "            saved_actions = np.array(to_categorical(saved_actions, num_classes=self.action_size))\n",
+        "            # print(\"Saved actions\", saved_actions, saved_actions.shape)\n",
+        "            returns = returns.reshape(-1,1)\n",
+        "            # print(\"Returns\", returns, returns.shape)\n",
+        "            # this is the trick part, we send a tuple so the CustomModel is able to split the x  and use \n",
+        "            # the returns inside to calculate the custom loss\n",
+        "            self.model.train_on_batch(x=(saved_state,returns), y=saved_actions)\n",
+        "\n",
+        "            # policy_loss = []\n",
+        "            # for action, log_prob, disc_return in zip(saved_actions, saved_log_probs, returns):\n",
+        "            #     policy_loss.append(-log_prob * disc_return)\n",
+        "            # policy_loss = torch.cat(policy_loss).sum()\n",
         "            \n",
+        "            # # Line 8: PyTorch prefers gradient descent \n",
         "            # optimizer.zero_grad()\n",
+        "            # policy_loss.backward()\n",
         "            # optimizer.step()\n",
         "            \n",
         "            if i_episode % print_every == 0:\n",
         "                print('Episode {}\\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))\n",
+        "        \n",
         "        return scores\n",
         "\n",
         "\n",
     },
     {
       "cell_type": "code",
+      "execution_count": 47,
       "metadata": {},
       "outputs": [],
       "source": [
     },
     {
       "cell_type": "code",
+      "execution_count": 48,
       "metadata": {},
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
+            "3067\n",
+            "1918\n"
           ]
         }
       ],
     },
     {
       "cell_type": "code",
+      "execution_count": 49,
       "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "action space [0, 1, 2]\n",
+            "Model: \"train_only\"\n",
+            "_________________________________________________________________\n",
+            " Layer (type)                Output Shape              Param #   \n",
+            "=================================================================\n",
+            " x_input (InputLayer)        [(None, 4)]               0         \n",
+            "                                                                 \n",
+            " dense_22 (Dense)            (None, 16)                80        \n",
+            "                                                                 \n",
+            " dense_23 (Dense)            (None, 16)                272       \n",
+            "                                                                 \n",
+            " y_pred (Dense)              (None, 3)                 51        \n",
+            "                                                                 \n",
+            "=================================================================\n",
+            "Total params: 403\n",
+            "Trainable params: 403\n",
+            "Non-trainable params: 0\n",
+            "_________________________________________________________________\n",
+            "Episode 100\tAverage Score: -180.05\n",
+            "Episode 200\tAverage Score: -164.72\n",
+            "Episode 300\tAverage Score: -81.03\n",
+            "Episode 400\tAverage Score: -117.40\n",
+            "Episode 500\tAverage Score: -182.76\n",
+            "Episode 600\tAverage Score: -92.27\n",
+            "Episode 700\tAverage Score: -207.78\n",
+            "Episode 800\tAverage Score: -232.02\n",
+            "Episode 900\tAverage Score: -29.72\n",
+            "Episode 1000\tAverage Score: -44.37\n",
+            "Episode 1100\tAverage Score: -60.61\n",
+            "Episode 1200\tAverage Score: -67.30\n",
+            "Episode 1300\tAverage Score: -36.28\n",
+            "Episode 1400\tAverage Score: -60.42\n",
+            "Episode 1500\tAverage Score: -93.99\n",
+            "Episode 1600\tAverage Score: -70.92\n",
+            "Episode 1700\tAverage Score: -88.01\n",
+            "Episode 1800\tAverage Score: -21.69\n",
+            "Episode 1900\tAverage Score: -66.15\n",
+            "Episode 2000\tAverage Score: -96.49\n",
+            "Episode 2100\tAverage Score: -33.40\n",
+            "Episode 2200\tAverage Score: -25.62\n",
+            "Episode 2300\tAverage Score: -46.25\n",
+            "Episode 2400\tAverage Score: -63.88\n",
+            "Episode 2500\tAverage Score: -29.43\n",
+            "Episode 2600\tAverage Score: -19.85\n",
+            "Episode 2700\tAverage Score: -53.53\n",
+            "Episode 2800\tAverage Score: -42.98\n",
+            "Episode 2900\tAverage Score: -50.12\n",
+            "Episode 3000\tAverage Score: -27.25\n"
+          ]
+        }
+      ],
       "source": [
         "# create env\n",
         "max_steps = 20  \n",
         "env = CustTradingEnv(df=eth_train, max_steps=max_steps)\n",
         "\n",
+        "model = Policy(env=env, action_size=3)\n",
+        "# model.learn(total_steps=6_000)\n",
+        "\n",
+        "model.learn(n_training_episodes=3000, max_t=20, print_every=100)\n",
+        "# model.learn(n_training_episodes=1000, max_t=1000, print_every=100)\n",
+        "env.close()\n"
       ]
     },
     {
       "metadata": {},
       "outputs": [],
       "source": [
+        "model.save(\"./alt/fin_rl_policy_gradient_v1\")\n",
+        "joblib.dump(env.get_scaler(),\"./alt/fin_rl_policy_gradient_v1.h5_scaler\")\n"
       ]
     },
     {

test_return.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from collections import deque
+import numpy as np
+returns = deque(maxlen=20)
+rewards = [1,1,1,1,1]
+n_steps = len(rewards)
+for t in range(n_steps)[::-1]:
+    print("Step=======",t)
+    disc_return_t = (returns[0] if len(returns)>0 else 0)
+    print("return",disc_return_t)
+    print("reward",rewards[t] )
+    returns.appendleft( 0.95 * disc_return_t +rewards[t]   )
+    print("appended ret",returns )
+returns = np.array(returns)
+print(returns)