{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "A100",
      "authorship_tag": "ABX9TyN71WxG6t24pCmMB3EAti2l",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU",
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "6a9a993a2bde4bc7acdd44e35ee8a423": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "VBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "VBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "VBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_63b857b0db6f4b5bbb55230dd1e0f66e",
              "IPY_MODEL_ae1f1250131c4283b929d3108e581926"
            ],
            "layout": "IPY_MODEL_e8190f4f922d4a1c9f8c730604c7e806"
          }
        },
        "63b857b0db6f4b5bbb55230dd1e0f66e": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "LabelModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "LabelModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "LabelView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_2b5adaa739be47f088c2a829508a1d43",
            "placeholder": "​",
            "style": "IPY_MODEL_fa697fde7f024d07aee37a56b55fbf59",
            "value": "0.472 MB of 0.472 MB uploaded\r"
          }
        },
        "ae1f1250131c4283b929d3108e581926": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "FloatProgressModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_9dbb0157705f4efb90649e4468708109",
            "max": 1,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_36f393f4de7c434c8480fbf195d0ac8b",
            "value": 1
          }
        },
        "e8190f4f922d4a1c9f8c730604c7e806": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "2b5adaa739be47f088c2a829508a1d43": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "fa697fde7f024d07aee37a56b55fbf59": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "9dbb0157705f4efb90649e4468708109": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "36f393f4de7c434c8480fbf195d0ac8b": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ProgressStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "0a85aa309bda4b9abf9240eeb8f403f0": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "VBoxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "VBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "VBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_9df202bad1e84ef8bc68982042d8831f",
              "IPY_MODEL_7681267bc11d410ebc3954064d1cd416",
              "IPY_MODEL_eaf3299530ba4f68a75b107d54bcd027",
              "IPY_MODEL_2b4660cfb7c24a269c8dda2a89c7fcef"
            ],
            "layout": "IPY_MODEL_fb800a6d97954553b45cfb9c712dab9c"
          }
        },
        "38c9ff2937f142fdaf2fdf488fda9656": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_9c102c35b48a454d9e83f61507097b4c",
            "placeholder": "​",
            "style": "IPY_MODEL_aa87be83bcc24e8aad06ffa648d1a057",
            "value": "<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svg\nalt='Hugging Face'> <br> Copy a token from <a\nhref=\"https://huggingface.co/settings/tokens\" target=\"_blank\">your Hugging Face\ntokens page</a> and paste it below. <br> Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file. </center>"
          }
        },
        "1e8f3c64ce81450994fc21b78e3aba76": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "PasswordModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "PasswordModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "PasswordView",
            "continuous_update": true,
            "description": "Token:",
            "description_tooltip": null,
            "disabled": false,
            "layout": "IPY_MODEL_a3eee979a8d544e8bdb76682d4e1a89f",
            "placeholder": "​",
            "style": "IPY_MODEL_7089322af59a42d4977b3251f0b7e2cf",
            "value": ""
          }
        },
        "c80588db8c264ec0a4c17c890c213963": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "CheckboxModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "CheckboxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "CheckboxView",
            "description": "Add token as git credential?",
            "description_tooltip": null,
            "disabled": false,
            "indent": true,
            "layout": "IPY_MODEL_057290b3b190427c8384d5a271bac2b5",
            "style": "IPY_MODEL_53c0c50ce74d425d8125d70aeead96c8",
            "value": true
          }
        },
        "5aa58b3190a148b1b9bbe28acc811192": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ButtonModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ButtonModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ButtonView",
            "button_style": "",
            "description": "Login",
            "disabled": false,
            "icon": "",
            "layout": "IPY_MODEL_5bd49cf339f64cf2b46079cf75bda5fd",
            "style": "IPY_MODEL_c78516a5de83466f884572700ce225d1",
            "tooltip": ""
          }
        },
        "12a46681a6004746b6869569c65b71a2": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "HTMLModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_9444ddb29edd421da3d74baf0cece75d",
            "placeholder": "​",
            "style": "IPY_MODEL_20cdbaaf3958455c862a2756e92e8f92",
            "value": "\n<b>Pro Tip:</b> If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. </center>"
          }
        },
        "fb800a6d97954553b45cfb9c712dab9c": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": "center",
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": "flex",
            "flex": null,
            "flex_flow": "column",
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": "50%"
          }
        },
        "9c102c35b48a454d9e83f61507097b4c": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "aa87be83bcc24e8aad06ffa648d1a057": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "a3eee979a8d544e8bdb76682d4e1a89f": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "7089322af59a42d4977b3251f0b7e2cf": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "057290b3b190427c8384d5a271bac2b5": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "53c0c50ce74d425d8125d70aeead96c8": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "5bd49cf339f64cf2b46079cf75bda5fd": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "c78516a5de83466f884572700ce225d1": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "ButtonStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ButtonStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "button_color": null,
            "font_weight": ""
          }
        },
        "9444ddb29edd421da3d74baf0cece75d": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "20cdbaaf3958455c862a2756e92e8f92": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "eb83010cb46043268a8b36f8f25c35e7": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "LabelModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "LabelModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "LabelView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_8247f0e0fe2d48bdb1cd9627f6e53417",
            "placeholder": "​",
            "style": "IPY_MODEL_ffb23aea10c94126bd8bdc54b265908d",
            "value": "Connecting..."
          }
        },
        "8247f0e0fe2d48bdb1cd9627f6e53417": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "ffb23aea10c94126bd8bdc54b265908d": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "9df202bad1e84ef8bc68982042d8831f": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "LabelModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "LabelModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "LabelView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_075428eaefb44bcbb96d96d2019deeef",
            "placeholder": "​",
            "style": "IPY_MODEL_c75f2dcbcd724ba0aed77744c6943faf",
            "value": "Token is valid (permission: write)."
          }
        },
        "7681267bc11d410ebc3954064d1cd416": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "LabelModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "LabelModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "LabelView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_6dba6d0fdf464e84a1319e793974a455",
            "placeholder": "​",
            "style": "IPY_MODEL_d1b60ca0cb6947a7afb1007de23f59aa",
            "value": "Your token has been saved in your configured git credential helpers (store)."
          }
        },
        "eaf3299530ba4f68a75b107d54bcd027": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "LabelModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "LabelModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "LabelView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_3f86a6ed43c14dbeb965535c44a114bd",
            "placeholder": "​",
            "style": "IPY_MODEL_f458884d36c64771ba0011bc94017dcf",
            "value": "Your token has been saved to /root/.cache/huggingface/token"
          }
        },
        "2b4660cfb7c24a269c8dda2a89c7fcef": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "LabelModel",
          "model_module_version": "1.5.0",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "LabelModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "LabelView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_e902c5cb7e884850ad1ba65fe7300a7a",
            "placeholder": "​",
            "style": "IPY_MODEL_cd97d20ae2e742ed9476c0829f1ee586",
            "value": "Login successful"
          }
        },
        "075428eaefb44bcbb96d96d2019deeef": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "c75f2dcbcd724ba0aed77744c6943faf": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "6dba6d0fdf464e84a1319e793974a455": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "d1b60ca0cb6947a7afb1007de23f59aa": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "3f86a6ed43c14dbeb965535c44a114bd": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "f458884d36c64771ba0011bc94017dcf": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "e902c5cb7e884850ad1ba65fe7300a7a": {
          "model_module": "@jupyter-widgets/base",
          "model_name": "LayoutModel",
          "model_module_version": "1.2.0",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "cd97d20ae2e742ed9476c0829f1ee586": {
          "model_module": "@jupyter-widgets/controls",
          "model_name": "DescriptionStyleModel",
          "model_module_version": "1.5.0",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        }
      }
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/Druvith/NN_from_scratch2023/blob/main/MoE2.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ImSfwIQxXDmD",
        "outputId": "d7b6c771-5fe8-4df4-9727-98373f3c92c1"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Mounted at /content/drive\n"
          ]
        }
      ],
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "\n",
        "# Directory where the dataset will be stored\n",
        "dataset_dir = \"TinyStories4\"\n",
        "\n",
        "# Create the TinyStories directory if it doesn't exist\n",
        "os.makedirs(dataset_dir, exist_ok=True)\n",
        "\n",
        "# URL of the dataset archive\n",
        "dataset_url = \"https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data.tar.gz\"\n",
        "# Name of the file to save the downloaded dataset archive\n",
        "dataset_archive_path = os.path.join(dataset_dir, \"TinyStories_all_data.tar.gz\")\n",
        "\n",
        "# Download the archive\n",
        "if not os.path.exists(dataset_archive_path):\n",
        "    os.system(f\"wget {dataset_url} -O {dataset_archive_path}\")\n",
        "\n",
        "# Extract the archive into the TinyStories directory\n",
        "os.system(f\"tar -xzf {dataset_archive_path} -C {dataset_dir}\")\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "L9gB6U-KXHZY",
        "outputId": "8c46b020-3881-4436-bf82-9e951a1a15be"
      },
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0"
            ]
          },
          "metadata": {},
          "execution_count": 2
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import os\n",
        "import glob\n",
        "import json\n",
        "\n",
        "shard_filenames = sorted(glob.glob(os.path.join('TinyStories4', \"*.json\")))\n",
        "with open(shard_filenames[0], \"r\") as f:\n",
        "        data = json.load(f)\n",
        "\n",
        "stories = [x['story'] for x in data]\n",
        "text = \"\\n\".join(stories)"
      ],
      "metadata": {
        "id": "uXrqxXeyXRwR"
      },
      "execution_count": 3,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "len(text)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "3Isc2XRfYGW6",
        "outputId": "d522f4c4-3e81-4714-cafe-a8dc393ae512"
      },
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "77586884"
            ]
          },
          "metadata": {},
          "execution_count": 4
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import string\n",
        "\n",
        "# Define the set of characters to remove\n",
        "remove_chars = \"\\$%&*+-/;`|~ éñ–—…\"\n",
        "\n",
        "# Create a translation table that maps the characters to be removed to None\n",
        "trans_table = str.maketrans(remove_chars, ' ' * len(remove_chars))\n",
        "\n",
        "# Remove the characters from the string using the translation table\n",
        "text = text.translate(trans_table)"
      ],
      "metadata": {
        "id": "7PlaC0WIYeD-"
      },
      "execution_count": 5,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "### vocab size and characters used\n",
        "\n",
        "chars = sorted(list(set(text)))\n",
        "vocab_size = len(chars)\n",
        "print(''.join(chars))\n",
        "print(vocab_size)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "DpPYXMqJZkU7",
        "outputId": "66cbd95c-4342-48ff-ce7f-de321d7d0501"
      },
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\t\n",
            " 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz \n",
            "66\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import torch\n",
        "import torch.nn as nn\n",
        "from torch.nn import functional as F\n",
        "torch.manual_seed(1137)\n",
        "\n",
        "# hyperparameters\n",
        "batch_size = 64 # how many independent sequences will we process in parallel?\n",
        "block_size = 256 # what is the maximum context length for predictions?\n",
        "max_iters = 6000\n",
        "eval_interval = 100\n",
        "learning_rate = 1e-3\n",
        "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
        "eval_iters = 200\n",
        "n_embed = 384\n",
        "n_head = 6\n",
        "n_layer = 6\n",
        "dropout = 0.0\n",
        "# ------------"
      ],
      "metadata": {
        "id": "FQr01U7Hahth"
      },
      "execution_count": 7,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "### character encoding and decoding function, stoi : string to integer, itos: integer to string\n",
        "\n",
        "import torch\n",
        "\n",
        "stoi = { ch:i for i, ch in enumerate(chars) }\n",
        "itos = {i:ch for i,ch in enumerate(chars)}\n",
        "encode = lambda s: [stoi[c] for c in s]\n",
        "decode = lambda l: \"\".join([itos[x] for x in l])\n",
        "\n",
        "data = torch.tensor(encode(text), dtype = torch.long)"
      ],
      "metadata": {
        "id": "lQcB-pEzZvoh"
      },
      "execution_count": 8,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "### creating the train and text splits\n",
        "\n",
        "n = int(0.9*len(data))\n",
        "train_data = data[:n]\n",
        "val_data = data[n:]"
      ],
      "metadata": {
        "id": "u6wCdZ5waJNy"
      },
      "execution_count": 9,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# data loading\n",
        "\n",
        "def get_batch(split):\n",
        "    # generate a small batch of data of inputs x and targets y\n",
        "    data = train_data if split == 'train' else val_data\n",
        "    ix = torch.randint(len(data) - block_size, (batch_size,))\n",
        "    x = torch.stack([data[i:i+block_size] for i in ix])\n",
        "    y = torch.stack([data[i+1:i+block_size+1] for i in ix])\n",
        "    return x, y"
      ],
      "metadata": {
        "id": "F86Zm98WaS8a"
      },
      "execution_count": 10,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "class RMSNorm(nn.Module):    #### x * (sqrt(mean(x**2))\n",
        "  def __init__(self, n_embed, eps = 1e-6):\n",
        "    super().__init__()\n",
        "    self.eps = eps\n",
        "    self.weight = nn.Parameter(torch.ones(n_embed))\n",
        "\n",
        "  def _norm(self, x):\n",
        "    return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)\n",
        "\n",
        "  def forward(self, x):\n",
        "    output = self._norm(x.float()).type_as(x)\n",
        "    return output * self.weight"
      ],
      "metadata": {
        "id": "_93TwyTwaknl"
      },
      "execution_count": 11,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "class MoeLayer(nn.Module):\n",
        "    def __init__(self, experts, gate, k=2):\n",
        "        super().__init__()\n",
        "        assert len(experts) > 0\n",
        "        self.experts = nn.ModuleList(experts)\n",
        "        self.gate = gate\n",
        "        self.k = k\n",
        "\n",
        "    def forward(self, inputs: torch.Tensor):\n",
        "        inputs_squashed = inputs.view(-1, inputs.shape[-1])\n",
        "        gate_logits = self.gate(inputs_squashed)\n",
        "        weights, selected_experts = torch.topk(\n",
        "            gate_logits, self.k\n",
        "        )\n",
        "        weights = nn.functional.softmax(\n",
        "            weights,\n",
        "            dim=1,\n",
        "            dtype=torch.float,\n",
        "        ).type_as(inputs)\n",
        "        results = torch.zeros_like(inputs_squashed)\n",
        "        for i, expert in enumerate(self.experts):\n",
        "            batch_idx, nth_expert = torch.where(selected_experts == i)\n",
        "            results[batch_idx] += weights[batch_idx, nth_expert, None] * expert(\n",
        "                inputs_squashed[batch_idx]\n",
        "            )\n",
        "        return results.view_as(inputs)"
      ],
      "metadata": {
        "id": "KYiPeTfua3NM"
      },
      "execution_count": 12,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "class Head(nn.Module):\n",
        "    def __init__(self, head_size):\n",
        "        super().__init__()\n",
        "        self.key = nn.Linear(n_embed, head_size, bias = False)\n",
        "        self.query = nn.Linear(n_embed, head_size, bias = False)\n",
        "        self.value = nn.Linear(n_embed, head_size, bias = False)\n",
        "        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))\n",
        "        self.dropout = nn.Dropout(dropout)\n",
        "\n",
        "    def forward(self, x):\n",
        "        B, T, C = x.shape\n",
        "        k = self.key(x)\n",
        "        q = self.query(x)\n",
        "        wei = q @ k.transpose(-2, -1) * C**-0.5\n",
        "        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))\n",
        "        wei = F.softmax(wei, dim=-1)\n",
        "        wei = self.dropout(wei)\n",
        "        v = self.value(x)\n",
        "        out = wei @ v\n",
        "        return out\n",
        "\n",
        "class MulitHeadAttention(nn.Module):\n",
        "    def __init__(self, num_heads, head_size):\n",
        "        super().__init__()\n",
        "        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])\n",
        "        self.proj = nn.Linear(n_embed, n_embed)\n",
        "        self.dropout = nn.Dropout(dropout)\n",
        "\n",
        "    def forward(self, x):\n",
        "        x =  torch.cat([head(x) for head in self.heads], dim=-1)\n",
        "        out = self.dropout(self.proj(x))\n",
        "        return out\n",
        "\n",
        "\n",
        "class Expert(nn.Module):\n",
        "    def __init__(self, n_embed):\n",
        "        super().__init__()\n",
        "        self.net = nn.Sequential(\n",
        "            nn.Linear(n_embed, 4* n_embed),\n",
        "            nn.SiLU(),\n",
        "            nn.Linear(4 * n_embed, n_embed),\n",
        "         nn.Dropout(dropout))\n",
        "\n",
        "    def forward(self, x):\n",
        "        return self.net(x)\n",
        "\n",
        "class Block(nn.Module):\n",
        "    def __init__(self, n_embed, n_head, num_experts=4):\n",
        "        super().__init__()\n",
        "        self.sa_head= MulitHeadAttention(n_head, n_embed//n_head)\n",
        "        self.ffw = MoeLayer(\n",
        "            experts=[Expert(n_embed) for _ in range(num_experts)],\n",
        "            gate=nn.Linear(n_embed, num_experts, bias=False),\n",
        "        )\n",
        "\n",
        "#         self.ffw=  FeedForward(n_embed)\n",
        "        self.ln1 = RMSNorm(n_embed)\n",
        "        self.ln2 = RMSNorm(n_embed)\n",
        "\n",
        "    def forward(self, x):\n",
        "        x = x + self.sa_head(self.ln1(x))\n",
        "        x = x+self.ffw(self.ln2(x))\n",
        "        return x\n",
        "\n",
        "\n",
        "class Transformer(nn.Module):\n",
        "    def __init__(self):\n",
        "        super().__init__()\n",
        "\n",
        "        self.token_embedding_table = nn.Embedding(vocab_size, n_embed, device=device)\n",
        "        self.position_embedding_table = nn.Embedding(block_size, n_embed, device=device)\n",
        "        self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])\n",
        "        self.lm_head = nn.Linear(n_embed, vocab_size)\n",
        "\n",
        "\n",
        "    def forward(self, idx, targets=None):\n",
        "        B, T = idx.shape\n",
        "\n",
        "        token_emb = self.token_embedding_table(idx)\n",
        "        pos_emb = self.position_embedding_table(torch.arange(T).to(device))\n",
        "        x = token_emb + pos_emb\n",
        "        x = self.blocks(x)\n",
        "        logits = self.lm_head(x)\n",
        "        if targets == None:\n",
        "            loss = None\n",
        "        else:\n",
        "            B, T, C = logits.shape\n",
        "            logits = logits.view(B*T, C)\n",
        "            targets = targets.view(B*T)\n",
        "            loss = F.cross_entropy(logits, targets)\n",
        "        return logits, loss\n",
        "\n",
        "    def generate(self, idx, max_new_tokes):\n",
        "        for _ in range(max_new_tokes):\n",
        "            idx_cond = idx[:, -block_size:]\n",
        "            logits, loss = self(idx_cond)\n",
        "            logits = logits[:, -1, :]\n",
        "            probs = F.softmax(logits, dim = -1)\n",
        "            idx_next = torch.multinomial(probs, num_samples = 1)\n",
        "            idx = torch.cat((idx, idx_next), dim = 1)\n",
        "        return idx"
      ],
      "metadata": {
        "id": "lJi2xz0obLHn"
      },
      "execution_count": 13,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "model = Transformer()\n",
        "optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate)\n",
        "print(sum(p.numel() for p in model.parameters()), 'total parameters')"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "M-R7m5FZbkbl",
        "outputId": "96c6cfe5-dede-4c9d-8c20-9411a5e3e966"
      },
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "32061762 total parameters\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "model.eval()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "vmZb3UxWboZ9",
        "outputId": "25f33004-7034-4fe8-af18-33477e566804"
      },
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "Transformer(\n",
              "  (token_embedding_table): Embedding(66, 384)\n",
              "  (position_embedding_table): Embedding(256, 384)\n",
              "  (blocks): Sequential(\n",
              "    (0): Block(\n",
              "      (sa_head): MulitHeadAttention(\n",
              "        (heads): ModuleList(\n",
              "          (0-5): 6 x Head(\n",
              "            (key): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (query): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (value): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (dropout): Dropout(p=0.0, inplace=False)\n",
              "          )\n",
              "        )\n",
              "        (proj): Linear(in_features=384, out_features=384, bias=True)\n",
              "        (dropout): Dropout(p=0.0, inplace=False)\n",
              "      )\n",
              "      (ffw): MoeLayer(\n",
              "        (experts): ModuleList(\n",
              "          (0-3): 4 x Expert(\n",
              "            (net): Sequential(\n",
              "              (0): Linear(in_features=384, out_features=1536, bias=True)\n",
              "              (1): SiLU()\n",
              "              (2): Linear(in_features=1536, out_features=384, bias=True)\n",
              "              (3): Dropout(p=0.0, inplace=False)\n",
              "            )\n",
              "          )\n",
              "        )\n",
              "        (gate): Linear(in_features=384, out_features=4, bias=False)\n",
              "      )\n",
              "      (ln1): RMSNorm()\n",
              "      (ln2): RMSNorm()\n",
              "    )\n",
              "    (1): Block(\n",
              "      (sa_head): MulitHeadAttention(\n",
              "        (heads): ModuleList(\n",
              "          (0-5): 6 x Head(\n",
              "            (key): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (query): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (value): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (dropout): Dropout(p=0.0, inplace=False)\n",
              "          )\n",
              "        )\n",
              "        (proj): Linear(in_features=384, out_features=384, bias=True)\n",
              "        (dropout): Dropout(p=0.0, inplace=False)\n",
              "      )\n",
              "      (ffw): MoeLayer(\n",
              "        (experts): ModuleList(\n",
              "          (0-3): 4 x Expert(\n",
              "            (net): Sequential(\n",
              "              (0): Linear(in_features=384, out_features=1536, bias=True)\n",
              "              (1): SiLU()\n",
              "              (2): Linear(in_features=1536, out_features=384, bias=True)\n",
              "              (3): Dropout(p=0.0, inplace=False)\n",
              "            )\n",
              "          )\n",
              "        )\n",
              "        (gate): Linear(in_features=384, out_features=4, bias=False)\n",
              "      )\n",
              "      (ln1): RMSNorm()\n",
              "      (ln2): RMSNorm()\n",
              "    )\n",
              "    (2): Block(\n",
              "      (sa_head): MulitHeadAttention(\n",
              "        (heads): ModuleList(\n",
              "          (0-5): 6 x Head(\n",
              "            (key): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (query): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (value): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (dropout): Dropout(p=0.0, inplace=False)\n",
              "          )\n",
              "        )\n",
              "        (proj): Linear(in_features=384, out_features=384, bias=True)\n",
              "        (dropout): Dropout(p=0.0, inplace=False)\n",
              "      )\n",
              "      (ffw): MoeLayer(\n",
              "        (experts): ModuleList(\n",
              "          (0-3): 4 x Expert(\n",
              "            (net): Sequential(\n",
              "              (0): Linear(in_features=384, out_features=1536, bias=True)\n",
              "              (1): SiLU()\n",
              "              (2): Linear(in_features=1536, out_features=384, bias=True)\n",
              "              (3): Dropout(p=0.0, inplace=False)\n",
              "            )\n",
              "          )\n",
              "        )\n",
              "        (gate): Linear(in_features=384, out_features=4, bias=False)\n",
              "      )\n",
              "      (ln1): RMSNorm()\n",
              "      (ln2): RMSNorm()\n",
              "    )\n",
              "    (3): Block(\n",
              "      (sa_head): MulitHeadAttention(\n",
              "        (heads): ModuleList(\n",
              "          (0-5): 6 x Head(\n",
              "            (key): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (query): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (value): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (dropout): Dropout(p=0.0, inplace=False)\n",
              "          )\n",
              "        )\n",
              "        (proj): Linear(in_features=384, out_features=384, bias=True)\n",
              "        (dropout): Dropout(p=0.0, inplace=False)\n",
              "      )\n",
              "      (ffw): MoeLayer(\n",
              "        (experts): ModuleList(\n",
              "          (0-3): 4 x Expert(\n",
              "            (net): Sequential(\n",
              "              (0): Linear(in_features=384, out_features=1536, bias=True)\n",
              "              (1): SiLU()\n",
              "              (2): Linear(in_features=1536, out_features=384, bias=True)\n",
              "              (3): Dropout(p=0.0, inplace=False)\n",
              "            )\n",
              "          )\n",
              "        )\n",
              "        (gate): Linear(in_features=384, out_features=4, bias=False)\n",
              "      )\n",
              "      (ln1): RMSNorm()\n",
              "      (ln2): RMSNorm()\n",
              "    )\n",
              "    (4): Block(\n",
              "      (sa_head): MulitHeadAttention(\n",
              "        (heads): ModuleList(\n",
              "          (0-5): 6 x Head(\n",
              "            (key): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (query): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (value): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (dropout): Dropout(p=0.0, inplace=False)\n",
              "          )\n",
              "        )\n",
              "        (proj): Linear(in_features=384, out_features=384, bias=True)\n",
              "        (dropout): Dropout(p=0.0, inplace=False)\n",
              "      )\n",
              "      (ffw): MoeLayer(\n",
              "        (experts): ModuleList(\n",
              "          (0-3): 4 x Expert(\n",
              "            (net): Sequential(\n",
              "              (0): Linear(in_features=384, out_features=1536, bias=True)\n",
              "              (1): SiLU()\n",
              "              (2): Linear(in_features=1536, out_features=384, bias=True)\n",
              "              (3): Dropout(p=0.0, inplace=False)\n",
              "            )\n",
              "          )\n",
              "        )\n",
              "        (gate): Linear(in_features=384, out_features=4, bias=False)\n",
              "      )\n",
              "      (ln1): RMSNorm()\n",
              "      (ln2): RMSNorm()\n",
              "    )\n",
              "    (5): Block(\n",
              "      (sa_head): MulitHeadAttention(\n",
              "        (heads): ModuleList(\n",
              "          (0-5): 6 x Head(\n",
              "            (key): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (query): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (value): Linear(in_features=384, out_features=64, bias=False)\n",
              "            (dropout): Dropout(p=0.0, inplace=False)\n",
              "          )\n",
              "        )\n",
              "        (proj): Linear(in_features=384, out_features=384, bias=True)\n",
              "        (dropout): Dropout(p=0.0, inplace=False)\n",
              "      )\n",
              "      (ffw): MoeLayer(\n",
              "        (experts): ModuleList(\n",
              "          (0-3): 4 x Expert(\n",
              "            (net): Sequential(\n",
              "              (0): Linear(in_features=384, out_features=1536, bias=True)\n",
              "              (1): SiLU()\n",
              "              (2): Linear(in_features=1536, out_features=384, bias=True)\n",
              "              (3): Dropout(p=0.0, inplace=False)\n",
              "            )\n",
              "          )\n",
              "        )\n",
              "        (gate): Linear(in_features=384, out_features=4, bias=False)\n",
              "      )\n",
              "      (ln1): RMSNorm()\n",
              "      (ln2): RMSNorm()\n",
              "    )\n",
              "  )\n",
              "  (lm_head): Linear(in_features=384, out_features=66, bias=True)\n",
              ")"
            ]
          },
          "metadata": {},
          "execution_count": 15
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install wandb"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "RroEewThbtDy",
        "outputId": "1974cda2-feee-49c5-e27d-c400255a822c"
      },
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting wandb\n",
            "  Downloading wandb-0.16.4-py3-none-any.whl (2.2 MB)\n",
            "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/2.2 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K     \u001b[91m━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.3/2.2 MB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m35.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: Click!=8.0.0,>=7.1 in /usr/local/lib/python3.10/dist-packages (from wandb) (8.1.7)\n",
            "Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)\n",
            "  Downloading GitPython-3.1.42-py3-none-any.whl (195 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m195.4/195.4 kB\u001b[0m \u001b[31m25.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: requests<3,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from wandb) (2.31.0)\n",
            "Requirement already satisfied: psutil>=5.0.0 in /usr/local/lib/python3.10/dist-packages (from wandb) (5.9.5)\n",
            "Collecting sentry-sdk>=1.0.0 (from wandb)\n",
            "  Downloading sentry_sdk-1.43.0-py2.py3-none-any.whl (264 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m264.6/264.6 kB\u001b[0m \u001b[31m29.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hCollecting docker-pycreds>=0.4.0 (from wandb)\n",
            "  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\n",
            "Requirement already satisfied: PyYAML in /usr/local/lib/python3.10/dist-packages (from wandb) (6.0.1)\n",
            "Collecting setproctitle (from wandb)\n",
            "  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)\n",
            "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from wandb) (67.7.2)\n",
            "Requirement already satisfied: appdirs>=1.4.3 in /usr/local/lib/python3.10/dist-packages (from wandb) (1.4.4)\n",
            "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.19.0 in /usr/local/lib/python3.10/dist-packages (from wandb) (3.20.3)\n",
            "Requirement already satisfied: six>=1.4.0 in /usr/local/lib/python3.10/dist-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\n",
            "Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb)\n",
            "  Downloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hRequirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.0.0->wandb) (3.3.2)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.0.0->wandb) (3.6)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.0.0->wandb) (2.0.7)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.0.0->wandb) (2024.2.2)\n",
            "Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb)\n",
            "  Downloading smmap-5.0.1-py3-none-any.whl (24 kB)\n",
            "Installing collected packages: smmap, setproctitle, sentry-sdk, docker-pycreds, gitdb, GitPython, wandb\n",
            "Successfully installed GitPython-3.1.42 docker-pycreds-0.4.0 gitdb-4.0.11 sentry-sdk-1.43.0 setproctitle-1.3.3 smmap-5.0.1 wandb-0.16.4\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import wandb\n",
        "!wandb login"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "E-rS2QXCcdpE",
        "outputId": "c79ac3b0-b729-45e4-ae1d-ed682cc1684f"
      },
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n",
            "\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n",
            "\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: \n",
            "\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "@torch.no_grad()\n",
        "def estimate_loss():\n",
        "    out = {}\n",
        "    model.eval()\n",
        "    for split in ['train', 'val']:\n",
        "        losses = torch.zeros(eval_iters)\n",
        "        for k in range(eval_iters):\n",
        "            X, Y = get_batch(split)\n",
        "            X = X.to(device)\n",
        "            Y = Y.to(device)\n",
        "            logits, loss = model(X, Y)\n",
        "            losses[k] = loss.item()\n",
        "        out[split] = losses.mean()\n",
        "    model.train()\n",
        "    return out"
      ],
      "metadata": {
        "id": "oNzq5Vkccmk0"
      },
      "execution_count": 18,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "model = model.to(device)\n",
        "optimizer = torch.optim.AdamW(model.parameters(),lr=1e-4)\n",
        "\n",
        "wandb.init(\n",
        "    # set the wandb project where this run will be logged\n",
        "    project=\"mixture of experts\",\n",
        "\n",
        "    # track hyperparameters and run metadata\n",
        "    config={\n",
        "    'batch_size': batch_size,\n",
        "    'block_size': block_size,\n",
        "    'max_iters': max_iters,\n",
        "    'eval_interval': eval_interval,\n",
        "    'learning_rate': learning_rate,\n",
        "    'device': device,\n",
        "    'eval_iters': eval_iters,\n",
        "    'n_embed': n_embed,\n",
        "    'n_head': n_head,\n",
        "    'n_layer': n_layer,\n",
        "    'dropout': dropout\n",
        "    }\n",
        ")\n",
        "\n",
        "wandb.watch(model)\n",
        "\n",
        "for iter in range(max_iters):\n",
        "\n",
        "    # every once in a while evaluate the loss on train and val sets\n",
        "    if iter % 100 == 0 or iter == max_iters - 1:\n",
        "        losses = estimate_loss()\n",
        "        print(f\"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}\")\n",
        "        wandb.log({'train_loss': losses['train'], 'val_loss': losses['val']}, step=iter)\n",
        "\n",
        "    # sample a batch of data\n",
        "    xb, yb = get_batch('train')\n",
        "    xb = xb.to(device)\n",
        "    yb = yb.to(device)\n",
        "\n",
        "    # evaluate the loss\n",
        "    logits, loss = model(xb, yb)\n",
        "    optimizer.zero_grad(set_to_none=True)\n",
        "    loss.backward()\n",
        "    optimizer.step()\n",
        "\n",
        "model_dir = 'Mixture of experts models'\n",
        "os.makedirs(model_dir, exist_ok=True)\n",
        "final_model_path = os.path.join(model_dir, 'moe2_model.pth')\n",
        "torch.save(model.state_dict(), final_model_path)\n",
        "print('Final trained model saved!')\n",
        "\n",
        "wandb.finish()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000,
          "referenced_widgets": [
            "6a9a993a2bde4bc7acdd44e35ee8a423",
            "63b857b0db6f4b5bbb55230dd1e0f66e",
            "ae1f1250131c4283b929d3108e581926",
            "e8190f4f922d4a1c9f8c730604c7e806",
            "2b5adaa739be47f088c2a829508a1d43",
            "fa697fde7f024d07aee37a56b55fbf59",
            "9dbb0157705f4efb90649e4468708109",
            "36f393f4de7c434c8480fbf195d0ac8b"
          ]
        },
        "id": "J-PJuS8qdCSW",
        "outputId": "4b4a6adb-6df8-4b62-e082-fb3dbd6c0a0b"
      },
      "execution_count": 19,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mdruvithlgowda00\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "Tracking run with wandb version 0.16.4"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "Run data is saved locally in <code>/content/wandb/run-20240323_052806-r801x9fx</code>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "Syncing run <strong><a href='https://wandb.ai/druvithlgowda00/Mixture%20of%20experts/runs/r801x9fx' target=\"_blank\">distinctive-plasma-3</a></strong> to <a href='https://wandb.ai/druvithlgowda00/Mixture%20of%20experts' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              " View project at <a href='https://wandb.ai/druvithlgowda00/Mixture%20of%20experts' target=\"_blank\">https://wandb.ai/druvithlgowda00/Mixture%20of%20experts</a>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              " View run at <a href='https://wandb.ai/druvithlgowda00/Mixture%20of%20experts/runs/r801x9fx' target=\"_blank\">https://wandb.ai/druvithlgowda00/Mixture%20of%20experts/runs/r801x9fx</a>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "step 0: train loss 4.5065, val loss 4.5070\n",
            "step 100: train loss 2.2867, val loss 2.2864\n",
            "step 200: train loss 2.2622, val loss 2.2609\n",
            "step 300: train loss 2.2438, val loss 2.2433\n",
            "step 400: train loss 2.2149, val loss 2.2150\n",
            "step 500: train loss 2.1553, val loss 2.1571\n",
            "step 600: train loss 1.9616, val loss 1.9618\n",
            "step 700: train loss 1.7527, val loss 1.7581\n",
            "step 800: train loss 1.6343, val loss 1.6381\n",
            "step 900: train loss 1.5419, val loss 1.5456\n",
            "step 1000: train loss 1.4781, val loss 1.4763\n",
            "step 1100: train loss 1.4095, val loss 1.4159\n",
            "step 1200: train loss 1.3634, val loss 1.3671\n",
            "step 1300: train loss 1.3279, val loss 1.3312\n",
            "step 1400: train loss 1.2929, val loss 1.2962\n",
            "step 1500: train loss 1.2640, val loss 1.2644\n",
            "step 1600: train loss 1.2410, val loss 1.2439\n",
            "step 1700: train loss 1.2074, val loss 1.2121\n",
            "step 1800: train loss 1.1888, val loss 1.1913\n",
            "step 1900: train loss 1.1664, val loss 1.1709\n",
            "step 2000: train loss 1.1527, val loss 1.1543\n",
            "step 2100: train loss 1.1329, val loss 1.1347\n",
            "step 2200: train loss 1.1160, val loss 1.1157\n",
            "step 2300: train loss 1.1030, val loss 1.1041\n",
            "step 2400: train loss 1.0875, val loss 1.0952\n",
            "step 2500: train loss 1.0799, val loss 1.0785\n",
            "step 2600: train loss 1.0638, val loss 1.0637\n",
            "step 2700: train loss 1.0549, val loss 1.0567\n",
            "step 2800: train loss 1.0404, val loss 1.0420\n",
            "step 2900: train loss 1.0337, val loss 1.0325\n",
            "step 3000: train loss 1.0187, val loss 1.0238\n",
            "step 3100: train loss 1.0117, val loss 1.0132\n",
            "step 3200: train loss 0.9958, val loss 0.9977\n",
            "step 3300: train loss 0.9885, val loss 0.9962\n",
            "step 3400: train loss 0.9853, val loss 0.9887\n",
            "step 3500: train loss 0.9733, val loss 0.9780\n",
            "step 3600: train loss 0.9689, val loss 0.9692\n",
            "step 3700: train loss 0.9575, val loss 0.9612\n",
            "step 3800: train loss 0.9484, val loss 0.9521\n",
            "step 3900: train loss 0.9464, val loss 0.9450\n",
            "step 4000: train loss 0.9367, val loss 0.9380\n",
            "step 4100: train loss 0.9293, val loss 0.9339\n",
            "step 4200: train loss 0.9286, val loss 0.9263\n",
            "step 4300: train loss 0.9167, val loss 0.9206\n",
            "step 4400: train loss 0.9121, val loss 0.9124\n",
            "step 4500: train loss 0.9069, val loss 0.9113\n",
            "step 4600: train loss 0.9024, val loss 0.9044\n",
            "step 4700: train loss 0.8984, val loss 0.8985\n",
            "step 4800: train loss 0.8916, val loss 0.8938\n",
            "step 4900: train loss 0.8889, val loss 0.8910\n",
            "step 5000: train loss 0.8830, val loss 0.8821\n",
            "step 5100: train loss 0.8778, val loss 0.8809\n",
            "step 5200: train loss 0.8714, val loss 0.8767\n",
            "step 5300: train loss 0.8702, val loss 0.8739\n",
            "step 5400: train loss 0.8626, val loss 0.8682\n",
            "step 5500: train loss 0.8589, val loss 0.8628\n",
            "step 5600: train loss 0.8545, val loss 0.8597\n",
            "step 5700: train loss 0.8561, val loss 0.8583\n",
            "step 5800: train loss 0.8486, val loss 0.8542\n",
            "step 5900: train loss 0.8436, val loss 0.8506\n",
            "step 5999: train loss 0.8443, val loss 0.8458\n",
            "Final trained model saved!\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\\r'), FloatProgress(value=1.0, max=1.0)))"
            ],
            "application/vnd.jupyter.widget-view+json": {
              "version_major": 2,
              "version_minor": 0,
              "model_id": "6a9a993a2bde4bc7acdd44e35ee8a423"
            }
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "<style>\n",
              "    table.wandb td:nth-child(1) { padding: 0 10px; text-align: left ; width: auto;} td:nth-child(2) {text-align: left ; width: 100%}\n",
              "    .wandb-row { display: flex; flex-direction: row; flex-wrap: wrap; justify-content: flex-start; width: 100% }\n",
              "    .wandb-col { display: flex; flex-direction: column; flex-basis: 100%; flex: 1; padding: 10px; }\n",
              "    </style>\n",
              "<div class=\"wandb-row\"><div class=\"wandb-col\"><h3>Run history:</h3><br/><table class=\"wandb\"><tr><td>train_loss</td><td>█▄▄▄▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁</td></tr><tr><td>val_loss</td><td>█▄▄▄▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁</td></tr></table><br/></div><div class=\"wandb-col\"><h3>Run summary:</h3><br/><table class=\"wandb\"><tr><td>train_loss</td><td>0.84435</td></tr><tr><td>val_loss</td><td>0.84581</td></tr></table><br/></div></div>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              " View run <strong style=\"color:#cdcd00\">distinctive-plasma-3</strong> at: <a href='https://wandb.ai/druvithlgowda00/Mixture%20of%20experts/runs/r801x9fx' target=\"_blank\">https://wandb.ai/druvithlgowda00/Mixture%20of%20experts/runs/r801x9fx</a><br/>Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "Find logs at: <code>./wandb/run-20240323_052806-r801x9fx/logs</code>"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# generation\n",
        "d = 'once'\n",
        "context = torch.tensor(encode(d), dtype=torch.long, device=device).unsqueeze(0)\n",
        "print(decode(model.generate(context, max_new_tokes=500)[0].tolist()))\n"
      ],
      "metadata": {
        "id": "FyJODzaHezJa",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "318940c5-e203-4f4f-bfde-a114edaff372"
      },
      "execution_count": 26,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "once to go home \n",
            "Once upon a time  there was a little girl named Lily  She loved to play with her together in the tracks  One day  her mom made a little here with a big sea  She had answed \n",
            " Let s play   as play white   The tree was crying and continued   but then we down the sea tost inside   the spick replied   That dog   for it is thought about   \n",
            "Then  it s house  The boy and the man fell down about the pole and excited away  The wind was go away to and Tom again  They excited to put the circle \n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "#push the weights to hugging_face.hub\n",
        "!pip install transformers huggingface_hub"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Ie3l0Ks79VWV",
        "outputId": "161fb7ba-9b18-43dc-e2cb-8577484ad939"
      },
      "execution_count": 27,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.38.2)\n",
            "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.10/dist-packages (0.20.3)\n",
            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.13.1)\n",
            "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.25.2)\n",
            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.0)\n",
            "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n",
            "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.12.25)\n",
            "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n",
            "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.2)\n",
            "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.2)\n",
            "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.2)\n",
            "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (2023.6.0)\n",
            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.10.0)\n",
            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n",
            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.6)\n",
            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n",
            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.2.2)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from huggingface_hub import notebook_login\n",
        "\n",
        "notebook_login()"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 157,
          "referenced_widgets": [
            "0a85aa309bda4b9abf9240eeb8f403f0",
            "38c9ff2937f142fdaf2fdf488fda9656",
            "1e8f3c64ce81450994fc21b78e3aba76",
            "c80588db8c264ec0a4c17c890c213963",
            "5aa58b3190a148b1b9bbe28acc811192",
            "12a46681a6004746b6869569c65b71a2",
            "fb800a6d97954553b45cfb9c712dab9c",
            "9c102c35b48a454d9e83f61507097b4c",
            "aa87be83bcc24e8aad06ffa648d1a057",
            "a3eee979a8d544e8bdb76682d4e1a89f",
            "7089322af59a42d4977b3251f0b7e2cf",
            "057290b3b190427c8384d5a271bac2b5",
            "53c0c50ce74d425d8125d70aeead96c8",
            "5bd49cf339f64cf2b46079cf75bda5fd",
            "c78516a5de83466f884572700ce225d1",
            "9444ddb29edd421da3d74baf0cece75d",
            "20cdbaaf3958455c862a2756e92e8f92",
            "eb83010cb46043268a8b36f8f25c35e7",
            "8247f0e0fe2d48bdb1cd9627f6e53417",
            "ffb23aea10c94126bd8bdc54b265908d",
            "9df202bad1e84ef8bc68982042d8831f",
            "7681267bc11d410ebc3954064d1cd416",
            "eaf3299530ba4f68a75b107d54bcd027",
            "2b4660cfb7c24a269c8dda2a89c7fcef",
            "075428eaefb44bcbb96d96d2019deeef",
            "c75f2dcbcd724ba0aed77744c6943faf",
            "6dba6d0fdf464e84a1319e793974a455",
            "d1b60ca0cb6947a7afb1007de23f59aa",
            "3f86a6ed43c14dbeb965535c44a114bd",
            "f458884d36c64771ba0011bc94017dcf",
            "e902c5cb7e884850ad1ba65fe7300a7a",
            "cd97d20ae2e742ed9476c0829f1ee586"
          ]
        },
        "id": "7EUxRlt0ARds",
        "outputId": "c18522ee-f346-460a-925f-d421ac92be58"
      },
      "execution_count": 28,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
            ],
            "application/vnd.jupyter.widget-view+json": {
              "version_major": 2,
              "version_minor": 0,
              "model_id": "0a85aa309bda4b9abf9240eeb8f403f0"
            }
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from huggingface_hub import create_repo, Repository\n",
        "\n",
        "repo_name = \"Druvith/Tiny_StoriesMoE\"\n",
        "create_repo(repo_name, private=False, exist_ok=True)  # Set private=True for a private repository\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 199
        },
        "id": "K7W5K4AIAWIc",
        "outputId": "537c39d1-50a8-4124-b0dc-4adf4564dedd"
      },
      "execution_count": 32,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n",
            "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
            "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
            "You will be able to reuse this secret in all of your notebooks.\n",
            "Please note that authentication is recommended but still optional to access public models or datasets.\n",
            "  warnings.warn(\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "RepoUrl('https://huggingface.co/Druvith/Tiny_StoriesMoE', endpoint='https://huggingface.co', repo_type='model', repo_id='Druvith/Tiny_StoriesMoE')"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 32
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "repo = Repository(\"Tiny_StoriesMoE\", clone_from=f\"Druvith/Tiny_StoriesMoE\")\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "cd3BTupuArrn",
        "outputId": "3cff5ef1-5889-4bd6-c097-d688f822301d"
      },
      "execution_count": 34,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_deprecation.py:131: FutureWarning: 'Repository' (from 'huggingface_hub.repository') is deprecated and will be removed from version '1.0'. Please prefer the http-based alternatives instead. Given its large adoption in legacy code, the complete removal is only planned on next major release.\n",
            "For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.\n",
            "  warnings.warn(warning_message, FutureWarning)\n",
            "Cloning https://huggingface.co/Druvith/Tiny_StoriesMoE into local empty directory.\n",
            "WARNING:huggingface_hub.repository:Cloning https://huggingface.co/Druvith/Tiny_StoriesMoE into local empty directory.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import shutil\n",
        "shutil.move(\"wandb\", \"Tiny_StoriesMoE\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 343
        },
        "id": "GKRZyt1EBrw5",
        "outputId": "4b27f0dc-3cfc-48ed-b6b5-eddc1fa56dff"
      },
      "execution_count": 40,
      "outputs": [
        {
          "output_type": "error",
          "ename": "Error",
          "evalue": "Destination path 'Tiny_StoriesMoE/wandb' already exists",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mError\u001b[0m                                     Traceback (most recent call last)",
            "\u001b[0;32m<ipython-input-40-a51f01afaf03>\u001b[0m in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mshutil\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mshutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmove\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"wandb\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Tiny_StoriesMoE\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
            "\u001b[0;32m/usr/lib/python3.10/shutil.py\u001b[0m in \u001b[0;36mmove\u001b[0;34m(src, dst, copy_function)\u001b[0m\n\u001b[1;32m    812\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    813\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreal_dst\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 814\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0mError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Destination path '%s' already exists\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mreal_dst\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    815\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    816\u001b[0m         \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrename\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreal_dst\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;31mError\u001b[0m: Destination path 'Tiny_StoriesMoE/wandb' already exists"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "repo.push_to_hub(commit_message=\"wandb logs\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 385
        },
        "id": "lAJTdA1QBslH",
        "outputId": "1aa73e28-0bf3-4a93-8faf-89196c251f68"
      },
      "execution_count": 39,
      "outputs": [
        {
          "output_type": "error",
          "ename": "IsADirectoryError",
          "evalue": "[Errno 21] Is a directory: '/content/Tiny_StoriesMoE/wandb/latest-run'",
          "traceback": [
            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
            "\u001b[0;31mIsADirectoryError\u001b[0m                         Traceback (most recent call last)",
            "\u001b[0;32m<ipython-input-39-c91212df481b>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mrepo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpush_to_hub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommit_message\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"wandb logs\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mpush_to_hub\u001b[0;34m(self, commit_message, blocking, clean_ok, auto_lfs_prune)\u001b[0m\n\u001b[1;32m   1320\u001b[0m             \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Repo currently clean. Ignoring push_to_hub\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1321\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1322\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgit_add\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mauto_lfs_track\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1323\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgit_commit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommit_message\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1324\u001b[0m         return self.git_push(\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mgit_add\u001b[0;34m(self, pattern, auto_lfs_track)\u001b[0m\n\u001b[1;32m   1024\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1025\u001b[0m             \u001b[0;31m# Read the remaining files and track them if they're binary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1026\u001b[0;31m             \u001b[0mtracked_files\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_track_binary_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpattern\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1027\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1028\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mtracked_files\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mauto_track_binary_files\u001b[0;34m(self, pattern)\u001b[0m\n\u001b[1;32m    918\u001b[0m                     )\n\u001b[1;32m    919\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 920\u001b[0;31m                 \u001b[0mis_binary\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mis_binary_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath_to_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    921\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    922\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0mis_binary\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mis_binary_file\u001b[0;34m(filename)\u001b[0m\n\u001b[1;32m    228\u001b[0m     \"\"\"\n\u001b[1;32m    229\u001b[0m     \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m         \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    231\u001b[0m             \u001b[0mcontent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1024\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m  \u001b[0;31m# Read a maximum of 10MB\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    232\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
            "\u001b[0;31mIsADirectoryError\u001b[0m: [Errno 21] Is a directory: '/content/Tiny_StoriesMoE/wandb/latest-run'"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "shutil.move(\"Mixture of experts models/moe2_model.pth\", \"Tiny_StoriesMoE\")"
      ],
      "metadata": {
        "id": "uFYn9p-FCXv8"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}