{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "A100",
"authorship_tag": "ABX9TyN71WxG6t24pCmMB3EAti2l",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU",
"widgets": {
"application/vnd.jupyter.widget-state+json": {
"6a9a993a2bde4bc7acdd44e35ee8a423": {
"model_module": "@jupyter-widgets/controls",
"model_name": "VBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "VBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "VBoxView",
"box_style": "",
"children": [
"IPY_MODEL_63b857b0db6f4b5bbb55230dd1e0f66e",
"IPY_MODEL_ae1f1250131c4283b929d3108e581926"
],
"layout": "IPY_MODEL_e8190f4f922d4a1c9f8c730604c7e806"
}
},
"63b857b0db6f4b5bbb55230dd1e0f66e": {
"model_module": "@jupyter-widgets/controls",
"model_name": "LabelModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "LabelModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "LabelView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_2b5adaa739be47f088c2a829508a1d43",
"placeholder": "",
"style": "IPY_MODEL_fa697fde7f024d07aee37a56b55fbf59",
"value": "0.472 MB of 0.472 MB uploaded\r"
}
},
"ae1f1250131c4283b929d3108e581926": {
"model_module": "@jupyter-widgets/controls",
"model_name": "FloatProgressModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "FloatProgressModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ProgressView",
"bar_style": "",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_9dbb0157705f4efb90649e4468708109",
"max": 1,
"min": 0,
"orientation": "horizontal",
"style": "IPY_MODEL_36f393f4de7c434c8480fbf195d0ac8b",
"value": 1
}
},
"e8190f4f922d4a1c9f8c730604c7e806": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"2b5adaa739be47f088c2a829508a1d43": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"fa697fde7f024d07aee37a56b55fbf59": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"9dbb0157705f4efb90649e4468708109": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"36f393f4de7c434c8480fbf195d0ac8b": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ProgressStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ProgressStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"bar_color": null,
"description_width": ""
}
},
"0a85aa309bda4b9abf9240eeb8f403f0": {
"model_module": "@jupyter-widgets/controls",
"model_name": "VBoxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "VBoxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "VBoxView",
"box_style": "",
"children": [
"IPY_MODEL_9df202bad1e84ef8bc68982042d8831f",
"IPY_MODEL_7681267bc11d410ebc3954064d1cd416",
"IPY_MODEL_eaf3299530ba4f68a75b107d54bcd027",
"IPY_MODEL_2b4660cfb7c24a269c8dda2a89c7fcef"
],
"layout": "IPY_MODEL_fb800a6d97954553b45cfb9c712dab9c"
}
},
"38c9ff2937f142fdaf2fdf488fda9656": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_9c102c35b48a454d9e83f61507097b4c",
"placeholder": "",
"style": "IPY_MODEL_aa87be83bcc24e8aad06ffa648d1a057",
"value": "
Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file. "
}
},
"1e8f3c64ce81450994fc21b78e3aba76": {
"model_module": "@jupyter-widgets/controls",
"model_name": "PasswordModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "PasswordModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "PasswordView",
"continuous_update": true,
"description": "Token:",
"description_tooltip": null,
"disabled": false,
"layout": "IPY_MODEL_a3eee979a8d544e8bdb76682d4e1a89f",
"placeholder": "",
"style": "IPY_MODEL_7089322af59a42d4977b3251f0b7e2cf",
"value": ""
}
},
"c80588db8c264ec0a4c17c890c213963": {
"model_module": "@jupyter-widgets/controls",
"model_name": "CheckboxModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "CheckboxModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "CheckboxView",
"description": "Add token as git credential?",
"description_tooltip": null,
"disabled": false,
"indent": true,
"layout": "IPY_MODEL_057290b3b190427c8384d5a271bac2b5",
"style": "IPY_MODEL_53c0c50ce74d425d8125d70aeead96c8",
"value": true
}
},
"5aa58b3190a148b1b9bbe28acc811192": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ButtonModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ButtonModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "ButtonView",
"button_style": "",
"description": "Login",
"disabled": false,
"icon": "",
"layout": "IPY_MODEL_5bd49cf339f64cf2b46079cf75bda5fd",
"style": "IPY_MODEL_c78516a5de83466f884572700ce225d1",
"tooltip": ""
}
},
"12a46681a6004746b6869569c65b71a2": {
"model_module": "@jupyter-widgets/controls",
"model_name": "HTMLModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "HTMLModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "HTMLView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_9444ddb29edd421da3d74baf0cece75d",
"placeholder": "",
"style": "IPY_MODEL_20cdbaaf3958455c862a2756e92e8f92",
"value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. "
}
},
"fb800a6d97954553b45cfb9c712dab9c": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": "center",
"align_self": null,
"border": null,
"bottom": null,
"display": "flex",
"flex": null,
"flex_flow": "column",
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": "50%"
}
},
"9c102c35b48a454d9e83f61507097b4c": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"aa87be83bcc24e8aad06ffa648d1a057": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"a3eee979a8d544e8bdb76682d4e1a89f": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"7089322af59a42d4977b3251f0b7e2cf": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"057290b3b190427c8384d5a271bac2b5": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"53c0c50ce74d425d8125d70aeead96c8": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"5bd49cf339f64cf2b46079cf75bda5fd": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"c78516a5de83466f884572700ce225d1": {
"model_module": "@jupyter-widgets/controls",
"model_name": "ButtonStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "ButtonStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"button_color": null,
"font_weight": ""
}
},
"9444ddb29edd421da3d74baf0cece75d": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"20cdbaaf3958455c862a2756e92e8f92": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"eb83010cb46043268a8b36f8f25c35e7": {
"model_module": "@jupyter-widgets/controls",
"model_name": "LabelModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "LabelModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "LabelView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_8247f0e0fe2d48bdb1cd9627f6e53417",
"placeholder": "",
"style": "IPY_MODEL_ffb23aea10c94126bd8bdc54b265908d",
"value": "Connecting..."
}
},
"8247f0e0fe2d48bdb1cd9627f6e53417": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"ffb23aea10c94126bd8bdc54b265908d": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"9df202bad1e84ef8bc68982042d8831f": {
"model_module": "@jupyter-widgets/controls",
"model_name": "LabelModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "LabelModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "LabelView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_075428eaefb44bcbb96d96d2019deeef",
"placeholder": "",
"style": "IPY_MODEL_c75f2dcbcd724ba0aed77744c6943faf",
"value": "Token is valid (permission: write)."
}
},
"7681267bc11d410ebc3954064d1cd416": {
"model_module": "@jupyter-widgets/controls",
"model_name": "LabelModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "LabelModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "LabelView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_6dba6d0fdf464e84a1319e793974a455",
"placeholder": "",
"style": "IPY_MODEL_d1b60ca0cb6947a7afb1007de23f59aa",
"value": "Your token has been saved in your configured git credential helpers (store)."
}
},
"eaf3299530ba4f68a75b107d54bcd027": {
"model_module": "@jupyter-widgets/controls",
"model_name": "LabelModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "LabelModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "LabelView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_3f86a6ed43c14dbeb965535c44a114bd",
"placeholder": "",
"style": "IPY_MODEL_f458884d36c64771ba0011bc94017dcf",
"value": "Your token has been saved to /root/.cache/huggingface/token"
}
},
"2b4660cfb7c24a269c8dda2a89c7fcef": {
"model_module": "@jupyter-widgets/controls",
"model_name": "LabelModel",
"model_module_version": "1.5.0",
"state": {
"_dom_classes": [],
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "LabelModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/controls",
"_view_module_version": "1.5.0",
"_view_name": "LabelView",
"description": "",
"description_tooltip": null,
"layout": "IPY_MODEL_e902c5cb7e884850ad1ba65fe7300a7a",
"placeholder": "",
"style": "IPY_MODEL_cd97d20ae2e742ed9476c0829f1ee586",
"value": "Login successful"
}
},
"075428eaefb44bcbb96d96d2019deeef": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"c75f2dcbcd724ba0aed77744c6943faf": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"6dba6d0fdf464e84a1319e793974a455": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"d1b60ca0cb6947a7afb1007de23f59aa": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"3f86a6ed43c14dbeb965535c44a114bd": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"f458884d36c64771ba0011bc94017dcf": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
},
"e902c5cb7e884850ad1ba65fe7300a7a": {
"model_module": "@jupyter-widgets/base",
"model_name": "LayoutModel",
"model_module_version": "1.2.0",
"state": {
"_model_module": "@jupyter-widgets/base",
"_model_module_version": "1.2.0",
"_model_name": "LayoutModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "LayoutView",
"align_content": null,
"align_items": null,
"align_self": null,
"border": null,
"bottom": null,
"display": null,
"flex": null,
"flex_flow": null,
"grid_area": null,
"grid_auto_columns": null,
"grid_auto_flow": null,
"grid_auto_rows": null,
"grid_column": null,
"grid_gap": null,
"grid_row": null,
"grid_template_areas": null,
"grid_template_columns": null,
"grid_template_rows": null,
"height": null,
"justify_content": null,
"justify_items": null,
"left": null,
"margin": null,
"max_height": null,
"max_width": null,
"min_height": null,
"min_width": null,
"object_fit": null,
"object_position": null,
"order": null,
"overflow": null,
"overflow_x": null,
"overflow_y": null,
"padding": null,
"right": null,
"top": null,
"visibility": null,
"width": null
}
},
"cd97d20ae2e742ed9476c0829f1ee586": {
"model_module": "@jupyter-widgets/controls",
"model_name": "DescriptionStyleModel",
"model_module_version": "1.5.0",
"state": {
"_model_module": "@jupyter-widgets/controls",
"_model_module_version": "1.5.0",
"_model_name": "DescriptionStyleModel",
"_view_count": null,
"_view_module": "@jupyter-widgets/base",
"_view_module_version": "1.2.0",
"_view_name": "StyleView",
"description_width": ""
}
}
}
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"
"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ImSfwIQxXDmD",
"outputId": "d7b6c771-5fe8-4df4-9727-98373f3c92c1"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Mounted at /content/drive\n"
]
}
],
"source": [
"from google.colab import drive\n",
"drive.mount('/content/drive')"
]
},
{
"cell_type": "code",
"source": [
"import os\n",
"\n",
"# Directory where the dataset will be stored\n",
"dataset_dir = \"TinyStories4\"\n",
"\n",
"# Create the TinyStories directory if it doesn't exist\n",
"os.makedirs(dataset_dir, exist_ok=True)\n",
"\n",
"# URL of the dataset archive\n",
"dataset_url = \"https://huggingface.co/datasets/roneneldan/TinyStories/resolve/main/TinyStories_all_data.tar.gz\"\n",
"# Name of the file to save the downloaded dataset archive\n",
"dataset_archive_path = os.path.join(dataset_dir, \"TinyStories_all_data.tar.gz\")\n",
"\n",
"# Download the archive\n",
"if not os.path.exists(dataset_archive_path):\n",
" os.system(f\"wget {dataset_url} -O {dataset_archive_path}\")\n",
"\n",
"# Extract the archive into the TinyStories directory\n",
"os.system(f\"tar -xzf {dataset_archive_path} -C {dataset_dir}\")\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "L9gB6U-KXHZY",
"outputId": "8c46b020-3881-4436-bf82-9e951a1a15be"
},
"execution_count": 2,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"0"
]
},
"metadata": {},
"execution_count": 2
}
]
},
{
"cell_type": "code",
"source": [
"import os\n",
"import glob\n",
"import json\n",
"\n",
"shard_filenames = sorted(glob.glob(os.path.join('TinyStories4', \"*.json\")))\n",
"with open(shard_filenames[0], \"r\") as f:\n",
" data = json.load(f)\n",
"\n",
"stories = [x['story'] for x in data]\n",
"text = \"\\n\".join(stories)"
],
"metadata": {
"id": "uXrqxXeyXRwR"
},
"execution_count": 3,
"outputs": []
},
{
"cell_type": "code",
"source": [
"len(text)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3Isc2XRfYGW6",
"outputId": "d522f4c4-3e81-4714-cafe-a8dc393ae512"
},
"execution_count": 4,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"77586884"
]
},
"metadata": {},
"execution_count": 4
}
]
},
{
"cell_type": "code",
"source": [
"import string\n",
"\n",
"# Define the set of characters to remove\n",
"remove_chars = \"\\$%&*+-/;`|~ éñ–—…\"\n",
"\n",
"# Create a translation table that maps the characters to be removed to None\n",
"trans_table = str.maketrans(remove_chars, ' ' * len(remove_chars))\n",
"\n",
"# Remove the characters from the string using the translation table\n",
"text = text.translate(trans_table)"
],
"metadata": {
"id": "7PlaC0WIYeD-"
},
"execution_count": 5,
"outputs": []
},
{
"cell_type": "code",
"source": [
"### vocab size and characters used\n",
"\n",
"chars = sorted(list(set(text)))\n",
"vocab_size = len(chars)\n",
"print(''.join(chars))\n",
"print(vocab_size)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "DpPYXMqJZkU7",
"outputId": "66cbd95c-4342-48ff-ce7f-de321d7d0501"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\t\n",
" 0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz \n",
"66\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import torch\n",
"import torch.nn as nn\n",
"from torch.nn import functional as F\n",
"torch.manual_seed(1137)\n",
"\n",
"# hyperparameters\n",
"batch_size = 64 # how many independent sequences will we process in parallel?\n",
"block_size = 256 # what is the maximum context length for predictions?\n",
"max_iters = 6000\n",
"eval_interval = 100\n",
"learning_rate = 1e-3\n",
"device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
"eval_iters = 200\n",
"n_embed = 384\n",
"n_head = 6\n",
"n_layer = 6\n",
"dropout = 0.0\n",
"# ------------"
],
"metadata": {
"id": "FQr01U7Hahth"
},
"execution_count": 7,
"outputs": []
},
{
"cell_type": "code",
"source": [
"### character encoding and decoding function, stoi : string to integer, itos: integer to string\n",
"\n",
"import torch\n",
"\n",
"stoi = { ch:i for i, ch in enumerate(chars) }\n",
"itos = {i:ch for i,ch in enumerate(chars)}\n",
"encode = lambda s: [stoi[c] for c in s]\n",
"decode = lambda l: \"\".join([itos[x] for x in l])\n",
"\n",
"data = torch.tensor(encode(text), dtype = torch.long)"
],
"metadata": {
"id": "lQcB-pEzZvoh"
},
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"source": [
"### creating the train and text splits\n",
"\n",
"n = int(0.9*len(data))\n",
"train_data = data[:n]\n",
"val_data = data[n:]"
],
"metadata": {
"id": "u6wCdZ5waJNy"
},
"execution_count": 9,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# data loading\n",
"\n",
"def get_batch(split):\n",
" # generate a small batch of data of inputs x and targets y\n",
" data = train_data if split == 'train' else val_data\n",
" ix = torch.randint(len(data) - block_size, (batch_size,))\n",
" x = torch.stack([data[i:i+block_size] for i in ix])\n",
" y = torch.stack([data[i+1:i+block_size+1] for i in ix])\n",
" return x, y"
],
"metadata": {
"id": "F86Zm98WaS8a"
},
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"source": [
"class RMSNorm(nn.Module): #### x * (sqrt(mean(x**2))\n",
" def __init__(self, n_embed, eps = 1e-6):\n",
" super().__init__()\n",
" self.eps = eps\n",
" self.weight = nn.Parameter(torch.ones(n_embed))\n",
"\n",
" def _norm(self, x):\n",
" return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)\n",
"\n",
" def forward(self, x):\n",
" output = self._norm(x.float()).type_as(x)\n",
" return output * self.weight"
],
"metadata": {
"id": "_93TwyTwaknl"
},
"execution_count": 11,
"outputs": []
},
{
"cell_type": "code",
"source": [
"class MoeLayer(nn.Module):\n",
" def __init__(self, experts, gate, k=2):\n",
" super().__init__()\n",
" assert len(experts) > 0\n",
" self.experts = nn.ModuleList(experts)\n",
" self.gate = gate\n",
" self.k = k\n",
"\n",
" def forward(self, inputs: torch.Tensor):\n",
" inputs_squashed = inputs.view(-1, inputs.shape[-1])\n",
" gate_logits = self.gate(inputs_squashed)\n",
" weights, selected_experts = torch.topk(\n",
" gate_logits, self.k\n",
" )\n",
" weights = nn.functional.softmax(\n",
" weights,\n",
" dim=1,\n",
" dtype=torch.float,\n",
" ).type_as(inputs)\n",
" results = torch.zeros_like(inputs_squashed)\n",
" for i, expert in enumerate(self.experts):\n",
" batch_idx, nth_expert = torch.where(selected_experts == i)\n",
" results[batch_idx] += weights[batch_idx, nth_expert, None] * expert(\n",
" inputs_squashed[batch_idx]\n",
" )\n",
" return results.view_as(inputs)"
],
"metadata": {
"id": "KYiPeTfua3NM"
},
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"source": [
"class Head(nn.Module):\n",
" def __init__(self, head_size):\n",
" super().__init__()\n",
" self.key = nn.Linear(n_embed, head_size, bias = False)\n",
" self.query = nn.Linear(n_embed, head_size, bias = False)\n",
" self.value = nn.Linear(n_embed, head_size, bias = False)\n",
" self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))\n",
" self.dropout = nn.Dropout(dropout)\n",
"\n",
" def forward(self, x):\n",
" B, T, C = x.shape\n",
" k = self.key(x)\n",
" q = self.query(x)\n",
" wei = q @ k.transpose(-2, -1) * C**-0.5\n",
" wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))\n",
" wei = F.softmax(wei, dim=-1)\n",
" wei = self.dropout(wei)\n",
" v = self.value(x)\n",
" out = wei @ v\n",
" return out\n",
"\n",
"class MulitHeadAttention(nn.Module):\n",
" def __init__(self, num_heads, head_size):\n",
" super().__init__()\n",
" self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])\n",
" self.proj = nn.Linear(n_embed, n_embed)\n",
" self.dropout = nn.Dropout(dropout)\n",
"\n",
" def forward(self, x):\n",
" x = torch.cat([head(x) for head in self.heads], dim=-1)\n",
" out = self.dropout(self.proj(x))\n",
" return out\n",
"\n",
"\n",
"class Expert(nn.Module):\n",
" def __init__(self, n_embed):\n",
" super().__init__()\n",
" self.net = nn.Sequential(\n",
" nn.Linear(n_embed, 4* n_embed),\n",
" nn.SiLU(),\n",
" nn.Linear(4 * n_embed, n_embed),\n",
" nn.Dropout(dropout))\n",
"\n",
" def forward(self, x):\n",
" return self.net(x)\n",
"\n",
"class Block(nn.Module):\n",
" def __init__(self, n_embed, n_head, num_experts=4):\n",
" super().__init__()\n",
" self.sa_head= MulitHeadAttention(n_head, n_embed//n_head)\n",
" self.ffw = MoeLayer(\n",
" experts=[Expert(n_embed) for _ in range(num_experts)],\n",
" gate=nn.Linear(n_embed, num_experts, bias=False),\n",
" )\n",
"\n",
"# self.ffw= FeedForward(n_embed)\n",
" self.ln1 = RMSNorm(n_embed)\n",
" self.ln2 = RMSNorm(n_embed)\n",
"\n",
" def forward(self, x):\n",
" x = x + self.sa_head(self.ln1(x))\n",
" x = x+self.ffw(self.ln2(x))\n",
" return x\n",
"\n",
"\n",
"class Transformer(nn.Module):\n",
" def __init__(self):\n",
" super().__init__()\n",
"\n",
" self.token_embedding_table = nn.Embedding(vocab_size, n_embed, device=device)\n",
" self.position_embedding_table = nn.Embedding(block_size, n_embed, device=device)\n",
" self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])\n",
" self.lm_head = nn.Linear(n_embed, vocab_size)\n",
"\n",
"\n",
" def forward(self, idx, targets=None):\n",
" B, T = idx.shape\n",
"\n",
" token_emb = self.token_embedding_table(idx)\n",
" pos_emb = self.position_embedding_table(torch.arange(T).to(device))\n",
" x = token_emb + pos_emb\n",
" x = self.blocks(x)\n",
" logits = self.lm_head(x)\n",
" if targets == None:\n",
" loss = None\n",
" else:\n",
" B, T, C = logits.shape\n",
" logits = logits.view(B*T, C)\n",
" targets = targets.view(B*T)\n",
" loss = F.cross_entropy(logits, targets)\n",
" return logits, loss\n",
"\n",
" def generate(self, idx, max_new_tokes):\n",
" for _ in range(max_new_tokes):\n",
" idx_cond = idx[:, -block_size:]\n",
" logits, loss = self(idx_cond)\n",
" logits = logits[:, -1, :]\n",
" probs = F.softmax(logits, dim = -1)\n",
" idx_next = torch.multinomial(probs, num_samples = 1)\n",
" idx = torch.cat((idx, idx_next), dim = 1)\n",
" return idx"
],
"metadata": {
"id": "lJi2xz0obLHn"
},
"execution_count": 13,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model = Transformer()\n",
"optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate)\n",
"print(sum(p.numel() for p in model.parameters()), 'total parameters')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "M-R7m5FZbkbl",
"outputId": "96c6cfe5-dede-4c9d-8c20-9411a5e3e966"
},
"execution_count": 14,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"32061762 total parameters\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"model.eval()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "vmZb3UxWboZ9",
"outputId": "25f33004-7034-4fe8-af18-33477e566804"
},
"execution_count": 15,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"Transformer(\n",
" (token_embedding_table): Embedding(66, 384)\n",
" (position_embedding_table): Embedding(256, 384)\n",
" (blocks): Sequential(\n",
" (0): Block(\n",
" (sa_head): MulitHeadAttention(\n",
" (heads): ModuleList(\n",
" (0-5): 6 x Head(\n",
" (key): Linear(in_features=384, out_features=64, bias=False)\n",
" (query): Linear(in_features=384, out_features=64, bias=False)\n",
" (value): Linear(in_features=384, out_features=64, bias=False)\n",
" (dropout): Dropout(p=0.0, inplace=False)\n",
" )\n",
" )\n",
" (proj): Linear(in_features=384, out_features=384, bias=True)\n",
" (dropout): Dropout(p=0.0, inplace=False)\n",
" )\n",
" (ffw): MoeLayer(\n",
" (experts): ModuleList(\n",
" (0-3): 4 x Expert(\n",
" (net): Sequential(\n",
" (0): Linear(in_features=384, out_features=1536, bias=True)\n",
" (1): SiLU()\n",
" (2): Linear(in_features=1536, out_features=384, bias=True)\n",
" (3): Dropout(p=0.0, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (gate): Linear(in_features=384, out_features=4, bias=False)\n",
" )\n",
" (ln1): RMSNorm()\n",
" (ln2): RMSNorm()\n",
" )\n",
" (1): Block(\n",
" (sa_head): MulitHeadAttention(\n",
" (heads): ModuleList(\n",
" (0-5): 6 x Head(\n",
" (key): Linear(in_features=384, out_features=64, bias=False)\n",
" (query): Linear(in_features=384, out_features=64, bias=False)\n",
" (value): Linear(in_features=384, out_features=64, bias=False)\n",
" (dropout): Dropout(p=0.0, inplace=False)\n",
" )\n",
" )\n",
" (proj): Linear(in_features=384, out_features=384, bias=True)\n",
" (dropout): Dropout(p=0.0, inplace=False)\n",
" )\n",
" (ffw): MoeLayer(\n",
" (experts): ModuleList(\n",
" (0-3): 4 x Expert(\n",
" (net): Sequential(\n",
" (0): Linear(in_features=384, out_features=1536, bias=True)\n",
" (1): SiLU()\n",
" (2): Linear(in_features=1536, out_features=384, bias=True)\n",
" (3): Dropout(p=0.0, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (gate): Linear(in_features=384, out_features=4, bias=False)\n",
" )\n",
" (ln1): RMSNorm()\n",
" (ln2): RMSNorm()\n",
" )\n",
" (2): Block(\n",
" (sa_head): MulitHeadAttention(\n",
" (heads): ModuleList(\n",
" (0-5): 6 x Head(\n",
" (key): Linear(in_features=384, out_features=64, bias=False)\n",
" (query): Linear(in_features=384, out_features=64, bias=False)\n",
" (value): Linear(in_features=384, out_features=64, bias=False)\n",
" (dropout): Dropout(p=0.0, inplace=False)\n",
" )\n",
" )\n",
" (proj): Linear(in_features=384, out_features=384, bias=True)\n",
" (dropout): Dropout(p=0.0, inplace=False)\n",
" )\n",
" (ffw): MoeLayer(\n",
" (experts): ModuleList(\n",
" (0-3): 4 x Expert(\n",
" (net): Sequential(\n",
" (0): Linear(in_features=384, out_features=1536, bias=True)\n",
" (1): SiLU()\n",
" (2): Linear(in_features=1536, out_features=384, bias=True)\n",
" (3): Dropout(p=0.0, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (gate): Linear(in_features=384, out_features=4, bias=False)\n",
" )\n",
" (ln1): RMSNorm()\n",
" (ln2): RMSNorm()\n",
" )\n",
" (3): Block(\n",
" (sa_head): MulitHeadAttention(\n",
" (heads): ModuleList(\n",
" (0-5): 6 x Head(\n",
" (key): Linear(in_features=384, out_features=64, bias=False)\n",
" (query): Linear(in_features=384, out_features=64, bias=False)\n",
" (value): Linear(in_features=384, out_features=64, bias=False)\n",
" (dropout): Dropout(p=0.0, inplace=False)\n",
" )\n",
" )\n",
" (proj): Linear(in_features=384, out_features=384, bias=True)\n",
" (dropout): Dropout(p=0.0, inplace=False)\n",
" )\n",
" (ffw): MoeLayer(\n",
" (experts): ModuleList(\n",
" (0-3): 4 x Expert(\n",
" (net): Sequential(\n",
" (0): Linear(in_features=384, out_features=1536, bias=True)\n",
" (1): SiLU()\n",
" (2): Linear(in_features=1536, out_features=384, bias=True)\n",
" (3): Dropout(p=0.0, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (gate): Linear(in_features=384, out_features=4, bias=False)\n",
" )\n",
" (ln1): RMSNorm()\n",
" (ln2): RMSNorm()\n",
" )\n",
" (4): Block(\n",
" (sa_head): MulitHeadAttention(\n",
" (heads): ModuleList(\n",
" (0-5): 6 x Head(\n",
" (key): Linear(in_features=384, out_features=64, bias=False)\n",
" (query): Linear(in_features=384, out_features=64, bias=False)\n",
" (value): Linear(in_features=384, out_features=64, bias=False)\n",
" (dropout): Dropout(p=0.0, inplace=False)\n",
" )\n",
" )\n",
" (proj): Linear(in_features=384, out_features=384, bias=True)\n",
" (dropout): Dropout(p=0.0, inplace=False)\n",
" )\n",
" (ffw): MoeLayer(\n",
" (experts): ModuleList(\n",
" (0-3): 4 x Expert(\n",
" (net): Sequential(\n",
" (0): Linear(in_features=384, out_features=1536, bias=True)\n",
" (1): SiLU()\n",
" (2): Linear(in_features=1536, out_features=384, bias=True)\n",
" (3): Dropout(p=0.0, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (gate): Linear(in_features=384, out_features=4, bias=False)\n",
" )\n",
" (ln1): RMSNorm()\n",
" (ln2): RMSNorm()\n",
" )\n",
" (5): Block(\n",
" (sa_head): MulitHeadAttention(\n",
" (heads): ModuleList(\n",
" (0-5): 6 x Head(\n",
" (key): Linear(in_features=384, out_features=64, bias=False)\n",
" (query): Linear(in_features=384, out_features=64, bias=False)\n",
" (value): Linear(in_features=384, out_features=64, bias=False)\n",
" (dropout): Dropout(p=0.0, inplace=False)\n",
" )\n",
" )\n",
" (proj): Linear(in_features=384, out_features=384, bias=True)\n",
" (dropout): Dropout(p=0.0, inplace=False)\n",
" )\n",
" (ffw): MoeLayer(\n",
" (experts): ModuleList(\n",
" (0-3): 4 x Expert(\n",
" (net): Sequential(\n",
" (0): Linear(in_features=384, out_features=1536, bias=True)\n",
" (1): SiLU()\n",
" (2): Linear(in_features=1536, out_features=384, bias=True)\n",
" (3): Dropout(p=0.0, inplace=False)\n",
" )\n",
" )\n",
" )\n",
" (gate): Linear(in_features=384, out_features=4, bias=False)\n",
" )\n",
" (ln1): RMSNorm()\n",
" (ln2): RMSNorm()\n",
" )\n",
" )\n",
" (lm_head): Linear(in_features=384, out_features=66, bias=True)\n",
")"
]
},
"metadata": {},
"execution_count": 15
}
]
},
{
"cell_type": "code",
"source": [
"!pip install wandb"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "RroEewThbtDy",
"outputId": "1974cda2-feee-49c5-e27d-c400255a822c"
},
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting wandb\n",
" Downloading wandb-0.16.4-py3-none-any.whl (2.2 MB)\n",
"\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/2.2 MB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.3/2.2 MB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m35.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: Click!=8.0.0,>=7.1 in /usr/local/lib/python3.10/dist-packages (from wandb) (8.1.7)\n",
"Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)\n",
" Downloading GitPython-3.1.42-py3-none-any.whl (195 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m195.4/195.4 kB\u001b[0m \u001b[31m25.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: requests<3,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from wandb) (2.31.0)\n",
"Requirement already satisfied: psutil>=5.0.0 in /usr/local/lib/python3.10/dist-packages (from wandb) (5.9.5)\n",
"Collecting sentry-sdk>=1.0.0 (from wandb)\n",
" Downloading sentry_sdk-1.43.0-py2.py3-none-any.whl (264 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m264.6/264.6 kB\u001b[0m \u001b[31m29.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting docker-pycreds>=0.4.0 (from wandb)\n",
" Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\n",
"Requirement already satisfied: PyYAML in /usr/local/lib/python3.10/dist-packages (from wandb) (6.0.1)\n",
"Collecting setproctitle (from wandb)\n",
" Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)\n",
"Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from wandb) (67.7.2)\n",
"Requirement already satisfied: appdirs>=1.4.3 in /usr/local/lib/python3.10/dist-packages (from wandb) (1.4.4)\n",
"Requirement already satisfied: protobuf!=4.21.0,<5,>=3.19.0 in /usr/local/lib/python3.10/dist-packages (from wandb) (3.20.3)\n",
"Requirement already satisfied: six>=1.4.0 in /usr/local/lib/python3.10/dist-packages (from docker-pycreds>=0.4.0->wandb) (1.16.0)\n",
"Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb)\n",
" Downloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.0.0->wandb) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.0.0->wandb) (3.6)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.0.0->wandb) (2.0.7)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3,>=2.0.0->wandb) (2024.2.2)\n",
"Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb)\n",
" Downloading smmap-5.0.1-py3-none-any.whl (24 kB)\n",
"Installing collected packages: smmap, setproctitle, sentry-sdk, docker-pycreds, gitdb, GitPython, wandb\n",
"Successfully installed GitPython-3.1.42 docker-pycreds-0.4.0 gitdb-4.0.11 sentry-sdk-1.43.0 setproctitle-1.3.3 smmap-5.0.1 wandb-0.16.4\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import wandb\n",
"!wandb login"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "E-rS2QXCcdpE",
"outputId": "c79ac3b0-b729-45e4-ae1d-ed682cc1684f"
},
"execution_count": 17,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\u001b[34m\u001b[1mwandb\u001b[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: You can find your API key in your browser here: https://wandb.ai/authorize\n",
"\u001b[34m\u001b[1mwandb\u001b[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: \n",
"\u001b[34m\u001b[1mwandb\u001b[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"@torch.no_grad()\n",
"def estimate_loss():\n",
" out = {}\n",
" model.eval()\n",
" for split in ['train', 'val']:\n",
" losses = torch.zeros(eval_iters)\n",
" for k in range(eval_iters):\n",
" X, Y = get_batch(split)\n",
" X = X.to(device)\n",
" Y = Y.to(device)\n",
" logits, loss = model(X, Y)\n",
" losses[k] = loss.item()\n",
" out[split] = losses.mean()\n",
" model.train()\n",
" return out"
],
"metadata": {
"id": "oNzq5Vkccmk0"
},
"execution_count": 18,
"outputs": []
},
{
"cell_type": "code",
"source": [
"model = model.to(device)\n",
"optimizer = torch.optim.AdamW(model.parameters(),lr=1e-4)\n",
"\n",
"wandb.init(\n",
" # set the wandb project where this run will be logged\n",
" project=\"mixture of experts\",\n",
"\n",
" # track hyperparameters and run metadata\n",
" config={\n",
" 'batch_size': batch_size,\n",
" 'block_size': block_size,\n",
" 'max_iters': max_iters,\n",
" 'eval_interval': eval_interval,\n",
" 'learning_rate': learning_rate,\n",
" 'device': device,\n",
" 'eval_iters': eval_iters,\n",
" 'n_embed': n_embed,\n",
" 'n_head': n_head,\n",
" 'n_layer': n_layer,\n",
" 'dropout': dropout\n",
" }\n",
")\n",
"\n",
"wandb.watch(model)\n",
"\n",
"for iter in range(max_iters):\n",
"\n",
" # every once in a while evaluate the loss on train and val sets\n",
" if iter % 100 == 0 or iter == max_iters - 1:\n",
" losses = estimate_loss()\n",
" print(f\"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}\")\n",
" wandb.log({'train_loss': losses['train'], 'val_loss': losses['val']}, step=iter)\n",
"\n",
" # sample a batch of data\n",
" xb, yb = get_batch('train')\n",
" xb = xb.to(device)\n",
" yb = yb.to(device)\n",
"\n",
" # evaluate the loss\n",
" logits, loss = model(xb, yb)\n",
" optimizer.zero_grad(set_to_none=True)\n",
" loss.backward()\n",
" optimizer.step()\n",
"\n",
"model_dir = 'Mixture of experts models'\n",
"os.makedirs(model_dir, exist_ok=True)\n",
"final_model_path = os.path.join(model_dir, 'moe2_model.pth')\n",
"torch.save(model.state_dict(), final_model_path)\n",
"print('Final trained model saved!')\n",
"\n",
"wandb.finish()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000,
"referenced_widgets": [
"6a9a993a2bde4bc7acdd44e35ee8a423",
"63b857b0db6f4b5bbb55230dd1e0f66e",
"ae1f1250131c4283b929d3108e581926",
"e8190f4f922d4a1c9f8c730604c7e806",
"2b5adaa739be47f088c2a829508a1d43",
"fa697fde7f024d07aee37a56b55fbf59",
"9dbb0157705f4efb90649e4468708109",
"36f393f4de7c434c8480fbf195d0ac8b"
]
},
"id": "J-PJuS8qdCSW",
"outputId": "4b4a6adb-6df8-4b62-e082-fb3dbd6c0a0b"
},
"execution_count": 19,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mdruvithlgowda00\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"text/html": [
"Tracking run with wandb version 0.16.4"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"text/html": [
"Run data is saved locally in /content/wandb/run-20240323_052806-r801x9fx
"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"text/html": [
"Syncing run distinctive-plasma-3 to Weights & Biases (docs)
"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"text/html": [
" View project at https://wandb.ai/druvithlgowda00/Mixture%20of%20experts"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"text/html": [
" View run at https://wandb.ai/druvithlgowda00/Mixture%20of%20experts/runs/r801x9fx"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"step 0: train loss 4.5065, val loss 4.5070\n",
"step 100: train loss 2.2867, val loss 2.2864\n",
"step 200: train loss 2.2622, val loss 2.2609\n",
"step 300: train loss 2.2438, val loss 2.2433\n",
"step 400: train loss 2.2149, val loss 2.2150\n",
"step 500: train loss 2.1553, val loss 2.1571\n",
"step 600: train loss 1.9616, val loss 1.9618\n",
"step 700: train loss 1.7527, val loss 1.7581\n",
"step 800: train loss 1.6343, val loss 1.6381\n",
"step 900: train loss 1.5419, val loss 1.5456\n",
"step 1000: train loss 1.4781, val loss 1.4763\n",
"step 1100: train loss 1.4095, val loss 1.4159\n",
"step 1200: train loss 1.3634, val loss 1.3671\n",
"step 1300: train loss 1.3279, val loss 1.3312\n",
"step 1400: train loss 1.2929, val loss 1.2962\n",
"step 1500: train loss 1.2640, val loss 1.2644\n",
"step 1600: train loss 1.2410, val loss 1.2439\n",
"step 1700: train loss 1.2074, val loss 1.2121\n",
"step 1800: train loss 1.1888, val loss 1.1913\n",
"step 1900: train loss 1.1664, val loss 1.1709\n",
"step 2000: train loss 1.1527, val loss 1.1543\n",
"step 2100: train loss 1.1329, val loss 1.1347\n",
"step 2200: train loss 1.1160, val loss 1.1157\n",
"step 2300: train loss 1.1030, val loss 1.1041\n",
"step 2400: train loss 1.0875, val loss 1.0952\n",
"step 2500: train loss 1.0799, val loss 1.0785\n",
"step 2600: train loss 1.0638, val loss 1.0637\n",
"step 2700: train loss 1.0549, val loss 1.0567\n",
"step 2800: train loss 1.0404, val loss 1.0420\n",
"step 2900: train loss 1.0337, val loss 1.0325\n",
"step 3000: train loss 1.0187, val loss 1.0238\n",
"step 3100: train loss 1.0117, val loss 1.0132\n",
"step 3200: train loss 0.9958, val loss 0.9977\n",
"step 3300: train loss 0.9885, val loss 0.9962\n",
"step 3400: train loss 0.9853, val loss 0.9887\n",
"step 3500: train loss 0.9733, val loss 0.9780\n",
"step 3600: train loss 0.9689, val loss 0.9692\n",
"step 3700: train loss 0.9575, val loss 0.9612\n",
"step 3800: train loss 0.9484, val loss 0.9521\n",
"step 3900: train loss 0.9464, val loss 0.9450\n",
"step 4000: train loss 0.9367, val loss 0.9380\n",
"step 4100: train loss 0.9293, val loss 0.9339\n",
"step 4200: train loss 0.9286, val loss 0.9263\n",
"step 4300: train loss 0.9167, val loss 0.9206\n",
"step 4400: train loss 0.9121, val loss 0.9124\n",
"step 4500: train loss 0.9069, val loss 0.9113\n",
"step 4600: train loss 0.9024, val loss 0.9044\n",
"step 4700: train loss 0.8984, val loss 0.8985\n",
"step 4800: train loss 0.8916, val loss 0.8938\n",
"step 4900: train loss 0.8889, val loss 0.8910\n",
"step 5000: train loss 0.8830, val loss 0.8821\n",
"step 5100: train loss 0.8778, val loss 0.8809\n",
"step 5200: train loss 0.8714, val loss 0.8767\n",
"step 5300: train loss 0.8702, val loss 0.8739\n",
"step 5400: train loss 0.8626, val loss 0.8682\n",
"step 5500: train loss 0.8589, val loss 0.8628\n",
"step 5600: train loss 0.8545, val loss 0.8597\n",
"step 5700: train loss 0.8561, val loss 0.8583\n",
"step 5800: train loss 0.8486, val loss 0.8542\n",
"step 5900: train loss 0.8436, val loss 0.8506\n",
"step 5999: train loss 0.8443, val loss 0.8458\n",
"Final trained model saved!\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\\r'), FloatProgress(value=1.0, max=1.0)))"
],
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "6a9a993a2bde4bc7acdd44e35ee8a423"
}
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"text/html": [
"\n",
"Run history:
train_loss | █▄▄▄▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ |
val_loss | █▄▄▄▃▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ |
Run summary:
train_loss | 0.84435 |
val_loss | 0.84581 |
"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"text/html": [
" View run distinctive-plasma-3 at: https://wandb.ai/druvithlgowda00/Mixture%20of%20experts/runs/r801x9fx
Synced 5 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)"
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
""
],
"text/html": [
"Find logs at: ./wandb/run-20240323_052806-r801x9fx/logs
"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"# generation\n",
"d = 'once'\n",
"context = torch.tensor(encode(d), dtype=torch.long, device=device).unsqueeze(0)\n",
"print(decode(model.generate(context, max_new_tokes=500)[0].tolist()))\n"
],
"metadata": {
"id": "FyJODzaHezJa",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "318940c5-e203-4f4f-bfde-a114edaff372"
},
"execution_count": 26,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"once to go home \n",
"Once upon a time there was a little girl named Lily She loved to play with her together in the tracks One day her mom made a little here with a big sea She had answed \n",
" Let s play as play white The tree was crying and continued but then we down the sea tost inside the spick replied That dog for it is thought about \n",
"Then it s house The boy and the man fell down about the pole and excited away The wind was go away to and Tom again They excited to put the circle \n"
]
}
]
},
{
"cell_type": "code",
"source": [
"#push the weights to hugging_face.hub\n",
"!pip install transformers huggingface_hub"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Ie3l0Ks79VWV",
"outputId": "161fb7ba-9b18-43dc-e2cb-8577484ad939"
},
"execution_count": 27,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.38.2)\n",
"Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.10/dist-packages (0.20.3)\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.13.1)\n",
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.25.2)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (24.0)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.12.25)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n",
"Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.15.2)\n",
"Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.2)\n",
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.2)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (2023.6.0)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface_hub) (4.10.0)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.2)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.6)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2024.2.2)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"from huggingface_hub import notebook_login\n",
"\n",
"notebook_login()"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 157,
"referenced_widgets": [
"0a85aa309bda4b9abf9240eeb8f403f0",
"38c9ff2937f142fdaf2fdf488fda9656",
"1e8f3c64ce81450994fc21b78e3aba76",
"c80588db8c264ec0a4c17c890c213963",
"5aa58b3190a148b1b9bbe28acc811192",
"12a46681a6004746b6869569c65b71a2",
"fb800a6d97954553b45cfb9c712dab9c",
"9c102c35b48a454d9e83f61507097b4c",
"aa87be83bcc24e8aad06ffa648d1a057",
"a3eee979a8d544e8bdb76682d4e1a89f",
"7089322af59a42d4977b3251f0b7e2cf",
"057290b3b190427c8384d5a271bac2b5",
"53c0c50ce74d425d8125d70aeead96c8",
"5bd49cf339f64cf2b46079cf75bda5fd",
"c78516a5de83466f884572700ce225d1",
"9444ddb29edd421da3d74baf0cece75d",
"20cdbaaf3958455c862a2756e92e8f92",
"eb83010cb46043268a8b36f8f25c35e7",
"8247f0e0fe2d48bdb1cd9627f6e53417",
"ffb23aea10c94126bd8bdc54b265908d",
"9df202bad1e84ef8bc68982042d8831f",
"7681267bc11d410ebc3954064d1cd416",
"eaf3299530ba4f68a75b107d54bcd027",
"2b4660cfb7c24a269c8dda2a89c7fcef",
"075428eaefb44bcbb96d96d2019deeef",
"c75f2dcbcd724ba0aed77744c6943faf",
"6dba6d0fdf464e84a1319e793974a455",
"d1b60ca0cb6947a7afb1007de23f59aa",
"3f86a6ed43c14dbeb965535c44a114bd",
"f458884d36c64771ba0011bc94017dcf",
"e902c5cb7e884850ad1ba65fe7300a7a",
"cd97d20ae2e742ed9476c0829f1ee586"
]
},
"id": "7EUxRlt0ARds",
"outputId": "c18522ee-f346-460a-925f-d421ac92be58"
},
"execution_count": 28,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"VBox(children=(HTML(value='
\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mshutil\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mshutil\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmove\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"wandb\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Tiny_StoriesMoE\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/usr/lib/python3.10/shutil.py\u001b[0m in \u001b[0;36mmove\u001b[0;34m(src, dst, copy_function)\u001b[0m\n\u001b[1;32m 812\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 813\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreal_dst\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 814\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Destination path '%s' already exists\"\u001b[0m \u001b[0;34m%\u001b[0m \u001b[0mreal_dst\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 815\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 816\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrename\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreal_dst\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mError\u001b[0m: Destination path 'Tiny_StoriesMoE/wandb' already exists"
]
}
]
},
{
"cell_type": "code",
"source": [
"repo.push_to_hub(commit_message=\"wandb logs\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 385
},
"id": "lAJTdA1QBslH",
"outputId": "1aa73e28-0bf3-4a93-8faf-89196c251f68"
},
"execution_count": 39,
"outputs": [
{
"output_type": "error",
"ename": "IsADirectoryError",
"evalue": "[Errno 21] Is a directory: '/content/Tiny_StoriesMoE/wandb/latest-run'",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIsADirectoryError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mrepo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpush_to_hub\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommit_message\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"wandb logs\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mpush_to_hub\u001b[0;34m(self, commit_message, blocking, clean_ok, auto_lfs_prune)\u001b[0m\n\u001b[1;32m 1320\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Repo currently clean. Ignoring push_to_hub\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1321\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1322\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgit_add\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mauto_lfs_track\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1323\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgit_commit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcommit_message\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1324\u001b[0m return self.git_push(\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mgit_add\u001b[0;34m(self, pattern, auto_lfs_track)\u001b[0m\n\u001b[1;32m 1024\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1025\u001b[0m \u001b[0;31m# Read the remaining files and track them if they're binary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1026\u001b[0;31m \u001b[0mtracked_files\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mauto_track_binary_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpattern\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1027\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1028\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtracked_files\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mauto_track_binary_files\u001b[0;34m(self, pattern)\u001b[0m\n\u001b[1;32m 918\u001b[0m )\n\u001b[1;32m 919\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 920\u001b[0;31m \u001b[0mis_binary\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mis_binary_file\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpath_to_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 921\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 922\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mis_binary\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.10/dist-packages/huggingface_hub/repository.py\u001b[0m in \u001b[0;36mis_binary_file\u001b[0;34m(filename)\u001b[0m\n\u001b[1;32m 228\u001b[0m \"\"\"\n\u001b[1;32m 229\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 230\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfilename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 231\u001b[0m \u001b[0mcontent\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;36m1024\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# Read a maximum of 10MB\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 232\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mIsADirectoryError\u001b[0m: [Errno 21] Is a directory: '/content/Tiny_StoriesMoE/wandb/latest-run'"
]
}
]
},
{
"cell_type": "code",
"source": [
"shutil.move(\"Mixture of experts models/moe2_model.pth\", \"Tiny_StoriesMoE\")"
],
"metadata": {
"id": "uFYn9p-FCXv8"
},
"execution_count": null,
"outputs": []
}
]
} | |