{ "cells": [ { "cell_type": "code", "source": [ "# %%capture\n", "# # Installs Unsloth, Xformers (Flash Attention) and all other packages!\n", "!pip install \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\" --quiet\n", "\n", "# We have to check which Torch version for Xformers (2.3 -> 0.0.27)\n", "from torch import __version__; from packaging.version import Version as V\n", "xformers = \"xformers==0.0.27\" if V(__version__) < V(\"2.4.0\") else \"xformers\"\n", "!pip install --no-deps {xformers} \"trl<0.9.0\" peft accelerate bitsandbytes triton --quiet\n", "\n", "!pip install peft --quiet\n", "!pip install --upgrade --no-cache-dir \"transformers<4.45.0\" --quiet # Reason: https://github.com/unslothai/unsloth/issues/1061\n", "\n", "!pip install -q gradio" ], "metadata": { "id": "g0gl_TBTXRYC", "outputId": "67222684-6f4f-4027-d8a5-32788590081c", "colab": { "base_uri": "https://localhost:8080/" } }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n" ] } ] }, { "cell_type": "code", "source": [ "import gradio as gr\n", "import random\n", "import time\n", "import os\n", "from unsloth import FastLanguageModel\n", "import torch\n", "max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!\n", "dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n", "load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.\n", "\n", "huggingface_token = \"\"\n", "\n", "if True:\n", " from unsloth import FastLanguageModel\n", " model, tokenizer = FastLanguageModel.from_pretrained(\n", " model_name = \"traversaal-llm-regional-languages/Urdu_Llama3_2_4bit_PF25_adapter\", # YOUR MODEL YOU USED FOR TRAINING\n", " max_seq_length = max_seq_length,\n", " dtype = dtype,\n", " load_in_4bit = load_in_4bit,\n", " token = huggingface_token,\n", " )\n", " FastLanguageModel.for_inference(model) # Enable native 2x faster inference\n", "\n", "\n", "alpaca_prompt = \"\"\"{0}\\nInput: {1}\\nOutput: \"\"\"\n", "\n", "def generate_text(prompt):\n", " # Format the prompt with instruction and input, and leave output prompt blank\n", " formatted_prompt = alpaca_prompt.format(\n", " \"دیئے گئے موضوع کے بارے میں ایک مختصر پیراگراف لکھیں۔\", # instruction\n", " prompt # user input\n", " )\n", "\n", " # Tokenize the prompt and move tensors to GPU\n", " inputs = tokenizer([formatted_prompt], return_tensors=\"pt\").to(\"cuda\")\n", "\n", " # Generate output from the model\n", " outputs = model.generate(**inputs, max_new_tokens=200, use_cache=True)\n", "\n", " # Decode the output and remove the instruction + input part\n", " generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", "\n", " # Remove the prompt part by splitting on \"Output:\" and returning only generated part\n", " result = generated_text.split(\"Output:\")[-1].strip()\n", "\n", " return result\n", "\n", "iface = gr.Interface(\n", " fn=generate_text,\n", " inputs=gr.Textbox(lines=2, placeholder=\"Enter your prompt here...\"),\n", " examples=['میں کراچی جانا چاہتا ہوں، وہاں کے کچھ بہترین مقامات کون سے ہیں؟',\n", " 'amazing food locations in Singapore',\n", " 'best activities in London'],\n", " outputs=\"text\",\n", " title=\"Urdu Chatbot - Powered by traversaal-urdu-llama-3.2-1b\",\n", " description=\"Ask me anything in Urdu!\",\n", ")\n", "\n", "iface.launch()\n" ], "metadata": { "id": "SM6OLuM5gve7", "outputId": "a3512ee6-8f5f-40c5-d792-1c7d34bbe2e2", "colab": { "base_uri": "https://localhost:8080/", "height": 796 } }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.\n", "==((====))== Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.\n", " \\\\ /| GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.\n", "O^O/ \\_/ \\ Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.\n", "\\ / Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]\n", " \"-____-\" Free Apache license: http://github.com/unslothai/unsloth\n", "Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "Unsloth 2024.9.post4 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n", "\n", "Colab notebook detected. To show errors in colab notebook, set debug=True in launch()\n", "* Running on public URL: https://8d8a38dbca08b1f69c.gradio.live\n", "\n", "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "" ], "text/html": [ "
" ] }, "metadata": {} }, { "output_type": "execute_result", "data": { "text/plain": [] }, "metadata": {}, "execution_count": 2 } ] }, { "cell_type": "code", "source": [], "metadata": { "id": "t1Zk6rKQoeYc" }, "execution_count": null, "outputs": [] } ], "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "accelerator": "GPU" }, "nbformat": 4, "nbformat_minor": 0 }