{
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "# %%capture\n",
        "# # Installs Unsloth, Xformers (Flash Attention) and all other packages!\n",
        "!pip install \"unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git\" --quiet\n",
        "\n",
        "# We have to check which Torch version for Xformers (2.3 -> 0.0.27)\n",
        "from torch import __version__; from packaging.version import Version as V\n",
        "xformers = \"xformers==0.0.27\" if V(__version__) < V(\"2.4.0\") else \"xformers\"\n",
        "!pip install --no-deps {xformers} \"trl<0.9.0\" peft accelerate bitsandbytes triton --quiet\n",
        "\n",
        "!pip install peft --quiet\n",
        "!pip install --upgrade --no-cache-dir \"transformers<4.45.0\"  --quiet # Reason: https://github.com/unslothai/unsloth/issues/1061\n",
        "\n",
        "!pip install -q gradio"
      ],
      "metadata": {
        "id": "g0gl_TBTXRYC",
        "outputId": "67222684-6f4f-4027-d8a5-32788590081c",
        "colab": {
          "base_uri": "https://localhost:8080/"
        }
      },
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import gradio as gr\n",
        "import random\n",
        "import time\n",
        "import os\n",
        "from unsloth import FastLanguageModel\n",
        "import torch\n",
        "max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!\n",
        "dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+\n",
        "load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.\n",
        "\n",
        "huggingface_token = \"\"\n",
        "\n",
        "if True:\n",
        "    from unsloth import FastLanguageModel\n",
        "    model, tokenizer = FastLanguageModel.from_pretrained(\n",
        "        model_name = \"traversaal-llm-regional-languages/Urdu_Llama3_2_4bit_PF25_adapter\", # YOUR MODEL YOU USED FOR TRAINING\n",
        "        max_seq_length = max_seq_length,\n",
        "        dtype = dtype,\n",
        "        load_in_4bit = load_in_4bit,\n",
        "        token = huggingface_token,\n",
        "    )\n",
        "    FastLanguageModel.for_inference(model) # Enable native 2x faster inference\n",
        "\n",
        "\n",
        "alpaca_prompt = \"\"\"{0}\\nInput: {1}\\nOutput: \"\"\"\n",
        "\n",
        "def generate_text(prompt):\n",
        "    # Format the prompt with instruction and input, and leave output prompt blank\n",
        "    formatted_prompt = alpaca_prompt.format(\n",
        "        \"دیئے گئے موضوع کے بارے میں ایک مختصر پیراگراف لکھیں۔\",  # instruction\n",
        "        prompt  # user input\n",
        "    )\n",
        "\n",
        "    # Tokenize the prompt and move tensors to GPU\n",
        "    inputs = tokenizer([formatted_prompt], return_tensors=\"pt\").to(\"cuda\")\n",
        "\n",
        "    # Generate output from the model\n",
        "    outputs = model.generate(**inputs, max_new_tokens=200, use_cache=True)\n",
        "\n",
        "    # Decode the output and remove the instruction + input part\n",
        "    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
        "\n",
        "    # Remove the prompt part by splitting on \"Output:\" and returning only generated part\n",
        "    result = generated_text.split(\"Output:\")[-1].strip()\n",
        "\n",
        "    return result\n",
        "\n",
        "iface = gr.Interface(\n",
        "    fn=generate_text,\n",
        "    inputs=gr.Textbox(lines=2, placeholder=\"Enter your prompt here...\"),\n",
        "    examples=['میں کراچی جانا چاہتا ہوں، وہاں کے کچھ بہترین مقامات کون سے ہیں؟',\n",
        "              'amazing food locations in Singapore',\n",
        "              'best activities in London'],\n",
        "    outputs=\"text\",\n",
        "    title=\"Urdu Chatbot - Powered by traversaal-urdu-llama-3.2-1b\",\n",
        "    description=\"Ask me anything in Urdu!\",\n",
        ")\n",
        "\n",
        "iface.launch()\n"
      ],
      "metadata": {
        "id": "SM6OLuM5gve7",
        "outputId": "a3512ee6-8f5f-40c5-d792-1c7d34bbe2e2",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 796
        }
      },
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.\n",
            "==((====))==  Unsloth 2024.9.post4: Fast Llama patching. Transformers = 4.44.2.\n",
            "   \\\\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.\n",
            "O^O/ \\_/ \\    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.\n",
            "\\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]\n",
            " \"-____-\"     Free Apache license: http://github.com/unslothai/unsloth\n",
            "Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "Unsloth 2024.9.post4 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n",
            "\n",
            "Colab notebook detected. To show errors in colab notebook, set debug=True in launch()\n",
            "* Running on public URL: https://8d8a38dbca08b1f69c.gradio.live\n",
            "\n",
            "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "<div><iframe src=\"https://8d8a38dbca08b1f69c.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": []
          },
          "metadata": {},
          "execution_count": 2
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "t1Zk6rKQoeYc"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "accelerator": "GPU"
  },
  "nbformat": 4,
  "nbformat_minor": 0
}