import gradio as gr import random import time import os from transformers import AutoTokenizer import transformers import torch from huggingface_hub import InferenceClient max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. #endpoint_url = os.getenv('url') hf_token = os.getenv('hf_token') # # Streaming Client # client = InferenceClient(endpoint_url, token=hf_token) # gen_kwargs = dict( # max_new_tokens=1024, # top_k=50, # top_p=0.9, # temperature=0.5, # repetition_penalty=1.2, #1.02 # stop= ["\nUser:", "<|endoftext|>", ""], # ) from transformers import AutoTokenizer import transformers import torch tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B") pipeline = transformers.pipeline( "text-generation", model = "meta-llama/Llama-3.2-1B", #model="traversaal-llm-regional-languages/Unsloth_Urdu_Llama3_1_FP16_PF100", torch_dtype=torch.float16, device_map="auto", load_in_4bit=True ) def predict(prompt): chat_prompt = f""" ### Instruction: You are a chatbot. Provide answers with your best knowledge. Don't say you don't know unless you really don't ### Input: {prompt} ### Response: "" """ sequences = pipeline( prompt, do_sample=True, temperature=0.2, top_p=0.9, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, max_length=100, ) response = '' for seq in sequences: response += seq['generated_text'] return response # def generate_text(prompt): # """Generates text using the Hugging Face Inference API.""" # chat_prompt = f""" # ### Instruction: # You are a chatbot. Chat in Urdu. Provide answers with your best knowledge. Don't say you don't know unless you really don't # ### Input: # {prompt} # ### Response: # "" # """ # stream = client.text_generation(chat_prompt, stream=True, details=True, **gen_kwargs) # generated_text = "" # for r in stream: # if r.token.special: # continue # if r.token.text in gen_kwargs["stop"]: # break # generated_text += r.token.text # yield generated_text iface = gr.Interface( fn=predict, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), examples = ['میں کراچی جانا چاہتا ہوں، وہاں کے کچھ بہترین مقامات کون سے ہیں؟','amazing food locations in Singapore','best activities in London'], outputs="text", title="Urdu Chatbot- Powered by traversaal-urdu-llama-3.1-8b", description="Ask me anything in Urdu!", ) iface.launch()