Spaces:
Runtime error
Runtime error
File size: 2,618 Bytes
0047098 18f6aaf efa99ab b48da00 0047098 b4db843 efa99ab 2c99f68 efa99ab 418756d baed359 29c8cd3 418756d efa99ab b4db843 2c99f68 b4db843 efa99ab fb6b355 8976123 b4db843 77bfe0b b4db843 77bfe0b b4db843 efa99ab b4db843 efa99ab b4db843 77bfe0b b4db843 77bfe0b 85c01a5 77bfe0b 53f8187 77bfe0b 418756d 985651e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import gradio as gr
import random
import time
import os
from transformers import AutoTokenizer
import transformers
import torch
from huggingface_hub import InferenceClient
import gradio as gr
import random
import time
import os
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
#endpoint_url = os.getenv('url')
hf_token = os.getenv('hf_token')
from huggingface_hub import login
login(token = hf_token)
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "traversaal-llm-regional-languages/Unsloth_Urdu_Llama3_1_4bit_PF100",
max_seq_length = max_seq_length,
dtype = 'Auto',
load_in_4bit = load_in_4bit
)
FastLanguageModel.for_inference(model)
chat_prompt = """
### Instruction:
You are a chatbot. Provide answers with your best knowledge in Urdu only. Don't say you don't know unless you really don't
### Input:
{prompt}
### Response:
"""
def generate_response(query):
prompt = chat_prompt.format(prompt=query)
inputs = tokenizer([prompt], return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
inputs,
streamer=streamer,
max_new_tokens=1024,
do_sample=True,
top_p=0.95,
top_k=50,
temperature=0.7,
repetition_penalty=1.2, #1.02
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
generated_text = ""
for new_text in streamer:
if new_text.endswith(tokenizer.eos_token):
new_text = new_text[:len(new_text) - len(tokenizer.eos_token)]
generated_text += new_text
yield generated_text
# for r in streamer:
# if r.token.special:
# continue
# generated_text += r.token.text
iface = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
examples = ['میں کراچی جانا چاہتا ہوں، وہاں کے کچھ بہترین مقامات کون سے ہیں؟','amazing food locations in Singapore','best activities in London'],
outputs="text",
title="Urdu Chatbot- Powered by traversaal-urdu-llama-3.1-8b",
description="Ask me anything in Urdu!",
)
iface.launch()
|