File size: 2,618 Bytes
0047098
 
 
18f6aaf
efa99ab
 
 
b48da00
0047098
b4db843
 
 
 
 
 
 
efa99ab
2c99f68
efa99ab
 
 
 
418756d
baed359
29c8cd3
 
418756d
 
efa99ab
b4db843
 
 
 
2c99f68
b4db843
 
 
efa99ab
fb6b355
8976123
b4db843
77bfe0b
b4db843
77bfe0b
 
 
 
b4db843
 
 
 
 
 
 
efa99ab
b4db843
 
 
 
 
 
 
 
 
 
 
efa99ab
b4db843
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77bfe0b
 
b4db843
77bfe0b
85c01a5
77bfe0b
53f8187
77bfe0b
418756d
 
985651e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import gradio as gr
import random
import time
import os
from transformers import AutoTokenizer
import transformers
import torch
from huggingface_hub import InferenceClient

import gradio as gr
import random
import time
import os
from unsloth import FastLanguageModel
import torch


max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


#endpoint_url = os.getenv('url')

hf_token = os.getenv('hf_token')
from huggingface_hub import login
login(token = hf_token)



from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "traversaal-llm-regional-languages/Unsloth_Urdu_Llama3_1_4bit_PF100",
    max_seq_length = max_seq_length,
    dtype = 'Auto',
    load_in_4bit = load_in_4bit
)
FastLanguageModel.for_inference(model)



chat_prompt = """
### Instruction:
You are a chatbot. Provide answers with your best knowledge in Urdu only. Don't say you don't know unless you really don't
### Input:
{prompt}
### Response:
"""


def generate_response(query):
    prompt = chat_prompt.format(prompt=query)
    inputs = tokenizer([prompt], return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
    
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    generation_kwargs = dict(
        inputs,
        streamer=streamer,
        max_new_tokens=1024,
        do_sample=True,
        top_p=0.95,
        top_k=50,
        temperature=0.7,
        repetition_penalty=1.2, #1.02
  
    )
    
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    generated_text = ""
    for new_text in streamer:
        if new_text.endswith(tokenizer.eos_token):
            new_text = new_text[:len(new_text) - len(tokenizer.eos_token)]
        generated_text += new_text
        yield generated_text
    # for r in streamer:
    #     if r.token.special:
    #         continue
    #     generated_text += r.token.text
        


iface = gr.Interface(
    fn=generate_response,
    inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
    examples = ['میں کراچی جانا چاہتا ہوں، وہاں کے کچھ بہترین مقامات کون سے ہیں؟','amazing food locations in Singapore','best activities in London'],
    outputs="text",
    title="Urdu Chatbot- Powered by traversaal-urdu-llama-3.1-8b",
    description="Ask me anything in Urdu!",
)

iface.launch()