File size: 2,801 Bytes
0047098
 
 
18f6aaf
efa99ab
 
 
b48da00
0047098
efa99ab
 
 
 
 
 
 
418756d
baed359
418756d
efa99ab
 
418756d
efa99ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8976123
 
efa99ab
 
 
77bfe0b
 
 
efa99ab
77bfe0b
 
 
 
 
 
 
efa99ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77bfe0b
 
efa99ab
77bfe0b
85c01a5
77bfe0b
53f8187
77bfe0b
418756d
 
77bfe0b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import gradio as gr
import random
import time
import os
from transformers import AutoTokenizer
import transformers
import torch
from huggingface_hub import InferenceClient


max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


#endpoint_url = os.getenv('url')

hf_token = os.getenv('hf_token')

# # Streaming Client
# client = InferenceClient(endpoint_url, token=hf_token)

# gen_kwargs = dict(
#     max_new_tokens=1024,
#     top_k=50,
#     top_p=0.9,
#     temperature=0.5,
#     repetition_penalty=1.2, #1.02
#     stop= ["\nUser:", "<|endoftext|>", "</s>"],
# )


from transformers import AutoTokenizer
import transformers
import torch

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")

pipeline = transformers.pipeline(
    "text-generation",
    model = "meta-llama/Llama-3.2-1B",
    #model="traversaal-llm-regional-languages/Unsloth_Urdu_Llama3_1_FP16_PF100",
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_4bit=True
)



def predict(prompt):
    chat_prompt = f"""

### Instruction:
You are a chatbot. Provide answers with your best knowledge. Don't say you don't know unless you really don't

### Input:
{prompt}

### Response:
""
"""
    sequences = pipeline(
    prompt,
    do_sample=True,
    temperature=0.2,
    top_p=0.9,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=100,
  )
    response = ''
    for seq in sequences:
        response += seq['generated_text']
    
    
    return response
# def generate_text(prompt):
#     """Generates text using the Hugging Face Inference API."""
#     chat_prompt = f"""

# ### Instruction:
# You are a chatbot. Chat in Urdu. Provide answers with your best knowledge. Don't say you don't know unless you really don't

# ### Input:
# {prompt}

# ### Response:
# ""
# """
#     stream = client.text_generation(chat_prompt, stream=True, details=True, **gen_kwargs)
#     generated_text = ""
#     for r in stream:
#         if r.token.special:
#             continue
#         if r.token.text in gen_kwargs["stop"]:
#             break
#         generated_text += r.token.text
#         yield generated_text

iface = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
    examples = ['میں کراچی جانا چاہتا ہوں، وہاں کے کچھ بہترین مقامات کون سے ہیں؟','amazing food locations in Singapore','best activities in London'],
    outputs="text",
    title="Urdu Chatbot- Powered by traversaal-urdu-llama-3.1-8b",
    description="Ask me anything in Urdu!",
)

iface.launch()