HAITAME LAFRAME commited on
Commit
2bbaa94
·
verified ·
1 Parent(s): 4da74c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -55
app.py CHANGED
@@ -1,4 +1,8 @@
1
- import gradio as gr
 
 
 
 
2
  try:
3
  import torch
4
  except ImportError:
@@ -6,37 +10,36 @@ except ImportError:
6
  subprocess.run([sys.executable, "-m", "pip", "install", "torch"], check=True)
7
  import torch
8
 
9
-
10
- from transformers import (
11
- AutoModelForCausalLM,
12
- AutoTokenizer,
13
- TextIteratorStreamer,
14
- )
15
- import os
16
- from threading import Thread
17
- import spaces
18
- import time
19
- import subprocess
20
-
21
  subprocess.run(
22
  "pip install flash-attn --no-build-isolation",
23
  env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
24
  shell=True,
25
  )
26
 
27
- token = os.environ["HF_TOKEN"]
 
 
 
 
 
 
28
 
 
 
 
 
29
 
 
30
  model = AutoModelForCausalLM.from_pretrained(
31
  "HaitameLaf/Phi3-Game16bit",
32
- token=token,
33
  trust_remote_code=True,
34
  )
35
- tok = AutoTokenizer.from_pretrained("HaitameLaf/Phi3-Game16bit", token=token)
36
- terminators = [
37
- tok.eos_token_id,
38
- ]
39
 
 
40
  if torch.cuda.is_available():
41
  device = torch.device("cuda")
42
  print(f"Using GPU: {torch.cuda.get_device_name(device)}")
@@ -45,33 +48,24 @@ else:
45
  print("Using CPU")
46
 
47
  model = model.to(device)
48
- # Dispatch Errors
49
-
50
 
51
- @spaces.GPU(duration=60)
52
  def chat(message, history, temperature, do_sample, max_tokens):
53
- chat = []
54
- for item in history:
55
- chat.append({"role": "user", "content": item[0]})
56
- if item[1] is not None:
57
- chat.append({"role": "assistant", "content": item[1]})
58
  chat.append({"role": "user", "content": message})
59
  messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
60
  model_inputs = tok([messages], return_tensors="pt").to(device)
61
- streamer = TextIteratorStreamer(
62
- tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
63
- )
64
- generate_kwargs = dict(
65
- model_inputs,
66
- streamer=streamer,
67
- max_new_tokens=max_tokens,
68
- do_sample=True,
69
- temperature=temperature,
70
- eos_token_id=terminators,
71
- )
72
-
73
- if temperature == 0:
74
- generate_kwargs["do_sample"] = False
75
 
76
  t = Thread(target=model.generate, kwargs=generate_kwargs)
77
  t.start()
@@ -83,30 +77,22 @@ def chat(message, history, temperature, do_sample, max_tokens):
83
 
84
  yield partial_text
85
 
86
-
87
  demo = gr.ChatInterface(
88
  fn=chat,
89
  examples=[["Write me a poem about Machine Learning."]],
90
- # multimodal=False,
91
  additional_inputs_accordion=gr.Accordion(
92
  label="⚙️ Parameters", open=False, render=False
93
  ),
94
  additional_inputs=[
95
- gr.Slider(
96
- minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
97
- ),
98
  gr.Checkbox(label="Sampling", value=True),
99
- gr.Slider(
100
- minimum=128,
101
- maximum=4096,
102
- step=1,
103
- value=512,
104
- label="Max new tokens",
105
- render=False,
106
- ),
107
  ],
108
  stop_btn="Stop Generation",
109
  title="Chat With LLMs",
110
  description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)",
111
  )
112
- demo.launch()
 
 
 
1
+ import subprocess
2
+ import sys
3
+ import os
4
+
5
+ # Vérifiez si torch est installé, sinon installez-le
6
  try:
7
  import torch
8
  except ImportError:
 
10
  subprocess.run([sys.executable, "-m", "pip", "install", "torch"], check=True)
11
  import torch
12
 
13
+ # Installer flash-attn
 
 
 
 
 
 
 
 
 
 
 
14
  subprocess.run(
15
  "pip install flash-attn --no-build-isolation",
16
  env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
17
  shell=True,
18
  )
19
 
20
+ import gradio as gr
21
+ from transformers import (
22
+ AutoModelForCausalLM,
23
+ AutoTokenizer,
24
+ TextIteratorStreamer,
25
+ )
26
+ from threading import Thread
27
 
28
+ # Obtenir le token d'authentification Hugging Face
29
+ token = os.getenv("HF_TOKEN")
30
+ if not token:
31
+ raise ValueError("Le token d'authentification HF_TOKEN n'est pas défini.")
32
 
33
+ # Charger le modèle et le tokenizer
34
  model = AutoModelForCausalLM.from_pretrained(
35
  "HaitameLaf/Phi3-Game16bit",
36
+ use_auth_token=token,
37
  trust_remote_code=True,
38
  )
39
+ tok = AutoTokenizer.from_pretrained("HaitameLaf/Phi3-Game16bit", use_auth_token=token)
40
+ terminators = [tok.eos_token_id]
 
 
41
 
42
+ # Vérifier la disponibilité du GPU
43
  if torch.cuda.is_available():
44
  device = torch.device("cuda")
45
  print(f"Using GPU: {torch.cuda.get_device_name(device)}")
 
48
  print("Using CPU")
49
 
50
  model = model.to(device)
 
 
51
 
52
+ # Fonction de chat
53
  def chat(message, history, temperature, do_sample, max_tokens):
54
+ chat = [{"role": "user", "content": item[0]} for item in history]
55
+ chat.extend({"role": "assistant", "content": item[1]} for item in history if item[1])
 
 
 
56
  chat.append({"role": "user", "content": message})
57
  messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
58
  model_inputs = tok([messages], return_tensors="pt").to(device)
59
+ streamer = TextIteratorStreamer(tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
60
+
61
+ generate_kwargs = {
62
+ "input_ids": model_inputs.input_ids,
63
+ "streamer": streamer,
64
+ "max_new_tokens": max_tokens,
65
+ "do_sample": do_sample,
66
+ "temperature": temperature,
67
+ "eos_token_id": terminators,
68
+ }
 
 
 
 
69
 
70
  t = Thread(target=model.generate, kwargs=generate_kwargs)
71
  t.start()
 
77
 
78
  yield partial_text
79
 
80
+ # Configuration de Gradio
81
  demo = gr.ChatInterface(
82
  fn=chat,
83
  examples=[["Write me a poem about Machine Learning."]],
 
84
  additional_inputs_accordion=gr.Accordion(
85
  label="⚙️ Parameters", open=False, render=False
86
  ),
87
  additional_inputs=[
88
+ gr.Slider(minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature"),
 
 
89
  gr.Checkbox(label="Sampling", value=True),
90
+ gr.Slider(minimum=128, maximum=4096, step=1, value=512, label="Max new tokens"),
 
 
 
 
 
 
 
91
  ],
92
  stop_btn="Stop Generation",
93
  title="Chat With LLMs",
94
  description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)",
95
  )
96
+
97
+ if __name__ == "__main__":
98
+ demo.launch()