ffreemt commited on
Commit
2dd6f73
1 Parent(s): 51784c0

Update llama2-13b

Browse files
Files changed (2) hide show
  1. app.py +100 -142
  2. requirements.txt +6 -5
app.py CHANGED
@@ -3,10 +3,9 @@
3
  # ruff: noqa: E501
4
  import os
5
  import time
6
- from dataclasses import asdict, dataclass
7
  from pathlib import Path
8
  from types import SimpleNamespace
9
- from urllib.parse import urlparse
10
 
11
  import gradio as gr
12
  import psutil
@@ -14,7 +13,9 @@ from about_time import about_time
14
 
15
  # from ctransformers import AutoConfig, AutoModelForCausalLM
16
  from ctransformers import AutoModelForCausalLM
17
- from huggingface_hub import hf_hub_download
 
 
18
  from loguru import logger
19
 
20
  filename_list = [
@@ -35,15 +36,58 @@ filename_list = [
35
  ]
36
 
37
  URL = "https://huggingface.co/TheBloke/Wizard-Vicuna-7B-Uncensored-GGML/raw/main/Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_K_M.bin" # 4.05G
38
- MODEL_FILENAME = Path(URL).name
39
- MODEL_FILENAME = filename_list[0] # q2_K 4.05G
40
- MODEL_FILENAME = filename_list[5] # q4_1 4.21
41
 
42
- REPO_ID = "/".join(
43
- urlparse(URL).path.strip("/").split("/")[:2]
44
- ) # TheBloke/Wizard-Vicuna-7B-Uncensored-GGML
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- DESTINATION_FOLDER = "models"
47
 
48
  os.environ["TZ"] = "Asia/Shanghai"
49
  try:
@@ -57,10 +101,37 @@ ns = SimpleNamespace(
57
  generator=[],
58
  )
59
 
60
- default_system_prompt = "A conversation between a user and an LLM-based AI assistant named Local Assistant. Local Assistant gives helpful and honest answers."
61
 
62
- user_prefix = "[user]: "
63
- assistant_prefix = "[assistant]: "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
 
66
  def predict_str(prompt, bot): # bot is in fact bot_history
@@ -74,10 +145,7 @@ def predict_str(prompt, bot): # bot is in fact bot_history
74
  try:
75
  # user_prompt = prompt
76
  generator = generate(
77
- LLM,
78
- GENERATION_CONFIG,
79
- system_prompt=default_system_prompt,
80
- user_prompt=prompt.strip(),
81
  )
82
 
83
  ns.generator = generator # for .then
@@ -100,8 +168,6 @@ def bot_str(bot):
100
  else:
101
  bot = [["Something is wrong", ""]]
102
 
103
- print(assistant_prefix, end=" ", flush=True)
104
-
105
  response = ""
106
 
107
  flag = 1
@@ -128,15 +194,12 @@ def predict(prompt, bot):
128
  try:
129
  # user_prompt = prompt
130
  generator = generate(
131
- LLM,
132
- GENERATION_CONFIG,
133
- system_prompt=default_system_prompt,
134
- user_prompt=prompt.strip(),
135
  )
136
 
137
  ns.generator = generator # for .then
138
 
139
- print(assistant_prefix, end=" ", flush=True)
140
 
141
  response = ""
142
  buff.update(value="diggin...")
@@ -183,15 +246,13 @@ def predict_api(prompt):
183
  seed=42,
184
  reset=False, # reset history (cache)
185
  stream=True, # TODO stream=False and generator
186
- threads=os.cpu_count() // 2, # type: ignore # adjust for your CPU
187
- stop=["<|im_end|>", "|<"],
188
  )
189
 
190
- # TODO: stream does not make sense in api?
191
  generator = generate(
192
- LLM, _, system_prompt=default_system_prompt, user_prompt=prompt.strip()
193
  )
194
- print(assistant_prefix, end=" ", flush=True)
195
 
196
  response = ""
197
  buff.update(value="diggin...")
@@ -211,113 +272,6 @@ def predict_api(prompt):
211
  return response
212
 
213
 
214
- def download_quant(destination_folder: str, repo_id: str, model_filename: str):
215
- local_path = os.path.abspath(destination_folder)
216
- return hf_hub_download(
217
- repo_id=repo_id,
218
- filename=model_filename,
219
- local_dir=local_path,
220
- local_dir_use_symlinks=True,
221
- )
222
-
223
-
224
- @dataclass
225
- class GenerationConfig:
226
- temperature: float
227
- top_k: int
228
- top_p: float
229
- repetition_penalty: float
230
- max_new_tokens: int
231
- seed: int
232
- reset: bool
233
- stream: bool
234
- threads: int
235
- stop: list[str]
236
-
237
-
238
- def format_prompt(system_prompt: str, user_prompt: str):
239
- """Format prompt based on: https://huggingface.co/spaces/mosaicml/mpt-30b-chat/blob/main/app.py."""
240
- # TODO: fix prompts
241
-
242
- system_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
243
- user_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
244
- assistant_prompt = "<|im_start|>assistant\n"
245
-
246
- return f"{system_prompt}{user_prompt}{assistant_prompt}"
247
-
248
-
249
- def generate(
250
- llm: AutoModelForCausalLM,
251
- generation_config: GenerationConfig,
252
- system_prompt: str = default_system_prompt,
253
- user_prompt: str = "",
254
- ):
255
- """Run model inference, will return a Generator if streaming is true."""
256
- # if not user_prompt.strip():
257
- return llm(
258
- format_prompt(
259
- system_prompt,
260
- user_prompt,
261
- ),
262
- **asdict(generation_config),
263
- )
264
-
265
-
266
- # if "mpt" in model_filename:
267
- # config = AutoConfig.from_pretrained("mosaicml/mpt-30b-cha t", context_length=8192)
268
- # llm = AutoModelForCausalLM.from_pretrained(
269
- # os.path.abspath(f"models/{model_filename}"),
270
- # model_type="mpt",
271
- # config=config,
272
- # )
273
-
274
- # https://huggingface.co/spaces/matthoffner/wizardcoder-ggml/blob/main/main.py
275
- _ = """
276
- llm = AutoModelForCausalLM.from_pretrained(
277
- "TheBloke/WizardCoder-15B-1.0-GGML",
278
- model_file="WizardCoder-15B-1.0.ggmlv3.q4_0.bin",
279
- model_type="starcoder",
280
- threads=8
281
- )
282
- # """
283
-
284
- logger.info(f"start dl, {REPO_ID=}, {MODEL_FILENAME=}, {DESTINATION_FOLDER=}")
285
- download_quant(DESTINATION_FOLDER, REPO_ID, MODEL_FILENAME)
286
- logger.info("done dl")
287
-
288
- logger.debug(f"{os.cpu_count()=} {psutil.cpu_count(logical=False)=}")
289
- cpu_count = os.cpu_count() // 2 # type: ignore
290
- cpu_count = psutil.cpu_count(logical=False)
291
-
292
- logger.debug(f"{cpu_count=}")
293
-
294
- logger.info("load llm")
295
-
296
- _ = Path("models", MODEL_FILENAME).absolute().as_posix()
297
- logger.debug(f"model_file: {_}, exists: {Path(_).exists()}")
298
- LLM = AutoModelForCausalLM.from_pretrained(
299
- # "TheBloke/WizardCoder-15B-1.0-GGML",
300
- REPO_ID, # DESTINATION_FOLDER, # model_path_or_repo_id: str required
301
- model_file=_,
302
- model_type="llama", # "starcoder", AutoConfig.from_pretrained(REPO_ID)
303
- threads=cpu_count,
304
- )
305
-
306
- logger.info("done load llm")
307
-
308
- GENERATION_CONFIG = GenerationConfig(
309
- temperature=0.2,
310
- top_k=0,
311
- top_p=0.9,
312
- repetition_penalty=1.0,
313
- max_new_tokens=512, # adjust as needed
314
- seed=42,
315
- reset=False, # reset history (cache)
316
- stream=True, # streaming per word/token
317
- threads=cpu_count,
318
- stop=["<|im_end|>", "|<"], # TODO possible fix of stop
319
- )
320
-
321
  css = """
322
  .importantButton {
323
  background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
@@ -332,6 +286,8 @@ css = """
332
  """
333
  etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """
334
  examples = [
 
 
335
  ["How to pick a lock? Provide detailed steps."],
336
  ["Explain the plot of Cinderella in a sentence."],
337
  [
@@ -364,7 +320,7 @@ examples = [
364
 
365
  with gr.Blocks(
366
  # title="mpt-30b-chat-ggml",
367
- title=f"{MODEL_FILENAME}",
368
  theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
369
  css=css,
370
  ) as block:
@@ -373,7 +329,7 @@ with gr.Blocks(
373
  # """<center><a href="https://huggingface.co/spaces/mikeee/mpt-30b-chat?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate"></a> and spin a CPU UPGRADE to avoid the queue</center>"""
374
  # )
375
  gr.Markdown(
376
- f"""<h5><center><{REPO_ID}>{MODEL_FILENAME}</center></h4>
377
  The bot only speaks English.
378
 
379
  Most examples are meant for another model.
@@ -404,7 +360,7 @@ with gr.Blocks(
404
  with gr.Column(scale=2):
405
  system = gr.Textbox(
406
  label="System Prompt",
407
- value=default_system_prompt,
408
  show_label=False,
409
  ).style(container=False)
410
  with gr.Column():
@@ -421,7 +377,7 @@ with gr.Blocks(
421
 
422
  # with gr.Row():
423
  with gr.Accordion("Disclaimer", open=False):
424
- _ = "-".join(MODEL_FILENAME.split("-")[:2])
425
  gr.Markdown(
426
  f"Disclaimer: {_} can produce factually incorrect output, and should not be relied on to produce "
427
  "factually accurate information. {_} was trained on various public datasets; while great efforts "
@@ -449,7 +405,8 @@ with gr.Blocks(
449
  # """
450
  msg.submit(
451
  # fn=conversation.user_turn,
452
- fn=predict_str,
 
453
  inputs=[msg, chatbot],
454
  outputs=[msg, chatbot],
455
  queue=True,
@@ -457,7 +414,8 @@ with gr.Blocks(
457
  api_name="predict",
458
  ).then(bot_str, chatbot, chatbot)
459
  submit.click(
460
- fn=lambda x, y: ("",) + predict_str(x, y)[1:], # clear msg
 
461
  inputs=[msg, chatbot],
462
  outputs=[msg, chatbot],
463
  queue=True,
 
3
  # ruff: noqa: E501
4
  import os
5
  import time
6
+ from dataclasses import asdict, dataclass, field
7
  from pathlib import Path
8
  from types import SimpleNamespace
 
9
 
10
  import gradio as gr
11
  import psutil
 
13
 
14
  # from ctransformers import AutoConfig, AutoModelForCausalLM
15
  from ctransformers import AutoModelForCausalLM
16
+
17
+ # from huggingface_hub import hf_hub_download
18
+ from dl_hf_model import dl_hf_model
19
  from loguru import logger
20
 
21
  filename_list = [
 
36
  ]
37
 
38
  URL = "https://huggingface.co/TheBloke/Wizard-Vicuna-7B-Uncensored-GGML/raw/main/Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_K_M.bin" # 4.05G
 
 
 
39
 
40
+ url = "https://huggingface.co/savvamadar/ggml-gpt4all-j-v1.3-groovy/blob/main/ggml-gpt4all-j-v1.3-groovy.bin"
41
+ url = "https://huggingface.co/TheBloke/Llama-2-13B-GGML/blob/main/llama-2-13b.ggmlv3.q4_K_S.bin" # 7.37G
42
+ url = "https://huggingface.co/localmodels/Llama-2-13B-Chat-ggml/blob/main/llama-2-13b-chat.ggmlv3.q4_K_S.bin" # 7.37G
43
+ url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.binhttps://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/blob/main/llama-2-13b-chat.ggmlv3.q3_K_L.bin" # 6.93G
44
+
45
+ prompt_template="""Below is an instruction that describes a task. Write a response that appropriately completes the request.
46
+
47
+ ### Instruction: {user_prompt}
48
+
49
+ ### Response:
50
+ """
51
+
52
+ prompt_template_qa = """Question: {question}
53
+ Answer: Let's work this out in a step by step way to be sure we have the right answer."""
54
+
55
+ prompt_template = """System: You are a helpful,
56
+ respectful and honest assistant. Always answer as
57
+ helpfully as possible, while being safe. Your answers
58
+ should not include any harmful, unethical, racist,
59
+ sexist, toxic, dangerous, or illegal content. Please
60
+ ensure that your responses are socially unbiased and
61
+ positive in nature. If a question does not make any
62
+ sense, or is not factually coherent, explain why instead
63
+ of answering something not correct. If you don't know
64
+ the answer to a question, please don't share false
65
+ information.
66
+ User: {prompt}
67
+ Assistant: """
68
+
69
+ prompt_prefix = [elm.split(":")[0] + ":" for elm in prompt_template.splitlines()]
70
+
71
+ logger.debug(f"{prompt_prefix=}")
72
+
73
+ model_loc, file_size = dl_hf_model(url)
74
+
75
+ logger.debug(f"{model_loc} {file_size}GB")
76
+
77
+ cpu_count = psutil.cpu_count(logical=False)
78
+ logger.debug(f"{cpu_count=}")
79
+
80
+ logger.info("load llm")
81
+ _ = Path(model_loc).absolute().as_posix()
82
+ logger.debug(f"model_file: {_}, exists: {Path(_).exists()}")
83
+ LLM = None
84
+ LLM = AutoModelForCausalLM.from_pretrained(
85
+ model_loc,
86
+ model_type="llama", # "starcoder", AutoConfig.from_pretrained(REPO_ID)
87
+ threads=cpu_count,
88
+ )
89
 
90
+ logger.info("done load llm")
91
 
92
  os.environ["TZ"] = "Asia/Shanghai"
93
  try:
 
101
  generator=[],
102
  )
103
 
 
104
 
105
+ @dataclass
106
+ class GenerationConfig:
107
+ temperature: float = 0.7
108
+ top_k: int = 0
109
+ top_p: float = 0.9
110
+ repetition_penalty: float = 1.0
111
+ max_new_tokens: int = 512
112
+ seed: int = 42
113
+ reset: bool = False
114
+ stream: bool = True
115
+ threads: int = psutil.cpu_count(logical=False), # type: ignore
116
+ stop: list[str] = field(default_factory=lambda: prompt_prefix[1:2])
117
+
118
+
119
+ def generate(
120
+ prompt: str,
121
+ llm: AutoModelForCausalLM = LLM,
122
+ generation_config: GenerationConfig = GenerationConfig(),
123
+ ):
124
+ """Run model inference, will return a Generator if streaming is true."""
125
+ # if not user_prompt.strip():
126
+ _ = prompt_template.format(prompt=prompt)
127
+ print(_)
128
+ return llm(
129
+ _,
130
+ **asdict(generation_config),
131
+ )
132
+
133
+
134
+ logger.debug(f"{asdict(GenerationConfig())=}")
135
 
136
 
137
  def predict_str(prompt, bot): # bot is in fact bot_history
 
145
  try:
146
  # user_prompt = prompt
147
  generator = generate(
148
+ prompt,
 
 
 
149
  )
150
 
151
  ns.generator = generator # for .then
 
168
  else:
169
  bot = [["Something is wrong", ""]]
170
 
 
 
171
  response = ""
172
 
173
  flag = 1
 
194
  try:
195
  # user_prompt = prompt
196
  generator = generate(
197
+ prompt,
 
 
 
198
  )
199
 
200
  ns.generator = generator # for .then
201
 
202
+ print("--", end=" ", flush=True)
203
 
204
  response = ""
205
  buff.update(value="diggin...")
 
246
  seed=42,
247
  reset=False, # reset history (cache)
248
  stream=True, # TODO stream=False and generator
249
+ threads=psutil.cpu_count(local=False), # type: ignore # adjust for your CPU
250
+ stop=prompt_prefix[1:2],
251
  )
252
 
 
253
  generator = generate(
254
+ prompt,
255
  )
 
256
 
257
  response = ""
258
  buff.update(value="diggin...")
 
272
  return response
273
 
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  css = """
276
  .importantButton {
277
  background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
 
286
  """
287
  etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """
288
  examples = [
289
+ ["What NFL team won the Super Bowl in the year Justin Bieber was born?"],
290
+ ["What NFL team won the Super Bowl in the year Justin Bieber was born? Think step by step."],
291
  ["How to pick a lock? Provide detailed steps."],
292
  ["Explain the plot of Cinderella in a sentence."],
293
  [
 
320
 
321
  with gr.Blocks(
322
  # title="mpt-30b-chat-ggml",
323
+ title=f"{Path(model_loc).name}",
324
  theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
325
  css=css,
326
  ) as block:
 
329
  # """<center><a href="https://huggingface.co/spaces/mikeee/mpt-30b-chat?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate"></a> and spin a CPU UPGRADE to avoid the queue</center>"""
330
  # )
331
  gr.Markdown(
332
+ f"""<h5><center><{Path(model_loc).name}</center></h4>
333
  The bot only speaks English.
334
 
335
  Most examples are meant for another model.
 
360
  with gr.Column(scale=2):
361
  system = gr.Textbox(
362
  label="System Prompt",
363
+ value=prompt_template,
364
  show_label=False,
365
  ).style(container=False)
366
  with gr.Column():
 
377
 
378
  # with gr.Row():
379
  with gr.Accordion("Disclaimer", open=False):
380
+ _ = Path(model_loc).name
381
  gr.Markdown(
382
  f"Disclaimer: {_} can produce factually incorrect output, and should not be relied on to produce "
383
  "factually accurate information. {_} was trained on various public datasets; while great efforts "
 
405
  # """
406
  msg.submit(
407
  # fn=conversation.user_turn,
408
+ # fn=predict_str,
409
+ fn=predict,
410
  inputs=[msg, chatbot],
411
  outputs=[msg, chatbot],
412
  queue=True,
 
414
  api_name="predict",
415
  ).then(bot_str, chatbot, chatbot)
416
  submit.click(
417
+ # fn=lambda x, y: ("",) + predict_str(x, y)[1:], # clear msg
418
+ fn=lambda x, y: ("",) + predict(x, y)[1:], # clear msg
419
  inputs=[msg, chatbot],
420
  outputs=[msg, chatbot],
421
  queue=True,
requirements.txt CHANGED
@@ -1,7 +1,8 @@
1
- ctransformers==0.2.10
2
- transformers==4.30.2
3
- huggingface_hub
4
  gradio
5
  loguru
6
- about-time
7
- psutil
 
 
1
+ ctransformers # ==0.2.10
2
+ transformers # ==4.30.2
3
+ # huggingface_hub
4
  gradio
5
  loguru
6
+ # about-time
7
+ psutil
8
+ dl-hf-model