Samuel L Meyers commited on
Commit
4d31c25
·
1 Parent(s): e96f7f5
Files changed (3) hide show
  1. app.py +68 -8
  2. packages.txt +1 -0
  3. requirements.txt +7 -7
app.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import json
@@ -73,8 +76,9 @@ import os
73
  def merge_descriptions_to_prompt(mi, d1, d2):
74
  from together import Together
75
  tog = Together(api_key=os.getenv("TOGETHER_KEY"))
76
- res = tog.completions.create(prompt=f"""Describe what would result if the following two descriptions were describing one thing.
77
  ### Description 1:
 
78
  ```text
79
  {d1}
80
  ```
@@ -89,7 +93,9 @@ Merge-Specific Instructions:
89
  Ensure you end your output with ```\\n
90
  ---
91
  Complete Description:
92
- ```text""", model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024)
 
 
93
  return res.choices[0].text.split("```")[0]
94
 
95
  def xform_image_description(img, inst):
@@ -97,7 +103,7 @@ def xform_image_description(img, inst):
97
  from together import Together
98
  desc = dual_images(img)
99
  tog = Together(api_key=os.getenv("TOGETHER_KEY"))
100
- prompt=f"""Describe the image in aggressively verbose detail. I must know every freckle upon a man's brow and each blade of the grass intimately.\nDescription: ```text\n{desc}\n```\nInstructions:\n```text\n{inst}\n```\n\n\n---\nDetailed Description:\n```text"""
101
  res = tog.completions.create(prompt=prompt, model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024)
102
  return res.choices[0].text[len(prompt):].split("```")[0]
103
 
@@ -275,18 +281,18 @@ with gr.Blocks() as arch_room:
275
  gr.Markdown(f"""
276
  ## Arcanistry
277
 
278
-
279
  *POOF* -- You walk in, to a cloudy room filled with heavy smoke. In the center of the room rests a waist-height table. Upon the table, you see a... You don't understand... It's dark and light and cold and warm but... As you extend your hand, you hear the voice travel up your arm and into your ears...
280
 
281
  ---
282
- """)
283
  with gr.Row():
284
- cdd = gr.Code("""### Human
285
  I require a Python script that serves a simple file server in Python over MongoDB.
286
 
287
  ### Wizard
288
  Sure! Here's the script:
289
- ```python""", language="markdown")
290
  with gr.Row():
291
  wzs = gr.Code(json.dumps({
292
  'token': '<|wizard|>',
@@ -301,4 +307,58 @@ Sure! Here's the script:
301
 
302
  with gr.TabbedInterface([ifc_imgprompt2text, c_ifc := gr.ChatInterface(chat, chatbot=chatbot, submit_btn=gr.Button(scale=1)), gr.ChatInterface(wizard_chat), arch_room], ["Prompt & Image 2 Text", "Chat w/ Llama 3 70b", "Chat w/ WizardLM 8x22B", "Arcanistry"]) as ifc:
303
  shrd = gr.JSON(visible=False)
304
- ifc.launch(share=False, debug=True, show_error=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+
4
  import gradio as gr
5
  from transformers import AutoTokenizer, AutoModelForCausalLM
6
  import json
 
76
  def merge_descriptions_to_prompt(mi, d1, d2):
77
  from together import Together
78
  tog = Together(api_key=os.getenv("TOGETHER_KEY"))
79
+ res = tog.completions.create(prompt=f""" """Describe what would result if the following two descriptions were describing one thing.
80
  ### Description 1:
81
+ """ """
82
  ```text
83
  {d1}
84
  ```
 
93
  Ensure you end your output with ```\\n
94
  ---
95
  Complete Description:
96
+ ```text"""
97
+
98
+ """, model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024)
99
  return res.choices[0].text.split("```")[0]
100
 
101
  def xform_image_description(img, inst):
 
103
  from together import Together
104
  desc = dual_images(img)
105
  tog = Together(api_key=os.getenv("TOGETHER_KEY"))
106
+ prompt=f""" """Describe the image in aggressively verbose detail. I must know every freckle upon a man's brow and each blade of the grass intimately.\nDescription: ```text\n{desc}\n```\nInstructions:\n```text\n{inst}\n```\n\n\n---\nDetailed Description:\n```text """ """
107
  res = tog.completions.create(prompt=prompt, model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024)
108
  return res.choices[0].text[len(prompt):].split("```")[0]
109
 
 
281
  gr.Markdown(f"""
282
  ## Arcanistry
283
 
284
+ """
285
  *POOF* -- You walk in, to a cloudy room filled with heavy smoke. In the center of the room rests a waist-height table. Upon the table, you see a... You don't understand... It's dark and light and cold and warm but... As you extend your hand, you hear the voice travel up your arm and into your ears...
286
 
287
  ---
288
+ """ """)
289
  with gr.Row():
290
+ cdd = gr.Code(""" """### Human
291
  I require a Python script that serves a simple file server in Python over MongoDB.
292
 
293
  ### Wizard
294
  Sure! Here's the script:
295
+ ```python""" """, language="markdown")
296
  with gr.Row():
297
  wzs = gr.Code(json.dumps({
298
  'token': '<|wizard|>',
 
307
 
308
  with gr.TabbedInterface([ifc_imgprompt2text, c_ifc := gr.ChatInterface(chat, chatbot=chatbot, submit_btn=gr.Button(scale=1)), gr.ChatInterface(wizard_chat), arch_room], ["Prompt & Image 2 Text", "Chat w/ Llama 3 70b", "Chat w/ WizardLM 8x22B", "Arcanistry"]) as ifc:
309
  shrd = gr.JSON(visible=False)
310
+ ifc.launch(share=False, debug=True, show_error=True) """
311
+
312
+ from transformers import AutoTokenizer, AutoModelForCausalLM
313
+ import gradio as gr
314
+ import spaces
315
+ from PIL import Image
316
+ import hashlib
317
+ import base64
318
+
319
+ def load_md2():
320
+ model = AutoModelForCausalLM.from_pretrained("vikhyatk/moondream2", device_map="cpu", trust_remote_code=True,revision="2025-01-09")
321
+ return model
322
+
323
+ global md2
324
+
325
+ md2 = load_md2()
326
+
327
+ @spaces.GPU()
328
+ def moondream2(question, image, history=None):
329
+ global md2
330
+ model = md2
331
+ model.cuda()
332
+ hsh = hashlib.sha256(bts := image.resize((224,224), Image.NEAREST).tobytes()).hexdigest()
333
+ b64 = base64.b64encode(bts).decode('utf-8')
334
+ res = model.query(image, question) if question is not None and question != "" else model.caption(image)
335
+ model.cpu()
336
+ ress = []
337
+ if history is not None:
338
+ for itm in history:
339
+ ress.append(itm)
340
+ ress.append({
341
+ "answer": res if question is not None and question != "" else None,
342
+ "caption": res if question is None or question == "" else None,
343
+ "sha256": hsh,
344
+ "image_b64": b64
345
+ })
346
+ return ress, ress
347
+
348
+ def gui():
349
+ with gr.Blocks() as blk:
350
+ with gr.Row():
351
+ imgs = gr.Image(label="input", type="pil", elem_id="imgs")
352
+ with gr.Row():
353
+ txt = gr.Textbox(label="prompt")
354
+ with gr.Row():
355
+ btn = gr.Button("Run")
356
+ with gr.Row():
357
+ res = gr.JSON(label="output")
358
+ with gr.Row(visible=False):
359
+ history = gr.JSON(label="history")
360
+ btn.click(moondream2, inputs=[txt, imgs, history], outputs=[res, history])
361
+ blk.launch(share=False)
362
+
363
+ if __name__ == "__main__":
364
+ gui()
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ libvips-dev
requirements.txt CHANGED
@@ -1,9 +1,9 @@
1
- gradio==4.31.3
2
- transformers==4.40.2
3
- accelerate==0.30.1
4
- einops==0.8.0
5
- pillow==10.3.0
6
- together==1.1.5
7
  torch
8
  torchvision
9
- accelerate==0.30.1
 
1
+ gradio
2
+ transformers
3
+ accelerate
4
+ einops
5
+ pillow
6
+ together
7
  torch
8
  torchvision
9
+ pyvips