Spaces:

MrOvkill
/

moondream-2-multi-interrogation

Running on Zero

App Files Files Community

Samuel L Meyers commited on 5 days ago

Commit

4d31c25

1 Parent(s): e96f7f5

v0.3

Browse files

Files changed (3) hide show

app.py +68 -8
packages.txt +1 -0
requirements.txt +7 -7

app.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import json
@@ -73,8 +76,9 @@ import os
 def merge_descriptions_to_prompt(mi, d1, d2):
   from together import Together
   tog = Together(api_key=os.getenv("TOGETHER_KEY"))
-  res = tog.completions.create(prompt=f"""Describe what would result if the following two descriptions were describing one thing.
 ### Description 1:
 ```text
 {d1}
 ```
@@ -89,7 +93,9 @@ Merge-Specific Instructions:
 Ensure you end your output with ```\\n
 ---
 Complete Description:
-```text""", model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024)
   return res.choices[0].text.split("```")[0]
 def xform_image_description(img, inst):
@@ -97,7 +103,7 @@ def xform_image_description(img, inst):
   from together import Together
   desc = dual_images(img)
   tog = Together(api_key=os.getenv("TOGETHER_KEY"))
-  prompt=f"""Describe the image in aggressively verbose detail. I must know every freckle upon a man's brow and each blade of the grass intimately.\nDescription: ```text\n{desc}\n```\nInstructions:\n```text\n{inst}\n```\n\n\n---\nDetailed Description:\n```text"""
   res = tog.completions.create(prompt=prompt, model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024)
   return res.choices[0].text[len(prompt):].split("```")[0]
@@ -275,18 +281,18 @@ with gr.Blocks() as arch_room:
     gr.Markdown(f"""
 ## Arcanistry
 *POOF* -- You walk in, to a cloudy room filled with heavy smoke. In the center of the room rests a waist-height table. Upon the table, you see a... You don't understand... It's dark and light and cold and warm but... As you extend your hand, you hear the voice travel up your arm and into your ears...
 ---
-""")
   with gr.Row():
-    cdd = gr.Code("""### Human
 I require a Python script that serves a simple file server in Python over MongoDB.
 ### Wizard
 Sure! Here's the script:
-```python""", language="markdown")
   with gr.Row():
     wzs = gr.Code(json.dumps({
       'token': '<|wizard|>',
@@ -301,4 +307,58 @@ Sure! Here's the script:
 with gr.TabbedInterface([ifc_imgprompt2text, c_ifc := gr.ChatInterface(chat, chatbot=chatbot, submit_btn=gr.Button(scale=1)), gr.ChatInterface(wizard_chat), arch_room], ["Prompt & Image 2 Text", "Chat w/ Llama 3 70b", "Chat w/ WizardLM 8x22B", "Arcanistry"]) as ifc:
   shrd = gr.JSON(visible=False)
-  ifc.launch(share=False, debug=True, show_error=True)

+"""
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import json
 def merge_descriptions_to_prompt(mi, d1, d2):
   from together import Together
   tog = Together(api_key=os.getenv("TOGETHER_KEY"))
+  res = tog.completions.create(prompt=f""" """Describe what would result if the following two descriptions were describing one thing.
 ### Description 1:
+""" """
 ```text
 {d1}
 ```
 Ensure you end your output with ```\\n
 ---
 Complete Description:
+```text"""
+""", model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024)
   return res.choices[0].text.split("```")[0]
 def xform_image_description(img, inst):
   from together import Together
   desc = dual_images(img)
   tog = Together(api_key=os.getenv("TOGETHER_KEY"))
+  prompt=f""" """Describe the image in aggressively verbose detail. I must know every freckle upon a man's brow and each blade of the grass intimately.\nDescription: ```text\n{desc}\n```\nInstructions:\n```text\n{inst}\n```\n\n\n---\nDetailed Description:\n```text """ """
   res = tog.completions.create(prompt=prompt, model="meta-llama/Meta-Llama-3-70B", stop=["```"], max_tokens=1024)
   return res.choices[0].text[len(prompt):].split("```")[0]
     gr.Markdown(f"""
 ## Arcanistry
+"""
 *POOF* -- You walk in, to a cloudy room filled with heavy smoke. In the center of the room rests a waist-height table. Upon the table, you see a... You don't understand... It's dark and light and cold and warm but... As you extend your hand, you hear the voice travel up your arm and into your ears...
 ---
+""" """)
   with gr.Row():
+    cdd = gr.Code(""" """### Human
 I require a Python script that serves a simple file server in Python over MongoDB.
 ### Wizard
 Sure! Here's the script:
+```python""" """, language="markdown")
   with gr.Row():
     wzs = gr.Code(json.dumps({
       'token': '<|wizard|>',
 with gr.TabbedInterface([ifc_imgprompt2text, c_ifc := gr.ChatInterface(chat, chatbot=chatbot, submit_btn=gr.Button(scale=1)), gr.ChatInterface(wizard_chat), arch_room], ["Prompt & Image 2 Text", "Chat w/ Llama 3 70b", "Chat w/ WizardLM 8x22B", "Arcanistry"]) as ifc:
   shrd = gr.JSON(visible=False)
+  ifc.launch(share=False, debug=True, show_error=True) """
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import gradio as gr
+import spaces
+from PIL import Image
+import hashlib
+import base64
+def load_md2():
+  model = AutoModelForCausalLM.from_pretrained("vikhyatk/moondream2", device_map="cpu", trust_remote_code=True,revision="2025-01-09")
+  return model
+global md2
+md2 = load_md2()
+@spaces.GPU()
+def moondream2(question, image, history=None):
+  global md2
+  model = md2
+  model.cuda()
+  hsh = hashlib.sha256(bts := image.resize((224,224), Image.NEAREST).tobytes()).hexdigest()
+  b64 = base64.b64encode(bts).decode('utf-8')
+  res = model.query(image, question) if question is not None and question != "" else model.caption(image)
+  model.cpu()
+  ress = []
+  if history is not None:
+    for itm in history:
+      ress.append(itm)
+  ress.append({
+    "answer": res if question is not None and question != "" else None,
+    "caption": res if question is None or question == "" else None,
+    "sha256": hsh,
+    "image_b64": b64
+  })
+  return ress, ress
+def gui():
+  with gr.Blocks() as blk:
+    with gr.Row():
+        imgs = gr.Image(label="input", type="pil", elem_id="imgs")
+    with gr.Row():
+      txt = gr.Textbox(label="prompt")
+    with gr.Row():
+      btn = gr.Button("Run")
+    with gr.Row():
+      res = gr.JSON(label="output")
+    with gr.Row(visible=False):
+      history = gr.JSON(label="history")
+    btn.click(moondream2, inputs=[txt, imgs, history], outputs=[res, history])
+  blk.launch(share=False)
+if __name__ == "__main__":
+  gui()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ libvips-dev

requirements.txt CHANGED Viewed

@@ -1,9 +1,9 @@
-gradio==4.31.3
-transformers==4.40.2
-accelerate==0.30.1
-einops==0.8.0
-pillow==10.3.0
-together==1.1.5
 torch
 torchvision
-accelerate==0.30.1

+gradio
+transformers
+accelerate
+einops
+pillow
+together
 torch
 torchvision
+pyvips