hgdgng commited on
Commit
903f1a6
1 Parent(s): 1e50a4a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -8
app.py CHANGED
@@ -1,28 +1,43 @@
1
  import requests
2
  import torch
3
  from PIL import Image
4
- from transformers import MllamaForConditionalGeneration, AutoProcessor
5
 
 
6
  model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
7
 
8
- model = MllamaForConditionalGeneration.from_pretrained(
 
9
  model_id,
10
- torch_dtype=torch.bfloat16,
11
- device_map="auto",
12
  )
 
 
13
  processor = AutoProcessor.from_pretrained(model_id)
14
 
 
15
  url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
 
 
16
  image = Image.open(requests.get(url, stream=True).raw)
17
 
 
18
  messages = [
19
  {"role": "user", "content": [
20
- {"type": "image"},
21
- {"type": "text", "text": "If I had to write a haiku for this one, it would be: "}
22
  ]}
23
  ]
 
 
24
  input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
 
 
25
  inputs = processor(image, input_text, return_tensors="pt").to(model.device)
26
 
27
- output = model.generate(**inputs, max_new_tokens=30)
28
- print(processor.decode(output[0]))
 
 
 
 
1
  import requests
2
  import torch
3
  from PIL import Image
4
+ from transformers import LlamaForConditionalGeneration, AutoProcessor
5
 
6
+ # Define the model ID, replace with the correct ID if needed
7
  model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
8
 
9
+ # Load the model in bfloat16 or float16 if needed
10
+ model = LlamaForConditionalGeneration.from_pretrained(
11
  model_id,
12
+ torch_dtype=torch.bfloat16, # Change to torch.float16 if hardware doesn't support bfloat16
13
+ device_map="auto", # Automatically selects the appropriate device
14
  )
15
+
16
+ # Load the processor
17
  processor = AutoProcessor.from_pretrained(model_id)
18
 
19
+ # Define an image URL
20
  url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
21
+
22
+ # Fetch the image using requests
23
  image = Image.open(requests.get(url, stream=True).raw)
24
 
25
+ # Define the messages in a format the model understands (adjust as needed)
26
  messages = [
27
  {"role": "user", "content": [
28
+ {"type": "image"}, # This indicates that the input contains an image
29
+ {"type": "text", "text": "Can you please describe this image in one sentence?"}
30
  ]}
31
  ]
32
+
33
+ # Generate input text with the processor
34
  input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
35
+
36
+ # Process the image and input text, prepare them for the model
37
  inputs = processor(image, input_text, return_tensors="pt").to(model.device)
38
 
39
+ # Run the model to generate a response
40
+ output = model.generate(**inputs, max_new_tokens=70)
41
+
42
+ # Decode and print the output
43
+ print(processor.decode(output[0][inputs["input_ids"].shape[-1]:]))