google/owlv2-base-patch16-finetuned

from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
from transformers import Owlv2Processor, Owlv2ForObjectDetection


torch.cuda.set_device(1)

processor = AutoProcessor.from_pretrained("/projects/TianchiCup/scenic/owlv2-base-patch16-finetuned/")

model = AutoModelForZeroShotObjectDetection.from_pretrained("/projects/TianchiCup/scenic/owlv2-base-patch16-finetuned/")
model.cuda()
model.eval()

image = Image.open('/projects/TianchiCup/scenic/000000039769.jpg')

texts = [["a photo of a cat", "a photo of a dog"]]
inputs = processor(text=texts, images=image, return_tensors="pt")
inputs = {key: value.cuda() for key, value in inputs.items()}

outputs = model(**inputs)

# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
target_sizes = torch.Tensor([image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to COCO API
print("Model Outputs:", outputs)
results = processor.post_process_object_detection(outputs=outputs, threshold=0.1, target_sizes=target_sizes)
print("Post-processing Results:", results)


i = 0  # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
print(boxes)
print(scores)
print(labels)
# Print detected objects and rescaled box coordinates
for box, score, label in zip(boxes, scores, labels):

    box = [round(i, 2) for i in box.tolist()]

    print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
the output is:
what's went wrong?
google
/

owlv2-base-patch16-finetuned

no detect results