Spaces:

pg56714
/

YoloWGDinoArena

Sleeping

App Files Files Community

YoloWGDinoArena / app.py

pg56714

Update app.py

b235a0b verified 7 months ago

raw

history blame contribute delete

7.15 kB

	from typing import List
	import cv2
	import gradio as gr
	import numpy as np
	import supervision as sv
	from inference.models import YOLOWorld
	from PIL import Image
	import warnings

	warnings.filterwarnings("ignore")
	from groundingdino.util.inference import annotate as gd_annotate
	from groundingdino.util.inference import predict, load_model
	import groundingdino.datasets.transforms as T

	MARKDOWN = """
	# YoloWGDinoArena

	Powered by Roboflow [Inference](https://github.com/roboflow/inference) and
	[Supervision](https://github.com/roboflow/supervision) and [YOLO-World](https://github.com/AILab-CVC/YOLO-World) and [Grounding DINO](https://github.com/IDEA-Research/GroundingDINO).
	\n
	Github Source Code: [Link](https://github.com/pg56714/YoloWGDinoArena)
	"""

	# GroundingDINO
	config_file = "./groundingdino/config/GroundingDINO_SwinT_OGC.py"
	ckpt_filenmae = "./weights/groundingdino_swint_ogc.pth"


	def image_transform_grounding(init_image):
	transform = T.Compose(
	[
	T.RandomResize([800], max_size=1333),
	T.ToTensor(),
	T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
	]
	)
	image, _ = transform(init_image, None)
	return init_image, image


	def image_transform_grounding_for_vis(init_image):
	transform = T.Compose(
	[
	T.RandomResize([800], max_size=1333),
	]
	)
	image, _ = transform(init_image, None)
	return image


	model = load_model(config_file, ckpt_filenmae)


	def run_grounding(input_image, grounding_caption, box_threshold, text_threshold):
	init_image = Image.fromarray(input_image.astype("uint8"), "RGB")

	_, image_tensor = image_transform_grounding(init_image)
	image_pil: Image = image_transform_grounding_for_vis(init_image)

	boxes, logits, phrases = predict(
	model,
	image_tensor,
	grounding_caption,
	box_threshold,
	text_threshold,
	device="cpu",
	)
	annotated_frame = gd_annotate(
	image_source=np.asarray(image_pil), boxes=boxes, logits=logits, phrases=phrases
	)
	image_with_box = Image.fromarray(cv2.cvtColor(annotated_frame, cv2.COLOR_BGR2RGB))

	return image_with_box


	box_threshold = gr.Slider(
	label="Box Threshold",
	minimum=0.0,
	maximum=1.0,
	value=0.25,
	step=0.001,
	)
	text_threshold = gr.Slider(
	label="Text Threshold",
	minimum=0.0,
	maximum=1.0,
	value=0.25,
	step=0.001,
	)

	# -----------------------------------------------------------------------------------------------------------

	# YOLO-WORLD
	# -----------------------------------------------------------------------------------------------------------
	YOLO_WORLD_MODEL = YOLOWorld(model_id="yolo_world/l")

	BOUNDING_BOX_ANNOTATOR = sv.BoxAnnotator()
	LABEL_ANNOTATOR = sv.LabelAnnotator()


	def process_categories(categories: str) -> List[str]:
	return [category.strip() for category in categories.split(",")]


	def annotate_image(
	input_image: np.ndarray,
	detections: sv.Detections,
	categories: List[str],
	with_confidence: bool = True,
	) -> np.ndarray:
	labels = [
	(
	f"{categories[class_id]}: {confidence:.3f}"
	if with_confidence
	else f"{categories[class_id]}"
	)
	for class_id, confidence in zip(detections.class_id, detections.confidence)
	]
	output_image = BOUNDING_BOX_ANNOTATOR.annotate(input_image, detections)
	output_image = LABEL_ANNOTATOR.annotate(output_image, detections, labels=labels)
	return output_image


	def process_image(
	input_image: np.ndarray,
	categories: str,
	confidence_threshold: float,
	nms_threshold: float,
	with_confidence: bool = True,
	) -> np.ndarray:
	categories = process_categories(categories)
	YOLO_WORLD_MODEL.set_classes(categories)
	results = YOLO_WORLD_MODEL.infer(input_image, confidence=confidence_threshold)
	detections = sv.Detections.from_inference(results).with_nms(
	class_agnostic=True, threshold=nms_threshold
	)

	output_image = cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR)
	output_image = annotate_image(
	input_image=output_image,
	detections=detections,
	categories=categories,
	with_confidence=with_confidence,
	)

	return cv2.cvtColor(output_image, cv2.COLOR_BGR2RGB)


	confidence_threshold_component = gr.Slider(
	minimum=0,
	maximum=1.0,
	value=0.005,
	step=0.01,
	label="Confidence Threshold",
	# info=(
	# "The confidence threshold for the YOLO-World model. Lower the threshold to "
	# "reduce false negatives, enhancing the model's sensitivity to detect "
	# "sought-after objects. Conversely, increase the threshold to minimize false "
	# "positives, preventing the model from identifying objects it shouldn't."
	# ),
	)

	iou_threshold_component = gr.Slider(
	minimum=0,
	maximum=1.0,
	value=0.1,
	step=0.01,
	label="IoU Threshold",
	# info=(
	# "The Intersection over Union (IoU) threshold for non-maximum suppression. "
	# "Decrease the value to lessen the occurrence of overlapping bounding boxes, "
	# "making the detection process stricter. On the other hand, increase the value "
	# "to allow more overlapping bounding boxes, accommodating a broader range of "
	# "detections."
	# ),
	)

	# -----------------------------------------------------------------------------------------------------------

	# View
	with gr.Blocks() as demo:
	gr.Markdown(MARKDOWN)
	with gr.Row():
	input_image_component = gr.Image(type="numpy", label="Input Image")
	yolo_world_output_image_component = gr.Image(
	type="numpy", label="YOLO-WORLD Output"
	)
	grounding_dion_output_image_component = gr.Image(
	type="pil", label="GroundingDINO Output"
	)
	with gr.Row():
	image_text_component = gr.Textbox(
	label="Categories",
	placeholder="you can input multiple words with comma (,)",
	scale=7,
	)
	submit_button_component = gr.Button(value="Submit", scale=1, variant="primary")

	with gr.Column():
	with gr.Accordion("YOLO-World", open=False):
	confidence_threshold_component.render()
	iou_threshold_component.render()

	with gr.Accordion("GroundingDINO", open=False):
	box_threshold.render()
	text_threshold.render()

	submit_button_component.click(
	fn=process_image,
	inputs=[
	input_image_component,
	image_text_component,
	confidence_threshold_component,
	iou_threshold_component,
	],
	outputs=[
	yolo_world_output_image_component,
	],
	)

	submit_button_component.click(
	fn=run_grounding,
	inputs=[
	input_image_component,
	image_text_component,
	box_threshold,
	text_threshold,
	],
	outputs=[
	grounding_dion_output_image_component,
	],
	)

	# demo.launch(debug=False, show_error=True, max_threads=1)
	demo.launch(debug=False, show_error=True)